mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[docs] Editing pass on clusters docs, removing legacy material and fixing style issues (#27816)
This commit is contained in:
parent
9a0c1f5e0a
commit
52f7b89865
127 changed files with 581 additions and 8136 deletions
27
doc/BUILD
27
doc/BUILD
|
@ -9,33 +9,6 @@ exports_files(["test_myst_doc.py"])
|
|||
# root directory.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
# Support for Dask has been dropped in 3.6.
|
||||
py_test(
|
||||
name = "dask_xgboost",
|
||||
size = "medium",
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
tags = ["exclusive", "team:ml", "py37"],
|
||||
data = ["//doc/source/ray-core/examples:core_examples"],
|
||||
args = ["--path", "doc/source/ray-core/examples/dask_xgboost/dask_xgboost.ipynb",
|
||||
"--smoke-test", "--address ''", "--num-actors 4", "--cpus-per-actor 1", "--num-actors-inference 4",
|
||||
"--cpus-per-actor-inference 1"]
|
||||
)
|
||||
|
||||
# Support for Modin has been dropped in 3.6.
|
||||
py_test(
|
||||
name = "modin_xgboost",
|
||||
size = "medium",
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
tags = ["exclusive", "team:ml", "py37"],
|
||||
data = ["//doc/source/ray-core/examples:core_examples"],
|
||||
args = ["--path", "doc/source/ray-core/examples/modin_xgboost/modin_xgboost.ipynb",
|
||||
"--smoke-test", "--address ''", "--num-actors 4",
|
||||
"--cpus-per-actor 1", "--num-actors-inference 4",
|
||||
"--cpus-per-actor-inference 1"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "highly_parallel",
|
||||
size = "medium",
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
.. Comment this out for now.
|
||||
|
||||
..
|
||||
.. admonition:: We're hiring!
|
||||
|
||||
`Anyscale Inc. <https://anyscale.com>`__, the company behind Ray, is hiring interns and full-time **software engineers** to help advance and maintain Ray autoscaler, cluster launcher, cloud providers, the Kubernetes operator, and Ray Client.
|
||||
If you have a background in distributed computing/cluster orchestration/Kubernetes and are interested in making Ray **the** industry-leading open-source platform for distributed computing, `apply here today <https://jobs.lever.co/anyscale/814c0d0e-08f5-419a-bdd8-0819b8b8df24>`__.
|
||||
We'd be thrilled to welcome you on the team!
|
|
@ -217,10 +217,7 @@ parts:
|
|||
- file: ray-more-libs/joblib
|
||||
- file: ray-more-libs/multiprocessing
|
||||
- file: ray-more-libs/ray-collective
|
||||
- file: ray-more-libs/ray-lightning
|
||||
- file: ray-core/examples/using-ray-with-pytorch-lightning
|
||||
- file: ray-core/examples/dask_xgboost/dask_xgboost
|
||||
- file: ray-core/examples/modin_xgboost/modin_xgboost
|
||||
- file: workflows/concepts
|
||||
title: Ray Workflows
|
||||
sections:
|
||||
|
@ -258,23 +255,22 @@ parts:
|
|||
- file: cluster/key-concepts
|
||||
title: Key Concepts
|
||||
- file: cluster/kubernetes/index
|
||||
title: Deploying a Ray Cluster on Kubernetes
|
||||
title: Deploying on Kubernetes
|
||||
sections:
|
||||
- file: cluster/kubernetes/getting-started
|
||||
- file: cluster/kubernetes/user-guides
|
||||
sections:
|
||||
- file: cluster/kubernetes/user-guides/k8s-cluster-setup.md
|
||||
- file: cluster/kubernetes/user-guides/config.md
|
||||
- file: cluster/kubernetes/user-guides/autoscaling.md
|
||||
- file: cluster/kubernetes/user-guides/configuring-autoscaling.md
|
||||
- file: cluster/kubernetes/user-guides/logging.md
|
||||
- file: cluster/kubernetes/user-guides/gpu.md
|
||||
- file: cluster/kubernetes/user-guides/kuberay-vs-legacy.md
|
||||
- file: cluster/kubernetes/examples
|
||||
sections:
|
||||
- file: cluster/kubernetes/examples/ml-example.md
|
||||
- file: cluster/kubernetes/references
|
||||
- file: cluster/vms/index
|
||||
title: Deploying a Ray Cluster on VMs
|
||||
title: Deploying on VMs
|
||||
sections:
|
||||
- file: cluster/vms/getting-started
|
||||
- file: cluster/vms/user-guides/index
|
||||
|
@ -296,25 +292,13 @@ parts:
|
|||
- file: cluster/vms/examples/ml-example
|
||||
- file: cluster/vms/references/index
|
||||
- file: cluster/running-applications/index
|
||||
title: Running Applications on Ray Clusters
|
||||
- file: cluster/references/index
|
||||
title: References
|
||||
title: Applications Guide
|
||||
|
||||
- file: cluster-deprecated/index
|
||||
title: Deprecated Ray Clusters Docs
|
||||
sections:
|
||||
- file: cluster-deprecated/key-concepts
|
||||
- file: cluster-deprecated/cloud
|
||||
- file: cluster-deprecated/quickstart
|
||||
- file: cluster-deprecated/usage-stats
|
||||
- file: cluster-deprecated/user-guide
|
||||
- file: cluster-deprecated/cluster_under_construction.md
|
||||
- file: cluster-deprecated/deploy
|
||||
- file: cluster-deprecated/api
|
||||
|
||||
- caption: References
|
||||
chapters:
|
||||
- file: ray-references/api
|
||||
- file: cluster/usage-stats
|
||||
|
||||
- caption: Developer Guides
|
||||
chapters:
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
.. _ref-cluster-api:
|
||||
|
||||
Ray Cluster API
|
||||
===============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Ray Cluster API References
|
||||
|
||||
../cluster-deprecated/reference.rst
|
||||
../cluster-deprecated/jobs-package-ref.rst
|
|
@ -1,316 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _aws-cluster:
|
||||
|
||||
AWS Configurations
|
||||
-------------------
|
||||
|
||||
.. _aws-cluster-efs:
|
||||
|
||||
Using Amazon EFS
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
To use Amazon EFS, install some utilities and mount the EFS in ``setup_commands``. Note that these instructions only work if you are using the AWS Autoscaler.
|
||||
|
||||
.. note::
|
||||
|
||||
You need to replace the ``{{FileSystemId}}`` to your own EFS ID before using the config. You may also need to set correct ``SecurityGroupIds`` for the instances in the config file.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
setup_commands:
|
||||
- sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1`;
|
||||
sudo pkill -9 apt-get;
|
||||
sudo pkill -9 dpkg;
|
||||
sudo dpkg --configure -a;
|
||||
sudo apt-get -y install binutils;
|
||||
cd $HOME;
|
||||
git clone https://github.com/aws/efs-utils;
|
||||
cd $HOME/efs-utils;
|
||||
./build-deb.sh;
|
||||
sudo apt-get -y install ./build/amazon-efs-utils*deb;
|
||||
cd $HOME;
|
||||
mkdir efs;
|
||||
sudo mount -t efs {{FileSystemId}}:/ efs;
|
||||
sudo chmod 777 efs;
|
||||
|
||||
.. _aws-cluster-s3:
|
||||
|
||||
Configure worker nodes to access Amazon S3
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In various scenarios, worker nodes may need write access to the S3 bucket.
|
||||
E.g. Ray Tune has the option that worker nodes write distributed checkpoints to S3 instead of syncing back to the driver using rsync.
|
||||
|
||||
If you see errors like "Unable to locate credentials", make sure that the correct ``IamInstanceProfile`` is configured for worker nodes in ``cluster.yaml`` file.
|
||||
This may look like:
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
worker_nodes:
|
||||
InstanceType: m5.xlarge
|
||||
ImageId: latest_dlami
|
||||
IamInstanceProfile:
|
||||
Arn: arn:aws:iam::YOUR_AWS_ACCOUNT:YOUR_INSTANCE_PROFILE
|
||||
|
||||
You can verify if the set up is correct by entering one worker node and do
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
aws configure list
|
||||
|
||||
You should see something like
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
Name Value Type Location
|
||||
---- ----- ---- --------
|
||||
profile <not set> None None
|
||||
access_key ****************XXXX iam-role
|
||||
secret_key ****************YYYY iam-role
|
||||
region <not set> None None
|
||||
|
||||
Please refer to `this discussion <https://github.com/ray-project/ray/issues/9327>`__ for more details.
|
||||
|
||||
|
||||
.. _aws-cluster-cloudwatch:
|
||||
|
||||
Using Amazon CloudWatch
|
||||
-----------------------
|
||||
|
||||
Amazon CloudWatch is a monitoring and observability service that provides data and actionable insights to monitor your applications, respond to system-wide performance changes, and optimize resource utilization.
|
||||
CloudWatch integration with Ray requires an AMI (or Docker image) with the Unified CloudWatch Agent pre-installed.
|
||||
|
||||
AMIs with the Unified CloudWatch Agent pre-installed are provided by the Amazon Ray Team, and are currently available in the us-east-1, us-east-2, us-west-1, and us-west-2 regions.
|
||||
Please direct any questions, comments, or issues to the `Amazon Ray Team <https://github.com/amzn/amazon-ray/issues/new/choose>`_.
|
||||
|
||||
The table below lists AMIs with the Unified CloudWatch Agent pre-installed in each region, and you can also find AMIs at `amazon-ray README <https://github.com/amzn/amazon-ray>`_.
|
||||
|
||||
.. list-table:: All available unified CloudWatch agent images
|
||||
|
||||
* - Base AMI
|
||||
- AMI ID
|
||||
- Region
|
||||
- Unified CloudWatch Agent Version
|
||||
* - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit)
|
||||
- ami-069f2811478f86c20
|
||||
- us-east-1
|
||||
- v1.247348.0b251302
|
||||
* - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit)
|
||||
- ami-058cc0932940c2b8b
|
||||
- us-east-2
|
||||
- v1.247348.0b251302
|
||||
* - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit)
|
||||
- ami-044f95c9ef12883ef
|
||||
- us-west-1
|
||||
- v1.247348.0b251302
|
||||
* - AWS Deep Learning AMI (Ubuntu 18.04, 64-bit)
|
||||
- ami-0d88d9cbe28fac870
|
||||
- us-west-2
|
||||
- v1.247348.0b251302
|
||||
|
||||
.. note::
|
||||
|
||||
Using Amazon CloudWatch will incur charges, please refer to `CloudWatch pricing <https://aws.amazon.com/cloudwatch/pricing/>`_ for details.
|
||||
|
||||
Getting started
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
1. Create a minimal cluster config YAML named ``cloudwatch-basic.yaml`` with the following contents:
|
||||
====================================================================================================
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
# Start by defining a `cloudwatch` section to enable CloudWatch integration with your Ray cluster.
|
||||
cloudwatch:
|
||||
agent:
|
||||
# Path to Unified CloudWatch Agent config file
|
||||
config: "cloudwatch/example-cloudwatch-agent-config.json"
|
||||
dashboard:
|
||||
# CloudWatch Dashboard name
|
||||
name: "example-dashboard-name"
|
||||
# Path to the CloudWatch Dashboard config file
|
||||
config: "cloudwatch/example-cloudwatch-dashboard-config.json"
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
node_config:
|
||||
InstanceType: c5a.large
|
||||
ImageId: ami-0d88d9cbe28fac870 # Unified CloudWatch agent pre-installed AMI, us-west-2
|
||||
resources: {}
|
||||
ray.worker.default:
|
||||
node_config:
|
||||
InstanceType: c5a.large
|
||||
ImageId: ami-0d88d9cbe28fac870 # Unified CloudWatch agent pre-installed AMI, us-west-2
|
||||
IamInstanceProfile:
|
||||
Name: ray-autoscaler-cloudwatch-v1
|
||||
resources: {}
|
||||
min_workers: 0
|
||||
|
||||
2. Download CloudWatch Agent and Dashboard config.
|
||||
==================================================
|
||||
|
||||
First, create a ``cloudwatch`` directory in the same directory as ``cloudwatch-basic.yaml``.
|
||||
Then, download the example `CloudWatch Agent <https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-agent-config.json>`_ and `CloudWatch Dashboard <https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-dashboard-config.json>`_ config files to the ``cloudwatch`` directory.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ mkdir cloudwatch
|
||||
$ cd cloudwatch
|
||||
$ wget https://raw.githubusercontent.com/ray-project/ray/master/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-agent-config.json
|
||||
$ wget https://raw.githubusercontent.com/ray-project/ray/master/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-dashboard-config.json
|
||||
|
||||
3. Run ``ray up cloudwatch-basic.yaml`` to start your Ray Cluster.
|
||||
==================================================================
|
||||
|
||||
This will launch your Ray cluster in ``us-west-2`` by default. When launching a cluster for a different region, you'll need to change your cluster config YAML file's ``region`` AND ``ImageId``.
|
||||
See the "Unified CloudWatch Agent Images" table above for available AMIs by region.
|
||||
|
||||
4. Check out your Ray cluster's logs, metrics, and dashboard in the `CloudWatch Console <https://console.aws.amazon.com/cloudwatch/>`_!
|
||||
=======================================================================================================================================
|
||||
|
||||
A tail can be acquired on all logs written to a CloudWatch log group by ensuring that you have the `AWS CLI V2+ installed <https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html>`_ and then running:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
aws logs tail $log_group_name --follow
|
||||
|
||||
Advanced Setup
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Refer to `example-cloudwatch.yaml <https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/example-cloudwatch.yaml>`_ for a complete example.
|
||||
|
||||
1. Choose an AMI with the Unified CloudWatch Agent pre-installed.
|
||||
=================================================================
|
||||
|
||||
Ensure that you're launching your Ray EC2 cluster in the same region as the AMI,
|
||||
then specify the ``ImageId`` to use with your cluster's head and worker nodes in your cluster config YAML file.
|
||||
|
||||
The following CLI command returns the latest available Unified CloudWatch Agent Image for ``us-west-2``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
aws ec2 describe-images --region us-west-2 --filters "Name=owner-id,Values=160082703681" "Name=name,Values=*cloudwatch*" --query 'Images[*].[ImageId,CreationDate]' --output text | sort -k2 -r | head -n1
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
available_node_types:
|
||||
ray.head.default:
|
||||
node_config:
|
||||
InstanceType: c5a.large
|
||||
ImageId: ami-0d88d9cbe28fac870
|
||||
ray.worker.default:
|
||||
node_config:
|
||||
InstanceType: c5a.large
|
||||
ImageId: ami-0d88d9cbe28fac870
|
||||
|
||||
To build your own AMI with the Unified CloudWatch Agent installed:
|
||||
|
||||
1. Follow the `CloudWatch Agent Installation <https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/install-CloudWatch-Agent-on-EC2-Instance.html>`_ user guide to install the Unified CloudWatch Agent on an EC2 instance.
|
||||
2. Follow the `EC2 AMI Creation <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html#creating-an-ami>`_ user guide to create an AMI from this EC2 instance.
|
||||
|
||||
2. Define your own CloudWatch Agent, Dashboard, and Alarm JSON config files.
|
||||
============================================================================
|
||||
|
||||
You can start by using the example `CloudWatch Agent <https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-agent-config.json>`_, `CloudWatch Dashboard <https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-dashboard-config.json>`_ and `CloudWatch Alarm <https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-alarm-config.json>`_ config files.
|
||||
|
||||
These example config files include the following features:
|
||||
|
||||
**Logs and Metrics**: Logs written to ``/tmp/ray/session_*/logs/**.out`` will be available in the ``{cluster_name}-ray_logs_out`` log group,
|
||||
and logs written to ``/tmp/ray/session_*/logs/**.err`` will be available in the ``{cluster_name}-ray_logs_err`` log group.
|
||||
Log streams are named after the EC2 instance ID that emitted their logs.
|
||||
Extended EC2 metrics including CPU/Disk/Memory usage and process statistics can be found in the ``{cluster_name}-ray-CWAgent`` metric namespace.
|
||||
|
||||
**Dashboard**: You will have a cluster-level dashboard showing total cluster CPUs and available object store memory.
|
||||
Process counts, disk usage, memory usage, and CPU utilization will be displayed as both cluster-level sums and single-node maximums/averages.
|
||||
|
||||
**Alarms**: Node-level alarms tracking prolonged high memory, disk, and CPU usage are configured. Alarm actions are NOT set,
|
||||
and must be manually provided in your alarm config file.
|
||||
|
||||
For more advanced options, see the `Agent <https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html>`_, `Dashboard <https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Dashboard-Body-Structure.html>`_ and `Alarm <https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutMetricAlarm.html>`_ config user guides.
|
||||
|
||||
CloudWatch Agent, Dashboard, and Alarm JSON config files support the following variables:
|
||||
|
||||
``{instance_id}``: Replaced with each EC2 instance ID in your Ray cluster.
|
||||
|
||||
``{region}``: Replaced with your Ray cluster's region.
|
||||
|
||||
``{cluster_name}``: Replaced with your Ray cluster name.
|
||||
|
||||
See CloudWatch Agent `Configuration File Details <https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html>`_ for additional variables supported natively by the Unified CloudWatch Agent.
|
||||
|
||||
.. note::
|
||||
Remember to replace the ``AlarmActions`` placeholder in your CloudWatch Alarm config file!
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
"AlarmActions":[
|
||||
"TODO: Add alarm actions! See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html"
|
||||
]
|
||||
|
||||
3. Reference your CloudWatch JSON config files in your cluster config YAML.
|
||||
===========================================================================
|
||||
|
||||
Specify the file path to your CloudWatch JSON config files relative to the working directory that you will run ``ray up`` from:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
provider:
|
||||
cloudwatch:
|
||||
agent:
|
||||
config: "cloudwatch/example-cloudwatch-agent-config.json"
|
||||
|
||||
|
||||
4. Set your IAM Role and EC2 Instance Profile.
|
||||
==============================================
|
||||
|
||||
By default the ``ray-autoscaler-cloudwatch-v1`` IAM role and EC2 instance profile is created at Ray cluster launch time.
|
||||
This role contains all additional permissions required to integrate CloudWatch with Ray, namely the ``CloudWatchAgentAdminPolicy``, ``AmazonSSMManagedInstanceCore``, ``ssm:SendCommand``, ``ssm:ListCommandInvocations``, and ``iam:PassRole`` managed policies.
|
||||
|
||||
Ensure that all worker nodes are configured to use the ``ray-autoscaler-cloudwatch-v1`` EC2 instance profile in your cluster config YAML:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
ray.worker.default:
|
||||
node_config:
|
||||
InstanceType: c5a.large
|
||||
IamInstanceProfile:
|
||||
Name: ray-autoscaler-cloudwatch-v1
|
||||
|
||||
5. Export Ray system metrics to CloudWatch.
|
||||
===========================================
|
||||
|
||||
To export Ray's Prometheus system metrics to CloudWatch, first ensure that your cluster has the
|
||||
Ray Dashboard installed, then uncomment the ``head_setup_commands`` section in `example-cloudwatch.yaml file <https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/aws/example-cloudwatch.yaml>`_ file.
|
||||
You can find Ray Prometheus metrics in the ``{cluster_name}-ray-prometheus`` metric namespace.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
head_setup_commands:
|
||||
# Make `ray_prometheus_waiter.sh` executable.
|
||||
- >-
|
||||
RAY_INSTALL_DIR=`pip show ray | grep -Po "(?<=Location:).*"`
|
||||
&& sudo chmod +x $RAY_INSTALL_DIR/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh
|
||||
# Copy `prometheus.yml` to Unified CloudWatch Agent folder
|
||||
- >-
|
||||
RAY_INSTALL_DIR=`pip show ray | grep -Po "(?<=Location:).*"`
|
||||
&& sudo cp -f $RAY_INSTALL_DIR/ray/autoscaler/aws/cloudwatch/prometheus.yml /opt/aws/amazon-cloudwatch-agent/etc
|
||||
# First get current cluster name, then let the Unified CloudWatch Agent restart and use `AmazonCloudWatch-ray_agent_config_{cluster_name}` parameter at SSM Parameter Store.
|
||||
- >-
|
||||
nohup sudo sh -c "`pip show ray | grep -Po "(?<=Location:).*"`/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh
|
||||
`cat ~/ray_bootstrap_config.yaml | jq '.cluster_name'`
|
||||
>> '/opt/aws/amazon-cloudwatch-agent/logs/ray_prometheus_waiter.out' 2>> '/opt/aws/amazon-cloudwatch-agent/logs/ray_prometheus_waiter.err'" &
|
||||
|
||||
6. Update CloudWatch Agent, Dashboard and Alarm config files.
|
||||
=============================================================
|
||||
|
||||
You can apply changes to the CloudWatch Logs, Metrics, Dashboard, and Alarms for your cluster by simply modifying the CloudWatch config files referenced by your Ray cluster config YAML and re-running ``ray up example-cloudwatch.yaml``.
|
||||
The Unified CloudWatch Agent will be automatically restarted on all cluster nodes, and your config changes will be applied.
|
||||
|
|
@ -1,444 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _cluster-cloud:
|
||||
|
||||
Launching Cloud Clusters
|
||||
========================
|
||||
|
||||
This section provides instructions for configuring the Ray Cluster Launcher to use with various cloud providers or on a private cluster of host machines.
|
||||
|
||||
See this blog post for a `step by step guide`_ to using the Ray Cluster Launcher.
|
||||
|
||||
To learn about deploying Ray on an existing Kubernetes cluster, refer to the guide :ref:`here<kuberay-index>`.
|
||||
|
||||
.. _`step by step guide`: https://medium.com/distributed-computing-with-ray/a-step-by-step-guide-to-scaling-your-first-python-application-in-the-cloud-8761fe331ef1
|
||||
|
||||
.. _ref-cloud-setup:
|
||||
|
||||
Ray with cloud providers
|
||||
------------------------
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
/cluster-deprecated/aws-tips.rst
|
||||
|
||||
.. tabbed:: AWS
|
||||
|
||||
First, install boto (``pip install boto3``) and configure your AWS credentials in ``~/.aws/credentials``,
|
||||
as described in `the boto docs <http://boto3.readthedocs.io/en/latest/guide/configuration.html>`__.
|
||||
|
||||
Once boto is configured to manage resources on your AWS account, you should be ready to launch your cluster. The provided `ray/python/ray/autoscaler/aws/example-full.yaml <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/aws/example-full.yaml>`__ cluster config file will create a small cluster with an m5.large head node (on-demand) configured to autoscale up to two m5.large `spot workers <https://aws.amazon.com/ec2/spot/>`__.
|
||||
|
||||
Test that it works by running the following commands from your local machine:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Create or update the cluster. When the command finishes, it will print
|
||||
# out the command that can be used to SSH into the cluster head node.
|
||||
$ ray up ray/python/ray/autoscaler/aws/example-full.yaml
|
||||
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/aws/example-full.yaml
|
||||
$ # Try running a Ray program.
|
||||
|
||||
# Tear down the cluster.
|
||||
$ ray down ray/python/ray/autoscaler/aws/example-full.yaml
|
||||
|
||||
|
||||
AWS Node Provider Maintainers (GitHub handles): pdames, Zyiqin-Miranda, DmitriGekhtman, wuisawesome
|
||||
|
||||
See :ref:`aws-cluster` for recipes on customizing AWS clusters.
|
||||
.. tabbed:: Azure
|
||||
|
||||
First, install the Azure CLI (``pip install azure-cli azure-identity``) then login using (``az login``).
|
||||
|
||||
Set the subscription to use from the command line (``az account set -s <subscription_id>``) or by modifying the provider section of the config provided e.g: `ray/python/ray/autoscaler/azure/example-full.yaml`
|
||||
|
||||
Once the Azure CLI is configured to manage resources on your Azure account, you should be ready to launch your cluster. The provided `ray/python/ray/autoscaler/azure/example-full.yaml <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/azure/example-full.yaml>`__ cluster config file will create a small cluster with a Standard DS2v3 head node (on-demand) configured to autoscale up to two Standard DS2v3 `spot workers <https://docs.microsoft.com/en-us/azure/virtual-machines/windows/spot-vms>`__. Note that you'll need to fill in your resource group and location in those templates.
|
||||
|
||||
Test that it works by running the following commands from your local machine:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Create or update the cluster. When the command finishes, it will print
|
||||
# out the command that can be used to SSH into the cluster head node.
|
||||
$ ray up ray/python/ray/autoscaler/azure/example-full.yaml
|
||||
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/azure/example-full.yaml
|
||||
# test ray setup
|
||||
$ python -c 'import ray; ray.init()'
|
||||
$ exit
|
||||
# Tear down the cluster.
|
||||
$ ray down ray/python/ray/autoscaler/azure/example-full.yaml
|
||||
|
||||
**Azure Portal**:
|
||||
Alternatively, you can deploy a cluster using Azure portal directly. Please note that autoscaling is done using Azure VM Scale Sets and not through
|
||||
the Ray autoscaler. This will deploy `Azure Data Science VMs (DSVM) <https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/>`_
|
||||
for both the head node and the auto-scalable cluster managed by `Azure Virtual Machine Scale Sets <https://azure.microsoft.com/en-us/services/virtual-machine-scale-sets/>`_.
|
||||
The head node conveniently exposes both SSH as well as JupyterLab.
|
||||
|
||||
.. image:: https://aka.ms/deploytoazurebutton
|
||||
:target: https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fray-project%2Fray%2Fmaster%2Fdoc%2Fazure%2Fazure-ray-template.json
|
||||
:alt: Deploy to Azure
|
||||
|
||||
Once the template is successfully deployed the deployment Outputs page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
|
||||
Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py38_tensorflow by default) to connect to the Ray cluster.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import ray
|
||||
ray.init()
|
||||
|
||||
Note that on each node the `azure-init.sh <https://github.com/ray-project/ray/blob/master/doc/azure/azure-init.sh>`_ script is executed and performs the following actions:
|
||||
|
||||
1. Activates one of the conda environments available on DSVM
|
||||
2. Installs Ray and any other user-specified dependencies
|
||||
3. Sets up a systemd task (``/lib/systemd/system/ray.service``) to start Ray in head or worker mode
|
||||
|
||||
|
||||
Azure Node Provider Maintainers (GitHub handles): gramhagen, eisber, ijrsvt
|
||||
.. note:: The Azure Node Provider is community-maintained. It is maintained by its authors, not the Ray team.
|
||||
|
||||
.. tabbed:: GCP
|
||||
|
||||
First, install the Google API client (``pip install google-api-python-client``), set up your GCP credentials, and create a new GCP project.
|
||||
|
||||
Once the API client is configured to manage resources on your GCP account, you should be ready to launch your cluster. The provided `ray/python/ray/autoscaler/gcp/example-full.yaml <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/gcp/example-full.yaml>`__ cluster config file will create a small cluster with a n1-standard-2 head node (on-demand) configured to autoscale up to two n1-standard-2 `preemptible workers <https://cloud.google.com/preemptible-vms/>`__. Note that you'll need to fill in your project id in those templates.
|
||||
|
||||
Test that it works by running the following commands from your local machine:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Create or update the cluster. When the command finishes, it will print
|
||||
# out the command that can be used to SSH into the cluster head node.
|
||||
$ ray up ray/python/ray/autoscaler/gcp/example-full.yaml
|
||||
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/gcp/example-full.yaml
|
||||
$ # Try running a Ray program with 'ray.init()'.
|
||||
|
||||
# Tear down the cluster.
|
||||
$ ray down ray/python/ray/autoscaler/gcp/example-full.yaml
|
||||
|
||||
GCP Node Provider Maintainers (GitHub handles): wuisawesome, DmitriGekhtman, ijrsvt
|
||||
|
||||
.. tabbed:: Aliyun
|
||||
|
||||
First, install the aliyun client package (``pip install aliyun-python-sdk-core aliyun-python-sdk-ecs``). Obtain the AccessKey pair of the Aliyun account as described in `the docs <https://www.alibabacloud.com/help/en/doc-detail/175967.htm>`__ and grant AliyunECSFullAccess/AliyunVPCFullAccess permissions to the RAM user. Finally, set the AccessKey pair in your cluster config file.
|
||||
|
||||
Once the above is done, you should be ready to launch your cluster. The provided `aliyun/example-full.yaml </ray/python/ray/autoscaler/aliyun/example-full.yaml>`__ cluster config file will create a small cluster with an ``ecs.n4.large`` head node (on-demand) configured to autoscale up to two ``ecs.n4.2xlarge`` nodes.
|
||||
|
||||
Make sure your account balance is not less than 100 RMB, otherwise you will receive a `InvalidAccountStatus.NotEnoughBalance` error.
|
||||
|
||||
Test that it works by running the following commands from your local machine:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Create or update the cluster. When the command finishes, it will print
|
||||
# out the command that can be used to SSH into the cluster head node.
|
||||
$ ray up ray/python/ray/autoscaler/aliyun/example-full.yaml
|
||||
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/aliyun/example-full.yaml
|
||||
$ # Try running a Ray program with 'ray.init()'.
|
||||
|
||||
# Tear down the cluster.
|
||||
$ ray down ray/python/ray/autoscaler/aliyun/example-full.yaml
|
||||
|
||||
Aliyun Node Provider Maintainers (GitHub handles): zhuangzhuang131419, chenk008
|
||||
|
||||
.. note:: The Aliyun Node Provider is community-maintained. It is maintained by its authors, not the Ray team.
|
||||
|
||||
|
||||
.. tabbed:: Custom
|
||||
|
||||
Ray also supports external node providers (check `node_provider.py <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/node_provider.py>`__ implementation).
|
||||
You can specify the external node provider using the yaml config:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
provider:
|
||||
type: external
|
||||
module: mypackage.myclass
|
||||
|
||||
The module needs to be in the format ``package.provider_class`` or ``package.sub_package.provider_class``.
|
||||
|
||||
|
||||
.. _cluster-private-setup:
|
||||
|
||||
Local On Premise Cluster (List of nodes)
|
||||
----------------------------------------
|
||||
You would use this mode if you want to run distributed Ray applications on some local nodes available on premise.
|
||||
|
||||
The most preferable way to run a Ray cluster on a private cluster of hosts is via the Ray Cluster Launcher.
|
||||
|
||||
There are two ways of running private clusters:
|
||||
|
||||
- Manually managed, i.e., the user explicitly specifies the head and worker ips.
|
||||
|
||||
- Automatically managed, i.e., the user only specifies a coordinator address to a coordinating server that automatically coordinates its head and worker ips.
|
||||
|
||||
.. tip:: To avoid getting the password prompt when running private clusters make sure to setup your ssh keys on the private cluster as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ssh-keygen
|
||||
$ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
|
||||
|
||||
.. tabbed:: Manually Managed
|
||||
|
||||
|
||||
You can get started by filling out the fields in the provided `ray/python/ray/autoscaler/local/example-full.yaml <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/local/example-full.yaml>`__.
|
||||
Be sure to specify the proper ``head_ip``, list of ``worker_ips``, and the ``ssh_user`` field.
|
||||
|
||||
Test that it works by running the following commands from your local machine:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Create or update the cluster. When the command finishes, it will print
|
||||
# out the command that can be used to get a remote shell into the head node.
|
||||
$ ray up ray/python/ray/autoscaler/local/example-full.yaml
|
||||
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/local/example-full.yaml
|
||||
$ # Try running a Ray program with 'ray.init()'.
|
||||
|
||||
# Tear down the cluster
|
||||
$ ray down ray/python/ray/autoscaler/local/example-full.yaml
|
||||
|
||||
.. tabbed:: Automatically Managed
|
||||
|
||||
|
||||
Start by launching the coordinator server that will manage all the on prem clusters. This server also makes sure to isolate the resources between different users. The script for running the coordinator server is `ray/python/ray/autoscaler/local/coordinator_server.py <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/local/coordinator_server.py>`__. To launch the coordinator server run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ python coordinator_server.py --ips <list_of_node_ips> --port <PORT>
|
||||
|
||||
where ``list_of_node_ips`` is a comma separated list of all the available nodes on the private cluster. For example, ``160.24.42.48,160.24.42.49,...`` and ``<PORT>`` is the port that the coordinator server will listen on.
|
||||
After running the coordinator server it will print the address of the coordinator server. For example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
>> INFO:ray.autoscaler.local.coordinator_server:Running on prem coordinator server
|
||||
on address <Host:PORT>
|
||||
|
||||
Next, the user only specifies the ``<Host:PORT>`` printed above in the ``coordinator_address`` entry instead of specific head/worker ips in the provided `ray/python/ray/autoscaler/local/example-full.yaml <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/local/example-full.yaml>`__.
|
||||
|
||||
Now we can test that it works by running the following commands from your local machine:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Create or update the cluster. When the command finishes, it will print
|
||||
# out the command that can be used to get a remote shell into the head node.
|
||||
$ ray up ray/python/ray/autoscaler/local/example-full.yaml
|
||||
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/local/example-full.yaml
|
||||
$ # Try running a Ray program with 'ray.init()'.
|
||||
|
||||
# Tear down the cluster
|
||||
$ ray down ray/python/ray/autoscaler/local/example-full.yaml
|
||||
|
||||
|
||||
.. _manual-cluster:
|
||||
|
||||
Manual Ray Cluster Setup
|
||||
------------------------
|
||||
|
||||
The most preferable way to run a Ray cluster is via the Ray Cluster Launcher. However, it is also possible to start a Ray cluster by hand.
|
||||
|
||||
This section assumes that you have a list of machines and that the nodes in the cluster can communicate with each other. It also assumes that Ray is installed
|
||||
on each machine. To install Ray, follow the `installation instructions`_.
|
||||
|
||||
.. _`installation instructions`: http://docs.ray.io/en/master/installation.html
|
||||
|
||||
Starting Ray on each machine
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
On the head node (just choose one node to be the head node), run the following.
|
||||
If the ``--port`` argument is omitted, Ray will choose port 6379, falling back to a
|
||||
random port.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ray start --head --port=6379
|
||||
...
|
||||
Next steps
|
||||
To connect to this Ray runtime from another node, run
|
||||
ray start --address='<ip address>:6379'
|
||||
|
||||
If connection fails, check your firewall settings and network configuration.
|
||||
|
||||
The command will print out the address of the Ray GCS server that was started
|
||||
(the local node IP address plus the port number you specified).
|
||||
|
||||
.. note::
|
||||
|
||||
If you already has remote Redis instances, you can specify environment variable
|
||||
`RAY_REDIS_ADDRESS=ip1:port1,ip2:port2...` to use them. The first one is
|
||||
primary and rest are shards.
|
||||
|
||||
**Then on each of the other nodes**, run the following. Make sure to replace
|
||||
``<address>`` with the value printed by the command on the head node (it
|
||||
should look something like ``123.45.67.89:6379``).
|
||||
|
||||
Note that if your compute nodes are on their own subnetwork with Network
|
||||
Address Translation, to connect from a regular machine outside that subnetwork,
|
||||
the command printed by the head node will not work. You need to find the
|
||||
address that will reach the head node from the second machine. If the head node
|
||||
has a domain address like compute04.berkeley.edu, you can simply use that in
|
||||
place of an IP address and rely on the DNS.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ray start --address=<address>
|
||||
--------------------
|
||||
Ray runtime started.
|
||||
--------------------
|
||||
|
||||
To terminate the Ray runtime, run
|
||||
ray stop
|
||||
|
||||
If you wish to specify that a machine has 10 CPUs and 1 GPU, you can do this
|
||||
with the flags ``--num-cpus=10`` and ``--num-gpus=1``. See the :ref:`Configuration <configuring-ray>` page for more information.
|
||||
|
||||
If you see ``Unable to connect to GCS at ...``,
|
||||
this means the head node is inaccessible at the given ``--address`` (because, for
|
||||
example, the head node is not actually running, a different version of Ray is
|
||||
running at the specified address, the specified address is wrong, or there are
|
||||
firewall settings preventing access).
|
||||
|
||||
If you see ``Ray runtime started.``, then the node successfully connected to
|
||||
the head node at the ``--address``. You should now be able to connect to the
|
||||
cluster with ``ray.init()``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
If connection fails, check your firewall settings and network configuration.
|
||||
|
||||
If the connection fails, to check whether each port can be reached from a node,
|
||||
you can use a tool such as ``nmap`` or ``nc``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ nmap -sV --reason -p $PORT $HEAD_ADDRESS
|
||||
Nmap scan report for compute04.berkeley.edu (123.456.78.910)
|
||||
Host is up, received echo-reply ttl 60 (0.00087s latency).
|
||||
rDNS record for 123.456.78.910: compute04.berkeley.edu
|
||||
PORT STATE SERVICE REASON VERSION
|
||||
6379/tcp open redis? syn-ack
|
||||
Service detection performed. Please report any incorrect results at https://nmap.org/submit/ .
|
||||
$ nc -vv -z $HEAD_ADDRESS $PORT
|
||||
Connection to compute04.berkeley.edu 6379 port [tcp/*] succeeded!
|
||||
|
||||
If the node cannot access that port at that IP address, you might see
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ nmap -sV --reason -p $PORT $HEAD_ADDRESS
|
||||
Nmap scan report for compute04.berkeley.edu (123.456.78.910)
|
||||
Host is up (0.0011s latency).
|
||||
rDNS record for 123.456.78.910: compute04.berkeley.edu
|
||||
PORT STATE SERVICE REASON VERSION
|
||||
6379/tcp closed redis reset ttl 60
|
||||
Service detection performed. Please report any incorrect results at https://nmap.org/submit/ .
|
||||
$ nc -vv -z $HEAD_ADDRESS $PORT
|
||||
nc: connect to compute04.berkeley.edu port 6379 (tcp) failed: Connection refused
|
||||
|
||||
|
||||
Stopping Ray
|
||||
~~~~~~~~~~~~
|
||||
|
||||
When you want to stop the Ray processes, run ``ray stop`` on each node.
|
||||
|
||||
|
||||
Additional Cloud Providers
|
||||
--------------------------
|
||||
|
||||
To use Ray autoscaling on other Cloud providers or cluster management systems, you can implement the ``NodeProvider`` interface (100 LOC) and register it in `node_provider.py <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/node_provider.py>`__. Contributions are welcome!
|
||||
|
||||
|
||||
Security
|
||||
--------
|
||||
|
||||
On cloud providers, nodes will be launched into their own security group by default, with traffic allowed only between nodes in the same group. A new SSH key will also be created and saved to your local machine for access to the cluster.
|
||||
|
||||
.. _using-ray-on-a-cluster:
|
||||
|
||||
Running a Ray program on the Ray cluster
|
||||
----------------------------------------
|
||||
|
||||
To run a distributed Ray program, you'll need to execute your program on the same machine as one of the nodes.
|
||||
|
||||
.. tabbed:: Python
|
||||
|
||||
Within your program/script, ``ray.init()`` will now automatically find and connect to the latest Ray cluster.
|
||||
For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ray.init()
|
||||
# Connecting to existing Ray cluster at address: <IP address>...
|
||||
|
||||
.. tabbed:: Java
|
||||
|
||||
You need to add the ``ray.address`` parameter to your command line (like ``-Dray.address=...``).
|
||||
|
||||
To connect your program to the Ray cluster, run it like this:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
java -classpath <classpath> \
|
||||
-Dray.address=<address> \
|
||||
<classname> <args>
|
||||
|
||||
.. note:: Specifying ``auto`` as the address hasn't been implemented in Java yet. You need to provide the actual address. You can find the address of the server from the output of the ``ray up`` command.
|
||||
|
||||
.. tabbed:: C++
|
||||
|
||||
You need to add the ``RAY_ADDRESS`` env var to your command line (like ``RAY_ADDRESS=...``).
|
||||
|
||||
To connect your program to the Ray cluster, run it like this:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
RAY_ADDRESS=<address> ./<binary> <args>
|
||||
|
||||
.. note:: Specifying ``auto`` as the address hasn't been implemented in C++ yet. You need to provide the actual address. You can find the address of the server from the output of the ``ray up`` command.
|
||||
|
||||
|
||||
.. note:: A common mistake is setting the address to be a cluster node while running the script on your laptop. This will not work because the script needs to be started/executed on one of the Ray nodes.
|
||||
|
||||
To verify that the correct number of nodes have joined the cluster, you can run the following.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import time
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
time.sleep(0.01)
|
||||
return ray._private.services.get_node_ip_address()
|
||||
|
||||
# Get a list of the IP addresses of the nodes that have joined the cluster.
|
||||
set(ray.get([f.remote() for _ in range(1000)]))
|
||||
|
||||
|
||||
What's Next?
|
||||
-------------
|
||||
|
||||
Now that you have a working understanding of the cluster launcher, check out:
|
||||
|
||||
* :ref:`ref-cluster-quick-start`: A end-to-end demo to run an application that autoscales.
|
||||
* :ref:`cluster-config`: A complete reference of how to configure your Ray cluster.
|
||||
* :ref:`cluster-commands`: A short user guide to the various cluster launcher commands.
|
||||
|
||||
|
||||
|
||||
Questions or Issues?
|
||||
--------------------
|
||||
|
||||
.. include:: /_includes/_help.rst
|
|
@ -1,4 +0,0 @@
|
|||
# Ray Clusters (under construction)
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
:::
|
|
@ -1,234 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _cluster-commands:
|
||||
|
||||
Cluster Launcher Commands
|
||||
=========================
|
||||
|
||||
This document overviews common commands for using the Ray Cluster Launcher.
|
||||
See the :ref:`Cluster Configuration <cluster-config>` docs on how to customize the configuration file.
|
||||
|
||||
Launching a cluster (``ray up``)
|
||||
--------------------------------
|
||||
|
||||
This will start up the machines in the cloud, install your dependencies and run
|
||||
any setup commands that you have, configure the Ray cluster automatically, and
|
||||
prepare you to scale your distributed system. See :ref:`the documentation
|
||||
<ray-up-doc>` for ``ray up``.
|
||||
|
||||
.. tip:: The worker nodes will start only after the head node has finished
|
||||
starting. To monitor the progress of the cluster setup, you can run
|
||||
`ray monitor <cluster yaml>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Replace '<your_backend>' with one of: 'aws', 'gcp', 'kubernetes', or 'local'.
|
||||
$ BACKEND=<your_backend>
|
||||
|
||||
# Create or update the cluster.
|
||||
$ ray up ray/python/ray/autoscaler/$BACKEND/example-full.yaml
|
||||
|
||||
# Tear down the cluster.
|
||||
$ ray down ray/python/ray/autoscaler/$BACKEND/example-full.yaml
|
||||
|
||||
Updating an existing cluster (``ray up``)
|
||||
-----------------------------------------
|
||||
|
||||
If you want to update your cluster configuration (add more files, change dependencies), run ``ray up`` again on the existing cluster.
|
||||
|
||||
This command checks if the local configuration differs from the applied
|
||||
configuration of the cluster. This includes any changes to synced files
|
||||
specified in the ``file_mounts`` section of the config. If so, the new files
|
||||
and config will be uploaded to the cluster. Following that, Ray
|
||||
services/processes will be restarted.
|
||||
|
||||
.. tip:: Don't do this for the cloud provider specifications (e.g., change from
|
||||
AWS to GCP on a running cluster) or change the cluster name (as this
|
||||
will just start a new cluster and orphan the original one).
|
||||
|
||||
|
||||
You can also run ``ray up`` to restart a cluster if it seems to be in a bad
|
||||
state (this will restart all Ray services even if there are no config changes).
|
||||
|
||||
Running ``ray up`` on an existing cluster will do all the following:
|
||||
|
||||
* If the head node matches the cluster specification, the filemounts will be
|
||||
reapplied and the ``setup_commands`` and ``ray start`` commands will be run.
|
||||
There may be some caching behavior here to skip setup/file mounts.
|
||||
* If the head node is out of date from the specified YAML (e.g.,
|
||||
``head_node_type`` has changed on the YAML), then the out of date node will
|
||||
be terminated and a new node will be provisioned to replace it. Setup/file
|
||||
mounts/``ray start`` will be applied.
|
||||
* After the head node reaches a consistent state (after ``ray start`` commands
|
||||
are finished), the same above procedure will be applied to all the worker
|
||||
nodes. The ``ray start`` commands tend to run a ``ray stop`` + ``ray start``,
|
||||
so this will kill currently working jobs.
|
||||
|
||||
If you don't want the update to restart services (e.g., because the changes
|
||||
don't require a restart), pass ``--no-restart`` to the update call.
|
||||
|
||||
If you want to force re-generation of the config to pick up possible changes in
|
||||
the cloud environment, pass ``--no-config-cache`` to the update call.
|
||||
|
||||
If you want to skip the setup commands and only run ``ray stop``/``ray start``
|
||||
on all nodes, pass ``--restart-only`` to the update call.
|
||||
|
||||
See :ref:`the documentation <ray-up-doc>` for ``ray up``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Reconfigure autoscaling behavior without interrupting running jobs.
|
||||
$ ray up ray/python/ray/autoscaler/$BACKEND/example-full.yaml \
|
||||
--max-workers=N --no-restart
|
||||
|
||||
Running shell commands on the cluster (``ray exec``)
|
||||
----------------------------------------------------
|
||||
|
||||
You can use ``ray exec`` to conveniently run commands on clusters. See :ref:`the documentation <ray-exec-doc>` for ``ray exec``.
|
||||
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Run a command on the cluster
|
||||
$ ray exec cluster.yaml 'echo "hello world"'
|
||||
|
||||
# Run a command on the cluster, starting it if needed
|
||||
$ ray exec cluster.yaml 'echo "hello world"' --start
|
||||
|
||||
# Run a command on the cluster, stopping the cluster after it finishes
|
||||
$ ray exec cluster.yaml 'echo "hello world"' --stop
|
||||
|
||||
# Run a command on a new cluster called 'experiment-1', stopping it after
|
||||
$ ray exec cluster.yaml 'echo "hello world"' \
|
||||
--start --stop --cluster-name experiment-1
|
||||
|
||||
# Run a command in a detached tmux session
|
||||
$ ray exec cluster.yaml 'echo "hello world"' --tmux
|
||||
|
||||
# Run a command in a screen (experimental)
|
||||
$ ray exec cluster.yaml 'echo "hello world"' --screen
|
||||
|
||||
If you want to run applications on the cluster that are accessible from a web
|
||||
browser (e.g., Jupyter notebook), you can use the ``--port-forward``. The local
|
||||
port opened is the same as the remote port.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ ray exec cluster.yaml --port-forward=8899 'source ~/anaconda3/bin/activate tensorflow_p36 && jupyter notebook --port=8899'
|
||||
|
||||
.. note:: For Kubernetes clusters, the ``port-forward`` option cannot be used
|
||||
while executing a command. To port forward and run a command you need
|
||||
to call ``ray exec`` twice separately.
|
||||
|
||||
Running Ray scripts on the cluster (``ray submit``)
|
||||
---------------------------------------------------
|
||||
|
||||
You can also use ``ray submit`` to execute Python scripts on clusters. This
|
||||
will ``rsync`` the designated file onto the head node cluster and execute it
|
||||
with the given arguments. See :ref:`the documentation <ray-submit-doc>` for
|
||||
``ray submit``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Run a Python script in a detached tmux session
|
||||
$ ray submit cluster.yaml --tmux --start --stop tune_experiment.py
|
||||
|
||||
# Run a Python script with arguments.
|
||||
# This executes script.py on the head node of the cluster, using
|
||||
# the command: python ~/script.py --arg1 --arg2 --arg3
|
||||
$ ray submit cluster.yaml script.py -- --arg1 --arg2 --arg3
|
||||
|
||||
|
||||
Attaching to a running cluster (``ray attach``)
|
||||
-----------------------------------------------
|
||||
|
||||
You can use ``ray attach`` to attach to an interactive screen session on the
|
||||
cluster. See :ref:`the documentation <ray-attach-doc>` for ``ray attach`` or
|
||||
run ``ray attach --help``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Open a screen on the cluster
|
||||
$ ray attach cluster.yaml
|
||||
|
||||
# Open a screen on a new cluster called 'session-1'
|
||||
$ ray attach cluster.yaml --start --cluster-name=session-1
|
||||
|
||||
# Attach to tmux session on cluster (creates a new one if none available)
|
||||
$ ray attach cluster.yaml --tmux
|
||||
|
||||
.. _ray-rsync:
|
||||
|
||||
Synchronizing files from the cluster (``ray rsync-up/down``)
|
||||
------------------------------------------------------------
|
||||
|
||||
To download or upload files to the cluster head node, use ``ray rsync_down`` or
|
||||
``ray rsync_up``:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ ray rsync_down cluster.yaml '/path/on/cluster' '/local/path'
|
||||
$ ray rsync_up cluster.yaml '/local/path' '/path/on/cluster'
|
||||
|
||||
.. _monitor-cluster:
|
||||
|
||||
Monitoring cluster status (``ray dashboard/status``)
|
||||
-----------------------------------------------------
|
||||
|
||||
The Ray also comes with an online dashboard. The dashboard is accessible via
|
||||
HTTP on the head node (by default it listens on ``localhost:8265``). You can
|
||||
also use the built-in ``ray dashboard`` to set up port forwarding
|
||||
automatically, making the remote dashboard viewable in your local browser at
|
||||
``localhost:8265``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ ray dashboard cluster.yaml
|
||||
|
||||
You can monitor cluster usage and auto-scaling status by running (on the head node):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ ray status
|
||||
|
||||
To see live updates to the status:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ watch -n 1 ray status
|
||||
|
||||
The Ray autoscaler also reports per-node status in the form of instance tags.
|
||||
In your cloud provider console, you can click on a Node, go to the "Tags" pane,
|
||||
and add the ``ray-node-status`` tag as a column. This lets you see per-node
|
||||
statuses at a glance:
|
||||
|
||||
.. image:: /images/autoscaler-status.png
|
||||
|
||||
Common Workflow: Syncing git branches
|
||||
-------------------------------------
|
||||
|
||||
A common use case is syncing a particular local git branch to all workers of
|
||||
the cluster. However, if you just put a `git checkout <branch>` in the setup
|
||||
commands, the autoscaler won't know when to rerun the command to pull in
|
||||
updates. There is a nice workaround for this by including the git SHA in the
|
||||
input (the hash of the file will change if the branch is updated):
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
file_mounts: {
|
||||
"/tmp/current_branch_sha": "/path/to/local/repo/.git/refs/heads/<YOUR_BRANCH_NAME>",
|
||||
}
|
||||
|
||||
setup_commands:
|
||||
- test -e <REPO_NAME> || git clone https://github.com/<REPO_ORG>/<REPO_NAME>.git
|
||||
- cd <REPO_NAME> && git fetch && git checkout `cat /tmp/current_branch_sha`
|
||||
|
||||
This tells ``ray up`` to sync the current git branch SHA from your personal
|
||||
computer to a temporary file on the cluster (assuming you've pushed the branch
|
||||
head already). Then, the setup commands read that file to figure out which SHA
|
||||
they should checkout on the nodes. Note that each command runs in its own
|
||||
session. The final workflow to update the cluster then becomes just this:
|
||||
|
||||
1. Make local changes to a git branch
|
||||
2. Commit the changes with ``git commit`` and ``git push``
|
||||
3. Update files on your Ray cluster with ``ray up``
|
File diff suppressed because it is too large
Load diff
|
@ -1,19 +0,0 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ref-cluster-setup:
|
||||
|
||||
Ray with Cluster Managers
|
||||
=========================
|
||||
|
||||
.. note::
|
||||
|
||||
If you're using AWS, Azure or GCP you can use the :ref:`Ray Cluster Launcher <cluster-cloud>` to simplify the cluster setup process.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
kubernetes.rst
|
||||
yarn.rst
|
||||
slurm.rst
|
||||
lsf.rst
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
import os
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
|
||||
# trainer.py
|
||||
from collections import Counter
|
||||
|
||||
import ray
|
||||
|
||||
num_cpus = int(sys.argv[1])
|
||||
|
||||
ray.init(address=os.environ["ip_head"])
|
||||
|
||||
print("Nodes in the Ray cluster:")
|
||||
print(ray.nodes())
|
||||
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
time.sleep(1)
|
||||
return socket.gethostbyname(socket.gethostname())
|
||||
|
||||
|
||||
# The following takes one second (assuming that
|
||||
# ray was able to access all of the allocated nodes).
|
||||
for i in range(60):
|
||||
start = time.time()
|
||||
ip_addresses = ray.get([f.remote() for _ in range(num_cpus)])
|
||||
print(Counter(ip_addresses))
|
||||
end = time.time()
|
||||
print(end - start)
|
|
@ -1,8 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-basic:
|
||||
|
||||
slurm-basic.sh
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
|
@ -1,65 +0,0 @@
|
|||
#!/bin/bash
|
||||
# shellcheck disable=SC2206
|
||||
#SBATCH --job-name=test
|
||||
#SBATCH --cpus-per-task=5
|
||||
#SBATCH --mem-per-cpu=1GB
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --tasks-per-node=1
|
||||
#SBATCH --time=00:30:00
|
||||
|
||||
set -x
|
||||
|
||||
# __doc_head_address_start__
|
||||
|
||||
# Getting the node names
|
||||
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
|
||||
nodes_array=($nodes)
|
||||
|
||||
head_node=${nodes_array[0]}
|
||||
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
|
||||
|
||||
# if we detect a space character in the head node IP, we'll
|
||||
# convert it to an ipv4 address. This step is optional.
|
||||
if [[ "$head_node_ip" == *" "* ]]; then
|
||||
IFS=' ' read -ra ADDR <<<"$head_node_ip"
|
||||
if [[ ${#ADDR[0]} -gt 16 ]]; then
|
||||
head_node_ip=${ADDR[1]}
|
||||
else
|
||||
head_node_ip=${ADDR[0]}
|
||||
fi
|
||||
echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
|
||||
fi
|
||||
# __doc_head_address_end__
|
||||
|
||||
# __doc_head_ray_start__
|
||||
port=6379
|
||||
ip_head=$head_node_ip:$port
|
||||
export ip_head
|
||||
echo "IP Head: $ip_head"
|
||||
|
||||
echo "Starting HEAD at $head_node"
|
||||
srun --nodes=1 --ntasks=1 -w "$head_node" \
|
||||
ray start --head --node-ip-address="$head_node_ip" --port=$port \
|
||||
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
|
||||
# __doc_head_ray_end__
|
||||
|
||||
# __doc_worker_ray_start__
|
||||
# optional, though may be useful in certain versions of Ray < 1.0.
|
||||
sleep 10
|
||||
|
||||
# number of nodes other than the head node
|
||||
worker_num=$((SLURM_JOB_NUM_NODES - 1))
|
||||
|
||||
for ((i = 1; i <= worker_num; i++)); do
|
||||
node_i=${nodes_array[$i]}
|
||||
echo "Starting WORKER $i at $node_i"
|
||||
srun --nodes=1 --ntasks=1 -w "$node_i" \
|
||||
ray start --address "$ip_head" \
|
||||
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
|
||||
sleep 5
|
||||
done
|
||||
# __doc_worker_ray_end__
|
||||
|
||||
# __doc_script_start__
|
||||
# ray/doc/source/cluster-deprecated/examples/simple-trainer.py
|
||||
python -u simple-trainer.py "$SLURM_CPUS_PER_TASK"
|
|
@ -1,108 +0,0 @@
|
|||
# slurm-launch.py
|
||||
# Usage:
|
||||
# python slurm-launch.py --exp-name test \
|
||||
# --command "rllib train --run PPO --env CartPole-v0"
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
template_file = Path(__file__) / "slurm-template.sh"
|
||||
JOB_NAME = "${JOB_NAME}"
|
||||
NUM_NODES = "${NUM_NODES}"
|
||||
NUM_GPUS_PER_NODE = "${NUM_GPUS_PER_NODE}"
|
||||
PARTITION_OPTION = "${PARTITION_OPTION}"
|
||||
COMMAND_PLACEHOLDER = "${COMMAND_PLACEHOLDER}"
|
||||
GIVEN_NODE = "${GIVEN_NODE}"
|
||||
LOAD_ENV = "${LOAD_ENV}"
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--exp-name",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The job name and path to logging file (exp_name.log).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-nodes", "-n", type=int, default=1, help="Number of nodes to use."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--node",
|
||||
"-w",
|
||||
type=str,
|
||||
help="The specified nodes to use. Same format as the "
|
||||
"return of 'sinfo'. Default: ''.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-gpus",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of GPUs to use in each node. (Default: 0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--partition",
|
||||
"-p",
|
||||
type=str,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--load-env",
|
||||
type=str,
|
||||
help="The script to load your environment ('module load cuda/10.1')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--command",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The command you wish to execute. For example: "
|
||||
" --command 'python test.py'. "
|
||||
"Note that the command must be a string.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.node:
|
||||
# assert args.num_nodes == 1
|
||||
node_info = "#SBATCH -w {}".format(args.node)
|
||||
else:
|
||||
node_info = ""
|
||||
|
||||
job_name = "{}_{}".format(
|
||||
args.exp_name, time.strftime("%m%d-%H%M", time.localtime())
|
||||
)
|
||||
|
||||
partition_option = (
|
||||
"#SBATCH --partition={}".format(args.partition) if args.partition else ""
|
||||
)
|
||||
|
||||
# ===== Modified the template script =====
|
||||
with open(template_file, "r") as f:
|
||||
text = f.read()
|
||||
text = text.replace(JOB_NAME, job_name)
|
||||
text = text.replace(NUM_NODES, str(args.num_nodes))
|
||||
text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus))
|
||||
text = text.replace(PARTITION_OPTION, partition_option)
|
||||
text = text.replace(COMMAND_PLACEHOLDER, str(args.command))
|
||||
text = text.replace(LOAD_ENV, str(args.load_env))
|
||||
text = text.replace(GIVEN_NODE, node_info)
|
||||
text = text.replace(
|
||||
"# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO " "PRODUCTION!",
|
||||
"# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE "
|
||||
"RUNNABLE!",
|
||||
)
|
||||
|
||||
# ===== Save the script =====
|
||||
script_file = "{}.sh".format(job_name)
|
||||
with open(script_file, "w") as f:
|
||||
f.write(text)
|
||||
|
||||
# ===== Submit the job =====
|
||||
print("Starting to submit job!")
|
||||
subprocess.Popen(["sbatch", script_file])
|
||||
print(
|
||||
"Job submitted! Script file is at: <{}>. Log file is at: <{}>".format(
|
||||
script_file, "{}.log".format(job_name)
|
||||
)
|
||||
)
|
||||
sys.exit(0)
|
|
@ -1,8 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-launch:
|
||||
|
||||
slurm-launch.py
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-launch.py
|
|
@ -1,9 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-template:
|
||||
|
||||
slurm-template.sh
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-template.sh
|
||||
:language: bash
|
|
@ -1,64 +0,0 @@
|
|||
#!/bin/bash
|
||||
# shellcheck disable=SC2206
|
||||
# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT!
|
||||
# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION!
|
||||
${PARTITION_OPTION}
|
||||
#SBATCH --job-name=${JOB_NAME}
|
||||
#SBATCH --output=${JOB_NAME}.log
|
||||
${GIVEN_NODE}
|
||||
### This script works for any number of nodes, Ray will find and manage all resources
|
||||
#SBATCH --nodes=${NUM_NODES}
|
||||
#SBATCH --exclusive
|
||||
### Give all resources to a single Ray task, ray can manage the resources internally
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --gpus-per-task=${NUM_GPUS_PER_NODE}
|
||||
|
||||
# Load modules or your own conda environment here
|
||||
# module load pytorch/v1.4.0-gpu
|
||||
# conda activate ${CONDA_ENV}
|
||||
${LOAD_ENV}
|
||||
|
||||
# ===== DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING =====
|
||||
# This script is a modification to the implementation suggest by gregSchwartz18 here:
|
||||
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
|
||||
redis_password=$(uuidgen)
|
||||
export redis_password
|
||||
|
||||
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names
|
||||
nodes_array=($nodes)
|
||||
|
||||
node_1=${nodes_array[0]}
|
||||
ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address
|
||||
|
||||
# if we detect a space character in the head node IP, we'll
|
||||
# convert it to an ipv4 address. This step is optional.
|
||||
if [[ "$ip" == *" "* ]]; then
|
||||
IFS=' ' read -ra ADDR <<< "$ip"
|
||||
if [[ ${#ADDR[0]} -gt 16 ]]; then
|
||||
ip=${ADDR[1]}
|
||||
else
|
||||
ip=${ADDR[0]}
|
||||
fi
|
||||
echo "IPV6 address detected. We split the IPV4 address as $ip"
|
||||
fi
|
||||
|
||||
port=6379
|
||||
ip_head=$ip:$port
|
||||
export ip_head
|
||||
echo "IP Head: $ip_head"
|
||||
|
||||
echo "STARTING HEAD at $node_1"
|
||||
srun --nodes=1 --ntasks=1 -w "$node_1" \
|
||||
ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block &
|
||||
sleep 30
|
||||
|
||||
worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
|
||||
for ((i = 1; i <= worker_num; i++)); do
|
||||
node_i=${nodes_array[$i]}
|
||||
echo "STARTING WORKER $i at $node_i"
|
||||
srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --redis-password="$redis_password" --block &
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# ===== Call your code below =====
|
||||
${COMMAND_PLACEHOLDER}
|
|
@ -1,312 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _deployment-guide:
|
||||
|
||||
Cluster Deployment Guide
|
||||
========================
|
||||
|
||||
This page provides an overview of how to deploy a multi-node Ray cluster, including how to:
|
||||
|
||||
* Launch the cluster.
|
||||
* Set up the autoscaler.
|
||||
* Deploy a Ray application.
|
||||
* Monitor a multi-node cluster.
|
||||
* Best practices for setting up large Ray clusters.
|
||||
|
||||
Launching a Ray cluster
|
||||
-----------------------
|
||||
|
||||
There 2 recommended ways of launching a Ray cluster are via:
|
||||
|
||||
1. :ref:`The cluster launcher <cluster-cloud>`
|
||||
2. :ref:`The kubernetes operator <Ray-operator>`
|
||||
|
||||
Cluster Launcher
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
The goal of :ref:`the cluster launcher <cluster-cloud>` is to make it easy to deploy a Ray cluster on
|
||||
any cloud. It will:
|
||||
|
||||
* provision a new instance/machine using the cloud provider's SDK.
|
||||
* execute shell commands to set up Ray with the provided options.
|
||||
* (optionally) run any custom, user defined setup commands. This can be useful for setting environment variables and installing packages. (To dynamically set up environments after the cluster has been deployed, you can use :ref:`Runtime Environments<runtime-environments>`.)
|
||||
* Initialize the Ray cluster.
|
||||
* Deploy an autoscaler process.
|
||||
|
||||
Kubernetes Operator
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The goal of the :ref:`Ray Kubernetes Operator <Ray-operator>` is to make it easy
|
||||
to deploy a Ray cluster on an existing Kubernetes cluster.
|
||||
|
||||
To simplify Operator configuration, Ray provides a :ref:`a Helm chart <Ray-helm>`.
|
||||
Installing the Helm chart will create an Operator Deployment.
|
||||
The Operator manages autoscaling Ray clusters; each Ray node runs in its own K8s Pod.
|
||||
|
||||
.. _deployment-guide-autoscaler:
|
||||
|
||||
Autoscaling with Ray
|
||||
--------------------
|
||||
|
||||
Ray is designed to support highly elastic workloads which are most efficient on
|
||||
an autoscaling cluster. At a high level, the autoscaler attempts to
|
||||
launch/terminate nodes in order to ensure that workloads have sufficient
|
||||
resources to run, while minimizing the idle resources.
|
||||
|
||||
It does this by taking into consideration:
|
||||
|
||||
* User specified hard limits (min/max workers).
|
||||
* User specified node types (nodes in a Ray cluster do _not_ have to be
|
||||
homogenous).
|
||||
* Information from the Ray core's scheduling layer about the current resource
|
||||
usage/demands of the cluster.
|
||||
* Programmatic autoscaling hints.
|
||||
|
||||
Take a look at :ref:`the cluster reference <cluster-config>` to learn more
|
||||
about configuring the autoscaler.
|
||||
|
||||
|
||||
How does it work?
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
The Ray Cluster Launcher will automatically enable a load-based autoscaler. The
|
||||
autoscaler resource demand scheduler will look at the pending tasks, actors,
|
||||
and placement groups resource demands from the cluster, and try to add the
|
||||
minimum list of nodes that can fulfill these demands. Autoscaler uses a simple
|
||||
binpacking algorithm to binpack the user demands into
|
||||
the available cluster resources. The remaining unfulfilled demands are placed
|
||||
on the smallest list of nodes that satisfies the demand while maximizing
|
||||
utilization (starting from the smallest node).
|
||||
|
||||
**Downscaling**: When worker nodes are
|
||||
idle (without active Tasks or Actors running on it)
|
||||
for more than :ref:`idle_timeout_minutes
|
||||
<cluster-configuration-idle-timeout-minutes>`, they are subject to
|
||||
removal from the cluster. But there are two important additional conditions
|
||||
to note:
|
||||
|
||||
* The head node is never removed unless the cluster is torn down.
|
||||
* If the Ray Object Store is used, and a Worker node still holds objects (including spilled objects on disk), it won't be removed.
|
||||
|
||||
|
||||
|
||||
**Here is "A Glimpse into the Ray Autoscaler" and how to debug/monitor your cluster:**
|
||||
|
||||
2021-19-01 by Ameer Haj-Ali, Anyscale Inc.
|
||||
|
||||
.. youtube:: BJ06eJasdu4
|
||||
|
||||
|
||||
Deploying an application
|
||||
------------------------
|
||||
|
||||
To submit an application to the Ray cluster, use the Ray :ref:`Job submission interface <jobs-overview>`.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
export RAY_ADDRESS=<your_cluster_address>:8265
|
||||
ray job submit ... -- "python script.py"
|
||||
|
||||
|
||||
To interactively connect to a Ray cluster, connect via the :ref:`Ray Client<ray-client>`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# outside python, set the ``RAY_ADDRESS`` environment variable to the address of the Ray client server
|
||||
ray.init("ray://<host>:<port>")
|
||||
|
||||
|
||||
:ref:`Learn more about setting up the Ray client server here <Ray-client>`.
|
||||
|
||||
You can dynamically specify local files, Python packages, and environment variables for your
|
||||
application using :ref:`Runtime Environments <runtime-environments>`.
|
||||
|
||||
.. note::
|
||||
|
||||
When deploying an application, the job will be killed if the driver
|
||||
disconnects.
|
||||
|
||||
:ref:`A detached actor <actor-lifetimes>` can be used to avoid having a long running driver.
|
||||
|
||||
Monitoring and observability
|
||||
----------------------------
|
||||
|
||||
Ray comes with 3 main observability features:
|
||||
|
||||
1. :ref:`The dashboard <Ray-dashboard>`
|
||||
2. :ref:`ray status <monitor-cluster>`
|
||||
3. :ref:`Prometheus metrics <multi-node-metrics>`
|
||||
|
||||
Monitoring the cluster via the dashboard
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
:ref:`The dashboard provides detailed information about the state of the cluster <Ray-dashboard>`,
|
||||
including the running jobs, actors, workers, nodes, etc.
|
||||
|
||||
By default, the cluster launcher and operator will launch the dashboard, but
|
||||
not publicly expose it.
|
||||
|
||||
If you launch your application via the cluster launcher, you can securely
|
||||
portforward local traffic to the dashboard via the ``ray dashboard`` command
|
||||
(which establishes an SSH tunnel). The dashboard will now be visible at
|
||||
``http://localhost:8265``.
|
||||
|
||||
The Kubernetes Operator makes the dashboard available via a Service targeting the Ray head pod.
|
||||
You can :ref:`access the dashboard <ray-k8s-dashboard>` using ``kubectl port-forward``.
|
||||
|
||||
|
||||
Observing the autoscaler
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The autoscaler makes decisions by scheduling information, and programmatic
|
||||
information from the cluster. This information, along with the status of
|
||||
starting nodes, can be accessed via the ``ray status`` command.
|
||||
|
||||
To dump the current state of a cluster launched via the cluster launcher, you
|
||||
can run ``ray exec cluster.yaml "Ray status"``.
|
||||
|
||||
For a more "live" monitoring experience, it is recommended that you run ``ray
|
||||
status`` in a watch loop: ``ray exec cluster.yaml "watch -n 1 Ray status"``.
|
||||
|
||||
With the kubernetes operator, you should replace ``ray exec cluster.yaml`` with
|
||||
``kubectl exec <head node pod>``.
|
||||
|
||||
Prometheus metrics
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Ray is capable of producing prometheus metrics. When enabled, Ray produces some
|
||||
metrics about the Ray core, and some internal metrics by default. It also
|
||||
supports custom, user-defined metrics.
|
||||
|
||||
These metrics can be consumed by any metrics infrastructure which can ingest
|
||||
metrics from the prometheus server on the head node of the cluster.
|
||||
|
||||
:ref:`Learn more about setting up prometheus here. <multi-node-metrics>`
|
||||
|
||||
Best practices for deploying large clusters
|
||||
-------------------------------------------
|
||||
|
||||
This section aims to document best practices for deploying Ray clusters at
|
||||
large scale.
|
||||
|
||||
Networking configuration
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
End users should only need to directly interact with the head node of the
|
||||
cluster. In particular, there are 2 services which should be exposed to users:
|
||||
|
||||
1. The dashboard
|
||||
2. The Ray client server
|
||||
|
||||
.. note::
|
||||
|
||||
While users only need 2 ports to connect to a cluster, the nodes within a
|
||||
cluster require a much wider range of ports to communicate.
|
||||
|
||||
See :ref:`Ray port configuration <Ray-ports>` for a comprehensive list.
|
||||
|
||||
Applications (such as :ref:`Ray Serve <Rayserve>`) may also require
|
||||
additional ports to work properly.
|
||||
|
||||
System configuration
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
There are a few system level configurations that should be set when using Ray
|
||||
at a large scale.
|
||||
|
||||
* Make sure ``ulimit -n`` is set to at least 65535. Ray opens many direct
|
||||
connections between worker processes to avoid bottlenecks, so it can quickly
|
||||
use a large number of file descriptors.
|
||||
* Make sure ``/dev/shm`` is sufficiently large. Most ML/RL applications rely
|
||||
heavily on the plasma store. By default, Ray will try to use ``/dev/shm`` for
|
||||
the object store, but if it is not large enough (i.e. ``--object-store-memory``
|
||||
> size of ``/dev/shm``), Ray will write the plasma store to disk instead, which
|
||||
may cause significant performance problems.
|
||||
* Use NVMe SSDs (or other high perforfmance storage) if possible. If
|
||||
:ref:`object spilling <object-spilling>` is enabled Ray will spill objects to
|
||||
disk if necessary. This is most commonly needed for data processing
|
||||
workloads.
|
||||
|
||||
Configuring the head node
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In addition to the above changes, when deploying a large cluster, Ray's
|
||||
architecture means that the head node will have extra stress due to GCS.
|
||||
|
||||
* Make sure the head node has sufficient bandwidth. The most heavily stressed
|
||||
resource on the head node is outbound bandwidth. For large clusters (see the
|
||||
scalability envelope), we recommend using machines networking characteristics
|
||||
at least as good as an r5dn.16xlarge on AWS EC2.
|
||||
* Set ``resources: {"CPU": 0}`` on the head node. (For Ray clusters deployed using Helm,
|
||||
set ``rayResources: {"CPU": 0}``.) Due to the heavy networking
|
||||
load (and the GCS and dashboard processes), we recommend setting the number of
|
||||
CPUs to 0 on the head node to avoid scheduling additional tasks on it.
|
||||
|
||||
Configuring the autoscaler
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
For large, long running clusters, there are a few parameters that can be tuned.
|
||||
|
||||
* Ensure your quotas for node types are set correctly.
|
||||
* For long running clusters, set the ``AUTOSCALER_MAX_NUM_FAILURES`` environment
|
||||
variable to a large number (or ``inf``) to avoid unexpected autoscaler
|
||||
crashes. The variable can be set by prepending \ ``export AUTOSCALER_MAX_NUM_FAILURES=inf;``
|
||||
to the head node's Ray start command.
|
||||
(Note: you may want a separate mechanism to detect if the autoscaler
|
||||
errors too often).
|
||||
* For large clusters, consider tuning ``upscaling_speed`` for faster
|
||||
autoscaling.
|
||||
|
||||
Picking nodes
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
Here are some tips for how to set your ``available_node_types`` for a cluster,
|
||||
using AWS instance types as a concrete example.
|
||||
|
||||
General recommendations with AWS instance types:
|
||||
|
||||
**When to use GPUs**
|
||||
|
||||
* If you’re using some RL/ML framework
|
||||
* You’re doing something with tensorflow/pytorch/jax (some framework that can
|
||||
leverage GPUs well)
|
||||
|
||||
**What type of GPU?**
|
||||
|
||||
* The latest gen GPU is almost always the best bang for your buck (p3 > p2, g4
|
||||
> g3), for most well designed applications the performance outweighs the
|
||||
price (the instance price may be higher, but you’ll use the instance for less
|
||||
time.
|
||||
* You may want to consider using older instances if you’re doing dev work and
|
||||
won’t actually fully utilize the GPUs though.
|
||||
* If you’re doing training (ML or RL), you should use a P instance. If you’re
|
||||
doing inference, you should use a G instance. The difference is
|
||||
processing:VRAM ratio (training requires more memory).
|
||||
|
||||
**What type of CPU?**
|
||||
|
||||
* Again stick to the latest generation, they’re typically cheaper and faster.
|
||||
* When in doubt use M instances, they have typically have the highest
|
||||
availability.
|
||||
* If you know your application is memory intensive (memory utilization is full,
|
||||
but cpu is not), go with an R instance
|
||||
* If you know your application is CPU intensive go with a C instance
|
||||
* If you have a big cluster, make the head node an instance with an n (r5dn or
|
||||
c5n)
|
||||
|
||||
**How many CPUs/GPUs?**
|
||||
|
||||
* Focus on your CPU:GPU ratio first and look at the utilization (Ray dashboard
|
||||
should help with this). If your CPU utilization is low add GPUs, or vice
|
||||
versa.
|
||||
* The exact ratio will be very dependent on your workload.
|
||||
* Once you find a good ratio, you should be able to scale up and and keep the
|
||||
same ratio.
|
||||
* You can’t infinitely scale forever. Eventually, as you add more machines your
|
||||
performance improvements will become sub-linear/not worth it. There may not
|
||||
be a good one-size fits all strategy at this point.
|
||||
|
||||
.. note::
|
||||
|
||||
If you're using RLlib, check out :ref:`the RLlib scaling guide
|
||||
<rllib-scaling-guide>` for RLlib specific recommendations.
|
|
@ -1,85 +0,0 @@
|
|||
.. include:: /_includes/clusters/announcement.rst
|
||||
|
||||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _cluster-index:
|
||||
|
||||
Ray Clusters Overview
|
||||
=====================
|
||||
|
||||
What is a Ray cluster?
|
||||
----------------------
|
||||
|
||||
One of Ray's strengths is the ability to leverage multiple machines for
|
||||
distributed execution. Ray can, of course, be run on a single machine (and is
|
||||
done so often), but the real power is using Ray on a cluster of machines.
|
||||
|
||||
Ray can automatically interact with the cloud provider to request or release
|
||||
instances. You can specify :ref:`a configuration <cluster-config>` to launch
|
||||
clusters on :ref:`AWS, GCP, Azure (community-maintained), Aliyun (community-maintained), on-premise, or even on
|
||||
your custom node provider <cluster-cloud>`. Ray can also be run on :ref:`Kubernetes <kuberay-index>` infrastructure.
|
||||
Your cluster can have a fixed size
|
||||
or :ref:`automatically scale up and down<cluster-autoscaler>` depending on the
|
||||
demands of your application.
|
||||
|
||||
Where to go from here?
|
||||
----------------------
|
||||
|
||||
.. panels::
|
||||
:container: text-center
|
||||
:column: col-lg-6 px-2 py-2
|
||||
:card:
|
||||
|
||||
**Quick Start**
|
||||
^^^
|
||||
|
||||
In this quick start tutorial you will take a sample application designed to
|
||||
run on a laptop and scale it up in the cloud.
|
||||
|
||||
+++
|
||||
.. link-button:: ref-cluster-quick-start
|
||||
:type: ref
|
||||
:text: Ray Clusters Quick Start
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
**Key Concepts**
|
||||
^^^
|
||||
|
||||
Understand the key concepts behind Ray Clusters. Learn about the main
|
||||
concepts and the different ways to interact with a cluster.
|
||||
|
||||
+++
|
||||
.. link-button:: cluster-key-concepts
|
||||
:type: ref
|
||||
:text: Learn Key Concepts
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
**Deployment Guide**
|
||||
^^^
|
||||
|
||||
Learn how to set up a distributed Ray cluster and run your workloads on it.
|
||||
|
||||
+++
|
||||
.. link-button:: ref-deployment-guide
|
||||
:type: ref
|
||||
:text: Deploy on a Ray Cluster
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
**API**
|
||||
^^^
|
||||
|
||||
Get more in-depth information about the various APIs to interact with Ray
|
||||
Clusters, including the :ref:`Ray cluster config YAML and CLI<cluster-config>`,
|
||||
the :ref:`Ray Client API<ray-client>` and the
|
||||
:ref:`Ray job submission API<ray-job-submission-api-ref>`.
|
||||
|
||||
+++
|
||||
.. link-button:: ref-cluster-api
|
||||
:type: ref
|
||||
:text: Read the API Reference
|
||||
:classes: btn-outline-info btn-block
|
||||
|
||||
.. include:: /_includes/clusters/announcement_bottom.rst
|
|
@ -1,382 +0,0 @@
|
|||
.. _jobs-overview:
|
||||
|
||||
==================
|
||||
Ray Job Submission
|
||||
==================
|
||||
|
||||
.. note::
|
||||
|
||||
This component is in **beta**. APIs may change before becoming stable. This feature requires a full installation of Ray using ``pip install "ray[default]"``.
|
||||
|
||||
Ray Job submission is a mechanism to submit locally developed and tested applications to a remote Ray cluster. It simplifies the experience of packaging, deploying, and managing a Ray application.
|
||||
|
||||
|
||||
|
||||
Jump to the :ref:`API Reference<ray-job-submission-api-ref>`, or continue reading for a quick overview.
|
||||
|
||||
Concepts
|
||||
--------
|
||||
|
||||
- **Job**: A Ray application submitted to a Ray cluster for execution. Consists of (1) an entrypoint command and (2) a :ref:`runtime environment<runtime-environments>`, which may contain file and package dependencies.
|
||||
|
||||
- **Job Lifecycle**: When a job is submitted, it runs once to completion or failure. Retries or different runs with different parameters should be handled by the submitter. Jobs are bound to the lifetime of a Ray cluster, so if the cluster goes down, all running jobs on that cluster will be terminated.
|
||||
|
||||
- **Job Manager**: An entity external to the Ray cluster that manages the lifecycle of a job (scheduling, killing, polling status, getting logs, and persisting inputs/outputs), and potentially also manages the lifecycle of Ray clusters. Can be any third-party framework with these abilities, such as Apache Airflow or Kubernetes Jobs.
|
||||
|
||||
Quick Start Example
|
||||
-------------------
|
||||
|
||||
Let's start with a sample job that can be run locally. The following script uses Ray APIs to increment a counter and print its value, and print the version of the ``requests`` module it's using:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# script.py
|
||||
|
||||
import ray
|
||||
import requests
|
||||
|
||||
ray.init()
|
||||
|
||||
@ray.remote
|
||||
class Counter:
|
||||
def __init__(self):
|
||||
self.counter = 0
|
||||
|
||||
def inc(self):
|
||||
self.counter += 1
|
||||
|
||||
def get_counter(self):
|
||||
return self.counter
|
||||
|
||||
counter = Counter.remote()
|
||||
|
||||
for _ in range(5):
|
||||
ray.get(counter.inc.remote())
|
||||
print(ray.get(counter.get_counter.remote()))
|
||||
|
||||
print(requests.__version__)
|
||||
|
||||
Put this file in a local directory of your choice, with filename ``script.py``, so your working directory will look like:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
| your_working_directory ("./")
|
||||
| ├── script.py
|
||||
|
||||
|
||||
Next, start a local Ray cluster:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
❯ ray start --head
|
||||
Local node IP: 127.0.0.1
|
||||
INFO services.py:1360 -- View the Ray dashboard at http://127.0.0.1:8265
|
||||
|
||||
Note the address and port returned in the terminal---this will be where we submit job requests to, as explained further in the examples below. If you do not see this, ensure the Ray Dashboard is installed by running :code:`pip install "ray[default]"`.
|
||||
|
||||
At this point, the job is ready to be submitted by one of the :ref:`Ray Job APIs<ray-job-apis>`.
|
||||
Continue on to see examples of running and interacting with this sample job.
|
||||
|
||||
.. _ray-job-apis:
|
||||
|
||||
Ray Job Submission APIs
|
||||
-----------------------
|
||||
|
||||
Ray provides three APIs for job submission:
|
||||
|
||||
* A :ref:`command line interface<ray-job-cli>`, the easiest way to get started.
|
||||
* A :ref:`Python SDK<ray-job-sdk>`, the recommended way to submit jobs programmatically.
|
||||
* An :ref:`HTTP REST API<ray-job-rest-api>`. Both the CLI and SDK call into the REST API under the hood.
|
||||
|
||||
All three APIs for job submission share the following key inputs:
|
||||
|
||||
* **Entrypoint**: The shell command to run the job.
|
||||
|
||||
* Example: :code:`python my_ray_script.py`
|
||||
* Example: :code:`echo hello`
|
||||
|
||||
* **Runtime Environment**: Specifies files, packages, and other dependencies for your job. See :ref:`Runtime Environments<runtime-environments>` for details.
|
||||
|
||||
* Example: ``{working_dir="/data/my_files", pip=["requests", "pendulum==2.1.2"]}``
|
||||
* Of special note: the field :code:`working_dir` specifies the files your job needs to run. The entrypoint command will be run in the remote cluster's copy of the `working_dir`, so for the entrypoint ``python my_ray_script.py``, the file ``my_ray_script.py`` must be in the directory specified by ``working_dir``.
|
||||
|
||||
* If :code:`working_dir` is a local directory: It will be automatically zipped and uploaded to the target Ray cluster, then unpacked to where your submitted application runs. This option has a size limit of 100 MB and is recommended for rapid iteration and experimentation.
|
||||
* If :code:`working_dir` is a remote URI hosted on S3, GitHub or others: It will be downloaded and unpacked to where your submitted application runs. This option has no size limit and is recommended for production use. For details, see :ref:`remote-uris`.
|
||||
|
||||
|
||||
.. _ray-job-cli:
|
||||
|
||||
CLI
|
||||
^^^
|
||||
|
||||
The easiest way to get started with Ray job submission is to use the Job Submission CLI.
|
||||
|
||||
Jump to the :ref:`API Reference<ray-job-submission-cli-ref>`, or continue reading for a walkthrough.
|
||||
|
||||
|
||||
Using the CLI on a local cluster
|
||||
""""""""""""""""""""""""""""""""
|
||||
|
||||
First, start a local Ray cluster (e.g. with ``ray start --head``) and open a terminal (on the head node, which is your local machine).
|
||||
|
||||
Next, set the :code:`RAY_ADDRESS` environment variable:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export RAY_ADDRESS="http://127.0.0.1:8265"
|
||||
|
||||
This tells the jobs CLI how to find your Ray cluster. Here we are specifying port ``8265`` on the head node, the port that the Ray Dashboard listens on.
|
||||
(Note that this port is different from the port used to connect to the cluster via :ref:`Ray Client <ray-client>`, which is ``10001`` by default.)
|
||||
|
||||
Now you are ready to use the CLI.
|
||||
Here are some examples of CLI commands from the Quick Start example and their output:
|
||||
|
||||
.. code-block::
|
||||
|
||||
❯ ray job submit --runtime-env-json='{"working_dir": "./", "pip": ["requests==2.26.0"]}' -- python script.py
|
||||
2021-12-01 23:04:52,672 INFO cli.py:25 -- Creating JobSubmissionClient at address: http://127.0.0.1:8265
|
||||
2021-12-01 23:04:52,809 INFO sdk.py:144 -- Uploading package gcs://_ray_pkg_bbcc8ca7e83b4dc0.zip.
|
||||
2021-12-01 23:04:52,810 INFO packaging.py:352 -- Creating a file package for local directory './'.
|
||||
2021-12-01 23:04:52,878 INFO cli.py:105 -- Job submitted successfully: raysubmit_RXhvSyEPbxhcXtm6.
|
||||
2021-12-01 23:04:52,878 INFO cli.py:106 -- Query the status of the job using: `ray job status raysubmit_RXhvSyEPbxhcXtm6`.
|
||||
|
||||
❯ ray job status raysubmit_RXhvSyEPbxhcXtm6
|
||||
2021-12-01 23:05:00,356 INFO cli.py:25 -- Creating JobSubmissionClient at address: http://127.0.0.1:8265
|
||||
2021-12-01 23:05:00,371 INFO cli.py:127 -- Job status for 'raysubmit_RXhvSyEPbxhcXtm6': PENDING.
|
||||
2021-12-01 23:05:00,371 INFO cli.py:129 -- Job has not started yet, likely waiting for the runtime_env to be set up.
|
||||
|
||||
❯ ray job status raysubmit_RXhvSyEPbxhcXtm6
|
||||
2021-12-01 23:05:37,751 INFO cli.py:25 -- Creating JobSubmissionClient at address: http://127.0.0.1:8265
|
||||
2021-12-01 23:05:37,764 INFO cli.py:127 -- Job status for 'raysubmit_RXhvSyEPbxhcXtm6': SUCCEEDED.
|
||||
2021-12-01 23:05:37,764 INFO cli.py:129 -- Job finished successfully.
|
||||
|
||||
❯ ray job logs raysubmit_RXhvSyEPbxhcXtm6
|
||||
2021-12-01 23:05:59,026 INFO cli.py:25 -- Creating JobSubmissionClient at address: http://127.0.0.1:8265
|
||||
2021-12-01 23:05:23,037 INFO worker.py:851 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379
|
||||
(pid=runtime_env) 2021-12-01 23:05:23,212 WARNING conda.py:54 -- Injecting /Users/jiaodong/Workspace/ray/python to environment /tmp/ray/session_2021-12-01_23-04-44_771129_7693/runtime_resources/conda/99305e1352b2dcc9d5f38c2721c7c1f1cc0551d5 because _inject_current_ray flag is on.
|
||||
(pid=runtime_env) 2021-12-01 23:05:23,212 INFO conda.py:328 -- Finished setting up runtime environment at /tmp/ray/session_2021-12-01_23-04-44_771129_7693/runtime_resources/conda/99305e1352b2dcc9d5f38c2721c7c1f1cc0551d5
|
||||
(pid=runtime_env) 2021-12-01 23:05:23,213 INFO working_dir.py:85 -- Setup working dir for gcs://_ray_pkg_bbcc8ca7e83b4dc0.zip
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
2.26.0
|
||||
|
||||
❯ ray job list
|
||||
{'raysubmit_AYhLMgDJ6XBQFvFP': JobInfo(status='SUCCEEDED', message='Job finished successfully.', error_type=None, start_time=1645908622, end_time=1645908623, metadata={}, runtime_env={}),
|
||||
'raysubmit_su9UcdUviUZ86b1t': JobInfo(status='SUCCEEDED', message='Job finished successfully.', error_type=None, start_time=1645908669, end_time=1645908670, metadata={}, runtime_env={})}
|
||||
|
||||
.. warning::
|
||||
|
||||
When using the CLI, do not wrap the entrypoint command in quotes. For example, use
|
||||
``ray job submit --working_dir="." -- python script.py`` instead of ``ray job submit --working_dir="." -- "python script.py"``.
|
||||
Otherwise you may encounter the error ``/bin/sh: 1: python script.py: not found``.
|
||||
|
||||
.. tip::
|
||||
|
||||
If your job is stuck in `PENDING`, the runtime environment installation may be stuck.
|
||||
(For example, the `pip` installation or `working_dir` download may be stalled due to internet issues.)
|
||||
You can check the installation logs at `/tmp/ray/session_latest/logs/runtime_env_setup-*.log` for details.
|
||||
|
||||
Using the CLI on a remote cluster
|
||||
"""""""""""""""""""""""""""""""""
|
||||
|
||||
Above, we ran the "Quick Start" example on a local Ray cluster. When connecting to a `remote` cluster via the CLI, you need to be able to access the Ray Dashboard port of the cluster over HTTP.
|
||||
|
||||
One way to do this is to port forward ``127.0.0.1:8265`` on your local machine to ``127.0.0.1:8265`` on the head node.
|
||||
If you started your remote cluster with the :ref:`Ray Cluster Launcher <ref-cluster-quick-start>`, then the port forwarding can be set up automatically using the ``ray dashboard`` command (see :ref:`monitor-cluster` for details).
|
||||
|
||||
To use this, run the following command on your local machine, where ``cluster.yaml`` is the configuration file you used to launch your cluster:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ray dashboard cluster.yaml
|
||||
|
||||
Once this is running, check that you can view the Ray Dashboard in your local browser at ``http://127.0.0.1:8265``.
|
||||
|
||||
Next, set the :code:`RAY_ADDRESS` environment variable:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export RAY_ADDRESS="http://127.0.0.1:8265"
|
||||
|
||||
(Note that this port is different from the port used to connect to the cluster via :ref:`Ray Client <ray-client>`, which is ``10001`` by default.)
|
||||
|
||||
Now you will be able to use the Jobs CLI on your local machine as in the example above to interact with your remote Ray cluster.
|
||||
|
||||
Using the CLI on Kubernetes
|
||||
"""""""""""""""""""""""""""
|
||||
|
||||
The instructions above still apply, but you can achieve the dashboard port forwarding using ``kubectl port-forward``:
|
||||
https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/
|
||||
|
||||
Alternatively, you can set up Ingress to the dashboard port of the cluster over HTTP: https://kubernetes.io/docs/concepts/services-networking/ingress/
|
||||
|
||||
|
||||
.. _ray-job-sdk:
|
||||
|
||||
Python SDK
|
||||
^^^^^^^^^^
|
||||
|
||||
The Job Submission Python SDK is the recommended way to submit jobs programmatically. Jump to the :ref:`API Reference<ray-job-submission-sdk-ref>`, or continue reading for a quick overview.
|
||||
|
||||
SDK calls are made via a ``JobSubmissionClient`` object. To initialize the client, provide the Ray cluster head node address and the port used by the Ray Dashboard (``8265`` by default). For this example, we'll use a local Ray cluster, but the same example will work for remote Ray cluster addresses.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.job_submission import JobSubmissionClient
|
||||
|
||||
# If using a remote cluster, replace 127.0.0.1 with the head node's IP address.
|
||||
client = JobSubmissionClient("http://127.0.0.1:8265")
|
||||
|
||||
Then we can submit our application to the Ray cluster via the Job SDK.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
job_id = client.submit_job(
|
||||
# Entrypoint shell command to execute
|
||||
entrypoint="python script.py",
|
||||
# Runtime environment for the job, specifying a working directory and pip package
|
||||
runtime_env={
|
||||
"working_dir": "./",
|
||||
"pip": ["requests==2.26.0"]
|
||||
}
|
||||
)
|
||||
|
||||
.. tip::
|
||||
|
||||
By default, the Ray job server will generate a new ``job_id`` and return it, but you can alternatively choose a unique ``job_id`` string first and pass it into :code:`submit_job`.
|
||||
In this case, the Job will be executed with your given id, and will throw an error if the same ``job_id`` is submitted more than once for the same Ray cluster.
|
||||
|
||||
Now we can write a simple polling loop that checks the job status until it reaches a terminal state (namely, ``JobStatus.SUCCEEDED``, ``JobStatus.STOPPED``, or ``JobStatus.FAILED``), and gets the logs at the end.
|
||||
We expect to see the numbers printed from our actor, as well as the correct version of the :code:`requests` module specified in the ``runtime_env``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.job_submission import JobStatus
|
||||
import time
|
||||
|
||||
def wait_until_finish(job_id):
|
||||
start = time.time()
|
||||
timeout = 5
|
||||
while time.time() - start <= timeout:
|
||||
status = client.get_job_status(job_id)
|
||||
print(f"status: {status}")
|
||||
if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
wait_until_finish(job_id)
|
||||
logs = client.get_job_logs(job_id)
|
||||
|
||||
The output should be as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
status: JobStatus.PENDING
|
||||
status: JobStatus.RUNNING
|
||||
status: JobStatus.SUCCEEDED
|
||||
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
|
||||
2.26.0
|
||||
|
||||
.. tip::
|
||||
|
||||
Instead of a local directory (``"./"`` in this example), you can also specify remote URIs for your job's working directory, such as S3 buckets or Git repositories. See :ref:`remote-uris` for details.
|
||||
|
||||
A submitted job can be stopped by the user before it finishes executing.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
job_id = client.submit_job(
|
||||
# Entrypoint shell command to execute
|
||||
entrypoint="python -c 'import time; time.sleep(60)'",
|
||||
runtime_env={}
|
||||
)
|
||||
wait_until_finish(job_id)
|
||||
client.stop_job(job_id)
|
||||
wait_until_finish(job_id)
|
||||
logs = client.get_job_logs(job_id)
|
||||
|
||||
To get information about all jobs, call ``client.list_jobs()``. This returns a ``Dict[str, JobInfo]`` object mapping Job IDs to their information.
|
||||
|
||||
For full details, see the :ref:`API Reference<ray-job-submission-sdk-ref>`.
|
||||
|
||||
|
||||
.. _ray-job-rest-api:
|
||||
|
||||
REST API
|
||||
^^^^^^^^
|
||||
|
||||
Under the hood, both the Python SDK and the CLI make HTTP calls to the job server running on the Ray head node. You can also directly send requests to the corresponding endpoints via HTTP if needed:
|
||||
|
||||
**Submit Job**
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
|
||||
resp = requests.post(
|
||||
"http://127.0.0.1:8265/api/jobs/",
|
||||
json={
|
||||
"entrypoint": "echo hello",
|
||||
"runtime_env": {},
|
||||
"job_id": None,
|
||||
"metadata": {"job_submission_id": "123"}
|
||||
}
|
||||
)
|
||||
rst = json.loads(resp.text)
|
||||
job_id = rst["job_id"]
|
||||
|
||||
**Query and poll for Job status**
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
start = time.time()
|
||||
while time.time() - start <= 10:
|
||||
resp = requests.get(
|
||||
"http://127.0.0.1:8265/api/jobs/<job_id>"
|
||||
)
|
||||
rst = json.loads(resp.text)
|
||||
status = rst["status"]
|
||||
print(f"status: {status}")
|
||||
if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
**Query for logs**
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
resp = requests.get(
|
||||
"http://127.0.0.1:8265/api/jobs/<job_id>/logs"
|
||||
)
|
||||
rst = json.loads(resp.text)
|
||||
logs = rst["logs"]
|
||||
|
||||
**List all jobs**
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
resp = requests.get(
|
||||
"http://127.0.0.1:8265/api/jobs/"
|
||||
)
|
||||
print(resp.json())
|
||||
# {"job_id": {"metadata": ..., "status": ..., "message": ...}, ...}
|
||||
|
||||
|
||||
Job Submission Architecture
|
||||
----------------------------
|
||||
|
||||
The following diagram shows the underlying structure and steps for each submitted job.
|
||||
|
||||
.. image:: https://raw.githubusercontent.com/ray-project/images/master/docs/job/job_submission_arch_v2.png
|
|
@ -1,75 +0,0 @@
|
|||
.. _ray-job-submission-api-ref:
|
||||
|
||||
Ray Job Submission API
|
||||
======================
|
||||
|
||||
For an overview with examples see :ref:`Ray Job Submission<jobs-overview>`.
|
||||
|
||||
.. _ray-job-submission-cli-ref:
|
||||
|
||||
Job Submission CLI
|
||||
------------------
|
||||
|
||||
.. _ray-job-submit-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:submit
|
||||
:prog: ray job submit
|
||||
|
||||
.. warning::
|
||||
|
||||
When using the CLI, do not wrap the entrypoint command in quotes. For example, use
|
||||
``ray job submit --working_dir="." -- python script.py`` instead of ``ray job submit --working_dir="." -- "python script.py"``.
|
||||
Otherwise you may encounter the error ``/bin/sh: 1: python script.py: not found``.
|
||||
|
||||
.. _ray-job-status-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:status
|
||||
:prog: ray job status
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-stop-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:stop
|
||||
:prog: ray job stop
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-logs-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:logs
|
||||
:prog: ray job logs
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-list-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:list
|
||||
:prog: ray job list
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-submission-sdk-ref:
|
||||
|
||||
Job Submission SDK
|
||||
------------------
|
||||
|
||||
.. _job-submission-client-ref:
|
||||
|
||||
JobSubmissionClient
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.job_submission.JobSubmissionClient
|
||||
:members:
|
||||
|
||||
.. _job-status-ref:
|
||||
|
||||
JobStatus
|
||||
~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.job_submission.JobStatus
|
||||
:members:
|
||||
|
||||
.. _job-info-ref:
|
||||
|
||||
JobInfo
|
||||
~~~~~~~
|
||||
|
||||
.. autoclass:: ray.job_submission.JobInfo
|
||||
:members:
|
|
@ -1,107 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _cluster-key-concepts:
|
||||
|
||||
Key Concepts
|
||||
============
|
||||
|
||||
Cluster
|
||||
-------
|
||||
|
||||
A Ray cluster is a set of one or more nodes that are running Ray and share the
|
||||
same :ref:`head node<cluster-node-types>`.
|
||||
|
||||
.. _cluster-node-types:
|
||||
|
||||
Node types
|
||||
----------
|
||||
|
||||
A Ray cluster consists of a :ref:`head node<cluster-head-node>` and a set of
|
||||
:ref:`worker nodes<cluster-worker-node>`.
|
||||
|
||||
.. image:: ray-cluster.jpg
|
||||
:align: center
|
||||
:width: 600px
|
||||
|
||||
.. _cluster-head-node:
|
||||
|
||||
Head node
|
||||
~~~~~~~~~
|
||||
|
||||
The head node is the first node started by the
|
||||
:ref:`Ray cluster launcher<cluster-launcher>` when trying to launch a Ray
|
||||
cluster. Among other things, the head node holds the :ref:`Global Control Store
|
||||
(GCS)<memory>` and runs the :ref:`autoscaler<cluster-autoscaler>`. Once the head
|
||||
node is started, it will be responsible for launching any additional
|
||||
:ref:`worker nodes<cluster-worker-node>`. The head node itself will also execute
|
||||
tasks and actors to utilize its capacity.
|
||||
|
||||
.. _cluster-worker-node:
|
||||
|
||||
Worker node
|
||||
~~~~~~~~~~~
|
||||
|
||||
A worker node is any node in the Ray cluster that is not functioning as head node.
|
||||
Therefore, worker nodes are simply responsible for executing tasks and actors.
|
||||
When a worker node is launched, it will be given the address of the head node to
|
||||
form a cluster.
|
||||
|
||||
.. _cluster-launcher:
|
||||
|
||||
Cluster launcher
|
||||
----------------
|
||||
|
||||
The cluster launcher is a process responsible for bootstrapping the Ray cluster
|
||||
by launching the :ref:`head node<cluster-head-node>`. For more information on how
|
||||
to use the cluster launcher, refer to
|
||||
:ref:`cluster launcher CLI commands documentation<cluster-commands>` and the
|
||||
corresponding :ref:`documentation for the configuration file<cluster-config>`.
|
||||
|
||||
.. _cluster-autoscaler:
|
||||
|
||||
Autoscaler
|
||||
----------
|
||||
|
||||
The autoscaler is a process that runs on the :ref:`head node<cluster-head-node>`
|
||||
and is responsible for adding or removing :ref:`worker nodes<cluster-worker-node>`
|
||||
to meet the needs of the Ray workload while matching the specification in the
|
||||
:ref:`cluster config file<cluster-config>`. In particular, if the resource
|
||||
demands of the Ray workload exceed the current capacity of the cluster, the
|
||||
autoscaler will try to add nodes. Conversely, if a node is idle for long enough,
|
||||
the autoscaler will remove it from the cluster. To learn more about autoscaling,
|
||||
refer to the :ref:`Ray cluster deployment guide<deployment-guide-autoscaler>`.
|
||||
|
||||
Ray Client
|
||||
----------
|
||||
The Ray Client is an API that connects a Python script to a remote Ray cluster.
|
||||
To learn more about the Ray Client, you can refer to the :ref:`documentation<ray-client>`.
|
||||
|
||||
Job submission
|
||||
--------------
|
||||
|
||||
Ray Job submission is a mechanism to submit locally developed and tested applications
|
||||
to a remote Ray cluster. It simplifies the experience of packaging, deploying,
|
||||
and managing a Ray application. To learn more about Ray jobs, refer to the
|
||||
:ref:`documentation<ray-job-submission-api-ref>`.
|
||||
|
||||
Cloud clusters
|
||||
--------------
|
||||
|
||||
If you’re using AWS, GCP, Azure (community-maintained) or Aliyun (community-maintained), you can use the
|
||||
:ref:`Ray cluster launcher<cluster-launcher>` to launch cloud clusters, which
|
||||
greatly simplifies the cluster setup process.
|
||||
|
||||
Cluster managers
|
||||
----------------
|
||||
|
||||
You can simplify the process of managing Ray clusters using a number of popular
|
||||
cluster managers including :ref:`Kubernetes<kuberay-index>`,
|
||||
:ref:`YARN<ray-yarn-deploy>`, :ref:`Slurm<ray-slurm-deploy>` and :ref:`LSF<ray-LSF-deploy>`.
|
||||
|
||||
Kubernetes (K8s) operator
|
||||
-------------------------
|
||||
|
||||
Deployments of Ray on Kubernetes are managed by the Ray Kubernetes Operator. The
|
||||
Ray Operator makes it easy to deploy clusters of Ray pods within a Kubernetes
|
||||
cluster. To learn more about the K8s operator, refer to
|
||||
the :ref:`documentation<kuberay-index>`.
|
|
@ -1,243 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _k8s-advanced:
|
||||
|
||||
Ray Operator Advanced Configuration
|
||||
===================================
|
||||
This document covers configuration options and other details concerning autoscaling Ray clusters on Kubernetes.
|
||||
We recommend first reading this :ref:`introductory guide<ray-k8s-deploy>`.
|
||||
|
||||
.. _helm-config:
|
||||
|
||||
Helm chart configuration
|
||||
------------------------
|
||||
This section discusses ``RayCluster`` configuration options exposed in the Ray Helm chart's `values.yaml`_ file.
|
||||
The default settings in ``values.yaml`` were chosen for the purposes of demonstration.
|
||||
For production use cases, the values should be modified. For example, you will probably want to increase Ray Pod resource requests.
|
||||
|
||||
Setting custom chart values
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
To configure Helm chart values, you can pass in a custom values ``yaml`` and/or set individual fields.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Pass in a custom values yaml.
|
||||
$ helm install example-cluster -f custom_values.yaml ./ray
|
||||
# Set custom values on the command line.
|
||||
$ helm install example-cluster --set image=rayproject/ray:1.2.0 ./ray
|
||||
|
||||
Refer to the `Helm docs`_ for more information.
|
||||
|
||||
Ray cluster configuration
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
A :ref:`Ray cluster<cluster-index>` consists of a head node and a collection of worker nodes.
|
||||
When deploying Ray on Kubernetes, each Ray node runs in its own Kubernetes Pod.
|
||||
The ``podTypes`` field of ``values.yaml`` represents the pod configurations available for use as nodes in the Ray cluster.
|
||||
The key of each ``podType`` is a user-defined name. The field ``headPodType`` identifies the name of the ``podType`` to use for the Ray head node.
|
||||
The rest of the ``podTypes`` are used as configuration for the Ray worker nodes.
|
||||
|
||||
Each ``podType`` specifies ``minWorkers`` and ``maxWorkers`` fields.
|
||||
The autoscaler will try to maintain at least ``minWorkers`` of the ``podType`` and can scale up to
|
||||
``maxWorkers`` according to the resource demands of the Ray workload. A common pattern is to specify ``minWorkers`` = ``maxWorkers`` = 0
|
||||
for the head ``podType``; this signals that the ``podType`` is to be used only for the head node.
|
||||
You can use `helm upgrade`_ to adjust the fields ``minWorkers`` and ``maxWorkers`` without :ref:`restarting<k8s-restarts>` the Ray cluster.
|
||||
|
||||
The fields ``CPU``, ``GPU``, ``memory``, and ``nodeSelector`` configure the Kubernetes ``PodSpec`` to use for nodes
|
||||
of the ``podType``. The ``image`` field determines the Ray container image used by all nodes in the Ray cluster.
|
||||
|
||||
The ``rayResources`` field of each ``podType`` can be used to signal the presence of custom resources to Ray.
|
||||
To schedule Ray tasks and actors that use custom hardware resources, ``rayResources`` can be used in conjunction with
|
||||
``nodeSelector``:
|
||||
|
||||
- Use ``nodeSelector`` to constrain workers of a ``podType`` to run on a Kubernetes Node with specialized hardware (e.g. a particular GPU accelerator.)
|
||||
- Signal availability of the hardware for that ``podType`` with ``rayResources: {"custom_resource": 3}``.
|
||||
- Schedule a Ray task or actor to use that resource with ``@ray.remote(resources={"custom_resource": 1})``.
|
||||
|
||||
By default, the fields ``CPU``, ``GPU``, and ``memory`` are used to configure cpu, gpu, and memory resources advertised to Ray.
|
||||
However, ``rayResources`` can be used to override this behavior. For example, ``rayResources: {"CPU": 0}`` can be set for head podType,
|
||||
to :ref:``avoid scheduling tasks on the Ray head``.
|
||||
|
||||
Refer to the documentation in `values.yaml`_ for more details.
|
||||
|
||||
.. note::
|
||||
|
||||
If your application could benefit from additional configuration options in the Ray Helm chart,
|
||||
(e.g. exposing more PodSpec fields), feel free to open a `feature request`_ on
|
||||
the Ray GitHub or a `discussion thread`_ on the Ray forums.
|
||||
|
||||
For complete configurability, it is also possible launch a Ray cluster :ref:`without the Helm chart<no-helm>`
|
||||
or to modify the Helm chart.
|
||||
|
||||
.. note::
|
||||
|
||||
Some things to keep in mind about the scheduling of Ray worker pods and Ray tasks/actors:
|
||||
|
||||
1. The Ray Autoscaler executes scaling decisions by sending pod creation requests to the Kubernetes API server.
|
||||
If your Kubernetes cluster cannot accomodate more worker pods of a given ``podType``, requested pods will enter
|
||||
a ``Pending`` state until the pod can be scheduled or a `timeout`_ expires.
|
||||
|
||||
2. If a Ray task requests more resources than available in any ``podType``, the Ray task cannot be scheduled.
|
||||
|
||||
|
||||
Running multiple Ray clusters
|
||||
-----------------------------
|
||||
The Ray Operator can manage multiple Ray clusters running within a single Kubernetes cluster.
|
||||
Since Helm does not support sharing resources between different releases, an additional Ray cluster
|
||||
must be launched in a Helm release separate from the release used to launch the Operator.
|
||||
|
||||
To enable launching with multiple Ray Clusters, the Ray Helm chart includes two flags:
|
||||
|
||||
- ``operatorOnly``: Start the Operator without launching a Ray cluster.
|
||||
- ``clusterOnly``: Create a RayCluster custom resource without installing the Operator. \(If the Operator has already been installed, a new Ray cluster will be launched.)
|
||||
|
||||
The following commands will install the Operator and two Ray Clusters in
|
||||
three separate Helm releases:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Install the operator in its own Helm release.
|
||||
$ helm install ray-operator --set operatorOnly=true ./ray
|
||||
|
||||
# Install a Ray cluster in a new namespace "ray".
|
||||
$ helm -n ray install example-cluster --set clusterOnly=true ./ray --create-namespace
|
||||
|
||||
# Install a second Ray cluster. Launch the second cluster without any workers.
|
||||
$ helm -n ray install example-cluster2 \
|
||||
--set podTypes.rayWorkerType.minWorkers=0 --set clusterOnly=true ./ray
|
||||
|
||||
# Examine the pods in both clusters.
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
example-cluster-ray-head-type-v6tt9 1/1 Running 0 35s
|
||||
example-cluster-ray-worker-type-fmn4k 1/1 Running 0 22s
|
||||
example-cluster-ray-worker-type-r6m7k 1/1 Running 0 22s
|
||||
example-cluster2-ray-head-type-tj666 1/1 Running 0 15s
|
||||
|
||||
Alternatively, the Operator and one of the Ray Clusters can be installed in the same Helm release:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Start the operator. Install a Ray cluster in a new namespace.
|
||||
helm -n ray install example-cluster --create-namespace ./ray
|
||||
|
||||
# Start another Ray cluster.
|
||||
# The cluster will be managed by the operator created in the last command.
|
||||
$ helm -n ray install example-cluster2 \
|
||||
--set podTypes.rayWorkerType.minWorkers=0 --set clusterOnly=true ./ray
|
||||
|
||||
|
||||
The Operator pod outputs autoscaling logs for all of the Ray clusters it manages.
|
||||
Each line of output is prefixed by the string :code:`<cluster name>,<namespace>`.
|
||||
This string can be used to filter for a specific Ray cluster's logs:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# The last 100 lines of logging output for the cluster with name "example-cluster2" in namespace "ray":
|
||||
$ kubectl logs \
|
||||
$(kubectl get pod -l cluster.ray.io/component=operator -o custom-columns=:metadata.name) \
|
||||
| grep example-cluster2,ray | tail -n 100
|
||||
|
||||
.. _k8s-cleanup:
|
||||
|
||||
Cleaning up resources
|
||||
---------------------
|
||||
When cleaning up,
|
||||
**RayCluster resources must be deleted before the Operator deployment is deleted**.
|
||||
This is because the Operator must remove a `finalizer`_ from the ``RayCluster`` resource to allow
|
||||
deletion of the resource to complete.
|
||||
|
||||
If the Operator and ``RayCluster`` are created as part of the same Helm release,
|
||||
the ``RayCluster`` must be deleted :ref:`before<k8s-cleanup-basic>` uninstalling the Helm release.
|
||||
If the Operator and one or more ``RayClusters`` are created in multiple Helm releases,
|
||||
the ``RayCluster`` releases must be uninstalled before the Operator release.
|
||||
|
||||
To remedy a situation where the Operator deployment was deleted first and ``RayCluster`` deletion is hanging, try one of the following:
|
||||
|
||||
- Manually delete the ``RayCluster``'s finalizers with ``kubectl edit`` or ``kubectl patch``.
|
||||
- Restart the Operator so that it can remove ``RayCluster`` finalizers. Then remove the Operator.
|
||||
|
||||
Cluster-scoped vs. namespaced operators
|
||||
---------------------------------------
|
||||
By default, the Ray Helm chart installs a ``cluster-scoped`` operator.
|
||||
This means that the operator manages all Ray clusters in your Kubernetes cluster, across all namespaces.
|
||||
The namespace into which the Operator Deployment is launched is determined by the chart field ``operatorNamespace``.
|
||||
If this field is unset, the operator is launched into namespace ``default``.
|
||||
|
||||
It is also possible to run a ``namespace-scoped`` Operator.
|
||||
This means that the Operator is launched into the namespace of the Helm release and manages only
|
||||
Ray clusters in that namespace. To run a namespaced Operator, add the flag ``--set namespacedOperator=True``
|
||||
to your Helm install command.
|
||||
|
||||
.. warning::
|
||||
Do not simultaneously run namespaced and cluster-scoped Ray Operators within one Kubernetes cluster, as this will lead to unintended effects.
|
||||
|
||||
.. _no-helm:
|
||||
|
||||
Deploying without Helm
|
||||
----------------------
|
||||
It is possible to deploy the Ray Operator without Helm.
|
||||
The necessary configuration files are available on the `Ray GitHub`_.
|
||||
The following manifests should be installed in the order listed:
|
||||
|
||||
- The `RayCluster CRD`_.
|
||||
- The Ray Operator, `namespaced`_ or `cluster-scoped`_.\Note that the cluster-scoped operator is configured to run in namespaced ``default``. Modify as needed.
|
||||
- A RayCluster custom resource: `example`_.
|
||||
|
||||
Ray Cluster Lifecycle
|
||||
---------------------
|
||||
|
||||
.. _k8s-restarts:
|
||||
|
||||
Restart behavior
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
The Ray cluster will restart under the following circumstances:
|
||||
|
||||
- There is an error in the cluster's autoscaling process. This will happen if the Ray head node goes down.
|
||||
- There has been a change to the Ray head pod configuration. In terms of the Ray Helm chart, this means either ``image`` or one of the following fields of the head's ``podType`` has been modified: ``CPU``, ``GPU``, ``memory``, ``nodeSelector``.
|
||||
|
||||
Similarly, all workers of a given ``podType`` will be discarded if
|
||||
|
||||
- There has been a change to ``image`` or one of the following fields of the ``podType``: ``CPU``, ``GPU``, ``memory``, ``nodeSelector``.
|
||||
|
||||
Status information
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Running ``kubectl -n <namespace> get raycluster`` will show all Ray clusters in the namespace with status information.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
kubectl -n ray get rayclusters
|
||||
NAME STATUS RESTARTS AGE
|
||||
example-cluster Running 0 9s
|
||||
|
||||
The ``STATUS`` column reports the RayCluster's ``status.phase`` field. The following values are possible:
|
||||
|
||||
- ``Empty/nil``: This means the RayCluster resource has not yet been registered by the Operator.
|
||||
- ``Updating``: The Operator is launching the Ray cluster or processing an update to the cluster's configuration.
|
||||
- ``Running``: The Ray cluster's autoscaling process is running in a normal state.
|
||||
- ``AutoscalingExceptionRecovery`` The Ray cluster's autoscaling process has crashed. Ray processes will restart. This can happen if the Ray head node goes down.
|
||||
- ``Error`` There was an unexpected error while updating the Ray cluster. (The Ray maintainers would be grateful if you file a `bug report`_ with operator logs.)
|
||||
|
||||
The ``RESTARTS`` column reports the RayCluster's ``status.autoscalerRetries`` field. This tracks the number of times the cluster has restarted due to an autoscaling error.
|
||||
|
||||
Questions or Issues?
|
||||
--------------------
|
||||
|
||||
.. include:: /_includes/_help.rst
|
||||
|
||||
.. _`RayCluster CRD`: https://github.com/ray-project/ray/tree/master/deploy/charts/ray/crds/cluster_crd.yaml
|
||||
.. _`finalizer` : https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#finalizers
|
||||
.. _`namespaced`: https://github.com/ray-project/ray/tree/master/deploy/components/operator_namespaced.yaml
|
||||
.. _`cluster-scoped`: https://github.com/ray-project/ray/tree/master/deploy/components/operator_cluster_scoped.yaml
|
||||
.. _`example`: https://github.com/ray-project/ray/tree/master/deploy/components/example_cluster.yaml
|
||||
.. _`values.yaml`: https://github.com/ray-project/ray/tree/master/deploy/charts/ray/values.yaml
|
||||
.. _`bug report`: https://github.com/ray-project/ray/issues/new?assignees=&labels=bug%2C+triage&template=bug_report.md&title=
|
||||
.. _`helm upgrade`: https://helm.sh/docs/helm/helm_upgrade/
|
||||
.. _`feature request`: https://github.com/ray-project/ray/issues/new?assignees=&labels=enhancement&template=feature_request.md&title=
|
||||
.. _`discussion thread`: https://discuss.ray.io/c/ray-clusters/ray-kubernetes/11
|
||||
.. _`timeout`: https://github.com/ray-project/ray/blob/b08b2c5103c634c680de31b237b2bfcceb9bc150/python/ray/autoscaler/_private/constants.py#L22
|
||||
.. _`Helm docs`: https://helm.sh/docs/helm/helm_install/
|
||||
.. _`Ray GitHub`: https://github.com/ray-project/ray/tree/master/deploy/components/
|
|
@ -1,93 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _k8s-gpus:
|
||||
|
||||
GPU Usage with Kubernetes
|
||||
=========================
|
||||
This document provides some notes on GPU usage with Kubernetes.
|
||||
|
||||
To use GPUs on Kubernetes, you will need to configure both your Kubernetes setup and add additional values to your Ray cluster configuration.
|
||||
|
||||
For relevant documentation for GPU usage on different clouds, see instructions for `GKE`_, for `EKS`_, and for `AKS`_.
|
||||
|
||||
The `Ray Docker Hub <https://hub.docker.com/r/rayproject/>`_ hosts CUDA-based images packaged with Ray for use in Kubernetes pods.
|
||||
For example, the image ``rayproject/ray-ml:nightly-gpu`` is ideal for running GPU-based ML workloads with the most recent nightly build of Ray.
|
||||
Read :ref:`here<docker-images>` for further details on Ray images.
|
||||
|
||||
Using Nvidia GPUs requires specifying the relevant resource `limits` in the container fields of your Kubernetes configurations.
|
||||
(Kubernetes `sets <https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/#using-device-plugins>`_
|
||||
the GPU request equal to the limit.) The configuration for a pod running a Ray GPU image and
|
||||
using one Nvidia GPU looks like this:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
generateName: example-cluster-ray-worker
|
||||
spec:
|
||||
...
|
||||
containers:
|
||||
- name: ray-node
|
||||
image: rayproject/ray:nightly-gpu
|
||||
...
|
||||
resources:
|
||||
cpu: 1000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
memory: 512Mi
|
||||
nvidia.com/gpu: 1
|
||||
|
||||
GPU taints and tolerations
|
||||
--------------------------
|
||||
.. note::
|
||||
|
||||
Users using a managed Kubernetes service probably don't need to worry about this section.
|
||||
|
||||
The `Nvidia gpu plugin`_ for Kubernetes applies `taints`_ to GPU nodes; these taints prevent non-GPU pods from being scheduled on GPU nodes.
|
||||
Managed Kubernetes services like GKE, EKS, and AKS automatically apply matching `tolerations`_
|
||||
to pods requesting GPU resources. Tolerations are applied by means of Kubernetes's `ExtendedResourceToleration`_ `admission controller`_.
|
||||
If this admission controller is not enabled for your Kubernetes cluster, you may need to manually add a GPU toleration each of to your GPU pod configurations. For example,
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
generateName: example-cluster-ray-worker
|
||||
spec:
|
||||
...
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
...
|
||||
containers:
|
||||
- name: ray-node
|
||||
image: rayproject/ray:nightly-gpu
|
||||
...
|
||||
|
||||
Further reference and discussion
|
||||
--------------------------------
|
||||
Read about Kubernetes device plugins `here <https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/>`__,
|
||||
about Kubernetes GPU plugins `here <https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus>`__,
|
||||
and about Nvidia's GPU plugin for Kubernetes `here <https://github.com/NVIDIA/k8s-device-plugin>`__.
|
||||
|
||||
If you run into problems setting up GPUs for your Ray cluster on Kubernetes, please reach out to us at `<https://discuss.ray.io>`_.
|
||||
|
||||
Questions or Issues?
|
||||
--------------------
|
||||
|
||||
.. include:: /_includes/_help.rst
|
||||
|
||||
.. _`GKE`: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus
|
||||
.. _`EKS`: https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html
|
||||
.. _`AKS`: https://docs.microsoft.com/en-us/azure/aks/gpu-cluster
|
||||
|
||||
.. _`tolerations`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
|
||||
.. _`taints`: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
|
||||
.. _`Nvidia gpu plugin`: https://github.com/NVIDIA/k8s-device-plugin
|
||||
.. _`admission controller`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/
|
||||
.. _`ExtendedResourceToleration`: https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/#extendedresourcetoleration
|
|
@ -1,159 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ray-k8s-static:
|
||||
|
||||
Deploying a Static Ray Cluster on Kubernetes
|
||||
============================================
|
||||
|
||||
This document gives an example of how to manually deploy a non-autoscaling Ray cluster on Kubernetes.
|
||||
|
||||
- Learn about deploying an autoscaling Ray cluster using the :ref:`Ray Helm chart<kuberay-index>`.
|
||||
|
||||
Creating a Ray Namespace
|
||||
------------------------
|
||||
|
||||
First, create a `Kubernetes Namespace`_ for Ray resources on your cluster. The
|
||||
following commands will create resources under this Namespace, so if you want
|
||||
to use a different one than ``ray``, please be sure to also change the
|
||||
``namespace`` fields in the provided ``yaml`` files and anytime you see a ``-n``
|
||||
flag passed to ``kubectl``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ kubectl create namespace ray
|
||||
|
||||
Starting a Ray Cluster
|
||||
----------------------
|
||||
|
||||
|
||||
A Ray cluster consists of a single head node and a set of worker nodes (the
|
||||
provided `ray-cluster.yaml <https://github.com/ray-project/ray/blob/master/doc/kubernetes/ray-cluster.yaml>`__ file will start 3 worker nodes). In the example
|
||||
Kubernetes configuration, this is implemented as:
|
||||
|
||||
- A ``ray-head`` `Kubernetes Service`_ that enables the worker nodes to discover the location of the head node on start up.
|
||||
This Service also enables access to the Ray Client and Ray Dashboard.
|
||||
- A ``ray-head`` `Kubernetes Deployment`_ that backs the ``ray-head`` Service with a single head node pod (replica).
|
||||
- A ``ray-worker`` `Kubernetes Deployment`_ with multiple worker node pods (replicas) that connect to the ``ray-head`` pod using the ``ray-head`` Service.
|
||||
|
||||
Note that because the head and worker nodes are Deployments, Kubernetes will
|
||||
automatically restart pods that crash to maintain the correct number of
|
||||
replicas.
|
||||
|
||||
- If a worker node goes down, a replacement pod will be started and joined to the cluster.
|
||||
- If the head node goes down, it will be restarted. This will start a new Ray cluster. Worker nodes that were connected to the old head node will crash and be restarted, connecting to the new head node when they come back up.
|
||||
|
||||
Try deploying a cluster with the provided Kubernetes config by running the
|
||||
following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml
|
||||
|
||||
Verify that the pods are running by running ``kubectl get pods -n ray``. You
|
||||
may have to wait up to a few minutes for the pods to enter the 'Running'
|
||||
state on the first run.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ray-head-5455bb66c9-6bxvz 1/1 Running 0 10s
|
||||
ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 5s
|
||||
ray-worker-5c49b7cc57-d9m86 1/1 Running 0 5s
|
||||
ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 5s
|
||||
|
||||
.. note::
|
||||
|
||||
You might see a nonzero number of RESTARTS for the worker pods. That can
|
||||
happen when the worker pods start up before the head pod and the workers
|
||||
aren't able to connect. This shouldn't affect the behavior of the cluster.
|
||||
|
||||
To change the number of worker nodes in the cluster, change the ``replicas``
|
||||
field in the worker deployment configuration in that file and then re-apply
|
||||
the config as follows:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Edit 'ray/doc/kubernetes/ray-cluster.yaml' and change the 'replicas'
|
||||
# field under the ray-worker deployment to, e.g., 4.
|
||||
|
||||
# Re-apply the new configuration to the running deployment.
|
||||
$ kubectl apply -f ray/doc/kubernetes/ray-cluster.yaml
|
||||
service/ray-head unchanged
|
||||
deployment.apps/ray-head unchanged
|
||||
deployment.apps/ray-worker configured
|
||||
|
||||
# Verify that there are now the correct number of worker pods running.
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ray-head-5455bb66c9-6bxvz 1/1 Running 0 30s
|
||||
ray-worker-5c49b7cc57-c6xs8 1/1 Running 0 25s
|
||||
ray-worker-5c49b7cc57-d9m86 1/1 Running 0 25s
|
||||
ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 25s
|
||||
ray-worker-5c49b7cc57-zzfg2 1/1 Running 0 0s
|
||||
|
||||
To validate that the restart behavior is working properly, try killing pods
|
||||
and checking that they are restarted by Kubernetes:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Delete a worker pod.
|
||||
$ kubectl -n ray delete pod ray-worker-5c49b7cc57-c6xs8
|
||||
pod "ray-worker-5c49b7cc57-c6xs8" deleted
|
||||
|
||||
# Check that a new worker pod was started (this may take a few seconds).
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ray-head-5455bb66c9-6bxvz 1/1 Running 0 45s
|
||||
ray-worker-5c49b7cc57-d9m86 1/1 Running 0 40s
|
||||
ray-worker-5c49b7cc57-kzk4s 1/1 Running 0 40s
|
||||
ray-worker-5c49b7cc57-ypq8x 1/1 Running 0 0s
|
||||
|
||||
# Delete the head pod.
|
||||
$ kubectl -n ray delete pod ray-head-5455bb66c9-6bxvz
|
||||
pod "ray-head-5455bb66c9-6bxvz" deleted
|
||||
|
||||
# Check that a new head pod was started and the worker pods were restarted.
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ray-head-5455bb66c9-gqzql 1/1 Running 0 0s
|
||||
ray-worker-5c49b7cc57-d9m86 1/1 Running 1 50s
|
||||
ray-worker-5c49b7cc57-kzk4s 1/1 Running 1 50s
|
||||
ray-worker-5c49b7cc57-ypq8x 1/1 Running 1 10s
|
||||
|
||||
# You can even try deleting all of the pods in the Ray namespace and checking
|
||||
# that Kubernetes brings the right number back up.
|
||||
$ kubectl -n ray delete pods --all
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ray-head-5455bb66c9-7l6xj 1/1 Running 0 10s
|
||||
ray-worker-5c49b7cc57-57tpv 1/1 Running 0 10s
|
||||
ray-worker-5c49b7cc57-6m4kp 1/1 Running 0 10s
|
||||
ray-worker-5c49b7cc57-jx2w2 1/1 Running 0 10s
|
||||
|
||||
Now that we have a running cluster, :ref:`we can execute Ray programs <ray-k8s-client>`.
|
||||
|
||||
Cleaning Up
|
||||
-----------
|
||||
|
||||
To delete a running Ray cluster, you can run the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
kubectl delete -f ray/doc/kubernetes/ray-cluster.yaml
|
||||
|
||||
|
||||
Questions or Issues?
|
||||
--------------------
|
||||
|
||||
.. include:: /_includes/_help.rst
|
||||
|
||||
|
||||
.. _`Kubernetes Namespace`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/
|
||||
.. _`Kubernetes Service`: https://kubernetes.io/docs/concepts/services-networking/service/
|
||||
.. _`Kubernetes Deployment`: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/
|
||||
.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/
|
||||
|
||||
.. _`Discussion Board`: https://discuss.ray.io/
|
|
@ -1,338 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ray-k8s-deploy:
|
||||
|
||||
The legacy Ray Kubernetes Operator
|
||||
==================================
|
||||
|
||||
.. note::
|
||||
|
||||
This documentation describes deploying Ray on Kubernetes using the legacy Ray Operator hosted in
|
||||
the Ray repo.
|
||||
Going forward, the :ref:`preferred tool for deploying Ray on Kubernetes<kuberay-index>` will be the `KubeRay operator`_.
|
||||
The legacy operator described on this page can still be used to deploy on Kubernetes. However, the legacy operator
|
||||
will enter maintenance mode in a future Ray release.
|
||||
|
||||
To learn more about KubeRay, see the links below:
|
||||
|
||||
- :ref:`Ray's guides for deploying using KubeRay<kuberay-index>`.
|
||||
- `The KubeRay documentation`_.
|
||||
- `The KubeRay GitHub`_.
|
||||
- :ref:`A comparison of KubeRay and the legacy Ray Operator<kuberay-vs-legacy>`.
|
||||
|
||||
|
||||
Overview
|
||||
--------
|
||||
You can leverage your `Kubernetes`_ cluster as a substrate for execution of distributed Ray programs.
|
||||
The :ref:`Ray Autoscaler<cluster-index>` spins up and deletes Kubernetes `Pods`_ according to the resource demands of the Ray workload. Each Ray node runs in its own Kubernetes Pod.
|
||||
|
||||
Quick Guide
|
||||
-----------
|
||||
|
||||
This document cover the following topics:
|
||||
|
||||
- :ref:`Intro to the Ray Kubernetes Operator<ray-operator>`
|
||||
- :ref:`Launching Ray clusters with the Ray Helm Chart<ray-helm>`
|
||||
- :ref:`Monitoring Ray clusters<ray-k8s-monitor>`
|
||||
- :ref:`Running Ray programs using Ray Client<ray-k8s-client>`
|
||||
|
||||
You can find more information at the following links:
|
||||
|
||||
- :ref:`Ray Operator and Helm chart configuration<k8s-advanced>`
|
||||
- :ref:`GPU usage with Kubernetes<k8s-gpus>`
|
||||
- :ref:`Using Ray Tune on your Kubernetes cluster<tune-kubernetes>`
|
||||
- :ref:`How to manually set up a non-autoscaling Ray cluster on Kubernetes<ray-k8s-static>`
|
||||
|
||||
.. _ray-operator:
|
||||
|
||||
The Ray Kubernetes Operator
|
||||
---------------------------
|
||||
Deployments of Ray on Kubernetes are managed by the ``Ray Kubernetes Operator``.
|
||||
The Ray Operator follows the standard Kubernetes `Operator pattern`_. The main players are
|
||||
|
||||
- A `Custom Resource`_ called a ``RayCluster``, which describes the desired state of the Ray cluster.
|
||||
- A `Custom Controller`_, the ``Ray Operator``, which processes ``RayCluster`` resources and manages the Ray cluster.
|
||||
|
||||
Under the hood, the Operator uses the :ref:`Ray Autoscaler<cluster-index>` to launch and scale your Ray cluster.
|
||||
|
||||
The rest of this document explains how to launch a small example Ray cluster on Kubernetes.
|
||||
|
||||
- :ref:`Ray on Kubernetes Configuration and Advanced Usage<k8s-advanced>`.
|
||||
|
||||
.. _ray-helm:
|
||||
|
||||
Installing the Ray Operator with Helm
|
||||
-------------------------------------
|
||||
Ray provides a `Helm`_ chart to simplify deployment of the Ray Operator and Ray clusters.
|
||||
|
||||
The `Ray Helm chart`_ is available as part of the Ray GitHub repository.
|
||||
The chart will be published to a public Helm repository as part of a future Ray release.
|
||||
|
||||
Preparation
|
||||
~~~~~~~~~~~
|
||||
|
||||
- Configure `kubectl`_ to access your Kubernetes cluster.
|
||||
- Install `Helm 3`_.
|
||||
- Download the `Ray Helm chart`_.
|
||||
|
||||
To run the default example in this document, make sure your Kubernetes cluster can accomodate
|
||||
additional resource requests of 4 CPU and 2.5Gi memory.
|
||||
|
||||
Installation
|
||||
~~~~~~~~~~~~
|
||||
|
||||
You can install a small Ray cluster with a single ``helm`` command.
|
||||
The default cluster configuration consists of a Ray head pod and two worker pods,
|
||||
with scaling allowed up to three workers.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Navigate to the directory containing the chart
|
||||
$ cd ray/deploy/charts
|
||||
|
||||
# Install a small Ray cluster with the default configuration
|
||||
# in a new namespace called "ray". Let's name the Helm release "example-cluster."
|
||||
$ helm -n ray install example-cluster --create-namespace ./ray
|
||||
NAME: example-cluster
|
||||
LAST DEPLOYED: Fri May 14 11:44:06 2021
|
||||
NAMESPACE: ray
|
||||
STATUS: deployed
|
||||
REVISION: 1
|
||||
TEST SUITE: None
|
||||
|
||||
View the installed resources as follows.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# The custom resource representing the state of the Ray cluster.
|
||||
$ kubectl -n ray get rayclusters
|
||||
NAME STATUS RESTARTS AGE
|
||||
example-cluster Running 0 53s
|
||||
|
||||
# The Ray head node and two Ray worker nodes.
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
example-cluster-ray-head-type-5926k 1/1 Running 0 57s
|
||||
example-cluster-ray-worker-type-8gbwx 1/1 Running 0 40s
|
||||
example-cluster-ray-worker-type-l6cvx 1/1 Running 0 40s
|
||||
|
||||
# A service exposing the Ray head node.
|
||||
$ kubectl -n ray get service
|
||||
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
|
||||
example-cluster-ray-head ClusterIP 10.8.11.17 <none> 10001/TCP,8265/TCP,8000/TCP 115s
|
||||
|
||||
# The operator deployment.
|
||||
# By default, the deployment is launched in namespace "default".
|
||||
$ kubectl get deployment ray-operator
|
||||
NAME READY UP-TO-DATE AVAILABLE AGE
|
||||
ray-operator 1/1 1 1 3m1s
|
||||
|
||||
# The single pod of the operator deployment.
|
||||
$ kubectl get pod -l cluster.ray.io/component=operator
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ray-operator-84f5d57b7f-xkvtm 1/1 Running 0 3m35
|
||||
|
||||
# The Custom Resource Definition defining a RayCluster.
|
||||
$ kubectl get crd rayclusters.cluster.ray.io
|
||||
NAME CREATED AT
|
||||
rayclusters.cluster.ray.io 2021-05-14T18:44:02
|
||||
|
||||
.. _ray-k8s-monitor:
|
||||
|
||||
Observability
|
||||
-------------
|
||||
|
||||
To view autoscaling logs, run a ``kubectl logs`` command on the operator pod:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# The last 100 lines of logs.
|
||||
$ kubectl logs \
|
||||
$(kubectl get pod -l cluster.ray.io/component=operator -o custom-columns=:metadata.name) \
|
||||
| tail -n 100
|
||||
|
||||
.. _ray-k8s-dashboard:
|
||||
|
||||
The :ref:`Ray dashboard<ray-dashboard>` can be accessed on the Ray head node at port ``8265``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Forward the relevant port from the service exposing the Ray head.
|
||||
$ kubectl -n ray port-forward service/example-cluster-ray-head 8265:8265
|
||||
|
||||
# The dashboard can now be viewed in a browser at http://localhost:8265
|
||||
|
||||
.. _ray-k8s-client:
|
||||
|
||||
Running Ray programs with Ray Jobs Submission
|
||||
---------------------------------------------
|
||||
|
||||
:ref:`Ray Job Submission <jobs-overview>` can be used to submit Ray programs to your Ray cluster.
|
||||
To do this, you must be able to access the Ray Dashboard, which runs on the Ray head node on port ``8265``.
|
||||
One way to do this is to port forward ``127.0.0.1:8265`` on your local machine to ``127.0.0.1:8265`` on the head node using the :ref:`Kubernetes port-forwarding command<ray-k8s-dashboard>`.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ kubectl -n ray port-forward service/example-cluster-ray-head 8265:8265
|
||||
|
||||
Then in a new shell, you can run a job using the CLI:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ export RAY_ADDRESS="http://127.0.0.1:8265"
|
||||
|
||||
$ ray job submit --runtime-env-json='{"working_dir": "./", "pip": ["requests==2.26.0"]}' -- python script.py
|
||||
2021-12-01 23:04:52,672 INFO cli.py:25 -- Creating JobSubmissionClient at address: http://127.0.0.1:8265
|
||||
2021-12-01 23:04:52,809 INFO sdk.py:144 -- Uploading package gcs://_ray_pkg_bbcc8ca7e83b4dc0.zip.
|
||||
2021-12-01 23:04:52,810 INFO packaging.py:352 -- Creating a file package for local directory './'.
|
||||
2021-12-01 23:04:52,878 INFO cli.py:105 -- Job submitted successfully: raysubmit_RXhvSyEPbxhcXtm6.
|
||||
2021-12-01 23:04:52,878 INFO cli.py:106 -- Query the status of the job using: `ray job status raysubmit_RXhvSyEPbxhcXtm6`.
|
||||
|
||||
For more ways to run jobs, including a Python SDK and a REST API, see :ref:`Ray Job Submission <jobs-overview>`.
|
||||
|
||||
|
||||
|
||||
Running Ray programs with Ray Client
|
||||
------------------------------------
|
||||
|
||||
:ref:`Ray Client <ray-client>` can be used to interactively execute Ray programs on your Ray cluster. The Ray Client server runs on the Ray head node, on port ``10001``.
|
||||
|
||||
.. note::
|
||||
|
||||
Connecting with Ray client requires using matching minor versions of Python (for example 3.7)
|
||||
on the server and client end, that is, on the Ray head node and in the environment where
|
||||
``ray.init("ray://<host>:<port>")`` is invoked. Note that the default ``rayproject/ray`` images use Python 3.7.
|
||||
The latest offical Ray release builds are available for Python 3.6 and 3.8 at the `Ray Docker Hub <https://hub.docker.com/r/rayproject/ray>`_.
|
||||
|
||||
Connecting with Ray client also requires matching Ray versions. To connect from a local machine to a cluster running the examples in this document, the :ref:`latest release version<installation>` of Ray must be installed locally.
|
||||
|
||||
Using Ray Client to connect from outside the Kubernetes cluster
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
One way to connect to the Ray cluster from outside your Kubernetes cluster
|
||||
is to forward the Ray Client server port:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ kubectl -n ray port-forward service/example-cluster-ray-head 10001:10001
|
||||
|
||||
Then open a new shell and try out a `sample Ray program`_:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ python ray/doc/kubernetes/example_scripts/run_local_example.py
|
||||
|
||||
The program in this example uses ``ray.init("ray://127.0.0.1:10001")`` to connect to the Ray cluster.
|
||||
The program waits for three Ray nodes to connect and then tests object transfer
|
||||
between the nodes.
|
||||
|
||||
|
||||
Using Ray Client to connect from within the Kubernetes cluster
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
You can also connect to your Ray cluster from another pod in the same Kubernetes cluster.
|
||||
|
||||
For example, you can submit a Ray application to run on the Kubernetes cluster as a `Kubernetes
|
||||
Job`_. The Job will run a single pod running the Ray driver program to
|
||||
completion, then terminate the pod but allow you to access the logs.
|
||||
|
||||
The following command submits a Job which executes an `example Ray program`_.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
$ kubectl -n ray create -f https://raw.githubusercontent.com/ray-project/ray/master/doc/kubernetes/job-example.yaml
|
||||
job.batch/ray-test-job created
|
||||
|
||||
The program executed by the job uses the name of the Ray cluster's head Service to connect:
|
||||
``ray.init("ray://example-cluster-ray-head:10001")``.
|
||||
The program waits for three Ray nodes to connect and then tests object transfer
|
||||
between the nodes.
|
||||
|
||||
To view the output of the Job, first find the name of the pod that ran it,
|
||||
then fetch its logs:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ kubectl -n ray get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
example-cluster-ray-head-type-5926k 1/1 Running 0 21m
|
||||
example-cluster-ray-worker-type-8gbwx 1/1 Running 0 21m
|
||||
example-cluster-ray-worker-type-l6cvx 1/1 Running 0 21m
|
||||
ray-test-job-dl9fv 1/1 Running 0 3s
|
||||
|
||||
# Fetch the logs. You should see repeated output for 10 iterations and then
|
||||
# 'Success!'
|
||||
$ kubectl -n ray logs ray-test-job-dl9fv
|
||||
|
||||
# Cleanup
|
||||
$ kubectl -n ray delete job ray-test-job
|
||||
job.batch "ray-test-job" deleted
|
||||
|
||||
.. tip::
|
||||
|
||||
Code dependencies for a given Ray task or actor must be installed on each Ray node that might run the task or actor.
|
||||
Typically, this means that all Ray nodes need to have the same dependencies installed.
|
||||
To achieve this, you can build a custom container image, using one of the `official Ray images <https://hub.docker.com/r/rayproject/ray>`_ as the base.
|
||||
Alternatively, try out :ref:`Runtime Environments<runtime-environments>`.
|
||||
|
||||
.. _k8s-cleanup-basic:
|
||||
|
||||
Cleanup
|
||||
-------
|
||||
|
||||
To remove a Ray Helm release and the associated API resources, use `kubectl delete`_ and `helm uninstall`_.
|
||||
Note the order of the commands below.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# First, delete the RayCluster custom resource.
|
||||
$ kubectl -n ray delete raycluster example-cluster
|
||||
raycluster.cluster.ray.io "example-cluster" deleted
|
||||
|
||||
# Delete the Ray release.
|
||||
$ helm -n ray uninstall example-cluster
|
||||
release "example-cluster" uninstalled
|
||||
|
||||
# Optionally, delete the namespace created for our Ray release.
|
||||
$ kubectl delete namespace ray
|
||||
namespace "ray" deleted
|
||||
|
||||
Note that ``helm uninstall`` `does not delete`_ the RayCluster CRD. If you wish to delete the CRD,
|
||||
make sure all Ray Helm releases have been uninstalled, then run ``kubectl delete crd rayclusters.cluster.ray.io``.
|
||||
|
||||
- :ref:`More details on resource cleanup<k8s-cleanup>`
|
||||
|
||||
Next steps
|
||||
----------
|
||||
:ref:`Ray Operator Advanced Configuration<k8s-advanced>`
|
||||
|
||||
Questions or Issues?
|
||||
--------------------
|
||||
|
||||
.. include:: /_includes/_help.rst
|
||||
|
||||
.. _`Kubernetes`: https://kubernetes.io/
|
||||
.. _`Kubernetes Job`: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/
|
||||
.. _`Kubernetes Service`: https://kubernetes.io/docs/concepts/services-networking/service/
|
||||
.. _`operator pattern`: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/
|
||||
.. _`Custom Resource`: https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/
|
||||
.. _`Custom Controller`: https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/#custom-controllers
|
||||
.. _`Kubernetes Custom Resource Definition`: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/
|
||||
.. _`annotation`: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/#attaching-metadata-to-objects
|
||||
.. _`permissions`: https://kubernetes.io/docs/reference/access-authn-authz/rbac/
|
||||
.. _`minikube`: https://minikube.sigs.k8s.io/docs/start/
|
||||
.. _`namespace`: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/
|
||||
.. _`Deployment`: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/
|
||||
.. _`Ray Helm chart`: https://github.com/ray-project/ray/tree/master/deploy/charts/ray/
|
||||
.. _`kubectl`: https://kubernetes.io/docs/tasks/tools/
|
||||
.. _`Helm 3`: https://helm.sh/
|
||||
.. _`Helm`: https://helm.sh/
|
||||
.. _`kubectl delete`: https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#delete
|
||||
.. _`helm uninstall`: https://helm.sh/docs/helm/helm_uninstall/
|
||||
.. _`does not delete`: https://helm.sh/docs/chart_best_practices/custom_resource_definitions/
|
||||
.. _`Pods`: https://kubernetes.io/docs/concepts/workloads/pods/
|
||||
.. _`example Ray program`: https://github.com/ray-project/ray/tree/master/doc/kubernetes/example_scripts/job_example.py
|
||||
.. _`sample Ray program`: https://github.com/ray-project/ray/tree/master/doc/kubernetes/example_scripts/run_local_example.py
|
||||
.. _`official Ray images`: https://hub.docker.com/r/rayproject/ray
|
||||
.. _`Ray Docker Hub`: https://hub.docker.com/r/rayproject/ray
|
||||
.. _`KubeRay operator`: https://github.com/ray-project/kuberay
|
||||
.. _`The KubeRay GitHub`: https://github.com/ray-project/kuberay
|
||||
.. _`The KubeRay documentation`: https://ray-project.github.io/kuberay/
|
|
@ -1,20 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ray-LSF-deploy:
|
||||
|
||||
Deploying on LSF
|
||||
================
|
||||
|
||||
This document describes a couple high-level steps to run ray cluster on LSF.
|
||||
|
||||
1) Obtain desired nodes from LSF scheduler using bsub directives.
|
||||
2) Obtain free ports on the desired nodes to start ray services like dashboard, GCS etc.
|
||||
3) Start ray head node on one of the available nodes.
|
||||
4) Connect all the worker nodes to the head node.
|
||||
5) Perform port forwarding to access ray dashboard.
|
||||
|
||||
Steps 1-4 have been automated and can be easily run as a script, please refer to below github repo to access script and run sample workloads:
|
||||
|
||||
- `ray_LSF`_ Ray with LSF. Users can start up a Ray cluster on LSF, and run DL workloads through that either in a batch or interactive mode.
|
||||
|
||||
.. _`ray_LSF`: https://github.com/IBMSpectrumComputing/ray-integration
|
|
@ -1,248 +0,0 @@
|
|||
.. include:: /_includes/clusters/announcement.rst
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ref-cluster-quick-start:
|
||||
|
||||
Ray Clusters Quick Start
|
||||
========================
|
||||
|
||||
This quick start demonstrates the capabilities of the Ray cluster. Using the Ray cluster, we'll take a sample application designed to run on a laptop and scale it up in the cloud. Ray will launch clusters and scale Python with just a few commands.
|
||||
|
||||
For launching a Ray cluster manually, you can refer to the :ref:`on-premise cluster setup <cluster-private-setup>` guide.
|
||||
|
||||
About the demo
|
||||
--------------
|
||||
|
||||
This demo will walk through an end-to-end flow:
|
||||
|
||||
1. Create a (basic) Python application.
|
||||
2. Launch a cluster on a cloud provider.
|
||||
3. Run the application in the cloud.
|
||||
|
||||
Requirements
|
||||
~~~~~~~~~~~~
|
||||
|
||||
To run this demo, you will need:
|
||||
|
||||
* Python installed on your development machine (typically your laptop), and
|
||||
* an account at your preferred cloud provider (AWS, Azure or GCP).
|
||||
|
||||
Setup
|
||||
~~~~~
|
||||
|
||||
Before we start, you will need to install some Python dependencies as follows:
|
||||
|
||||
.. tabbed:: AWS
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ pip install -U "ray[default]" boto3
|
||||
|
||||
.. tabbed:: Azure
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ pip install -U "ray[default]" azure-cli azure-core
|
||||
|
||||
.. tabbed:: GCP
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ pip install -U "ray[default]" google-api-python-client
|
||||
|
||||
Next, if you're not set up to use your cloud provider from the command line, you'll have to configure your credentials:
|
||||
|
||||
.. tabbed:: AWS
|
||||
|
||||
Configure your credentials in ``~/.aws/credentials`` as described in `the AWS docs <https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html>`_.
|
||||
|
||||
.. tabbed:: Azure
|
||||
|
||||
Log in using ``az login``, then configure your credentials with ``az account set -s <subscription_id>``.
|
||||
|
||||
.. tabbed:: GCP
|
||||
|
||||
Set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable as described in `the GCP docs <https://cloud.google.com/docs/authentication/getting-started>`_.
|
||||
|
||||
Create a (basic) Python application
|
||||
-----------------------------------
|
||||
|
||||
We will write a simple Python application that tracks the IP addresses of the machines that its tasks are executed on:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from collections import Counter
|
||||
import socket
|
||||
import time
|
||||
|
||||
def f():
|
||||
time.sleep(0.001)
|
||||
# Return IP address.
|
||||
return socket.gethostbyname(socket.gethostname())
|
||||
|
||||
ip_addresses = [f() for _ in range(10000)]
|
||||
print(Counter(ip_addresses))
|
||||
|
||||
Save this application as ``script.py`` and execute it by running the command ``python script.py``. The application should take 10 seconds to run and output something similar to ``Counter({'127.0.0.1': 10000})``.
|
||||
|
||||
With some small changes, we can make this application run on Ray (for more information on how to do this, refer to :ref:`the Ray Core Walkthrough<core-walkthrough>`):
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from collections import Counter
|
||||
import socket
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init()
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
time.sleep(0.001)
|
||||
# Return IP address.
|
||||
return socket.gethostbyname(socket.gethostname())
|
||||
|
||||
object_ids = [f.remote() for _ in range(10000)]
|
||||
ip_addresses = ray.get(object_ids)
|
||||
print(Counter(ip_addresses))
|
||||
|
||||
Finally, let's add some code to make the output more interesting:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from collections import Counter
|
||||
import socket
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init()
|
||||
|
||||
print('''This cluster consists of
|
||||
{} nodes in total
|
||||
{} CPU resources in total
|
||||
'''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
time.sleep(0.001)
|
||||
# Return IP address.
|
||||
return socket.gethostbyname(socket.gethostname())
|
||||
|
||||
object_ids = [f.remote() for _ in range(10000)]
|
||||
ip_addresses = ray.get(object_ids)
|
||||
|
||||
print('Tasks executed')
|
||||
for ip_address, num_tasks in Counter(ip_addresses).items():
|
||||
print(' {} tasks on {}'.format(num_tasks, ip_address))
|
||||
|
||||
Running ``python script.py`` should now output something like:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
This cluster consists of
|
||||
1 nodes in total
|
||||
4.0 CPU resources in total
|
||||
|
||||
Tasks executed
|
||||
10000 tasks on 127.0.0.1
|
||||
|
||||
Launch a cluster on a cloud provider
|
||||
------------------------------------
|
||||
|
||||
To start a Ray Cluster, first we need to define the cluster configuration. The cluster configuration is defined within a YAML file that will be used by the Cluster Launcher to launch the head node, and by the Autoscaler to launch worker nodes.
|
||||
|
||||
A minimal sample cluster configuration file looks as follows:
|
||||
|
||||
.. tabbed:: AWS
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: minimal
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
|
||||
.. tabbed:: Azure
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: minimal
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: azure
|
||||
location: westus2
|
||||
resource_group: ray-cluster
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
.. tabbed:: GCP
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
# A unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: minimal
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: gcp
|
||||
region: us-west1
|
||||
|
||||
Save this configuration file as ``config.yaml``. You can specify a lot more details in the configuration file: instance types to use, minimum and maximum number of workers to start, autoscaling strategy, files to sync, and more. For a full reference on the available configuration properties, please refer to the :ref:`cluster YAML configuration options reference <cluster-config>`.
|
||||
|
||||
After defining our configuration, we will use the Ray Cluster Launcher to start a cluster on the cloud, creating a designated "head node" and worker nodes. To start the Ray cluster, we will use the :ref:`Ray CLI <ray-cli>`. Run the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ ray up -y config.yaml
|
||||
|
||||
Run the application in the cloud
|
||||
--------------------------------
|
||||
|
||||
We are now ready to execute the application in across multiple machines on our Ray cloud cluster.
|
||||
``ray.init()`` will now automatically connect to the newly created cluster.
|
||||
|
||||
Next, run the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ ray submit config.yaml script.py
|
||||
|
||||
The output should now look similar to the following:
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
Connecting to existing Ray cluster at address: <IP address>...
|
||||
|
||||
This cluster consists of
|
||||
3 nodes in total
|
||||
6.0 CPU resources in total
|
||||
|
||||
Tasks executed
|
||||
3425 tasks on xxx.xxx.xxx.xxx
|
||||
3834 tasks on xxx.xxx.xxx.xxx
|
||||
2741 tasks on xxx.xxx.xxx.xxx
|
||||
|
||||
In this sample output, 3 nodes were started. If the output only shows 1 node, you may want to increase the ``secs`` in ``time.sleep(secs)`` to give Ray more time to start additional nodes.
|
||||
|
||||
The Ray CLI offers additional functionality. For example, you can monitor the Ray cluster status with ``ray monitor config.yaml``, and you can connect to the cluster (ssh into the head node) with ``ray attach config.yaml``. For a full reference on the Ray CLI, please refer to :ref:`the cluster commands reference <cluster-commands>`.
|
||||
|
||||
To finish, don't forget to shut down the cluster. Run the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ ray down -y config.yaml
|
|
@ -1,278 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ray-client:
|
||||
|
||||
Ray Client: Interactive Development
|
||||
===================================
|
||||
|
||||
**What is the Ray Client?**
|
||||
|
||||
The Ray Client is an API that connects a Python script to a **remote** Ray cluster. Effectively, it allows you to leverage a remote Ray cluster just like you would with Ray running on your local machine.
|
||||
|
||||
By changing ``ray.init()`` to ``ray.init("ray://<head_node_host>:<port>")``, you can connect from your laptop (or anywhere) directly to a remote cluster and scale-out your Ray code, while maintaining the ability to develop interactively in a Python shell. **This will only work with Ray 1.5+.**
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# You can run this code outside of the Ray cluster!
|
||||
import ray
|
||||
|
||||
# Starting the Ray client. This connects to a remote Ray cluster.
|
||||
ray.init("ray://<head_node_host>:10001")
|
||||
|
||||
# Normal Ray code follows
|
||||
@ray.remote
|
||||
def do_work(x):
|
||||
return x ** x
|
||||
|
||||
do_work.remote(2)
|
||||
#....
|
||||
|
||||
Client arguments
|
||||
----------------
|
||||
|
||||
Ray Client is used when the address passed into ``ray.init`` is prefixed with ``ray://``. Besides the address, Client mode currently accepts two other arguments:
|
||||
|
||||
- ``namespace`` (optional): Sets the namespace for the session.
|
||||
- ``runtime_env`` (optional): Sets the `runtime environment <../ray-core/handling-dependencies.html#runtime-environments>`_ for the session, allowing you to dynamically specify environment variables, packages, local files, and more.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Connects to an existing cluster at 1.2.3.4 listening on port 10001, using
|
||||
# the namespace "my_namespace". The Ray workers will run inside a cluster-side
|
||||
# copy of the local directory "files/my_project", in a Python environment with
|
||||
# `toolz` and `requests` installed.
|
||||
ray.init(
|
||||
"ray://1.2.3.4:10001",
|
||||
namespace="my_namespace",
|
||||
runtime_env={"working_dir": "files/my_project", "pip": ["toolz", "requests"]},
|
||||
)
|
||||
#....
|
||||
|
||||
When to use Ray Client
|
||||
----------------------
|
||||
|
||||
Ray Client should be used when you want to connect a script or an interactive shell session to a **remote** cluster.
|
||||
|
||||
* Use ``ray.init("ray://<head_node_host>:10001")`` (Ray Client) if you've set up a remote cluster at ``<head_node_host>`` and you want to do interactive work. This will connect your local script or shell to the cluster. See the section on :ref:`using Ray Client<how-do-you-use-the-ray-client>` for more details on setting up your cluster.
|
||||
* Use ``ray.init("localhost:<port>")`` (non-client connection, local address) if you're developing locally or on the head node of your cluster and you have already started the cluster (i.e. ``ray start --head`` has already been run)
|
||||
* Use ``ray.init()`` (non-client connection, no address specified) if you're developing locally and want to automatically create a local cluster and attach directly to it OR if you are using Ray Job submission.
|
||||
|
||||
.. _how-do-you-use-the-ray-client:
|
||||
|
||||
How do you use the Ray Client?
|
||||
------------------------------
|
||||
|
||||
Step 1: Set up your Ray cluster
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you have a running Ray cluster (version >= 1.5), Ray Client server is likely already running on port ``10001`` of the head node by default. Otherwise, you'll want to create a Ray cluster. To start a Ray cluster locally, you can run
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ray start --head
|
||||
|
||||
To start a Ray cluster remotely, you can follow the directions in :ref:`ref-cluster-quick-start`.
|
||||
|
||||
If necessary, you can modify the Ray Client server port to be other than ``10001``, by specifying ``--ray-client-server-port=...`` to the ``ray start`` :ref:`command <ray-start-doc>`.
|
||||
|
||||
Step 2: Check ports
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Ensure that the Ray Client port on the head node is reachable from your local machine.
|
||||
This means opening that port up by configuring security groups or other access controls (on `EC2 <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/authorizing-access-to-an-instance.html>`_)
|
||||
or proxying from your local machine to the cluster (on `K8s <https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/#forward-a-local-port-to-a-port-on-the-pod>`_).
|
||||
|
||||
.. tabbed:: AWS
|
||||
|
||||
With the Ray cluster launcher, you can configure the security group
|
||||
to allow inbound access by defining :ref:`cluster-configuration-security-group`
|
||||
in your `cluster.yaml`.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: minimal_security_group
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
security_group:
|
||||
GroupName: ray_client_security_group
|
||||
IpPermissions:
|
||||
- FromPort: 10001
|
||||
ToPort: 10001
|
||||
IpProtocol: TCP
|
||||
IpRanges:
|
||||
# This will enable inbound access from ALL IPv4 addresses.
|
||||
- CidrIp: 0.0.0.0/0
|
||||
|
||||
Step 3: Run Ray code
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Now, connect to the Ray Cluster with the following and then use Ray like you normally would:
|
||||
|
||||
..
|
||||
.. code-block:: python
|
||||
|
||||
import ray
|
||||
|
||||
# replace with the appropriate host and port
|
||||
ray.init("ray://<head_node_host>:10001")
|
||||
|
||||
# Normal Ray code follows
|
||||
@ray.remote
|
||||
def do_work(x):
|
||||
return x ** x
|
||||
|
||||
do_work.remote(2)
|
||||
|
||||
#....
|
||||
|
||||
Alternative Approach: SSH Port Forwarding
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
As an alternative to configuring inbound traffic rules, you can also set up
|
||||
Ray Client via port forwarding. While this approach does require an open SSH
|
||||
connection, it can be useful in a test environment where the
|
||||
``head_node_host`` often changes.
|
||||
|
||||
First, open up an SSH connection with your Ray cluster and forward the
|
||||
listening port (``10001``).
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ray up cluster.yaml
|
||||
$ ray attach cluster.yaml -p 10001
|
||||
|
||||
Then, you can connect to the Ray cluster **from another terminal** using ``localhost`` as the
|
||||
``head_node_host``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import ray
|
||||
|
||||
# This will connect to the cluster via the open SSH session.
|
||||
ray.init("ray://localhost:10001")
|
||||
|
||||
# Normal Ray code follows
|
||||
@ray.remote
|
||||
def do_work(x):
|
||||
return x ** x
|
||||
|
||||
do_work.remote(2)
|
||||
|
||||
#....
|
||||
|
||||
Connect to multiple Ray clusters (Experimental)
|
||||
-----------------------------------------------
|
||||
|
||||
Ray Client allows connecting to multiple Ray clusters in one Python process. To do this, just pass ``allow_multiple=True`` to ``ray.init``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import ray
|
||||
# Create a default client.
|
||||
ray.init("ray://<head_node_host_cluster>:10001")
|
||||
|
||||
# Connect to other clusters.
|
||||
cli1 = ray.init("ray://<head_node_host_cluster_1>:10001", allow_multiple=True)
|
||||
cli2 = ray.init("ray://<head_node_host_cluster_2>:10001", allow_multiple=True)
|
||||
|
||||
# Data is put into the default cluster.
|
||||
obj = ray.put("obj")
|
||||
|
||||
with cli1:
|
||||
obj1 = ray.put("obj1")
|
||||
|
||||
with cli2:
|
||||
obj2 = ray.put("obj2")
|
||||
|
||||
with cli1:
|
||||
assert ray.get(obj1) == "obj1"
|
||||
try:
|
||||
ray.get(obj2) # Cross-cluster ops not allowed.
|
||||
except:
|
||||
print("Failed to get object which doesn't belong to this cluster")
|
||||
|
||||
with cli2:
|
||||
assert ray.get(obj2) == "obj2"
|
||||
try:
|
||||
ray.get(obj1) # Cross-cluster ops not allowed.
|
||||
except:
|
||||
print("Failed to get object which doesn't belong to this cluster")
|
||||
assert "obj" == ray.get(obj)
|
||||
cli1.disconnect()
|
||||
cli2.disconnect()
|
||||
|
||||
|
||||
When using Ray multi-client, there are some different behaviors to pay attention to:
|
||||
|
||||
* The client won't be disconnected automatically. Call ``disconnect`` explicitly to close the connection.
|
||||
* Object references can only be used by the client from which it was obtained.
|
||||
* ``ray.init`` without ``allow_multiple`` will create a default global Ray client.
|
||||
|
||||
Things to know
|
||||
--------------
|
||||
|
||||
Client disconnections
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
When the client disconnects, any object or actor references held by the server on behalf of the client are dropped, as if directly disconnecting from the cluster.
|
||||
|
||||
|
||||
Versioning requirements
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Generally, the client Ray version must match the server Ray version. An error will be raised if an incompatible version is used.
|
||||
|
||||
Similarly, the minor Python (e.g., 3.6 vs 3.7) must match between the client and server. An error will be raised if this is not the case.
|
||||
|
||||
Starting a connection on older Ray versions
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you encounter ``socket.gaierror: [Errno -2] Name or service not known`` when using ``ray.init("ray://...")`` then you may be on a version of Ray prior to 1.5 that does not support starting client connections through ``ray.init``.
|
||||
|
||||
Connection through the Ingress
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you encounter the following error message when connecting to the ``Ray Cluster`` using an ``Ingress``, it may be caused by the Ingress's configuration.
|
||||
|
||||
..
|
||||
.. code-block:: python
|
||||
|
||||
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
|
||||
status = StatusCode.INVALID_ARGUMENT
|
||||
details = ""
|
||||
debug_error_string = "{"created":"@1628668820.164591000","description":"Error received from peer ipv4:10.233.120.107:443","file":"src/core/lib/surface/call.cc","file_line":1062,"grpc_message":"","grpc_status":3}"
|
||||
>
|
||||
Got Error from logger channel -- shutting down: <_MultiThreadedRendezvous of RPC that terminated with:
|
||||
status = StatusCode.INVALID_ARGUMENT
|
||||
details = ""
|
||||
debug_error_string = "{"created":"@1628668820.164713000","description":"Error received from peer ipv4:10.233.120.107:443","file":"src/core/lib/surface/call.cc","file_line":1062,"grpc_message":"","grpc_status":3}"
|
||||
>
|
||||
|
||||
|
||||
If you are using the ``nginx-ingress-controller``, you may be able to resolve the issue by adding the following Ingress configuration.
|
||||
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
metadata:
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/server-snippet: |
|
||||
underscores_in_headers on;
|
||||
ignore_invalid_headers on;
|
||||
|
||||
Ray client logs
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Ray client logs can be found at ``/tmp/ray/session_latest/logs`` on the head node.
|
||||
|
||||
Uploads
|
||||
~~~~~~~
|
||||
|
||||
If a ``working_dir`` is specified in the runtime env, when running ``ray.init()`` the Ray client will upload the ``working_dir`` on the laptop to ``/tmp/ray/session_latest/runtime_resources/_ray_pkg_<hash of directory contents>``.
|
||||
|
||||
Ray workers are started in the ``/tmp/ray/session_latest/runtime_resources/_ray_pkg_<hash of directory contents>`` directory on the cluster. This means that relative paths in the remote tasks and actors in the code will work on the laptop and on the cluster without any code changes. For example, if the ``working_dir`` on the laptop contains ``data.txt`` and ``run.py``, inside the remote task definitions in ``run.py`` one can just use the relative path ``"data.txt"``. Then ``python run.py`` will work on my laptop, and also on the cluster. As a side note, since relative paths can be used in the code, the absolute path is only useful for debugging purposes.
|
Binary file not shown.
Before Width: | Height: | Size: 55 KiB |
|
@ -1,13 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _cluster-reference:
|
||||
|
||||
Ray Cluster Config YAML and CLI
|
||||
===============================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
config.rst
|
||||
commands.rst
|
||||
sdk.rst
|
|
@ -1,285 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ray-slurm-deploy:
|
||||
|
||||
Deploying on Slurm
|
||||
==================
|
||||
|
||||
Slurm usage with Ray can be a little bit unintuitive.
|
||||
|
||||
* SLURM requires multiple copies of the same program are submitted multiple times to the same cluster to do cluster programming. This is particularly well-suited for MPI-based workloads.
|
||||
* Ray, on the other hand, expects a head-worker architecture with a single point of entry. That is, you'll need to start a Ray head node, multiple Ray worker nodes, and run your Ray script on the head node.
|
||||
|
||||
.. warning::
|
||||
|
||||
SLURM support is still a work in progress. SLURM users should be aware
|
||||
of current limitations regarding networking.
|
||||
See :ref:`here <slurm-network-ray>` for more explanations.
|
||||
|
||||
SLURM support is community-maintained. Maintainer GitHub handle: tupui.
|
||||
|
||||
This document aims to clarify how to run Ray on SLURM.
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
|
||||
|
||||
Walkthrough using Ray with SLURM
|
||||
--------------------------------
|
||||
|
||||
Many SLURM deployments require you to interact with slurm via ``sbatch``, which executes a batch script on SLURM.
|
||||
|
||||
To run a Ray job with ``sbatch``, you will want to start a Ray cluster in the sbatch job with multiple ``srun`` commands (tasks), and then execute your python script that uses Ray. Each task will run on a separate node and start/connect to a Ray runtime.
|
||||
|
||||
The below walkthrough will do the following:
|
||||
|
||||
1. Set the proper headers for the ``sbatch`` script.
|
||||
2. Load the proper environment/modules.
|
||||
3. Fetch a list of available computing nodes and their IP addresses.
|
||||
4. Launch a head ray process in one of the node (called the head node).
|
||||
5. Launch Ray processes in (n-1) worker nodes and connects them to the head node by providing the head node address.
|
||||
6. After the underlying ray cluster is ready, submit the user specified task.
|
||||
|
||||
See :ref:`slurm-basic.sh <slurm-basic>` for an end-to-end example.
|
||||
|
||||
.. _ray-slurm-headers:
|
||||
|
||||
sbatch directives
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
In your sbatch script, you'll want to add `directives to provide context <https://slurm.schedmd.com/sbatch.html>`__ for your job to SLURM.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=my-workload
|
||||
|
||||
You'll need to tell SLURM to allocate nodes specifically for Ray. Ray will then find and manage all resources on each node.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
### Modify this according to your Ray workload.
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --exclusive
|
||||
|
||||
Important: To ensure that each Ray worker runtime will run on a separate node, set ``tasks-per-node``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
#SBATCH --tasks-per-node=1
|
||||
|
||||
Since we've set `tasks-per-node = 1`, this will be used to guarantee that each Ray worker runtime will obtain the
|
||||
proper resources. In this example, we ask for at least 5 CPUs and 5 GB of memory per node.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
### Modify this according to your Ray workload.
|
||||
#SBATCH --cpus-per-task=5
|
||||
#SBATCH --mem-per-cpu=1GB
|
||||
### Similarly, you can also specify the number of GPUs per node.
|
||||
### Modify this according to your Ray workload. Sometimes this
|
||||
### should be 'gres' instead.
|
||||
#SBATCH --gpus-per-task=1
|
||||
|
||||
|
||||
You can also add other optional flags to your sbatch directives.
|
||||
|
||||
|
||||
Loading your environment
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
First, you'll often want to Load modules or your own conda environment at the beginning of the script.
|
||||
|
||||
Note that this is an optional step, but it is often required for enabling the right set of dependencies.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Example: module load pytorch/v1.4.0-gpu
|
||||
# Example: conda activate my-env
|
||||
|
||||
conda activate my-env
|
||||
|
||||
Obtain the head IP address
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Next, we'll want to obtain a hostname and a node IP address for the head node. This way, when we start worker nodes, we'll be able to properly connect to the right head node.
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_head_address_start__
|
||||
:end-before: __doc_head_address_end__
|
||||
|
||||
|
||||
|
||||
Starting the Ray head node
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
After detecting the head node hostname and head node IP, we'll want to create
|
||||
a Ray head node runtime. We'll do this by using ``srun`` as a background task
|
||||
as a single task/node (recall that ``tasks-per-node=1``).
|
||||
|
||||
Below, you'll see that we explicitly specify the number of CPUs (``num-cpus``)
|
||||
and number of GPUs (``num-gpus``) to Ray, as this will prevent Ray from using
|
||||
more resources than allocated. We also need to explictly
|
||||
indicate the ``node-ip-address`` for the Ray head runtime:
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_head_ray_start__
|
||||
:end-before: __doc_head_ray_end__
|
||||
|
||||
By backgrounding the above srun task, we can proceed to start the Ray worker runtimes.
|
||||
|
||||
Starting the Ray worker nodes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Below, we do the same thing, but for each worker. Make sure the Ray head and Ray worker processes are not started on the same node.
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_worker_ray_start__
|
||||
:end-before: __doc_worker_ray_end__
|
||||
|
||||
Submitting your script
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Finally, you can invoke your Python script:
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_script_start__
|
||||
|
||||
.. _slurm-network-ray:
|
||||
|
||||
SLURM networking caveats
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
There are two important networking aspects to keep in mind when working with
|
||||
SLURM and Ray:
|
||||
|
||||
1. Ports binding.
|
||||
2. IP binding.
|
||||
|
||||
One common use of a SLURM cluster is to have multiple users running concurrent
|
||||
jobs on the same infrastructure. This can easily conflict with Ray due to the
|
||||
way the head node communicates with its workers.
|
||||
|
||||
Considering 2 users, if they both schedule a SLURM job using Ray
|
||||
at the same time, they are both creating a head node. In the backend, Ray will
|
||||
assign some internal ports to a few services. The issue is that as soon as the
|
||||
first head node is created, it will bind some ports and prevent them to be
|
||||
used by another head node. To prevent any conflicts, users have to manually
|
||||
specify non overlapping ranges of ports. The following ports are to be
|
||||
adjusted. For an explanation on ports, see :ref:`here <ray-ports>`::
|
||||
|
||||
# used for all ports
|
||||
--node-manager-port
|
||||
--object-manager-port
|
||||
--min-worker-port
|
||||
--max-worker-port
|
||||
# used for the head node
|
||||
--port
|
||||
--ray-client-server-port
|
||||
--redis-shard-ports
|
||||
|
||||
For instance, again with 2 users, they would have to adapt the instructions
|
||||
seen above to:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# user 1
|
||||
# same as above
|
||||
...
|
||||
srun --nodes=1 --ntasks=1 -w "$head_node" \
|
||||
ray start --head --node-ip-address="$head_node_ip" \
|
||||
--port=6379 \
|
||||
--node-manager-port=6700 \
|
||||
--object-manager-port=6701 \
|
||||
--ray-client-server-port=10001 \
|
||||
--redis-shard-ports=6702 \
|
||||
--min-worker-port=10002 \
|
||||
--max-worker-port=19999 \
|
||||
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
|
||||
|
||||
# user 2
|
||||
# same as above
|
||||
...
|
||||
srun --nodes=1 --ntasks=1 -w "$head_node" \
|
||||
ray start --head --node-ip-address="$head_node_ip" \
|
||||
--port=6380 \
|
||||
--node-manager-port=6800 \
|
||||
--object-manager-port=6801 \
|
||||
--ray-client-server-port=20001 \
|
||||
--redis-shard-ports=6802 \
|
||||
--min-worker-port=20002 \
|
||||
--max-worker-port=29999 \
|
||||
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
|
||||
|
||||
As for the IP binding, on some cluster architecture the network interfaces
|
||||
do not allow to use external IPs between nodes. Instead, there are internal
|
||||
network interfaces (`eth0`, `eth1`, etc.). Currently, it's difficult to
|
||||
set an internal IP
|
||||
(see the open `issue <https://github.com/ray-project/ray/issues/22732>`_).
|
||||
|
||||
Python-interface SLURM scripts
|
||||
------------------------------
|
||||
|
||||
[Contributed by @pengzhenghao] Below, we provide a helper utility (:ref:`slurm-launch.py <slurm-launch>`) to auto-generate SLURM scripts and launch.
|
||||
``slurm-launch.py`` uses an underlying template (:ref:`slurm-template.sh <slurm-template>`) and fills out placeholders given user input.
|
||||
|
||||
You can feel free to copy both files into your cluster for use. Feel free to also open any PRs for contributions to improve this script!
|
||||
|
||||
Usage example
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
If you want to utilize a multi-node cluster in slurm:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python slurm-launch.py --exp-name test --command "python your_file.py" --num-nodes 3
|
||||
|
||||
If you want to specify the computing node(s), just use the same node name(s) in the same format of the output of ``sinfo`` command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python slurm-launch.py --exp-name test --command "python your_file.py" --num-nodes 3 --node NODE_NAMES
|
||||
|
||||
|
||||
There are other options you can use when calling ``python slurm-launch.py``:
|
||||
|
||||
* ``--exp-name``: The experiment name. Will generate ``{exp-name}_{date}-{time}.sh`` and ``{exp-name}_{date}-{time}.log``.
|
||||
* ``--command``: The command you wish to run. For example: ``rllib train XXX`` or ``python XXX.py``.
|
||||
* ``--num-gpus``: The number of GPUs you wish to use in each computing node. Default: 0.
|
||||
* ``--node`` (``-w``): The specific nodes you wish to use, in the same form as the output of ``sinfo``. Nodes are automatically assigned if not specified.
|
||||
* ``--num-nodes`` (``-n``): The number of nodes you wish to use. Default: 1.
|
||||
* ``--partition`` (``-p``): The partition you wish to use. Default: "", will use user's default partition.
|
||||
* ``--load-env``: The command to setup your environment. For example: ``module load cuda/10.1``. Default: "".
|
||||
|
||||
Note that the :ref:`slurm-template.sh <slurm-template>` is compatible with both IPV4 and IPV6 ip address of the computing nodes.
|
||||
|
||||
Implementation
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Concretely, the (:ref:`slurm-launch.py <slurm-launch>`) does the following things:
|
||||
|
||||
1. It automatically writes your requirements, e.g. number of CPUs, GPUs per node, the number of nodes and so on, to a sbatch script name ``{exp-name}_{date}-{time}.sh``. Your command (``--command``) to launch your own job is also written into the sbatch script.
|
||||
2. Then it will submit the sbatch script to slurm manager via a new process.
|
||||
3. Finally, the python process will terminate itself and leaves a log file named ``{exp-name}_{date}-{time}.log`` to record the progress of your submitted command. At the mean time, the ray cluster and your job is running in the slurm cluster.
|
||||
|
||||
|
||||
Examples and templates
|
||||
----------------------
|
||||
|
||||
Here are some community-contributed templates for using SLURM with Ray:
|
||||
|
||||
- `Ray sbatch submission scripts`_ used at `NERSC <https://www.nersc.gov/>`_, a US national lab.
|
||||
- `YASPI`_ (yet another slurm python interface) by @albanie. The goal of yaspi is to provide an interface to submitting slurm jobs, thereby obviating the joys of sbatch files. It does so through recipes - these are collections of templates and rules for generating sbatch scripts. Supports job submissions for Ray.
|
||||
|
||||
- `Convenient python interface`_ to launch ray cluster and submit task by @pengzhenghao
|
||||
|
||||
.. _`Ray sbatch submission scripts`: https://github.com/NERSC/slurm-ray-cluster
|
||||
|
||||
.. _`YASPI`: https://github.com/albanie/yaspi
|
||||
|
||||
.. _`Convenient python interface`: https://github.com/pengzhenghao/use-ray-with-slurm
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
Usage Stats Data API
|
||||
====================
|
||||
|
||||
.. _ray-usage-stats-data-ref:
|
||||
|
||||
UsageStatsToReport
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray._private.usage.usage_lib.UsageStatsToReport
|
||||
:members:
|
|
@ -1,21 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ref-deployment-guide:
|
||||
|
||||
Deployment Guide
|
||||
================
|
||||
|
||||
This section explains how to set up a distributed Ray cluster and run your workloads on it.
|
||||
|
||||
To set up your cluster, check out the :ref:`Ray Cluster Overview <cluster-index>`, or jump to the :ref:`Ray Cluster Quick Start <ref-cluster-quick-start>`.
|
||||
|
||||
To trigger a Ray workload from your local machine, a CI system, or a third-party job scheduler/orchestrator via a command line interface or API call, try :ref:`Ray Job Submission <jobs-overview>`.
|
||||
|
||||
To run an interactive Ray workload and see the output in real time in a client of your choice (e.g. your local machine, SageMaker Studio, or Google Colab), you can use :ref:`Ray Client <ray-client>`.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
guide.rst
|
||||
job-submission.rst
|
||||
ray-client.rst
|
|
@ -1,8 +0,0 @@
|
|||
.. Comment this out for now.
|
||||
|
||||
..
|
||||
.. admonition:: We're hiring!
|
||||
|
||||
`Anyscale Inc. <https://anyscale.com>`__, the company behind Ray, is hiring interns and full-time **software engineers** to help advance and maintain Ray autoscaler, cluster launcher, cloud providers, the Kubernetes operator, and Ray Client.
|
||||
If you have a background in distributed computing/cluster orchestration/Kubernetes and are interested in making Ray **the** industry-leading open-source platform for distributed computing, `apply here today <https://jobs.lever.co/anyscale/814c0d0e-08f5-419a-bdd8-0819b8b8df24>`__.
|
||||
We'd be thrilled to welcome you on the team!
|
|
@ -1,196 +0,0 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ray-yarn-deploy:
|
||||
|
||||
Deploying on YARN
|
||||
=================
|
||||
|
||||
.. warning::
|
||||
|
||||
Running Ray on YARN is still a work in progress. If you have a
|
||||
suggestion for how to improve this documentation or want to request
|
||||
a missing feature, please feel free to create a pull request or get in touch
|
||||
using one of the channels in the `Questions or Issues?`_ section below.
|
||||
|
||||
This document assumes that you have access to a YARN cluster and will walk
|
||||
you through using `Skein`_ to deploy a YARN job that starts a Ray cluster and
|
||||
runs an example script on it.
|
||||
|
||||
Skein uses a declarative specification (either written as a yaml file or using the Python API) and allows users to launch jobs and scale applications without the need to write Java code.
|
||||
|
||||
You will first need to install Skein: ``pip install skein``.
|
||||
|
||||
The Skein ``yaml`` file and example Ray program used here are provided in the
|
||||
`Ray repository`_ to get you started. Refer to the provided ``yaml``
|
||||
files to be sure that you maintain important configuration options for Ray to
|
||||
function properly.
|
||||
|
||||
.. _`Ray repository`: https://github.com/ray-project/ray/tree/master/doc/yarn
|
||||
|
||||
Skein Configuration
|
||||
-------------------
|
||||
|
||||
A Ray job is configured to run as two `Skein services`:
|
||||
|
||||
1. The ``ray-head`` service that starts the Ray head node and then runs the
|
||||
application.
|
||||
2. The ``ray-worker`` service that starts worker nodes that join the Ray cluster.
|
||||
You can change the number of instances in this configuration or at runtime
|
||||
using ``skein container scale`` to scale the cluster up/down.
|
||||
|
||||
The specification for each service consists of necessary files and commands that will be run to start the service.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
services:
|
||||
ray-head:
|
||||
# There should only be one instance of the head node per cluster.
|
||||
instances: 1
|
||||
resources:
|
||||
# The resources for the worker node.
|
||||
vcores: 1
|
||||
memory: 2048
|
||||
files:
|
||||
...
|
||||
script:
|
||||
...
|
||||
ray-worker:
|
||||
# Number of ray worker nodes to start initially.
|
||||
# This can be scaled using 'skein container scale'.
|
||||
instances: 3
|
||||
resources:
|
||||
# The resources for the worker node.
|
||||
vcores: 1
|
||||
memory: 2048
|
||||
files:
|
||||
...
|
||||
script:
|
||||
...
|
||||
|
||||
Packaging Dependencies
|
||||
----------------------
|
||||
|
||||
Use the ``files`` option to specify files that will be copied into the YARN container for the application to use. See `the Skein file distribution page <https://jcrist.github.io/skein/distributing-files.html>`_ for more information.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
services:
|
||||
ray-head:
|
||||
# There should only be one instance of the head node per cluster.
|
||||
instances: 1
|
||||
resources:
|
||||
# The resources for the head node.
|
||||
vcores: 1
|
||||
memory: 2048
|
||||
files:
|
||||
# ray/doc/yarn/example.py
|
||||
example.py: example.py
|
||||
# # A packaged python environment using `conda-pack`. Note that Skein
|
||||
# # doesn't require any specific way of distributing files, but this
|
||||
# # is a good one for python projects. This is optional.
|
||||
# # See https://jcrist.github.io/skein/distributing-files.html
|
||||
# environment: environment.tar.gz
|
||||
|
||||
Ray Setup in YARN
|
||||
-----------------
|
||||
|
||||
Below is a walkthrough of the bash commands used to start the ``ray-head`` and ``ray-worker`` services. Note that this configuration will launch a new Ray cluster for each application, not reuse the same cluster.
|
||||
|
||||
Head node commands
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Start by activating a pre-existing environment for dependency management.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
source environment/bin/activate
|
||||
|
||||
Register the Ray head address needed by the workers in the Skein key-value store.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
skein kv put --key=RAY_HEAD_ADDRESS --value=$(hostname -i) current
|
||||
|
||||
Start all the processes needed on the ray head node. By default, we set object store memory
|
||||
and heap memory to roughly 200 MB. This is conservative and should be set according to application needs.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ray start --head --port=6379 --object-store-memory=200000000 --memory 200000000 --num-cpus=1
|
||||
|
||||
Execute the user script containing the Ray program.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python example.py
|
||||
|
||||
Clean up all started processes even if the application fails or is killed.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ray stop
|
||||
skein application shutdown current
|
||||
|
||||
Putting things together, we have:
|
||||
|
||||
.. literalinclude:: /../yarn/ray-skein.yaml
|
||||
:language: yaml
|
||||
:start-after: # Head service
|
||||
:end-before: # Worker service
|
||||
|
||||
|
||||
Worker node commands
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Fetch the address of the head node from the Skein key-value store.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
RAY_HEAD_ADDRESS=$(skein kv get current --key=RAY_HEAD_ADDRESS)
|
||||
|
||||
Start all of the processes needed on a ray worker node, blocking until killed by Skein/YARN via SIGTERM. After receiving SIGTERM, all started processes should also die (ray stop).
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ray start --object-store-memory=200000000 --memory 200000000 --num-cpus=1 --address=$RAY_HEAD_ADDRESS:6379 --block; ray stop
|
||||
|
||||
Putting things together, we have:
|
||||
|
||||
.. literalinclude:: /../yarn/ray-skein.yaml
|
||||
:language: yaml
|
||||
:start-after: # Worker service
|
||||
|
||||
Running a Job
|
||||
-------------
|
||||
|
||||
Within your Ray script, use the following to connect to the started Ray cluster:
|
||||
|
||||
.. literalinclude:: /../yarn/example.py
|
||||
:language: python
|
||||
:start-after: if __name__ == "__main__"
|
||||
|
||||
You can use the following command to launch the application as specified by the Skein YAML file.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
skein application submit [TEST.YAML]
|
||||
|
||||
Once it has been submitted, you can see the job running on the YARN dashboard.
|
||||
|
||||
.. image:: /images/yarn-job.png
|
||||
|
||||
Cleaning Up
|
||||
-----------
|
||||
|
||||
To clean up a running job, use the following (using the application ID):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
skein application shutdown $appid
|
||||
|
||||
Questions or Issues?
|
||||
--------------------
|
||||
|
||||
.. include:: /_includes/_help.rst
|
||||
|
||||
.. _`Skein`: https://jcrist.github.io/skein/
|
|
@ -1,36 +1,29 @@
|
|||
.. include:: /_includes/clusters/announcement.rst
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _cluster-index-under-construction:
|
||||
|
||||
.. _cluster-index:
|
||||
|
||||
Ray Clusters Overview
|
||||
=====================
|
||||
|
||||
What is a Ray cluster?
|
||||
----------------------
|
||||
Ray enables seamless scaling of workloads from a laptop to a large cluster. While Ray
|
||||
works out of the box on single machines with just a call to ``ray.init``, to run Ray
|
||||
applications on multiple nodes you must first *deploy a Ray cluster*.
|
||||
|
||||
One of Ray's strengths is the ability to leverage multiple machines for
|
||||
distributed execution. Ray is great for multiprocessing on a single machine.
|
||||
However, the real power of Ray is the ability to seamlessly scale to a cluster
|
||||
of machines.
|
||||
A Ray cluster is a set of worker nodes connected to a common :ref:`Ray head node <cluster-head-node>`.
|
||||
Ray clusters can be fixed-size, or they may :ref:`autoscale up and down <cluster-autoscaler>` according
|
||||
to the resources requested by applications running on the cluster.
|
||||
|
||||
A Ray cluster is a set of one or more nodes that are running Ray and share the same :ref:`head node<cluster-head-node-under-construction>`.
|
||||
Ray clusters can either be a fixed-size number of nodes or :ref:`can autoscale<cluster-autoscaler-under-construction>` (i.e., automatically provision or deprovision the number of nodes in a cluster) according to the demand of the Ray workload.
|
||||
Where can I deploy Ray clusters?
|
||||
--------------------------------
|
||||
|
||||
How can I use Ray clusters?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Ray clusters are officially supported on the following technology stacks:
|
||||
Ray provides native cluster deployment support on the following technology stacks:
|
||||
|
||||
* The :ref:`Ray cluster launcher on AWS and GCP<ref-cluster-quick-start-vms-under-construction>`. Community-supported Azure and Aliyun integrations also exist.
|
||||
* :ref:`KubeRay, the official way to run Ray on Kubernetes<kuberay-index>`.
|
||||
* On :ref:`AWS and GCP <cloud-vm-index>`. Community-supported Azure and Aliyun integrations also exist.
|
||||
* On :ref:`Kubernetes <kuberay-index>`, via the officially supported KubeRay project.
|
||||
|
||||
Advanced users may want to :ref:`deploy Ray clusters on-premise <on-prem>`
|
||||
or onto infrastructure platforms not listed here by :ref:`providing a custom node provider <ref-cluster-setup-under-construction>`.
|
||||
Advanced users may want to :ref:`deploy Ray manually <on-prem>`
|
||||
or onto :ref:`platforms not listed here <ref-cluster-setup>`.
|
||||
|
||||
Where to go from here?
|
||||
----------------------
|
||||
What's next?
|
||||
------------
|
||||
|
||||
.. panels::
|
||||
:container: text-center
|
||||
|
@ -42,34 +35,47 @@ Where to go from here?
|
|||
Understand the key concepts and main ways of interacting with a Ray cluster.
|
||||
|
||||
+++
|
||||
.. link-button:: cluster-key-concepts-under-construction
|
||||
.. link-button:: cluster-key-concepts
|
||||
:type: ref
|
||||
:text: Learn Key Concepts
|
||||
:classes: btn-outline-info btn-block
|
||||
|
||||
---
|
||||
|
||||
**I want to run Ray on Kubernetes**
|
||||
^^^
|
||||
Deploy a Ray application to a Kubernetes cluster. You can run the tutorial on a
|
||||
Kubernetes cluster or on your laptop via KinD.
|
||||
|
||||
+++
|
||||
.. link-button:: kuberay-quickstart
|
||||
:type: ref
|
||||
:text: Get Started with Ray on Kubernetes
|
||||
:classes: btn-outline-info btn-block
|
||||
|
||||
---
|
||||
|
||||
**I want to run Ray on a cloud provider**
|
||||
^^^
|
||||
Take a sample application designed to run on a laptop and scale it up in the
|
||||
cloud. Access to an AWS or GCP account is required.
|
||||
|
||||
+++
|
||||
.. link-button:: ref-cluster-quick-start-vms-under-construction
|
||||
.. link-button:: vm-cluster-quick-start
|
||||
:type: ref
|
||||
:text: Getting Started with Ray Clusters on VMs
|
||||
:text: Get Started with Ray on VMs
|
||||
:classes: btn-outline-info btn-block
|
||||
|
||||
---
|
||||
|
||||
**I want to run Ray on Kubernetes**
|
||||
**I want to run my application on an existing Ray cluster**
|
||||
^^^
|
||||
Deploy a Ray application to a Kubernetes cluster. You can run the tutorial on a
|
||||
remote Kubernetes cluster or on your laptop via KinD.
|
||||
Guide to submitting applications as Jobs to existing Ray clusters.
|
||||
|
||||
+++
|
||||
.. link-button:: kuberay-quickstart
|
||||
.. link-button:: jobs-quickstart
|
||||
:type: ref
|
||||
:text: Getting Started with Ray on Kubernetes
|
||||
:text: Job Submission
|
||||
:classes: btn-outline-info btn-block
|
||||
|
||||
.. include:: /_includes/clusters/announcement_bottom.rst
|
||||
|
|
|
@ -1,23 +1,17 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
Key Concepts
|
||||
============
|
||||
|
||||
.. _cluster-key-concepts-under-construction:
|
||||
.. _cluster-key-concepts:
|
||||
|
||||
|
||||
This page introduces the following key concepts concerning Ray clusters:
|
||||
This page introduces key concepts for Ray clusters:
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
|
||||
Ray cluster
|
||||
------------
|
||||
A **Ray cluster** is comprised of a :ref:`head node<cluster-head-node-under-construction>`
|
||||
and any number of :ref:`worker nodes<cluster-worker-nodes-under-construction>`.
|
||||
Ray Cluster
|
||||
-----------
|
||||
A Ray cluster consists of a single :ref:`head node <cluster-head-node>`
|
||||
and any number of connected :ref:`worker nodes <cluster-worker-nodes>`:
|
||||
|
||||
.. figure:: images/ray-cluster.svg
|
||||
:align: center
|
||||
|
@ -25,53 +19,48 @@ and any number of :ref:`worker nodes<cluster-worker-nodes-under-construction>`.
|
|||
|
||||
*A Ray cluster with two worker nodes. Each node runs Ray helper processes to
|
||||
facilitate distributed scheduling and memory management. The head node runs
|
||||
additional control processes, which are highlighted.*
|
||||
additional control processes (highlighted in blue).*
|
||||
|
||||
The number of worker nodes in a cluster may change with application demand, according
|
||||
to your Ray cluster configuration. This is known as *autoscaling*. The head node runs
|
||||
the :ref:`autoscaler<cluster-autoscaler-under-construction>`.
|
||||
The number of worker nodes may be *autoscaled* with application demand as specified
|
||||
by your Ray cluster configuration. The head node runs the :ref:`autoscaler <cluster-autoscaler>`.
|
||||
|
||||
.. note::
|
||||
Ray nodes are implemented as pods when :ref:`running on Kubernetes<kuberay-index>`.
|
||||
Ray nodes are implemented as pods when :ref:`running on Kubernetes <kuberay-index>`.
|
||||
|
||||
Users can submit jobs for execution on the Ray cluster, or can interactively use the
|
||||
cluster by connecting to the head node and running `ray.init`. See
|
||||
:ref:`Clients and Jobs<cluster-clients-and-jobs-under-construction>` for more information.
|
||||
:ref:`Ray Jobs <jobs-quickstart>` for more information.
|
||||
|
||||
.. _cluster-worker-nodes-under-construction:
|
||||
.. _cluster-head-node:
|
||||
|
||||
Worker nodes
|
||||
~~~~~~~~~~~~
|
||||
**Worker nodes** execute a Ray application by executing tasks and actors and storing Ray objects. Each worker node runs helper processes which
|
||||
implement distributed scheduling and :ref:`memory management<memory>`.
|
||||
|
||||
.. _cluster-head-node-under-construction:
|
||||
|
||||
Head node
|
||||
~~~~~~~~~
|
||||
Every Ray cluster has one node which is designated as the **head node** of the cluster.
|
||||
Head Node
|
||||
---------
|
||||
Every Ray cluster has one node which is designated as the *head node* of the cluster.
|
||||
The head node is identical to other worker nodes, except that it also runs singleton processes responsible for cluster management such as the
|
||||
:ref:`autoscaler<cluster-autoscaler-under-construction>` and the Ray driver processes
|
||||
:ref:`which run Ray jobs<cluster-clients-and-jobs-under-construction>`. Ray may schedule
|
||||
:ref:`autoscaler <cluster-autoscaler>` and the Ray driver processes
|
||||
:ref:`which run Ray jobs <cluster-clients-and-jobs>`. Ray may schedule
|
||||
tasks and actors on the head node just like any other worker node, unless configured otherwise.
|
||||
|
||||
.. _cluster-autoscaler-under-construction:
|
||||
.. _cluster-worker-nodes:
|
||||
|
||||
Autoscaler
|
||||
----------
|
||||
Worker Node
|
||||
------------
|
||||
*Worker nodes* do not run any head node management processes, and serve only to run user code in Ray tasks and actors. They participate in distributed scheduling, as well as the storage and distribution of Ray objects in :ref:`cluster memory <memory>`.
|
||||
|
||||
The **autoscaler** is a process that runs on the :ref:`head node<cluster-head-node-under-construction>` (or as a sidecar container in the head pod if :ref:`using Kubernetes<kuberay-index>`).
|
||||
It is responsible for provisioning or deprovisioning :ref:`worker nodes<cluster-worker-nodes-under-construction>`
|
||||
to meet the needs of the Ray workload. In particular, if the resource demands of the Ray workload exceed the
|
||||
current capacity of the cluster, the autoscaler will attempt to add more nodes. Conversely, if
|
||||
a node is idle for long enough, the autoscaler will remove it from the cluster.
|
||||
.. _cluster-autoscaler:
|
||||
|
||||
To learn more about the autoscaler and how to configure it, refer to the following user guides:
|
||||
Autoscaling
|
||||
-----------
|
||||
|
||||
* :ref:`Configuring Autoscaling on VMs<deployment-guide-autoscaler-under-construction>`.
|
||||
* :ref:`Autoscaling on Kubernetes<kuberay-autoscaler-discussion>`.
|
||||
The *Ray autoscaler* is a process that runs on the :ref:`head node <cluster-head-node>` (or as a sidecar container in the head pod if :ref:`using Kubernetes <kuberay-index>`).
|
||||
When the resource demands of the Ray workload exceed the
|
||||
current capacity of the cluster, the autoscaler will try to increase the number of worker nodes. When worker nodes
|
||||
sit idle, the autoscaler will remove worker nodes from the cluster.
|
||||
|
||||
.. _cluster-clients-and-jobs-under-construction:
|
||||
It is important to understand that the autoscaler only reacts to task and actor resource requests, and not application metrics or physical resource utilization.
|
||||
To learn more about autoscaling, refer to the user guides for Ray clusters on :ref:`VMs <cloud-vm-index>` and :ref:`Kubernetes <kuberay-index>`.
|
||||
|
||||
.. _cluster-clients-and-jobs:
|
||||
|
||||
Ray Jobs
|
||||
--------
|
||||
|
@ -81,4 +70,9 @@ Ray Jobs enable users to submit locally developed-and-tested applications to a
|
|||
remote Ray cluster. Ray Job Submission simplifies the experience of packaging,
|
||||
deploying, and managing a Ray application.
|
||||
|
||||
To learn how to run workloads on a Ray Cluster, refer to the :ref:`Ray Jobs guide<ray-jobs-under-construction>`.
|
||||
For interactive development, the following additional methods are available:
|
||||
|
||||
* Directly running a script or notebook on any head or worker node.
|
||||
* Using the Ray Client to connect remotely to the cluster.
|
||||
|
||||
To learn how to run workloads on a Ray cluster, refer to the :ref:`Ray Jobs guide <jobs-overview>`.
|
||||
|
|
|
@ -4,10 +4,10 @@
|
|||
|
||||
:::{note}
|
||||
To learn the basics of Ray on Kubernetes, we recommend taking a look
|
||||
at the {ref}`introductory guide<kuberay-quickstart>` first.
|
||||
at the {ref}`introductory guide <kuberay-quickstart>` first.
|
||||
|
||||
If you are new to your cloud provider's Kubernetes service, we recommend
|
||||
taking a look at the documentation links collected {ref}`here<kuberay-k8s-setup>`
|
||||
taking a look at the documentation links collected {ref}`here <kuberay-k8s-setup>`
|
||||
:::
|
||||
|
||||
This section presents example Ray workloads to try out on your Kubernetes cluster.
|
||||
|
|
|
@ -4,27 +4,20 @@
|
|||
|
||||
:::{note}
|
||||
To learn the basics of Ray on Kubernetes, we recommend taking a look
|
||||
at the {ref}`introductory guide<kuberay-quickstart>` first.
|
||||
at the {ref}`introductory guide <kuberay-quickstart>` first.
|
||||
:::
|
||||
|
||||
|
||||
In this guide, we show you how to run a sample Ray machine learning
|
||||
workload on Kubernetes infrastructure.
|
||||
|
||||
We will run Ray's {ref}`XGBoost training benchmark<xgboost-benchmark>` with a 100 gigabyte training set.
|
||||
We will run Ray's {ref}`XGBoost training benchmark <xgboost-benchmark>` with a 100 gigabyte training set.
|
||||
To learn more about using Ray's XGBoostTrainer, check out {ref}`the XGBoostTrainer documentation <train-gbdt-guide>`.
|
||||
|
||||
```{admonition} Optional: Autoscaling
|
||||
This guide includes notes on how to deploy the XGBoost benchmark with optional Ray Autoscaler support.
|
||||
In this guide's example, we know that we need 1 Ray head and 9 Ray workers,
|
||||
so autoscaling is not strictly required. Read {ref}`this discussion<autoscaler-pro-con>` for guidance
|
||||
on whether to use autoscaling.
|
||||
```
|
||||
|
||||
## Kubernetes infrastructure setup
|
||||
|
||||
If you are new to Kubernetes and you are planning to deploy Ray workloads on a managed
|
||||
Kubernetes service, we recommend taking a look at this {ref}`introductory guide<kuberay-k8s-setup>`
|
||||
Kubernetes service, we recommend taking a look at this {ref}`introductory guide <kuberay-k8s-setup>`
|
||||
first.
|
||||
|
||||
For the workload in this guide, it is recommended to use a pool or group of Kubernetes nodes
|
||||
|
@ -47,7 +40,7 @@ scale up to accommodate Ray worker pods. These nodes will scale back down after
|
|||
## Deploy the KubeRay operator
|
||||
|
||||
Once you have set up your Kubernetes cluster, deploy the KubeRay operator.
|
||||
Refer to the {ref}`Getting Started guide<kuberay-operator-deploy>`
|
||||
Refer to the {ref}`Getting Started guide <kuberay-operator-deploy>`
|
||||
for instructions on this step.
|
||||
|
||||
## Deploy a Ray cluster
|
||||
|
@ -88,7 +81,7 @@ watch -n 1 kubectl get pod
|
|||
```
|
||||
|
||||
Once the Ray head pod enters `Running` state, we are ready to execute the XGBoost workload.
|
||||
We will use {ref}`Ray Job Submission<jobs-overview>` to kick off the workload.
|
||||
We will use {ref}`Ray Job Submission <jobs-overview>` to kick off the workload.
|
||||
|
||||
### Connect to the cluster.
|
||||
|
||||
|
@ -100,7 +93,7 @@ kubectl port-forward service/raycluster-xgboost-benchmark-head-svc 8265:8265
|
|||
|
||||
### Submit the workload.
|
||||
|
||||
We'll use the {ref}`Ray Job Python SDK<ray-job-sdk>` to submit the XGBoost workload.
|
||||
We'll use the {ref}`Ray Job Python SDK <ray-job-sdk>` to submit the XGBoost workload.
|
||||
|
||||
```{literalinclude} /cluster/doc_code/xgboost_submit.py
|
||||
:language: python
|
||||
|
@ -170,7 +163,7 @@ Results: {'training_time': 1338.488839321999, 'prediction_time': 403.36653568099
|
|||
```
|
||||
|
||||
The performance of the benchmark is sensitive to the underlying cloud infrastructure --
|
||||
you might not match {ref}`the numbers quoted in the benchmark docs<xgboost-benchmark>`.
|
||||
you might not match {ref}`the numbers quoted in the benchmark docs <xgboost-benchmark>`.
|
||||
|
||||
#### Model parameters
|
||||
The file `model.json` in the Ray head pod contains the parameters for the trained model.
|
||||
|
|
|
@ -290,7 +290,7 @@
|
|||
"id": "fa9c6e9d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"While this can be useful for ad-hoc execution on the Ray Cluster, the recommended way to execute an application on a Ray Cluster is to use [Ray Jobs](ray-jobs-under-construction).\n",
|
||||
"While this can be useful for ad-hoc execution on the Ray Cluster, the recommended way to execute an application on a Ray Cluster is to use [Ray Jobs](jobs-quickstart).\n",
|
||||
"\n",
|
||||
"(kuberay-job)=\n",
|
||||
"### Ray Job submission\n",
|
||||
|
@ -367,7 +367,7 @@
|
|||
"id": "f8453b2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For a more detailed guide on using Ray Jobs to run applications on a Ray Cluster, check out the [quickstart guide](jobs-quickstart-under-construction)"
|
||||
"For a more detailed guide on using Ray Jobs to run applications on a Ray Cluster, check out the [quickstart guide](jobs-quickstart)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -20,7 +20,7 @@ different Ray versions in the same Kubernetes cluster.
|
|||
```
|
||||
|
||||
|
||||
Concretely you will learn how to:
|
||||
Concretely, you will learn how to:
|
||||
|
||||
- Set up and configure Ray on a Kubernetes cluster
|
||||
- Deploy and monitor Ray applications
|
||||
|
@ -80,7 +80,7 @@ The Ray docs present all the information you need to start running Ray workloads
|
|||
:text: Check API references
|
||||
:classes: btn-outline-info btn-block
|
||||
```
|
||||
## The KubeRay project
|
||||
## About KubeRay
|
||||
|
||||
Ray's Kubernetes support is developed at the [KubeRay GitHub repository](https://github.com/ray-project/kuberay), under the broader [Ray project](https://github.com/ray-project/). KubeRay is used by several companies to run production Ray deployments.
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# API Reference
|
||||
|
||||
To learn about RayCluster configuration, we recommend taking a look at
|
||||
the {ref}`configuration guide<kuberay-config>`.
|
||||
the {ref}`configuration guide <kuberay-config>`.
|
||||
|
||||
For comprehensive coverage of all supported RayCluster fields,
|
||||
refer to the [Golang structs][RayClusterDef] used to generate the RayCluster CRD.
|
||||
|
|
|
@ -3,16 +3,13 @@
|
|||
|
||||
:::{note}
|
||||
To learn the basics of Ray on Kubernetes, we recommend taking a look
|
||||
at the {ref}`introductory guide<kuberay-quickstart>` first.
|
||||
at the {ref}`introductory guide <kuberay-quickstart>` first.
|
||||
:::
|
||||
|
||||
In these guides, we go into further depth on several topics related to
|
||||
deployments of Ray on Kubernetes.
|
||||
* {ref}`kuberay-k8s-setup`
|
||||
* {ref}`kuberay-config`
|
||||
* {ref}`kuberay-autoscaler-discussion`
|
||||
* {ref}`kuberay-config`
|
||||
* {ref}`kuberay-autoscaler-discussion`
|
||||
* {ref}`kuberay-logging`
|
||||
* {ref}`kuberay-autoscaling`
|
||||
* {ref}`kuberay-gpu`
|
||||
* {ref}`kuberay-vs-legacy`
|
||||
* {ref}`kuberay-logging`
|
||||
|
|
|
@ -89,6 +89,7 @@ spec:
|
|||
```
|
||||
|
||||
The rest of this guide will discuss the `RayCluster` CR's config fields.
|
||||
See also the [guide](kuberay-autoscaling-config) on configuring Ray autoscaling with KubeRay.
|
||||
|
||||
(kuberay-config-ray-version)=
|
||||
## The Ray Version
|
||||
|
@ -175,15 +176,15 @@ In any case, do make sure that all Ray images in your CR carry the same Ray vers
|
|||
Python version.
|
||||
To distribute custom code dependencies across your cluster, you can build a custom container image,
|
||||
using one of the [official Ray images](https://hub.docker.com/r/rayproject/ray>) as the base.
|
||||
See {ref}`this guide<docker-images>` to learn more about the official Ray images.
|
||||
See {ref}`this guide <docker-images>` to learn more about the official Ray images.
|
||||
For dynamic dependency management geared towards iteration and developement,
|
||||
you can also use {ref}`Runtime Environments<runtime-environments>`.
|
||||
you can also use {ref}`Runtime Environments <runtime-environments>`.
|
||||
|
||||
(rayStartParams)=
|
||||
## Ray Start Parameters
|
||||
The ``rayStartParams`` field of each group spec is a string-string map of arguments to the Ray
|
||||
container’s `ray start` entrypoint. For the full list of arguments, refer to
|
||||
the documentation for {ref}`ray start<ray-start-doc>`. We make special note of the following arguments:
|
||||
the documentation for {ref}`ray start <ray-start-doc>`. We make special note of the following arguments:
|
||||
|
||||
### block
|
||||
For most use-cases, this field should be set to "true" for all Ray pod. The container's Ray
|
||||
|
@ -210,7 +211,7 @@ must be supplied as **strings**.
|
|||
### num-gpus
|
||||
This optional field specifies the number of GPUs available to the Ray container.
|
||||
In KubeRay versions since 0.3.0, the number of GPUs can be auto-detected from Ray container resource limits.
|
||||
For certain advanced use-cases, you may wish to use `num-gpus` to set an {ref}`override<kuberay-gpu-override>`.
|
||||
For certain advanced use-cases, you may wish to use `num-gpus` to set an {ref}`override <kuberay-gpu-override>`.
|
||||
Note that the values of all Ray start parameters, including `num-gpus`,
|
||||
must be supplied as **strings**.
|
||||
|
||||
|
@ -313,134 +314,6 @@ rayStartParams:
|
|||
ray-client-server-port: "10002"
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
(kuberay-autoscaling-config)=
|
||||
## Autoscaler Configuration
|
||||
```{note}
|
||||
If you are deciding whether to use autoscaling for a particular Ray application,
|
||||
check out this {ref}`discussion<autoscaler-pro-con>`. Note that autoscaling is
|
||||
supported only with Ray versions at least as new as Ray 1.11.0.
|
||||
```
|
||||
To enable the optional Ray Autoscaler support, set `enableInTreeAutoscaling:true`.
|
||||
The KubeRay operator will then automatically configure an autoscaling sidecar container
|
||||
for the Ray head pod. The autoscaler container collects resource metrics from the Ray cluster
|
||||
and automatically adjusts the `replicas` field of each `workerGroupSpec` as needed to fulfill
|
||||
the requirements of your Ray application.
|
||||
|
||||
Use the fields `minReplicas` and `maxReplicas` to constrain the number of `replicas` of an autoscaling
|
||||
`workerGroup`. When deploying an autoscaling cluster, one typically sets `replicas` and `minReplicas`
|
||||
to the same value.
|
||||
The Ray autoscaler will then take over and modify the `replicas` field as needed by
|
||||
the Ray application.
|
||||
|
||||
### Autoscaler operation
|
||||
We describe how the autoscaler interacts with the `RayCluster` CR.
|
||||
|
||||
#### Scale up
|
||||
The autoscaler scales worker pods up to accomodate the load of logical resources
|
||||
from your Ray application. For example, suppose you submit a task requesting 2 GPUs:
|
||||
```python
|
||||
@ray.remote(num_gpus=2)
|
||||
...
|
||||
```
|
||||
If your Ray cluster does not currently have any GPU worker pods, and if your configuration
|
||||
specifies a worker type with at least 2 units of GPU capacity, a GPU pod will be
|
||||
upscaled.
|
||||
|
||||
The autoscaler scales Ray worker pods up by editing the `replicas` field of the relevant `workerGroupSpec`.
|
||||
|
||||
#### Scale down
|
||||
The autoscaler scales a worker pod down when the pod has not been using any logical resources
|
||||
for a {ref}`set period of time<kuberay-idle-timeout>`. In this context, "resources" are the logical Ray resources
|
||||
(such as CPU, GPU, memory, and custom resources) specified in Ray task and actor annotations.
|
||||
Usage of the Ray Object Store also marks a Ray worker pod as active and prevents downscaling.
|
||||
|
||||
To scale down Ray pods of a given `workerGroup`, the autoscaler
|
||||
adds the Ray pods' names to the relevant `workerGroupSpec`'s
|
||||
`scaleStrategy.workersToDelete` list and decrements the `replicas` field.
|
||||
|
||||
#### Manually scaling
|
||||
You may manually adjust a `RayCluster`'s scale by editing the `replicas` or `workersToDelete` fields.
|
||||
(It is also possible to implement custom scaling logic that adjusts scale on your behalf.)
|
||||
It is however, not recommended to manually edit `replicas` or `workersToDelete` for a `RayCluster` with
|
||||
autoscaling enabled.
|
||||
|
||||
### autoscalerOptions
|
||||
To enable Ray autoscaler support, it is enough to set `enableInTreeAutoscaling:true`.
|
||||
Should you need to adjust autoscaling behavior or change the autoscaler container's configuration,
|
||||
you can use the `RayCluster` CR's `autoscalerOptions` field. The `autoscalerOptions` field
|
||||
carries the following subfields:
|
||||
|
||||
#### upscalingMode
|
||||
The `upscalingMode` field can be used to control the rate of Ray pod upscaling.
|
||||
|
||||
UpscalingMode is `Conservative`, `Default`, or `Aggressive`.
|
||||
- `Conservative`: Upscaling is rate-limited; the number of pending worker pods is at most the number
|
||||
of worker pods connected to the Ray cluster.
|
||||
- `Default`: Upscaling is not rate-limited.
|
||||
- `Aggressive`: An alias for Default; upscaling is not rate-limited.
|
||||
|
||||
You may wish to use `Conservative` upscaling if you plan to submit many short-lived tasks
|
||||
to your Ray cluster. In this situation, `Default` upscaling may trigger the _thrashing_ behavior:
|
||||
- The autoscaler sees resource demands from the submitted short-lived tasks.
|
||||
- The autoscaler immediately creates Ray pods to accomodate the demand.
|
||||
- By the time the additional Ray pods are provisioned, the tasks have already run to completion.
|
||||
- The additional Ray pods are unused and scale down after a period of idleness.
|
||||
|
||||
Note, however, that it is generally not recommended to over-parallelize with Ray.
|
||||
Since running a Ray task incurs scheduling overhead, it is usually preferable to use
|
||||
a few long-running tasks over many short-running tasks. Ensuring that each task has
|
||||
a non-trivial amount of work to do will also help prevent the autoscaler from over-provisioning
|
||||
Ray pods.
|
||||
|
||||
(kuberay-idle-timeout)=
|
||||
#### idleTimeoutSeconds
|
||||
`idleTimeoutSeconds` is the number of seconds to wait before scaling down a worker pod
|
||||
which is not using resources. In this context, "resources" are the logical Ray resources
|
||||
(such as CPU, GPU, memory, and custom resources) specified in Ray task and actor annotations.
|
||||
Usage of the Ray Object Store also marks a Ray worker pod as active and prevents downscaling.
|
||||
|
||||
`idleTimeoutSeconds` defaults to 60 seconds.
|
||||
|
||||
#### resources
|
||||
The `resources` subfield of `autoscalerOptions` sets optional resource overrides
|
||||
for the autoscaler sidecar container. These overrides
|
||||
should be specified in the standard [container resource
|
||||
spec format](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#resources).
|
||||
The default values are as indicated below:
|
||||
```
|
||||
resources:
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
requests:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
```
|
||||
These defaults should be suitable for most use-cases.
|
||||
However, we do recommend monitoring autoscaler container resource usage and adjusting as needed.
|
||||
|
||||
#### image and imagePullPolicy
|
||||
The `image` subfield of `autoscalerOptions` optionally overrides the autoscaler container image.
|
||||
If your `RayCluster`'s `spec.RayVersion` is at least `2.0.0`, the autoscaler will default to using
|
||||
**the same image** as the Ray container. (Ray autoscaler code is bundled with the rest of Ray.)
|
||||
For older Ray versions, the autoscaler will default to the image `rayproject/ray:2.0.0`.
|
||||
|
||||
The `imagePullPolicy` subfield of `autoscalerOptions` optionally overrides the autoscaler container's
|
||||
image pull policy. The default is `Always`.
|
||||
|
||||
The `image` and `imagePullPolicy` overrides are provided primarily for the purposes of autoscaler testing and
|
||||
development.
|
||||
|
||||
#### env and envFrom
|
||||
|
||||
The `env` and `envFrom` fields specify autoscaler container
|
||||
environment variables, for debugging and development purposes.
|
||||
These fields should be formatted following the
|
||||
[Kuberentes API](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#environment-variables)
|
||||
for container environment variables.
|
||||
|
||||
(kuberay-config-miscellaneous)=
|
||||
## Pod and container lifecyle: preStop hooks and initContainers
|
||||
There are two pieces of pod configuration that should always be included
|
||||
|
|
|
@ -1,36 +1,44 @@
|
|||
(kuberay-autoscaler-discussion)=
|
||||
# Autoscaling
|
||||
This page discusses autoscaling in the context of Ray on Kubernetes.
|
||||
For details on autoscaler configuration see the {ref}`configuration guide<kuberay-autoscaling-config>`.
|
||||
(kuberay-autoscaling)=
|
||||
|
||||
:::{note}
|
||||
Autoscaling is supported only with Ray versions at least
|
||||
as new as 1.11.0.
|
||||
:::
|
||||
# KubeRay Autoscaling
|
||||
|
||||
(autoscaler-pro-con)=
|
||||
## Should I enable autoscaling?
|
||||
Ray Autoscaler support is optional.
|
||||
Here are some considerations to keep in mind when choosing whether to use autoscaling.
|
||||
This guide explains how to configure the Ray autoscaler on Kubernetes.
|
||||
The Ray autoscaler is a Ray cluster process that automatically scales a cluster up and down based on resource demand.
|
||||
The autoscaler does this by adjusting the number of nodes (Ray pods) in the cluster based on the resources required by tasks, actors or placement groups.
|
||||
|
||||
### Autoscaling: Pros
|
||||
**Cope with unknown resource requirements.** If you don't know how much compute your Ray
|
||||
workload will require, autoscaling can adjust your Ray cluster to the right size.
|
||||
Note that the autoscaler only considers logical resource requests for scaling (i.e., those specified in ``@ray.remote`` and displayed in `ray status`), not physical machine utilization. If a user tries to launch an actor, task, or placement group but there are insufficient resources, the request will be queued. The autoscaler adds nodes to satisfy resource demands in this queue.
|
||||
The autoscaler also removes nodes after they become idle for some time.
|
||||
A node is considered idle if it has no active tasks, actors, or objects.
|
||||
|
||||
**Save on costs.** Idle compute is automatically scaled down, potentially leading to cost savings,
|
||||
especially when you are using expensive resources like GPUs.
|
||||
<!-- TODO(ekl): probably should change the default kuberay examples to not use autoscaling -->
|
||||
```{admonition} When to use Autoscaling?
|
||||
Autoscaling can reduce workload costs, but adds node launch overheads and can be tricky to configure.
|
||||
We recommend starting with non-autoscaling clusters if you're new to Ray.
|
||||
```
|
||||
|
||||
### Autoscaling: Cons
|
||||
**Less predictable when resource requirements are known.** If you already know exactly
|
||||
how much compute your workload requires, it could make sense to provision a static Ray cluster
|
||||
of the appropriate fixed size.
|
||||
## Overview
|
||||
The following diagram illustrates the integration of the Ray Autoscaler
|
||||
with the KubeRay operator.
|
||||
|
||||
**Longer end-to-end runtime.** Autoscaling entails provisioning compute for Ray workers
|
||||
while the Ray application is running. On the other hand, if you pre-provision a fixed
|
||||
number of Ray workers, all of the Ray workers can be started in parallel, potentially reducing your application's
|
||||
runtime.
|
||||
```{eval-rst}
|
||||
.. image:: ../images/AutoscalerOperator.svg
|
||||
:align: center
|
||||
..
|
||||
Find the source document here (https://docs.google.com/drawings/d/1LdOg9JQuN5AOII-vDpSaFBsTeg0JGWcsbyNNLP1yovg/edit)
|
||||
```
|
||||
|
||||
### Getting Started with Autoscaling
|
||||
Worker pod upscaling occurs through the following sequence of events:
|
||||
1. The user submits a Ray workload.
|
||||
2. Workload resource requirements are aggregated by the Ray head container
|
||||
and communicated to the Ray autoscaler sidecar.
|
||||
3. The autoscaler determines that a Ray worker pod must be added to satisfy the workload's resource requirement.
|
||||
4. The autoscaler requests an addtional worker pod by incrementing the RayCluster CR's `replicas` field.
|
||||
5. The KubeRay operator creates a Ray worker pod to match the new `replicas` specification.
|
||||
6. The Ray scheduler places the user's workload on the new worker pod.
|
||||
|
||||
See also the operator architecture diagram in the [KubeRay documentation](https://ray-project.github.io/kuberay/components/operator/).
|
||||
|
||||
## Quickstart
|
||||
|
||||
First, follow the [quickstart guide](kuberay-quickstart) to create an autoscaling cluster. The commands to create the KubeRay operator and deploy an autoscaling cluster are summarized here:
|
||||
|
||||
|
@ -45,7 +53,7 @@ $ kubectl create -k kuberay/ray-operator/config/default
|
|||
$ kubectl apply -f kuberay/ray-operator/config/samples/ray-cluster.autoscaler.yaml
|
||||
```
|
||||
|
||||
Now, we can run a Ray program on the head pod that asks the autoscaler to scale the cluster to a total of 3 CPUs. The head and worker in our example cluster each have a capacity of 1 CPU, so the request should trigger upscaling of an additional worker pod.
|
||||
Now, we can run a Ray program on the head pod that uses [``request_resources``](ref-autoscaler-sdk) to scale the cluster to a total of 3 CPUs. The head and worker pods in our [example cluster config](https://github.com/ray-project/kuberay/blob/master/ray-operator/config/samples/ray-cluster.autoscaler.yaml) each have a capacity of 1 CPU, and we specified a minimum of 1 worker pod. Thus, the request should trigger upscaling of one additional worker pod.
|
||||
|
||||
Note that in real-life scenarios, you will want to use larger Ray pods. In fact, it is advantageous to size each Ray pod to take up an entire Kubernetes node. See the [configuration guide](kuberay-config) for more details.
|
||||
|
||||
|
@ -88,7 +96,73 @@ $ kubectl logs raycluster-autoscaler-head-xxxxx -c autoscaler | tail -n 20
|
|||
# ...
|
||||
```
|
||||
|
||||
## Ray Autoscaler vs. other autoscalers
|
||||
(kuberay-autoscaling-config)=
|
||||
## KubeRay Config Parameters
|
||||
|
||||
There are two steps to enabling Ray autoscaling in the KubeRay `RayCluster` custom resource (CR) config:
|
||||
|
||||
1. Set `enableInTreeAutoscaling:true`. The KubeRay operator will then automatically configure an autoscaling sidecar container
|
||||
for the Ray head pod. The autoscaler container collects resource metrics from the Ray cluster
|
||||
and automatically adjusts the `replicas` field of each `workerGroupSpec` as needed to fulfill
|
||||
the requirements of your Ray application.
|
||||
|
||||
2. Set the fields `minReplicas` and `maxReplicas` to constrain the number of `replicas` of an autoscaling
|
||||
`workerGroup`. When deploying an autoscaling cluster, one typically sets `replicas` and `minReplicas`
|
||||
to the same value.
|
||||
The Ray autoscaler will then take over and modify the `replicas` field as pods are added to or removed from the cluster.
|
||||
|
||||
For an example, check out the [config file](https://github.com/ray-project/kuberay/blob/master/ray-operator/config/samples/ray-cluster.autoscaler.yaml) that we used in the above quickstart guide.
|
||||
|
||||
### Upscaling and downscaling speed
|
||||
|
||||
If needed, you can also control the rate at which nodes should be added to or removed from the cluster. For applications with many short-lived tasks, you may wish to adjust the upscaling and downscaling speed to be more conservative.
|
||||
|
||||
Use the `RayCluster` CR's `autoscalerOptions` field to do so. The `autoscalerOptions` field
|
||||
carries the following subfields:
|
||||
|
||||
**`upscalingMode`**: This controls the rate of Ray pod upscaling. The valid values are:
|
||||
- `Conservative`: Upscaling is rate-limited; the number of pending worker pods is at most the number
|
||||
of worker pods connected to the Ray cluster.
|
||||
- `Default`: Upscaling is not rate-limited.
|
||||
- `Aggressive`: An alias for Default; upscaling is not rate-limited.
|
||||
|
||||
**`idleTimeoutSeconds`** (default 60s): This is the number of seconds to wait before scaling down an idle worker pod. Worker nodes are considered idle when they hold no active tasks, actors, or referenced objects (either in-memory or spilled to disk).
|
||||
|
||||
### Configuring the autoscaler sidecar container
|
||||
|
||||
The `autoscalerOptions` field also provides options for configuring the autoscaler container. Usually, it is not necessary to specify these options.
|
||||
|
||||
**`resources`**: The `resources` subfield of `autoscalerOptions` sets optional resource overrides
|
||||
for the autoscaler sidecar container. These overrides
|
||||
should be specified in the standard [container resource
|
||||
spec format](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#resources).
|
||||
The default values are indicated below:
|
||||
```
|
||||
resources:
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
requests:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
```
|
||||
|
||||
The following `autoscalerOptions` suboptions are also available for testing and development of the autoscaler itself.
|
||||
|
||||
**`image`**: This field overrides the autoscaler container image.
|
||||
If your `RayCluster`'s `spec.RayVersion` is at least `2.0.0`, the autoscaler will default to using
|
||||
**the same image** as the Ray container. (Ray autoscaler code is bundled with the rest of Ray.)
|
||||
For older Ray versions, the autoscaler will default to the image `rayproject/ray:2.0.0`.
|
||||
|
||||
**`imagePullPolicy`**: This field overrides the autoscaler container's
|
||||
image pull policy. The default is `Always`.
|
||||
|
||||
**`env`** and **`envFrom`**: These fields specify autoscaler container
|
||||
environment variables. These fields should be formatted following the
|
||||
[Kuberentes API](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#environment-variables)
|
||||
for container environment variables.
|
||||
|
||||
## Understanding the Ray Autoscaler in the Context of Kubernetes
|
||||
We describe the relationship between the Ray autoscaler and other autoscalers in the Kubernetes
|
||||
ecosystem.
|
||||
|
||||
|
@ -155,25 +229,3 @@ One method is to schedule fewer tasks/actors per node by increasing the resource
|
|||
requirements specified in the `ray.remote` annotation.
|
||||
For example, changing `@ray.remote(num_cpus=2)` to `@ray.remote(num_cpus=4)`
|
||||
will halve the quantity of that task or actor that can fit in a given Ray pod.
|
||||
|
||||
## Autoscaling architecture
|
||||
The following diagram illustrates the integration of the Ray Autoscaler
|
||||
with the KubeRay operator.
|
||||
|
||||
```{eval-rst}
|
||||
.. image:: ../images/AutoscalerOperator.svg
|
||||
:align: center
|
||||
..
|
||||
Find the source document here (https://docs.google.com/drawings/d/1LdOg9JQuN5AOII-vDpSaFBsTeg0JGWcsbyNNLP1yovg/edit)
|
||||
```
|
||||
|
||||
Worker pod upscaling occurs through the following sequence of events:
|
||||
1. The user submits a Ray workload.
|
||||
2. Workload resource requirements are aggregated by the Ray head container
|
||||
and communicated to the Ray autoscaler sidecar.
|
||||
3. The autoscaler determines that a Ray worker pod must be added to satisfy the workload's resource requirement.
|
||||
4. The autoscaler requests an addtional worker pod by incrementing the RayCluster CR's `replicas` field.
|
||||
5. The KubeRay operator creates a Ray worker pod to match the new `replicas` specification.
|
||||
6. The Ray scheduler places the user's workload on the new worker pod.
|
||||
|
||||
See also the operator architecture diagram in the [KubeRay documentation](https://ray-project.github.io/kuberay/components/operator/).
|
|
@ -13,12 +13,12 @@ ___________________________________________
|
|||
The `Ray Docker Hub <https://hub.docker.com/r/rayproject/>`_ hosts CUDA-based container images packaged
|
||||
with Ray and certain machine learning libraries.
|
||||
For example, the image ``rayproject/ray-ml:2.0.0-gpu`` is ideal for running GPU-based ML workloads with Ray 2.0.0.
|
||||
The Ray ML images are packaged with dependencies (such as TensorFlow and PyTorch) needed to use the :ref:`Ray AI Runtime<air>`
|
||||
The Ray ML images are packaged with dependencies (such as TensorFlow and PyTorch) needed to use the :ref:`Ray AI Runtime <air>`
|
||||
and the Ray Libraries covered in these docs.
|
||||
To add custom dependencies, we recommend one, or both, of the following methods:
|
||||
|
||||
* Building a docker image using one of the official :ref:`Ray docker images<docker-images>` as base.
|
||||
* Using :ref:`Ray Runtime environments<runtime-environments>`.
|
||||
* Building a docker image using one of the official :ref:`Ray docker images <docker-images>` as base.
|
||||
* Using :ref:`Ray Runtime environments <runtime-environments>`.
|
||||
|
||||
|
||||
Configuring Ray pods for GPU usage
|
||||
|
@ -64,7 +64,7 @@ Each of the Ray pods in the group can be scheduled on an AWS `p2.xlarge` instanc
|
|||
as demonstrated with the `minReplicas:0` and `maxReplicas:5` settings above.
|
||||
To enable autoscaling, remember also to set `enableInTreeAutoscaling:True` in your RayCluster's `spec`
|
||||
Finally, make sure your group or pool of GPU Kubernetes nodes are configured to autoscale.
|
||||
Refer to your :ref:`cloud provider's documentation<kuberay-k8s-setup>` for details on autoscaling node pools.
|
||||
Refer to your :ref:`cloud provider's documentation <kuberay-k8s-setup>` for details on autoscaling node pools.
|
||||
|
||||
GPUs and Ray
|
||||
____________
|
||||
|
@ -119,7 +119,7 @@ nodes will be scaled down as well.
|
|||
|
||||
Requesting GPUs
|
||||
~~~~~~~~~~~~~~~
|
||||
You can also make a :ref:`direct request to the autoscaler<ref-autoscaler-sdk-request-resources>` to scale up GPU resources.
|
||||
You can also make a :ref:`direct request to the autoscaler <ref-autoscaler-sdk-request-resources>` to scale up GPU resources.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
|
|
@ -1,195 +0,0 @@
|
|||
(kuberay-vs-legacy)=
|
||||
|
||||
# KubeRay vs. the Legacy Ray Operator
|
||||
|
||||
Using the [KubeRay operator](https://ray-project.github.io/kuberay/components/operator/)
|
||||
is the preferred way to deploy Ray on Kubernetes.
|
||||
This page compares the KubeRay operator to the {ref}`legacy Ray Operator<ray-k8s-deploy>` hosted in the Ray repo.
|
||||
This page also provides migration notes for users switching to the KubeRay operator.
|
||||
|
||||
## KubeRay vs. Legacy Ray Operator: Similarities
|
||||
|
||||
### Purpose
|
||||
The two operators have the same purpose: managing clusters of Ray pods deployed on Kubernetes.
|
||||
|
||||
### High-level interface structure
|
||||
Both operators rely on a user-specified custom resource specifying Ray pod configuration and
|
||||
Ray worker pod quantities.
|
||||
|
||||
## KubeRay vs. Legacy Ray Operator: Differences
|
||||
|
||||
The two operators differ primarily in internal design and implementation.
|
||||
There are also some differences in configuration details.
|
||||
|
||||
### Implementation and architecture
|
||||
**Legacy Ray Operator** The legacy Ray Operator is implemented in Python.
|
||||
Kubernetes event handling is implemented using the [Kopf](https://kopf.readthedocs.io/en/stable/) framework.
|
||||
The operator invokes Ray cluster launcher and autoscaler code to manage Ray clusters.
|
||||
The operator forks an autoscaler subprocess for each Ray cluster it manages.
|
||||
The Ray autoscaler subprocesses create and delete Ray pods directly.
|
||||
|
||||
**KubeRay Operator** The KubeRay operator is implemented in Golang using standard tools
|
||||
for building Kubernetes operators, including the [KubeBuilder](https://github.com/kubernetes-sigs/kubebuilder)
|
||||
operator framework
|
||||
and the [client-go](https://github.com/kubernetes/client-go) client library.
|
||||
The KubeRay operator is structurally simpler than the legacy Ray Operator;
|
||||
rather than running many Ray autoscalers in subprocesses, the KubeRay operator implements a simple
|
||||
reconciliation loop. The reconciliation loop creates and deletes Ray pods to match the desired
|
||||
state expressed in each RayCluster CR.
|
||||
Each Ray cluster runs its own autoscaler as a sidecar to the Ray head pod.
|
||||
The Ray autoscaler communicates desired scale to the KubeRay operator by writing to the RayCluster
|
||||
custom resource.
|
||||
|
||||
### Scalability
|
||||
The KubeRay operator is more scalable than the legacy Ray Operator. Specifically, the
|
||||
KubeRay operator can simultaneously manage more Ray clusters.
|
||||
|
||||
**Legacy Ray Operator** Each Ray autoscaler consumes nontrivial memory and CPU resources.
|
||||
Since the legacy Ray Operator runs many autoscalers in one pod, it cannot manage many Ray clusters.
|
||||
|
||||
**KubeRay Operator** The KubeRay operator does not run Ray autoscaler processes.
|
||||
Each Ray autoscaler runs as a sidecar to the Ray head. Since managing each Ray cluster is cheap,
|
||||
the KubeRay operator can manage many Ray clusters.
|
||||
|
||||
### Ray version compatibility
|
||||
|
||||
**Legacy Ray Operator**
|
||||
It is recommended to use the same Ray version in the legacy Ray operator
|
||||
as in all of the Ray pods managed by the operator.
|
||||
Matching Ray versions is required to maintain compatibility between autoscaler code
|
||||
running in the operator pod and Ray code running in the Ray cluster.
|
||||
|
||||
**KubeRay Operator**
|
||||
The KubeRay operator is compatible with many Ray versions.
|
||||
Compatibility of KubeRay v0.3.0 with Ray versions 1.11, 1.12, 1.13, and 2.0 is tested explicitly.
|
||||
|
||||
Note however that autoscaling with KubeRay is supported only with Ray versions
|
||||
at least as new as 1.11.0.
|
||||
|
||||
### Configuration details
|
||||
Some details of Ray cluster configuration are different with KubeRay; see the next section
|
||||
for migration notes. Refer to the {ref}`configuration guide<kuberay-config>` for comprehensive
|
||||
information.
|
||||
|
||||
## Migration Notes
|
||||
|
||||
Take note of the following configuration differences when switching to KubeRay
|
||||
deployment.
|
||||
|
||||
### Autoscaling is optional
|
||||
Ray Autoscaler support is optional with KubeRay. Set `spec.enableInTreeAutoscaling:true`
|
||||
in the RayCluster CR to enable autoscaling. The KubeRay operator will then automatically
|
||||
configure a Ray Autoscaler sidecar for the Ray head pod.
|
||||
The autoscaler container requests 500m CPU and 512Mi memory by default.
|
||||
Autoscaler container configuration is accessible via `spec.autoscalerOptions`.
|
||||
Note that autoscaling with KubeRay is supported only with Ray versions at least as new 1.11.
|
||||
|
||||
### No need to specify /dev/shm
|
||||
The KubeRay operator automatically configures a `/dev/shm` volume for each Ray pod's object store.
|
||||
There is no need to specify this volume in the RayCluster CR.
|
||||
|
||||
### Namespace-scoped operation.
|
||||
Similar to the legacy Ray Operator, it is possible to run the KubeRay operator at single-namespace scope.
|
||||
See the [KubeRay documentation][KubeRaySingleNamespace] for details.
|
||||
|
||||
Note that the KubeRay operator can manage many Ray clusters running at different Ray versions.
|
||||
Thus, from a scalability and compatibility perspective, there is no need to run
|
||||
one KubeRay operator per Kubernetes namespace. Run a namespace-scoped KubeRay operator
|
||||
only if necessary, e.g. to accommodate permissions constraints in your Kubernetes cluster.
|
||||
|
||||
### Specifying resource quantities.
|
||||
Ray pod CPU, GPU, and memory capacities are detected from container resource limits and advertised
|
||||
to Ray.
|
||||
|
||||
The interface for overriding the resource capacities advertised to Ray is different:
|
||||
Resource overrides must be specified in `rayStartParams`.
|
||||
For example, you may wish to prevent the Ray head pod
|
||||
from running Ray workloads by labelling the head as having 0 CPU capacity.
|
||||
To achieve this with KubeRay, include the following in the `headGroupSpec`'s configuration:
|
||||
```yaml
|
||||
rayStartParams:
|
||||
num-cpus: "0"
|
||||
```
|
||||
To advertise custom resource capacities to Ray, one uses the field `rayStartParams.resources`.
|
||||
See the {ref}`configuration guide<rayStartParams>` for details.
|
||||
|
||||
[KuberaySingleNamespace]: https://github.com/ray-project/kuberay#single-namespace-version
|
||||
|
||||
### Ray Version
|
||||
The Ray version (e.g. 2.0.0) should be supplied under the RayCluster CR's `spec.rayVersion`.
|
||||
See the {ref}`configuration guide<kuberay-config-ray-version>` for details.
|
||||
|
||||
### Init Containers and Pre-Stop hooks
|
||||
There are two pieces of configuration that should be included in all KubeRay RayCluster CRs.
|
||||
- Worker pods need an init container that awaits creation of the Ray head service.
|
||||
- Ray containers for the Ray head and worker should include a preStop hook with a `ray stop`
|
||||
command.
|
||||
While future versions of KubeRay may inject this configuration automatically,
|
||||
currently these elements must be included in all RayCluster CRs.
|
||||
See the {ref}`configuration guide<kuberay-config-miscellaneous>` for details.
|
||||
|
||||
## Migration: Example
|
||||
This section presents an example of the migration process.
|
||||
Specifically, we translate a Helm values.yaml configuration for the legacy Ray Operator into
|
||||
an example RayCluster CR for KubeRay.
|
||||
We also recommend taking a look at example RayCluster CRs in the [Ray docs][RayExamples]
|
||||
and in the [KubeRay docs][KubeRayExamples].
|
||||
|
||||
### Legacy Ray Operator values.yaml
|
||||
Here is a `values.yaml` for the legacy Ray Operator's Helm chart which specifies a Ray cluster
|
||||
with the following features
|
||||
- A head pod annotated with a `"CPU":0` override to prevent scheduling Ray workloads on the head.
|
||||
- A CPU worker group annotated with custom resource capacities.
|
||||
- A GPU worker group.
|
||||
```yaml
|
||||
image: rayproject/ray-ml:2.0.0-gpu
|
||||
headPodType: rayHeadType
|
||||
podTypes:
|
||||
rayHeadType:
|
||||
CPU: 14
|
||||
memory: 54Gi
|
||||
# Annotate the head pod as having 0 CPU
|
||||
# to prevent the head pod from scheduling Ray workloads.
|
||||
rayResources: {"CPU": 0}
|
||||
rayCPUWorkerType:
|
||||
# Start with 2 CPU workers. Allow scaling up to 3 CPU workers.
|
||||
minWorkers: 2
|
||||
maxWorkers: 3
|
||||
memory: 54Gi
|
||||
CPU: 14
|
||||
# Annotate the Ray worker pod as having 1 unit of Custom capacity and 5 units of "Custom2" capacity
|
||||
rayResources: {"Custom": 1, "Custom2": 5}
|
||||
rayGPUWorkerType:
|
||||
minWorkers: 0
|
||||
maxWorkers: 5
|
||||
CPU: 3
|
||||
GPU: 1
|
||||
memory: 50Gi
|
||||
|
||||
operatorImage: rayproject/ray:2.0.0
|
||||
```
|
||||
|
||||
### KubeRay RayCluster CR
|
||||
In this section, we show a KubeRay RayCluster CR equivalent to the above legacy Ray Operator Helm configuration.
|
||||
|
||||
```{note}
|
||||
The configuration below is more verbose, as it does not employ Helm.
|
||||
Helm support for KubeRay is in progress; to try it, out read KubeRay's [Helm docs][KubeRayHelm].
|
||||
KubeRay's Helm charts can be found on GitHub [here][KubeRayHelmCode].
|
||||
|
||||
Currently, we recommend directly deploying KubeRay RayCluster CRs without Helm.
|
||||
```
|
||||
|
||||
Here is a [link][ConfigLink] to the configuration shown below.
|
||||
|
||||
```{literalinclude} ../configs/migration-example.yaml
|
||||
:language: yaml
|
||||
```
|
||||
<!-- TODO: fix this -->
|
||||
<!-- [RayExamples]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/kubernetes/configs -->
|
||||
[RayExamples]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/
|
||||
[KubeRayExamples]: https://ray-project.github.io/kuberay/components/operator/#running-an-example-cluster
|
||||
[ConfigLink]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/
|
||||
<!-- [ConfigLink]: https://raw.githubusercontent.com/ray-project/ray/7aeb1ab9cf7adb58fd9418c0e08984ff0fe6d018/doc/source/cluster/ray-clusters-on-kubernetes/configs/migration-example.yaml -->
|
||||
[KubeRayHelm]: https://ray-project.github.io/kuberay/deploy/helm/
|
||||
[KubeRayHelmCode]: https://github.com/ray-project/kuberay/tree/master/helm-chart
|
|
@ -6,7 +6,7 @@ This page provides tips on how to collect logs from
|
|||
Ray clusters running on Kubernetes.
|
||||
|
||||
:::{tip}
|
||||
Skip to {ref}`the deployment instructions<kuberay-logging-tldr>`
|
||||
Skip to {ref}`the deployment instructions <kuberay-logging-tldr>`
|
||||
for a sample configuration showing how to extract logs from a Ray pod.
|
||||
:::
|
||||
|
||||
|
@ -26,7 +26,7 @@ We mention two strategies for collecting logs written to a pod's filesystem,
|
|||
patterns in the [Kubernetes documentation][KubDoc].
|
||||
|
||||
### Sidecar containers
|
||||
We will provide an {ref}`example<kuberay-fluentbit>` of the sidecar strategy in this guide.
|
||||
We will provide an {ref}`example <kuberay-fluentbit>` of the sidecar strategy in this guide.
|
||||
You can process logs by configuring a log-processing sidecar
|
||||
for each Ray pod. Ray containers should be configured to share the `/tmp/ray`
|
||||
directory with the logging sidecar via a volume mount.
|
||||
|
@ -116,7 +116,7 @@ for a single-pod RayCluster will a log-processing sidecar.
|
|||
Now, we will see how to deploy the configuration described above.
|
||||
|
||||
Deploy the KubeRay Operator if you haven't yet.
|
||||
Refer to the {ref}`Getting Started guide<kuberay-operator-deploy>`
|
||||
Refer to the {ref}`Getting Started guide <kuberay-operator-deploy>`
|
||||
for instructions on this step.
|
||||
|
||||
Now, run the following commands to deploy the Fluent Bit ConfigMap and a single-pod RayCluster with
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ref-autoscaler-sdk-under-construction:
|
||||
|
||||
Autoscaler SDK
|
||||
==============
|
||||
|
||||
.. _ref-autoscaler-sdk-request-resources-under-construction:
|
||||
|
||||
ray.autoscaler.sdk.request_resources
|
||||
------------------------------------
|
||||
|
||||
Within a Ray program, you can command the autoscaler to scale the cluster up to a desired size with ``request_resources()`` call. The cluster will immediately attempt to scale to accommodate the requested resources, bypassing normal upscaling speed constraints.
|
||||
|
||||
.. autofunction:: ray.autoscaler.sdk.request_resources
|
||||
:noindex:
|
||||
|
||||
TODO: cade@ remove the noindex above.
|
|
@ -1,13 +0,0 @@
|
|||
# API References
|
||||
|
||||
The following pages provide reference documentation for using Ray Clusters. It is applicable for KubeRay users or users of Ray Clusters on VMs.
|
||||
Refer to the {ref}`KubeRay documentation<kuberay-api-reference>` or {ref}`Ray Clusters on VM documentation<ray-clusters-vms-reference>` for APIs specific to those deployment types.
|
||||
|
||||
```{toctree}
|
||||
:caption: "Reference documentation for Ray Clusters:"
|
||||
:maxdepth: '2'
|
||||
:name: ray-clusters-reference
|
||||
|
||||
autoscaler-sdk-api
|
||||
```
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
# Autoscaling
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
See the {ref}`Kubernetes page on autoscaling<kuberay-autoscaler-discussion>` and the {ref}`VM page on autoscaling<deployment-guide-autoscaler-under-construction>`.
|
||||
:::
|
|
@ -1,9 +1,7 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _ref-autoscaler-sdk:
|
||||
|
||||
Autoscaler SDK
|
||||
==============
|
||||
Programmatic Cluster Scaling
|
||||
============================
|
||||
|
||||
.. _ref-autoscaler-sdk-request-resources:
|
||||
|
||||
|
@ -13,3 +11,4 @@ ray.autoscaler.sdk.request_resources
|
|||
Within a Ray program, you can command the autoscaler to scale the cluster up to a desired size with ``request_resources()`` call. The cluster will immediately attempt to scale to accommodate the requested resources, bypassing normal upscaling speed constraints.
|
||||
|
||||
.. autofunction:: ray.autoscaler.sdk.request_resources
|
||||
:noindex:
|
|
@ -1,15 +1,12 @@
|
|||
# Index
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
:::
|
||||
|
||||
This section introduces the main differences in running a Ray application on your laptop vs on a Ray Cluster.
|
||||
To get started, check out the [job submissions](jobs-quickstart-under-construction) page.
|
||||
To get started, check out the [job submissions](jobs-quickstart) page.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: '2'
|
||||
|
||||
job-submission/index
|
||||
autoscaling
|
||||
monitoring-and-observability
|
||||
autoscaling/reference
|
||||
```
|
||||
|
|
|
@ -1,20 +1,13 @@
|
|||
(ray-jobs-under-construction)=
|
||||
(jobs-overview)=
|
||||
|
||||
# Ray Jobs
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
:::
|
||||
|
||||
Once you have deployed a Ray cluster (on [VMs](ref-cluster-quick-start-vms-under-construction) or [Kubernetes](kuberay-quickstart)), you are ready to run a Ray application!
|
||||
Once you have deployed a Ray cluster (on [VMs](vm-cluster-quick-start) or [Kubernetes](kuberay-quickstart)), you are ready to run a Ray application!
|
||||
|
||||
The recommended way to run a Ray application on a Ray Cluster is to use *Ray Jobs*.
|
||||
Ray Jobs allow you to submit locally developed applications to a remote Ray Cluster for execution.
|
||||
It simplifies the experience of packaging, deploying, and managing a Ray application.
|
||||
|
||||
:::{note}
|
||||
This component is in **beta**. APIs may change before becoming stable.
|
||||
:::
|
||||
|
||||
A Ray Job consists of:
|
||||
1. An entrypoint command, like `python my_script.py`.
|
||||
2. A [runtime environment](runtime-environments), which specifies the application's file and package dependencies.
|
||||
|
@ -26,11 +19,11 @@ After a Ray Job is submitted, it runs once to completion or failure, regardless
|
|||
Retries or different runs with different parameters should be handled by the submitter.
|
||||
Jobs are bound to the lifetime of a Ray cluster, so if the cluster goes down, all running jobs on that cluster will be terminated.
|
||||
|
||||
To get started with Ray Jobs, check out the [quickstart](jobs-quickstart-under-construction) guide, which walks you through the CLI tools for submitting and interacting with a Ray Job.
|
||||
To get started with Ray Jobs, check out the [quickstart](jobs-quickstart) guide, which walks you through the CLI tools for submitting and interacting with a Ray Job.
|
||||
This is suitable for any client that can communicate over HTTP to the Ray Cluster.
|
||||
If needed, Ray Jobs also provides APIs for [programmatic job submission](ray-job-sdk-under-construction) and [job submission using REST](ray-job-rest-api-under-construction).
|
||||
If needed, Ray Jobs also provides APIs for [programmatic job submission](ray-job-sdk) and [job submission using REST](ray-job-rest-api).
|
||||
|
||||
Finally, if you would like to run an application *interactively* and see the output in real time, you can use [Ray Client](ray-client-under-construction). This tool can be useful during development.
|
||||
Finally, if you would like to run an application *interactively* and see the output in real time, you can use [Ray Client](ray-client-ref). This tool can be useful during development.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: '1'
|
||||
|
|
|
@ -1,18 +1,15 @@
|
|||
.. _ray-job-submission-api-ref-under-construction:
|
||||
|
||||
.. TODO: Remove :noindex: tags below.
|
||||
.. _ray-job-submission-api-ref:
|
||||
|
||||
Ray Job Submission API Reference
|
||||
================================
|
||||
|
||||
For an overview with examples see :ref:`Ray Job Submission<jobs-overview>`.
|
||||
For an overview with examples see :ref:`Ray Jobs <jobs-overview>`.
|
||||
|
||||
.. _ray-job-submission-cli-ref-under-construction:
|
||||
.. _ray-job-submission-cli-ref:
|
||||
|
||||
Job Submission CLI
|
||||
------------------
|
||||
|
||||
.. _ray-job-submit-doc-under-construction:
|
||||
.. _ray-job-submit-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:submit
|
||||
:prog: ray job submit
|
||||
|
@ -23,36 +20,36 @@ Job Submission CLI
|
|||
``ray job submit --working_dir="." -- python script.py`` instead of ``ray job submit --working_dir="." -- "python script.py"``.
|
||||
Otherwise you may encounter the error ``/bin/sh: 1: python script.py: not found``.
|
||||
|
||||
.. _ray-job-status-doc-under-construction:
|
||||
.. _ray-job-status-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:status
|
||||
:prog: ray job status
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-stop-doc-under-construction:
|
||||
.. _ray-job-stop-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:stop
|
||||
:prog: ray job stop
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-logs-doc-under-construction:
|
||||
.. _ray-job-logs-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:logs
|
||||
:prog: ray job logs
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-list-doc-under-construction:
|
||||
.. _ray-job-list-doc:
|
||||
|
||||
.. click:: ray.dashboard.modules.job.cli:list
|
||||
:prog: ray job list
|
||||
:show-nested:
|
||||
|
||||
.. _ray-job-submission-sdk-ref-under-construction:
|
||||
.. _ray-job-submission-sdk-ref:
|
||||
|
||||
Job Submission SDK
|
||||
------------------
|
||||
|
||||
.. _job-submission-client-ref-under-construction:
|
||||
.. _job-submission-client-ref:
|
||||
|
||||
JobSubmissionClient
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -61,7 +58,7 @@ JobSubmissionClient
|
|||
:members:
|
||||
:noindex:
|
||||
|
||||
.. _job-status-ref-under-construction:
|
||||
.. _job-status-ref:
|
||||
|
||||
JobStatus
|
||||
~~~~~~~~~
|
||||
|
@ -70,7 +67,7 @@ JobStatus
|
|||
:members:
|
||||
:noindex:
|
||||
|
||||
.. _job-info-ref-under-construction:
|
||||
.. _job-info-ref:
|
||||
|
||||
JobInfo
|
||||
~~~~~~~
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. _jobs-quickstart-under-construction:
|
||||
.. _jobs-quickstart:
|
||||
|
||||
=================================
|
||||
Quickstart Using the Ray Jobs CLI
|
||||
|
@ -10,7 +7,8 @@ Quickstart Using the Ray Jobs CLI
|
|||
In this guide, we will walk through the Ray Jobs CLIs available for submitting and interacting with a Ray Job.
|
||||
|
||||
.. note::
|
||||
This component is in **beta**. APIs may change before becoming stable.
|
||||
|
||||
The Jobs API is in beta and may change before becoming stable.
|
||||
|
||||
Setup
|
||||
-----
|
||||
|
@ -35,7 +33,7 @@ For convenience, this guide will assume that you are using a local Ray Cluster,
|
|||
|
||||
This will create a Ray head node on our local machine that we can use for development purposes.
|
||||
Note the Ray Dashboard URL that is printed when starting or connecting to a Ray Cluster; we will use this URL later to submit a Ray Job.
|
||||
For more details on production deployment scenarios, check out the guides for deploying Ray on :ref:`VMs <ref-cluster-quick-start-vms-under-construction>` and :ref:`Kubernetes <kuberay-quickstart>`.
|
||||
For more details on production deployment scenarios, check out the guides for deploying Ray on :ref:`VMs <vm-cluster-quick-start>` and :ref:`Kubernetes <kuberay-quickstart>`.
|
||||
|
||||
|
||||
Submitting a Ray Job
|
||||
|
@ -190,7 +188,7 @@ Dependency Management
|
|||
To run a distributed application, we need to make sure that all workers run in the same environment.
|
||||
This can be challenging if multiple applications in the same Ray Cluster have different and conflicting dependencies.
|
||||
|
||||
To avoid dependency conflicts, Ray provides a mechanism called :ref:`runtime environments<runtime-environments>`. Runtime environments allow an application to override the default environment on the Ray Cluster and run in an isolated environment, similar to virtual environments in single-node Python. Dependencies can include both files and Python packages.
|
||||
To avoid dependency conflicts, Ray provides a mechanism called :ref:`runtime environments <runtime-environments>`. Runtime environments allow an application to override the default environment on the Ray Cluster and run in an isolated environment, similar to virtual environments in single-node Python. Dependencies can include both files and Python packages.
|
||||
|
||||
Ray Jobs provides an option to specify the runtime environment when submitting a job. On the Ray Cluster, Ray will then install the runtime environment across the workers and ensure that tasks in that job run in the same environment. To see how this works, we'll use a Python script that prints the current version of the ``requests`` module in a Ray task.
|
||||
|
||||
|
@ -258,5 +256,5 @@ Now let's try it with a runtime environment that pins the version of the ``reque
|
|||
# Job 'raysubmit_vGGV4MiP9rYkYUnb' succeeded
|
||||
# ------------------------------------------
|
||||
|
||||
The full API reference for the Ray Jobs CLI can be found :ref:`here <ray-job-submission-api-ref-under-construction>`.
|
||||
For more information on other ways to submit Ray Jobs, check out the guides for :ref:`programmatic job submission <ray-job-sdk-under-construction>` and :ref:`job submission using REST <ray-job-rest-api-under-construction>`.
|
||||
The full API reference for the Ray Jobs CLI can be found :ref:`here <ray-job-submission-api-ref>`.
|
||||
For more information on other ways to submit Ray Jobs, check out the guides for :ref:`programmatic job submission <ray-job-sdk>` and :ref:`job submission using REST <ray-job-rest-api>`.
|
||||
|
|
|
@ -1,9 +1,4 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ray-client-under-construction:
|
||||
.. _ray-client-ref:
|
||||
|
||||
Ray Client: Interactive Development
|
||||
===================================
|
||||
|
@ -57,11 +52,11 @@ When to use Ray Client
|
|||
|
||||
Ray Client should be used when you want to connect a script or an interactive shell session to a **remote** cluster.
|
||||
|
||||
* Use ``ray.init("ray://<head_node_host>:10001")`` (Ray Client) if you've set up a remote cluster at ``<head_node_host>`` and you want to do interactive work. This will connect your local script or shell to the cluster. See the section on :ref:`using Ray Client<how-do-you-use-the-ray-client>` for more details on setting up your cluster.
|
||||
* Use ``ray.init("ray://<head_node_host>:10001")`` (Ray Client) if you've set up a remote cluster at ``<head_node_host>`` and you want to do interactive work. This will connect your local script or shell to the cluster. See the section on :ref:`using Ray Client <how-do-you-use-the-ray-client>` for more details on setting up your cluster.
|
||||
* Use ``ray.init("localhost:<port>")`` (non-client connection, local address) if you're developing locally or on the head node of your cluster and you have already started the cluster (i.e. ``ray start --head`` has already been run)
|
||||
* Use ``ray.init()`` (non-client connection, no address specified) if you're developing locally and want to automatically create a local cluster and attach directly to it OR if you are using Ray Job submission.
|
||||
|
||||
.. _how-do-you-use-the-ray-client-under-construction:
|
||||
.. _how-do-you-use-the-ray-client:
|
||||
|
||||
How do you use the Ray Client?
|
||||
------------------------------
|
||||
|
@ -75,7 +70,7 @@ If you have a running Ray cluster (version >= 1.5), Ray Client server is likely
|
|||
|
||||
ray start --head
|
||||
|
||||
To start a Ray cluster remotely, you can follow the directions in :ref:`ref-cluster-quick-start`.
|
||||
To start a Ray cluster remotely, you can follow the directions in :ref:`vm-cluster-quick-start`.
|
||||
|
||||
If necessary, you can modify the Ray Client server port to be other than ``10001``, by specifying ``--ray-client-server-port=...`` to the ``ray start`` :ref:`command <ray-start-doc>`.
|
||||
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. _ray-job-rest-api-under-construction:
|
||||
.. _ray-job-rest-api:
|
||||
|
||||
REST API
|
||||
^^^^^^^^
|
||||
|
|
|
@ -1,15 +1,9 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. _ray-job-sdk-under-construction:
|
||||
.. _ray-job-sdk:
|
||||
|
||||
Python SDK
|
||||
^^^^^^^^^^
|
||||
|
||||
The Job Submission Python SDK is the recommended way to submit jobs programmatically. Jump to the :ref:`API Reference<ray-job-submission-sdk-ref>`, or continue reading for a quick overview.
|
||||
|
||||
.. note::
|
||||
This component is in **beta**. APIs may change before becoming stable.
|
||||
The Job Submission Python SDK is the recommended way to submit jobs programmatically. Jump to the :ref:`API Reference <ray-job-submission-sdk-ref>`, or continue reading for a quick overview.
|
||||
|
||||
Setup
|
||||
-----
|
||||
|
@ -34,7 +28,7 @@ For convenience, this guide will assume that you are using a local Ray Cluster,
|
|||
|
||||
This will create a Ray head node on our local machine that we can use for development purposes.
|
||||
Note the Ray Dashboard URL that is printed when starting or connecting to a Ray Cluster; we will use this URL later to submit a Ray Job.
|
||||
For more details on production deployment scenarios, check out the guides for deploying Ray on :ref:`VMs <ref-cluster-quick-start-vms-under-construction>` and :ref:`Kubernetes <kuberay-quickstart>`.
|
||||
For more details on production deployment scenarios, check out the guides for deploying Ray on :ref:`VMs <vm-cluster-quick-start>` and :ref:`Kubernetes <kuberay-quickstart>`.
|
||||
|
||||
Submitting a Ray Job
|
||||
--------------------
|
||||
|
@ -123,7 +117,7 @@ To get information about all jobs, call ``client.list_jobs()``. This returns a
|
|||
Dependency Management
|
||||
---------------------
|
||||
|
||||
Similar to the :ref:`Jobs CLI <jobs-quickstart-under-construction>`, we can also package our application's dependencies by using a Ray :ref:`runtime environment <runtime-environments>`.
|
||||
Similar to the :ref:`Jobs CLI <jobs-quickstart>`, we can also package our application's dependencies by using a Ray :ref:`runtime environment <runtime-environments>`.
|
||||
Using the Python SDK, the syntax looks something like this:
|
||||
|
||||
.. code-block:: python
|
||||
|
@ -143,4 +137,4 @@ Using the Python SDK, the syntax looks something like this:
|
|||
Instead of a local directory (``"./"`` in this example), you can also specify remote URIs for your job's working directory, such as S3 buckets or Git repositories. See :ref:`remote-uris` for details.
|
||||
|
||||
|
||||
For full details, see the :ref:`API Reference<ray-job-submission-sdk-ref>`.
|
||||
For full details, see the :ref:`API Reference <ray-job-submission-sdk-ref>`.
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
Monitoring and observability
|
||||
----------------------------
|
||||
Cluster Monitoring
|
||||
------------------
|
||||
|
||||
Ray ships with the following observability features:
|
||||
|
||||
|
@ -9,7 +7,7 @@ Ray ships with the following observability features:
|
|||
2. CLI tools such as the :ref:`Ray state APIs <state-api-overview-ref>` and :ref:`ray status <monitor-cluster>`, for checking application and cluster status.
|
||||
3. :ref:`Prometheus metrics <multi-node-metrics>` for internal and custom user-defined metrics.
|
||||
|
||||
For more information on these tools, check out the more comprehensive :ref:`Observability guide<observability>`.
|
||||
For more information on these tools, check out the more comprehensive :ref:`Observability guide <observability>`.
|
||||
|
||||
The rest of this page will focus on how to access these services when running a Ray Cluster.
|
||||
|
||||
|
@ -18,7 +16,7 @@ Monitoring the cluster via the dashboard
|
|||
|
||||
:ref:`The dashboard <ray-dashboard>` provides detailed information about the state of the cluster,
|
||||
including the running jobs, actors, workers, nodes, etc.
|
||||
By default, the :ref:`cluster launcher <ref-cluster-quick-start-vms-under-construction>` and :ref:`KubeRay operator <kuberay-quickstart>` will launch the dashboard, but will
|
||||
By default, the :ref:`cluster launcher <vm-cluster-quick-start>` and :ref:`KubeRay operator <kuberay-quickstart>` will launch the dashboard, but will
|
||||
not publicly expose the port.
|
||||
|
||||
.. tabbed:: If using the VM cluster launcher
|
||||
|
@ -104,12 +102,12 @@ Auto-discovering metrics endpoints
|
|||
##################################
|
||||
|
||||
You can allow Prometheus to dynamically find endpoints it should scrape by using Prometheus' `file based service discovery <https://prometheus.io/docs/guides/file-sd/#installing-configuring-and-running-prometheus>`_.
|
||||
This is the recommended way to export Prometheus metrics when using the Ray :ref:`cluster launcher <ref-cluster-quick-start-vms-under-construction>`, as node IP addresses can often change as the cluster scales up and down.
|
||||
This is the recommended way to export Prometheus metrics when using the Ray :ref:`cluster launcher <vm-cluster-quick-start>`, as node IP addresses can often change as the cluster scales up and down.
|
||||
|
||||
Ray auto-generates a Prometheus `service discovery file <https://prometheus.io/docs/guides/file-sd/#installing-configuring-and-running-prometheus>`_ on the head node to facilitate metrics agents' service discovery.
|
||||
This allows you to scrape all metrics in the cluster without knowing their IPs. Let's walk through how to acheive this.
|
||||
|
||||
The service discovery file is generated on the :ref:`head node <cluster-head-node-under-construction>`. On this node, look for ``/tmp/ray/prom_metrics_service_discovery.json`` (or the eqiuvalent file if using a custom Ray ``temp_dir``).
|
||||
The service discovery file is generated on the :ref:`head node <cluster-head-node>`. On this node, look for ``/tmp/ray/prom_metrics_service_discovery.json`` (or the eqiuvalent file if using a custom Ray ``temp_dir``).
|
||||
Ray will periodically update this file with the addresses of all metrics agents in the cluster.
|
||||
|
||||
Now, on the same node, modify a Prometheus config to scrape the file for service discovery.
|
||||
|
|
|
@ -15,7 +15,7 @@ Here are the guiding principles of our collection policy:
|
|||
- We will **not** collect any personally identifiable data or proprietary code/data
|
||||
- We will **not** sell data or buy data about you.
|
||||
|
||||
You will always be able to :ref:`disable the usage stats collection<usage-disable>`.
|
||||
You will always be able to :ref:`disable the usage stats collection <usage-disable>`.
|
||||
|
||||
For more context, please refer to this `RFC <https://github.com/ray-project/ray/issues/20857>`_.
|
||||
|
||||
|
@ -23,7 +23,7 @@ What data is collected?
|
|||
-----------------------
|
||||
|
||||
We collect non-sensitive data that helps us understand how Ray is used (e.g., which Ray libraries are used).
|
||||
**Personally identifiable data will never be collected.** Please check :ref:`UsageStatsToReport <ray-usage-stats-data-ref>` to see the data we collect.
|
||||
**Personally identifiable data will never be collected.** Please check the UsageStatsToReport class to see the data we collect.
|
||||
|
||||
.. _usage-disable:
|
||||
|
|
@ -1,4 +1,14 @@
|
|||
(vm-cluster-examples)=
|
||||
|
||||
# Examples
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
|
||||
:::{note}
|
||||
To learn the basics of Ray on Cloud VMs, we recommend taking a look
|
||||
at the {ref}`introductory guide <vm-cluster-quick-start>` first.
|
||||
:::
|
||||
|
||||
This section presents example Ray workloads to try out on your cloud cluster.
|
||||
|
||||
More examples will be added in the future. Running the distributed XGBoost example below is a
|
||||
great way to start experimenting with production Ray workloads in the cloud.
|
||||
- {ref}`clusters-vm-ml-example`
|
||||
|
|
|
@ -4,23 +4,16 @@
|
|||
|
||||
:::{note}
|
||||
To learn the basics of Ray on VMs, we recommend taking a look
|
||||
at the {ref}`introductory guide<ref-cluster-quick-start-vms-under-construction>` first.
|
||||
at the {ref}`introductory guide <vm-cluster-quick-start>` first.
|
||||
:::
|
||||
|
||||
|
||||
In this guide, we show you how to run a sample Ray machine learning
|
||||
workload on AWS. The similar steps can be used to deploy on GCP or Azure as well.
|
||||
|
||||
We will run Ray's {ref}`XGBoost training benchmark<xgboost-benchmark>` with a 100 gigabyte training set.
|
||||
We will run Ray's {ref}`XGBoost training benchmark <xgboost-benchmark>` with a 100 gigabyte training set.
|
||||
To learn more about using Ray's XGBoostTrainer, check out {ref}`the XGBoostTrainer documentation <train-gbdt-guide>`.
|
||||
|
||||
```{admonition} Optional: Autoscaling
|
||||
This guide includes notes on how to deploy the XGBoost benchmark with optional Ray Autoscaler support.
|
||||
In this guide's example, we know that we need 1 Ray head and 9 Ray workers,
|
||||
so autoscaling is not strictly required. Read {ref}`this discussion<autoscaler-pro-con>` for guidance
|
||||
on whether to use autoscaling.
|
||||
```
|
||||
|
||||
## VM cluster setup
|
||||
|
||||
For the workload in this guide, it is recommended to use the following setup:
|
||||
|
@ -57,7 +50,7 @@ A Ray head node and 9 Ray worker nodes will be created.
|
|||
|
||||
## Run the workload
|
||||
|
||||
We will use {ref}`Ray Job Submission<jobs-overview>` to kick off the workload.
|
||||
We will use {ref}`Ray Job Submission <jobs-overview>` to kick off the workload.
|
||||
|
||||
### Connect to the cluster
|
||||
|
||||
|
@ -70,7 +63,7 @@ This will forward remote port 8265 to port 8265 on localhost.
|
|||
|
||||
### Submit the workload
|
||||
|
||||
We'll use the {ref}`Ray Job Python SDK<ray-job-sdk>` to submit the XGBoost workload.
|
||||
We'll use the {ref}`Ray Job Python SDK <ray-job-sdk>` to submit the XGBoost workload.
|
||||
|
||||
```{literalinclude} /cluster/doc_code/xgboost_submit.py
|
||||
:language: python
|
||||
|
@ -121,12 +114,12 @@ Results: {'training_time': 1338.488839321999, 'prediction_time': 403.36653568099
|
|||
```
|
||||
|
||||
The performance of the benchmark is sensitive to the underlying cloud infrastructure --
|
||||
you might not match {ref}`the numbers quoted in the benchmark docs<xgboost-benchmark>`.
|
||||
you might not match {ref}`the numbers quoted in the benchmark docs <xgboost-benchmark>`.
|
||||
|
||||
#### Model parameters
|
||||
The file `model.json` in the Ray head node contains the parameters for the trained model.
|
||||
Other result data will be available in the directory `ray_results` in the head node.
|
||||
Refer to the {ref}`XGBoostTrainer documentation<train-gbdt-guide>` for details.
|
||||
Refer to the {ref}`XGBoostTrainer documentation <train-gbdt-guide>` for details.
|
||||
|
||||
```{admonition} Scale-down
|
||||
If autoscaling is enabled, Ray worker nodes will scale down after the specified idle timeout.
|
||||
|
|
|
@ -1,14 +1,9 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. include:: /_includes/clusters/announcement.rst
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
.. _vm-cluster-quick-start:
|
||||
|
||||
.. _ref-cluster-quick-start-vms-under-construction:
|
||||
|
||||
Ray Clusters Quick Start
|
||||
========================
|
||||
Getting Started
|
||||
===============
|
||||
|
||||
This quick start demonstrates the capabilities of the Ray cluster. Using the Ray cluster, we'll take a sample application designed to run on a laptop and scale it up in the cloud. Ray will launch clusters and scale Python with just a few commands.
|
||||
|
||||
|
@ -89,7 +84,7 @@ We will write a simple Python application that tracks the IP addresses of the ma
|
|||
|
||||
Save this application as ``script.py`` and execute it by running the command ``python script.py``. The application should take 10 seconds to run and output something similar to ``Counter({'127.0.0.1': 10000})``.
|
||||
|
||||
With some small changes, we can make this application run on Ray (for more information on how to do this, refer to :ref:`the Ray Core Walkthrough<core-walkthrough>`):
|
||||
With some small changes, we can make this application run on Ray (for more information on how to do this, refer to :ref:`the Ray Core Walkthrough <core-walkthrough>`):
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -239,7 +234,7 @@ You can also optionally get a remote shell using ``ray attach`` and run commands
|
|||
|
||||
For a full reference on the Ray Cluster CLI tools, please refer to :ref:`the cluster commands reference <cluster-commands>`.
|
||||
|
||||
While these tools are useful for ad-hoc execution on the Ray Cluster, the recommended way to execute an application on a Ray Cluster is to use :ref:`Ray Jobs <ray-jobs-under-construction>`. Check out the :ref:`quickstart guide <jobs-quickstart-under-construction>` to get started!
|
||||
While these tools are useful for ad-hoc execution on the Ray Cluster, the recommended way to execute an application on a Ray Cluster is to use :ref:`Ray Jobs <jobs-quickstart>`. Check out the :ref:`quickstart guide <jobs-quickstart>` to get started!
|
||||
|
||||
Deleting a Ray Cluster
|
||||
----------------------
|
||||
|
|
|
@ -1,4 +1,71 @@
|
|||
# Index
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
:::
|
||||
# Ray on Cloud VMs
|
||||
(cloud-vm-index)=
|
||||
|
||||
## Overview
|
||||
|
||||
In this section we cover how to launch Ray clusters on Cloud VMs. Ray ships with built-in support
|
||||
for launching AWS and GCP clusters, and also has community-maintained integrations for Azure and Aliyun.
|
||||
Each Ray cluster consists of a head node and a collection of worker nodes. Optional
|
||||
[autoscaling](vms-autoscaling) support allows the Ray cluster to be sized according to the
|
||||
requirements of your Ray workload, adding and removing worker nodes as needed. Ray supports
|
||||
clusters composed of multiple heterogenous compute nodes (including GPU nodes).
|
||||
|
||||
Concretely, you will learn how to:
|
||||
|
||||
- Set up and configure Ray in public clouds
|
||||
- Deploy applications and monitor your cluster
|
||||
|
||||
## Learn More
|
||||
|
||||
The Ray docs present all the information you need to start running Ray workloads on VMs.
|
||||
|
||||
```{eval-rst}
|
||||
.. panels::
|
||||
:container: text-center
|
||||
:column: col-lg-12 p-2
|
||||
:card:
|
||||
|
||||
**Getting Started**
|
||||
^^^
|
||||
|
||||
Learn how to start a Ray cluster and deploy Ray applications in the cloud.
|
||||
|
||||
+++
|
||||
.. link-button:: vm-cluster-quick-start
|
||||
:type: ref
|
||||
:text: Get Started with Ray on Cloud VMs
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
**Examples**
|
||||
^^^
|
||||
|
||||
Try example Ray workloads in the Cloud
|
||||
|
||||
+++
|
||||
.. link-button:: vm-cluster-examples
|
||||
:type: ref
|
||||
:text: Try example workloads
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
**User Guides**
|
||||
^^^
|
||||
|
||||
Learn best practices for configuring cloud clusters
|
||||
|
||||
+++
|
||||
.. link-button:: vm-cluster-guides
|
||||
:type: ref
|
||||
:text: Read the User Guides
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
**API Reference**
|
||||
^^^
|
||||
|
||||
Find API references for cloud clusters
|
||||
|
||||
+++
|
||||
.. link-button:: vm-cluster-api-references
|
||||
:type: ref
|
||||
:text: Check API references
|
||||
:classes: btn-outline-info btn-block
|
||||
```
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
(vm-cluster-api-references)=
|
||||
|
||||
# API References
|
||||
|
||||
The following pages provide reference documentation for using Ray Clusters on virtual machines.
|
||||
For reference documentation that is common to both Ray Clusters on VMs and Kubernetes, see {ref}`Ray Clusters References<ray-clusters-reference>`.
|
||||
|
||||
```{toctree}
|
||||
:caption: "Reference documentation for Ray Clusters on VMs:"
|
||||
|
|
|
@ -1,9 +1,4 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _cluster-commands-under-construction:
|
||||
.. _cluster-commands:
|
||||
|
||||
Cluster Launcher Commands
|
||||
=========================
|
||||
|
@ -160,7 +155,7 @@ run ``ray attach --help``.
|
|||
# Attach to tmux session on cluster (creates a new one if none available)
|
||||
$ ray attach cluster.yaml --tmux
|
||||
|
||||
.. _ray-rsync-under-construction:
|
||||
.. _ray-rsync:
|
||||
|
||||
Synchronizing files from the cluster (``ray rsync-up/down``)
|
||||
------------------------------------------------------------
|
||||
|
@ -173,7 +168,7 @@ To download or upload files to the cluster head node, use ``ray rsync_down`` or
|
|||
$ ray rsync_down cluster.yaml '/path/on/cluster' '/local/path'
|
||||
$ ray rsync_up cluster.yaml '/local/path' '/path/on/cluster'
|
||||
|
||||
.. _monitor-cluster-under-construction:
|
||||
.. _monitor-cluster:
|
||||
|
||||
Monitoring cluster status (``ray dashboard/status``)
|
||||
-----------------------------------------------------
|
||||
|
|
|
@ -1,9 +1,4 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _cluster-config-under-construction:
|
||||
.. _cluster-config:
|
||||
|
||||
Cluster YAML Configuration Options
|
||||
==================================
|
||||
|
@ -52,7 +47,7 @@ Syntax
|
|||
Custom types
|
||||
------------
|
||||
|
||||
.. _cluster-configuration-docker-type-under-construction:
|
||||
.. _cluster-configuration-docker-type:
|
||||
|
||||
Docker
|
||||
~~~~~~
|
||||
|
@ -72,7 +67,7 @@ Docker
|
|||
:ref:`disable_automatic_runtime_detection <cluster-configuration-disable-automatic-runtime-detection>`: bool
|
||||
:ref:`disable_shm_size_detection <cluster-configuration-disable-shm-size-detection>`: bool
|
||||
|
||||
.. _cluster-configuration-auth-type-under-construction:
|
||||
.. _cluster-configuration-auth-type:
|
||||
|
||||
Auth
|
||||
~~~~
|
||||
|
@ -99,7 +94,7 @@ Auth
|
|||
:ref:`ssh_user <cluster-configuration-ssh-user>`: str
|
||||
:ref:`ssh_private_key <cluster-configuration-ssh-private-key>`: str
|
||||
|
||||
.. _cluster-configuration-provider-type-under-construction:
|
||||
.. _cluster-configuration-provider-type:
|
||||
|
||||
Provider
|
||||
~~~~~~~~
|
||||
|
@ -135,7 +130,7 @@ Provider
|
|||
:ref:`project_id <cluster-configuration-project-id>`: str
|
||||
:ref:`cache_stopped_nodes <cluster-configuration-cache-stopped-nodes>`: bool
|
||||
|
||||
.. _cluster-configuration-security-group-type-under-construction:
|
||||
.. _cluster-configuration-security-group-type:
|
||||
|
||||
Security Group
|
||||
~~~~~~~~~~~~~~
|
||||
|
@ -148,14 +143,14 @@ Security Group
|
|||
:ref:`IpPermissions <cluster-configuration-ip-permissions>`:
|
||||
- `IpPermission <https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_IpPermission.html>`_
|
||||
|
||||
.. _cluster-configuration-node-types-type-under-construction:
|
||||
.. _cluster-configuration-node-types-type:
|
||||
|
||||
Node types
|
||||
~~~~~~~~~~
|
||||
|
||||
The ``available_nodes_types`` object's keys represent the names of the different node types.
|
||||
|
||||
Deleting a node type from ``available_node_types`` and updating with :ref:`ray up<ray-up-doc>` will cause the autoscaler to scale down all nodes of that type.
|
||||
Deleting a node type from ``available_node_types`` and updating with :ref:`ray up <ray-up-doc>` will cause the autoscaler to scale down all nodes of that type.
|
||||
In particular, changing the key of a node type object will
|
||||
result in removal of nodes corresponding to the old key; nodes with the new key name will then be
|
||||
created according to cluster configuration and Ray resource demands.
|
||||
|
@ -176,14 +171,14 @@ created according to cluster configuration and Ray resource demands.
|
|||
...
|
||||
...
|
||||
|
||||
.. _cluster-configuration-node-config-type-under-construction:
|
||||
.. _cluster-configuration-node-config-type:
|
||||
|
||||
Node config
|
||||
~~~~~~~~~~~
|
||||
|
||||
Cloud-specific configuration for nodes of a given node type.
|
||||
|
||||
Modifying the ``node_config`` and updating with :ref:`ray up<ray-up-doc>` will cause the autoscaler to scale down all existing nodes of the node type;
|
||||
Modifying the ``node_config`` and updating with :ref:`ray up <ray-up-doc>` will cause the autoscaler to scale down all existing nodes of the node type;
|
||||
nodes with the newly applied ``node_config`` will then be created according to cluster configuration and Ray resource demands.
|
||||
|
||||
.. tabbed:: AWS
|
||||
|
@ -198,7 +193,7 @@ nodes with the newly applied ``node_config`` will then be created according to c
|
|||
|
||||
A YAML object as defined in `the GCP docs <https://cloud.google.com/compute/docs/reference/rest/v1/instances>`_.
|
||||
|
||||
.. _cluster-configuration-node-docker-type-under-construction:
|
||||
.. _cluster-configuration-node-docker-type:
|
||||
|
||||
Node Docker
|
||||
~~~~~~~~~~~
|
||||
|
@ -212,7 +207,7 @@ Node Docker
|
|||
:ref:`disable_automatic_runtime_detection <cluster-configuration-disable-automatic-runtime-detection>`: bool
|
||||
:ref:`disable_shm_size_detection <cluster-configuration-disable-shm-size-detection>`: bool
|
||||
|
||||
.. _cluster-configuration-resources-type-under-construction:
|
||||
.. _cluster-configuration-resources-type:
|
||||
|
||||
Resources
|
||||
~~~~~~~~~
|
||||
|
@ -227,7 +222,7 @@ Resources
|
|||
<custom_resource2>: int
|
||||
...
|
||||
|
||||
.. _cluster-configuration-file-mounts-type-under-construction:
|
||||
.. _cluster-configuration-file-mounts-type:
|
||||
|
||||
File mounts
|
||||
~~~~~~~~~~~
|
||||
|
@ -240,7 +235,7 @@ File mounts
|
|||
Properties and Definitions
|
||||
--------------------------
|
||||
|
||||
.. _cluster-configuration-cluster-name-under-construction:
|
||||
.. _cluster-configuration-cluster-name:
|
||||
|
||||
``cluster_name``
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
@ -253,7 +248,7 @@ The name of the cluster. This is the namespace of the cluster.
|
|||
* **Default:** "default"
|
||||
* **Pattern:** ``[a-zA-Z0-9_]+``
|
||||
|
||||
.. _cluster-configuration-max-workers-under-construction:
|
||||
.. _cluster-configuration-max-workers:
|
||||
|
||||
``max_workers``
|
||||
~~~~~~~~~~~~~~~
|
||||
|
@ -267,7 +262,7 @@ The maximum number of workers the cluster will have at any given time.
|
|||
* **Minimum:** ``0``
|
||||
* **Maximum:** Unbounded
|
||||
|
||||
.. _cluster-configuration-upscaling-speed-under-construction:
|
||||
.. _cluster-configuration-upscaling-speed:
|
||||
|
||||
``upscaling_speed``
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -281,7 +276,7 @@ The number of nodes allowed to be pending as a multiple of the current number of
|
|||
* **Minimum:** ``0.0``
|
||||
* **Maximum:** Unbounded
|
||||
|
||||
.. _cluster-configuration-idle-timeout-minutes-under-construction:
|
||||
.. _cluster-configuration-idle-timeout-minutes:
|
||||
|
||||
``idle_timeout_minutes``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -295,7 +290,7 @@ The number of minutes that need to pass before an idle worker node is removed by
|
|||
* **Minimum:** ``0``
|
||||
* **Maximum:** Unbounded
|
||||
|
||||
.. _cluster-configuration-docker-under-construction:
|
||||
.. _cluster-configuration-docker:
|
||||
|
||||
``docker``
|
||||
~~~~~~~~~~
|
||||
|
@ -317,7 +312,7 @@ In rare cases when Docker is not available on the system by default (e.g., bad A
|
|||
- sudo usermod -aG docker $USER
|
||||
- sudo systemctl restart docker -f
|
||||
|
||||
.. _cluster-configuration-provider-under-construction:
|
||||
.. _cluster-configuration-provider:
|
||||
|
||||
``provider``
|
||||
~~~~~~~~~~~~
|
||||
|
@ -328,7 +323,7 @@ The cloud provider-specific configuration properties.
|
|||
* **Importance:** High
|
||||
* **Type:** :ref:`Provider <cluster-configuration-provider-type>`
|
||||
|
||||
.. _cluster-configuration-auth-under-construction:
|
||||
.. _cluster-configuration-auth:
|
||||
|
||||
``auth``
|
||||
~~~~~~~~
|
||||
|
@ -339,7 +334,7 @@ Authentication credentials that Ray will use to launch nodes.
|
|||
* **Importance:** High
|
||||
* **Type:** :ref:`Auth <cluster-configuration-auth-type>`
|
||||
|
||||
.. _cluster-configuration-available-node-types-under-construction:
|
||||
.. _cluster-configuration-available-node-types:
|
||||
|
||||
``available_node_types``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -373,14 +368,14 @@ Each node type is identified by a user-specified key.
|
|||
resources: {"CPU": 2}
|
||||
min_workers: 0
|
||||
|
||||
.. _cluster-configuration-head-node-type-under-construction:
|
||||
.. _cluster-configuration-head-node-type:
|
||||
|
||||
``head_node_type``
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The key for one of the node types in :ref:`available_node_types <cluster-configuration-available-node-types>`. This node type will be used to launch the head node.
|
||||
|
||||
If the field ``head_node_type`` is changed and an update is executed with :ref:`ray up<ray-up-doc>`, the currently running head node will
|
||||
If the field ``head_node_type`` is changed and an update is executed with :ref:`ray up <ray-up-doc>`, the currently running head node will
|
||||
be considered outdated. The user will receive a prompt asking to confirm scale-down of the outdated head node, and the cluster will restart with a new
|
||||
head node. Changing the :ref:`node_config<cluster-configuration-node-config>` of the :ref:`node_type<cluster-configuration-node-types-type>` with key ``head_node_type`` will also result in cluster restart after a user prompt.
|
||||
|
||||
|
@ -391,7 +386,7 @@ head node. Changing the :ref:`node_config<cluster-configuration-node-config>` of
|
|||
* **Type:** String
|
||||
* **Pattern:** ``[a-zA-Z0-9_]+``
|
||||
|
||||
.. _cluster-configuration-file-mounts-under-construction:
|
||||
.. _cluster-configuration-file-mounts:
|
||||
|
||||
``file_mounts``
|
||||
~~~~~~~~~~~~~~~
|
||||
|
@ -403,7 +398,7 @@ The files or directories to copy to the head and worker nodes.
|
|||
* **Type:** :ref:`File mounts <cluster-configuration-file-mounts-type>`
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-cluster-synced-files-under-construction:
|
||||
.. _cluster-configuration-cluster-synced-files:
|
||||
|
||||
``cluster_synced_files``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -415,7 +410,7 @@ A list of paths to the files or directories to copy from the head node to the wo
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-rsync-exclude-under-construction:
|
||||
.. _cluster-configuration-rsync-exclude:
|
||||
|
||||
``rsync_exclude``
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
@ -429,7 +424,7 @@ Example for a pattern in the list: ``**/.git/**``.
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-rsync-filter-under-construction:
|
||||
.. _cluster-configuration-rsync-filter:
|
||||
|
||||
``rsync_filter``
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
@ -443,7 +438,7 @@ Example for a pattern in the list: ``.gitignore``.
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-initialization-commands-under-construction:
|
||||
.. _cluster-configuration-initialization-commands:
|
||||
|
||||
``initialization_commands``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -455,7 +450,7 @@ A list of commands that will be run before the :ref:`setup commands <cluster-con
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-setup-commands-under-construction:
|
||||
.. _cluster-configuration-setup-commands:
|
||||
|
||||
``setup_commands``
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
@ -491,7 +486,7 @@ A list of commands to run to set up nodes. These commands will always run on the
|
|||
- sudo pkill -9 dpkg || true
|
||||
- sudo dpkg --configure -a
|
||||
|
||||
.. _cluster-configuration-head-setup-commands-under-construction:
|
||||
.. _cluster-configuration-head-setup-commands:
|
||||
|
||||
``head_setup_commands``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -503,7 +498,7 @@ A list of commands to run to set up the head node. These commands will be merged
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-worker-setup-commands-under-construction:
|
||||
.. _cluster-configuration-worker-setup-commands:
|
||||
|
||||
``worker_setup_commands``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -515,7 +510,7 @@ A list of commands to run to set up the worker nodes. These commands will be mer
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-head-start-ray-commands-under-construction:
|
||||
.. _cluster-configuration-head-start-ray-commands:
|
||||
|
||||
``head_start_ray_commands``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -535,7 +530,7 @@ Commands to start ray on the head node. You don't need to change this.
|
|||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
.. _cluster-configuration-worker-start-ray-commands-under-construction:
|
||||
.. _cluster-configuration-worker-start-ray-commands:
|
||||
|
||||
``worker_start_ray_commands``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -555,7 +550,7 @@ Command to start ray on worker nodes. You don't need to change this.
|
|||
- ray stop
|
||||
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
|
||||
.. _cluster-configuration-image-under-construction:
|
||||
.. _cluster-configuration-image:
|
||||
|
||||
``docker.image``
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
@ -573,7 +568,7 @@ The Ray project provides Docker images on `DockerHub <https://hub.docker.com/u/r
|
|||
* ``rayproject/ray-ml:latest``: No CUDA support, includes ML dependencies.
|
||||
* ``rayproject/ray:latest``: No CUDA support, no ML dependencies.
|
||||
|
||||
.. _cluster-configuration-head-image-under-construction:
|
||||
.. _cluster-configuration-head-image:
|
||||
|
||||
``docker.head_image``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -583,7 +578,7 @@ Docker image for the head node to override the default :ref:`docker image <clust
|
|||
* **Importance:** Low
|
||||
* **Type:** String
|
||||
|
||||
.. _cluster-configuration-worker-image-under-construction:
|
||||
.. _cluster-configuration-worker-image:
|
||||
|
||||
``docker.worker_image``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -593,7 +588,7 @@ Docker image for the worker nodes to override the default :ref:`docker image <cl
|
|||
* **Importance:** Low
|
||||
* **Type:** String
|
||||
|
||||
.. _cluster-configuration-container-name-under-construction:
|
||||
.. _cluster-configuration-container-name:
|
||||
|
||||
``docker.container_name``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -605,7 +600,7 @@ The name to use when starting the Docker container.
|
|||
* **Type:** String
|
||||
* **Default:** ray_container
|
||||
|
||||
.. _cluster-configuration-pull-before-run-under-construction:
|
||||
.. _cluster-configuration-pull-before-run:
|
||||
|
||||
``docker.pull_before_run``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -617,7 +612,7 @@ If enabled, the latest version of image will be pulled when starting Docker. If
|
|||
* **Type:** Boolean
|
||||
* **Default:** ``True``
|
||||
|
||||
.. _cluster-configuration-run-options-under-construction:
|
||||
.. _cluster-configuration-run-options:
|
||||
|
||||
``docker.run_options``
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -629,7 +624,7 @@ The extra options to pass to ``docker run``.
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-head-run-options-under-construction:
|
||||
.. _cluster-configuration-head-run-options:
|
||||
|
||||
``docker.head_run_options``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -641,7 +636,7 @@ The extra options to pass to ``docker run`` for head node only.
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-worker-run-options-under-construction:
|
||||
.. _cluster-configuration-worker-run-options:
|
||||
|
||||
``docker.worker_run_options``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -653,7 +648,7 @@ The extra options to pass to ``docker run`` for worker nodes only.
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-disable-automatic-runtime-detection-under-construction:
|
||||
.. _cluster-configuration-disable-automatic-runtime-detection:
|
||||
|
||||
``docker.disable_automatic_runtime_detection``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -666,7 +661,7 @@ If enabled, Ray will not try to use the NVIDIA Container Runtime if GPUs are pre
|
|||
* **Default:** ``False``
|
||||
|
||||
|
||||
.. _cluster-configuration-disable-shm-size-detection-under-construction:
|
||||
.. _cluster-configuration-disable-shm-size-detection:
|
||||
|
||||
``docker.disable_shm_size_detection``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -680,7 +675,7 @@ If ``--shm-size=<>`` is manually added to ``run_options``, this is *automaticall
|
|||
* **Default:** ``False``
|
||||
|
||||
|
||||
.. _cluster-configuration-ssh-user-under-construction:
|
||||
.. _cluster-configuration-ssh-user:
|
||||
|
||||
``auth.ssh_user``
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
@ -691,7 +686,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
* **Importance:** High
|
||||
* **Type:** String
|
||||
|
||||
.. _cluster-configuration-ssh-private-key-under-construction:
|
||||
.. _cluster-configuration-ssh-private-key:
|
||||
|
||||
``auth.ssh_private_key``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -722,7 +717,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
* **Importance:** Low
|
||||
* **Type:** String
|
||||
|
||||
.. _cluster-configuration-ssh-public-key-under-construction:
|
||||
.. _cluster-configuration-ssh-public-key:
|
||||
|
||||
``auth.ssh_public_key``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -743,7 +738,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
|
||||
Not available.
|
||||
|
||||
.. _cluster-configuration-type-under-construction:
|
||||
.. _cluster-configuration-type:
|
||||
|
||||
``provider.type``
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
@ -772,7 +767,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
* **Importance:** High
|
||||
* **Type:** String
|
||||
|
||||
.. _cluster-configuration-region-under-construction:
|
||||
.. _cluster-configuration-region:
|
||||
|
||||
``provider.region``
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -799,7 +794,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
* **Type:** String
|
||||
* **Default:** us-west1
|
||||
|
||||
.. _cluster-configuration-availability-zone-under-construction:
|
||||
.. _cluster-configuration-availability-zone:
|
||||
|
||||
``provider.availability_zone``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -828,7 +823,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
* **Type:** String
|
||||
* **Default:** us-west1-a
|
||||
|
||||
.. _cluster-configuration-location-under-construction:
|
||||
.. _cluster-configuration-location:
|
||||
|
||||
``provider.location``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -850,7 +845,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
|
||||
Not available.
|
||||
|
||||
.. _cluster-configuration-resource-group-under-construction:
|
||||
.. _cluster-configuration-resource-group:
|
||||
|
||||
``provider.resource_group``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -872,7 +867,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
|
||||
Not available.
|
||||
|
||||
.. _cluster-configuration-subscription-id-under-construction:
|
||||
.. _cluster-configuration-subscription-id:
|
||||
|
||||
``provider.subscription_id``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -894,7 +889,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
|
||||
Not available.
|
||||
|
||||
.. _cluster-configuration-project-id-under-construction:
|
||||
.. _cluster-configuration-project-id:
|
||||
|
||||
``provider.project_id``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -916,7 +911,7 @@ The user that Ray will authenticate with when launching new nodes.
|
|||
* **Type:** String
|
||||
* **Default:** ``null``
|
||||
|
||||
.. _cluster-configuration-cache-stopped-nodes-under-construction:
|
||||
.. _cluster-configuration-cache-stopped-nodes:
|
||||
|
||||
``provider.cache_stopped_nodes``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -929,7 +924,7 @@ If enabled, nodes will be *stopped* when the cluster scales down. If disabled, n
|
|||
* **Type:** Boolean
|
||||
* **Default:** ``True``
|
||||
|
||||
.. _cluster-configuration-security-group-under-construction:
|
||||
.. _cluster-configuration-security-group:
|
||||
|
||||
``provider.security_group``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -951,7 +946,7 @@ If enabled, nodes will be *stopped* when the cluster scales down. If disabled, n
|
|||
Not available.
|
||||
|
||||
|
||||
.. _cluster-configuration-group-name-under-construction:
|
||||
.. _cluster-configuration-group-name:
|
||||
|
||||
``security_group.GroupName``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -963,7 +958,7 @@ The name of the security group. This name must be unique within the VPC.
|
|||
* **Type:** String
|
||||
* **Default:** ``"ray-autoscaler-{cluster-name}"``
|
||||
|
||||
.. _cluster-configuration-ip-permissions-under-construction:
|
||||
.. _cluster-configuration-ip-permissions:
|
||||
|
||||
``security_group.IpPermissions``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -974,7 +969,7 @@ The inbound rules associated with the security group.
|
|||
* **Importance:** Medium
|
||||
* **Type:** `IpPermission <https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_IpPermission.html>`_
|
||||
|
||||
.. _cluster-configuration-node-config-under-construction:
|
||||
.. _cluster-configuration-node-config:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.node_config``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -985,7 +980,7 @@ The configuration to be used to launch the nodes on the cloud service provider.
|
|||
* **Importance:** High
|
||||
* **Type:** :ref:`Node config <cluster-configuration-node-config-type>`
|
||||
|
||||
.. _cluster-configuration-resources-under-construction:
|
||||
.. _cluster-configuration-resources:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.resources``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -999,7 +994,7 @@ The resources that a node type provides, which enables the autoscaler to automat
|
|||
|
||||
In some cases, adding special nodes without any resources may be desirable. Such nodes can be used as a driver which connects to the cluster to launch jobs. In order to manually add a node to an autoscaled cluster, the *ray-cluster-name* tag should be set and *ray-node-type* tag should be set to unmanaged. Unmanaged nodes can be created by setting the resources to ``{}`` and the :ref:`maximum workers <cluster-configuration-node-min-workers>` to 0. The Autoscaler will not attempt to start, stop, or update unmanaged nodes. The user is responsible for properly setting up and cleaning up unmanaged nodes.
|
||||
|
||||
.. _cluster-configuration-node-min-workers-under-construction:
|
||||
.. _cluster-configuration-node-min-workers:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.min_workers``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1013,7 +1008,7 @@ The minimum number of workers to maintain for this node type regardless of utili
|
|||
* **Minimum:** ``0``
|
||||
* **Maximum:** Unbounded
|
||||
|
||||
.. _cluster-configuration-node-max-workers-under-construction:
|
||||
.. _cluster-configuration-node-max-workers:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.max_workers``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1029,7 +1024,7 @@ Note, for the nodes of type ``head_node_type`` the default number of max workers
|
|||
* **Minimum:** ``0``
|
||||
* **Maximum:** cluster-wide :ref:`max_workers <cluster-configuration-max-workers>`
|
||||
|
||||
.. _cluster-configuration-node-type-worker-setup-commands-under-construction:
|
||||
.. _cluster-configuration-node-type-worker-setup-commands:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.worker_setup_commands``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1041,7 +1036,7 @@ A list of commands to run to set up worker nodes of this type. These commands wi
|
|||
* **Type:** List of String
|
||||
* **Default:** ``[]``
|
||||
|
||||
.. _cluster-configuration-cpu-under-construction:
|
||||
.. _cluster-configuration-cpu:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.resources.CPU``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1071,7 +1066,7 @@ A list of commands to run to set up worker nodes of this type. These commands wi
|
|||
* **Type:** Integer
|
||||
|
||||
|
||||
.. _cluster-configuration-gpu-under-construction:
|
||||
.. _cluster-configuration-gpu:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.resources.GPU``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1100,7 +1095,7 @@ A list of commands to run to set up worker nodes of this type. These commands wi
|
|||
* **Importance:** High
|
||||
* **Type:** Integer
|
||||
|
||||
.. _cluster-configuration-memory-under-construction:
|
||||
.. _cluster-configuration-memory:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.resources.memory``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1129,7 +1124,7 @@ A list of commands to run to set up worker nodes of this type. These commands wi
|
|||
* **Importance:** High
|
||||
* **Type:** Integer
|
||||
|
||||
.. _cluster-configuration-object-store-memory-under-construction:
|
||||
.. _cluster-configuration-object-store-memory:
|
||||
|
||||
``available_node_types.<node_type_name>.node_type.resources.object-store-memory``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1158,7 +1153,7 @@ A list of commands to run to set up worker nodes of this type. These commands wi
|
|||
* **Importance:** High
|
||||
* **Type:** Integer
|
||||
|
||||
.. _cluster-configuration-node-docker-under-construction:
|
||||
.. _cluster-configuration-node-docker:
|
||||
|
||||
``available_node_types.<node_type_name>.docker``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ref-cluster-setup-under-construction:
|
||||
.. _ref-cluster-setup:
|
||||
|
||||
Community Supported Cluster Managers
|
||||
====================================
|
||||
|
||||
.. note::
|
||||
|
||||
If you're using AWS, Azure or GCP you can use the :ref:`Ray cluster launcher <cluster-cloud>` to simplify the cluster setup process.
|
||||
If you're using AWS, Azure or GCP you can use the :ref:`Ray cluster launcher <cluster-index>` to simplify the cluster setup process.
|
||||
|
||||
The following is a list of community supported cluster managers.
|
||||
|
||||
|
@ -18,7 +16,7 @@ The following is a list of community supported cluster managers.
|
|||
slurm.rst
|
||||
lsf.rst
|
||||
|
||||
.. _ref-additional-cloud-providers-under-construction:
|
||||
.. _ref-additional-cloud-providers:
|
||||
|
||||
Using a custom cloud or cluster manager
|
||||
=======================================
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ray-LSF-deploy-under-construction:
|
||||
.. _ray-LSF-deploy:
|
||||
|
||||
Deploying on LSF
|
||||
================
|
||||
|
||||
This document describes a couple high-level steps to run ray cluster on LSF.
|
||||
This document describes a couple high-level steps to run Ray clusters on LSF.
|
||||
|
||||
1) Obtain desired nodes from LSF scheduler using bsub directives.
|
||||
2) Obtain free ports on the desired nodes to start ray services like dashboard, GCS etc.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-basic-under-construction:
|
||||
.. _slurm-basic:
|
||||
|
||||
slurm-basic.sh
|
||||
~~~~~~~~~~~~~~
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-launch-under-construction:
|
||||
.. _slurm-launch:
|
||||
|
||||
slurm-launch.py
|
||||
~~~~~~~~~~~~~~~
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-template-under-construction:
|
||||
.. _slurm-template:
|
||||
|
||||
slurm-template.sh
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ray-slurm-deploy-under-construction:
|
||||
.. _ray-slurm-deploy:
|
||||
|
||||
Deploying on Slurm
|
||||
==================
|
||||
|
@ -14,7 +12,7 @@ Slurm usage with Ray can be a little bit unintuitive.
|
|||
|
||||
SLURM support is still a work in progress. SLURM users should be aware
|
||||
of current limitations regarding networking.
|
||||
See :ref:`here <slurm-network-ray-under-construction>` for more explanations.
|
||||
See :ref:`here <slurm-network-ray>` for more explanations.
|
||||
|
||||
SLURM support is community-maintained. Maintainer GitHub handle: tupui.
|
||||
|
||||
|
@ -40,9 +38,9 @@ The below walkthrough will do the following:
|
|||
5. Launch Ray processes in (n-1) worker nodes and connects them to the head node by providing the head node address.
|
||||
6. After the underlying ray cluster is ready, submit the user specified task.
|
||||
|
||||
See :ref:`slurm-basic.sh <slurm-basic-under-construction>` for an end-to-end example.
|
||||
See :ref:`slurm-basic.sh <slurm-basic>` for an end-to-end example.
|
||||
|
||||
.. _ray-slurm-headers-under-construction:
|
||||
.. _ray-slurm-headers:
|
||||
|
||||
sbatch directives
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
@ -149,7 +147,7 @@ Finally, you can invoke your Python script:
|
|||
:language: bash
|
||||
:start-after: __doc_script_start__
|
||||
|
||||
.. _slurm-network-ray-under-construction:
|
||||
.. _slurm-network-ray:
|
||||
|
||||
SLURM networking caveats
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -224,8 +222,8 @@ set an internal IP
|
|||
Python-interface SLURM scripts
|
||||
------------------------------
|
||||
|
||||
[Contributed by @pengzhenghao] Below, we provide a helper utility (:ref:`slurm-launch.py <slurm-launch-under-construction>`) to auto-generate SLURM scripts and launch.
|
||||
``slurm-launch.py`` uses an underlying template (:ref:`slurm-template.sh <slurm-template-under-construction>`) and fills out placeholders given user input.
|
||||
[Contributed by @pengzhenghao] Below, we provide a helper utility (:ref:`slurm-launch.py <slurm-launch>`) to auto-generate SLURM scripts and launch.
|
||||
``slurm-launch.py`` uses an underlying template (:ref:`slurm-template.sh <slurm-template>`) and fills out placeholders given user input.
|
||||
|
||||
You can feel free to copy both files into your cluster for use. Feel free to also open any PRs for contributions to improve this script!
|
||||
|
||||
|
@ -255,12 +253,12 @@ There are other options you can use when calling ``python slurm-launch.py``:
|
|||
* ``--partition`` (``-p``): The partition you wish to use. Default: "", will use user's default partition.
|
||||
* ``--load-env``: The command to setup your environment. For example: ``module load cuda/10.1``. Default: "".
|
||||
|
||||
Note that the :ref:`slurm-template.sh <slurm-template-under-construction>` is compatible with both IPV4 and IPV6 ip address of the computing nodes.
|
||||
Note that the :ref:`slurm-template.sh <slurm-template>` is compatible with both IPV4 and IPV6 ip address of the computing nodes.
|
||||
|
||||
Implementation
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Concretely, the (:ref:`slurm-launch.py <slurm-launch-under-construction>`) does the following things:
|
||||
Concretely, the (:ref:`slurm-launch.py <slurm-launch>`) does the following things:
|
||||
|
||||
1. It automatically writes your requirements, e.g. number of CPUs, GPUs per node, the number of nodes and so on, to a sbatch script name ``{exp-name}_{date}-{time}.sh``. Your command (``--command``) to launch your own job is also written into the sbatch script.
|
||||
2. Then it will submit the sbatch script to slurm manager via a new process.
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _ray-yarn-deploy-under-construction:
|
||||
.. _ray-yarn-deploy:
|
||||
|
||||
Deploying on YARN
|
||||
=================
|
||||
|
|
|
@ -1,51 +1,56 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
.. _deployment-guide-autoscaler-under-construction:
|
||||
.. _vms-autoscaling:
|
||||
|
||||
Configuring Autoscaling
|
||||
=======================
|
||||
|
||||
This guide explains how to configure the Ray autoscaler. The Ray autoscaler adjusts
|
||||
the number of nodes in the cluster based on the resources required by tasks, actors or
|
||||
placement groups.
|
||||
This guide explains how to configure the Ray autoscaler using the Ray cluster launcher.
|
||||
The Ray autoscaler is a Ray cluster process that automatically scales a cluster up and down based on resource demand.
|
||||
The autoscaler does this by adjusting the number of nodes in the cluster based on the resources required by tasks, actors or placement groups.
|
||||
|
||||
Note that the autoscaler only considers logical resource requests for scaling (i.e., those specified
|
||||
in ``@ray.remote`` and displayed in ``ray status``), not machine utilization.
|
||||
If a user tries to launch an actor, task, or placement group but there are insufficient resources, the request will be queued.
|
||||
The autoscaler adds nodes to satisfy resource demands in the queue, and removes nodes when they become idle.
|
||||
Note that the autoscaler only considers logical resource requests for scaling (i.e., those specified in ``@ray.remote`` and displayed in `ray status`), not physical machine utilization. If a user tries to launch an actor, task, or placement group but there are insufficient resources, the request will be queued. The autoscaler adds nodes to satisfy resource demands in this queue.
|
||||
The autoscaler also removes nodes after they become idle for some time.
|
||||
A node is considered idle if it has no active tasks, actors, or objects.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
.. tip::
|
||||
**When to use Autoscaling?**
|
||||
|
||||
The following are the autoscaling parameters that are specified with cluster launch. They can also be modified at runtime by
|
||||
updating the cluster config.
|
||||
Autoscaling can reduce workload costs, but adds node launch overheads and can be tricky to configure.
|
||||
We recommend starting with non-autoscaling clusters if you're new to Ray.
|
||||
|
||||
**max_workers[default_value=2, min_value=0]**: Specify the max number of cluster worker nodes. Note that this excludes the head node.
|
||||
Cluster Config Parameters
|
||||
-------------------------
|
||||
|
||||
The following options are available in your cluster config file.
|
||||
It is recommended that you set these before launching your cluster, but you can also modify them at run-time by updating the cluster config.
|
||||
|
||||
`max_workers[default_value=2, min_value=0]`: The max number of cluster worker nodes to launch. Note that this does not include the head node.
|
||||
|
||||
`min_workers[default_value=0, min_value=0]`: The min number of cluster worker nodes to launch, regardless of utilization. Note that this does not include the head node. This number must be less than the ``max_workers``.
|
||||
|
||||
.. note::
|
||||
|
||||
If this value is modified at runtime, the autoscaler will immediately remove nodes until this constraint
|
||||
If `max_workers` is modified at runtime, the autoscaler will immediately remove nodes until this constraint
|
||||
is satisfied. This may disrupt running workloads.
|
||||
|
||||
**upscaling_speed[default_value=1.0, min_value=1.0]**: The number of nodes allowed to be pending as a multiple of the current number of nodes.
|
||||
For example, if this is set to 1.0, the cluster can grow in size by at most 100% at any time, so if the cluster currently has 20 nodes, at most 20 pending
|
||||
If you are using more than one node type, you can also set min and max workers for each individual type:
|
||||
|
||||
`available_node_types.<node_type_name>.max_workers[default_value=cluster max_workers, min_value=0]`: The maximum number of worker nodes of a given type to launch. This number must be less than or equal to the `max_workers` for the cluster.
|
||||
|
||||
|
||||
`available_node_types.<node_type_name>.min_workers[default_value=0, min_value=0]`: The minimum number of worker nodes of a given type to launch, regardless of utilization. The sum of `min_workers` across all node types must be less than or equal to the `max_workers` for the cluster.
|
||||
|
||||
Upscaling and downscaling speed
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If needed, you can also control the rate at which nodes should be added to or removed from the cluster. For applications with many short-lived tasks, you may wish to adjust the upscaling and downscaling speed to be more conservative.
|
||||
|
||||
`upscaling_speed[default_value=1.0, min_value=1.0]`: The number of nodes allowed to be pending as a multiple of the current number of nodes. The higher the value, the more aggressive upscaling will be. For example, if this is set to 1.0, the cluster can grow in size by at most 100% at any time, so if the cluster currently has 20 nodes, at most 20 pending
|
||||
launches are allowed. The minimum number of pending launches is 5 regardless of this setting.
|
||||
|
||||
**idle_timeout_minutes[default_value=5, min_value=0]**: The number of minutes that need to pass before an idle worker node is removed by the
|
||||
autoscaler. Worker nodes are idle when they hold no active tasks, actors, or referenced objects (either in-memory or spilled to disk). This parameter does not affect the head node.
|
||||
`idle_timeout_minutes[default_value=5, min_value=0]`: The number of minutes that need to pass before an idle worker node is removed by the
|
||||
autoscaler. The smaller the value, the more aggressive downscaling will be. Worker nodes are considered idle when they hold no active tasks, actors, or referenced objects (either in-memory or spilled to disk). This parameter does not affect the head node.
|
||||
|
||||
**available_node_types.<node_type_name>.min_workers[default_value=0, min_value=0]**: The minimum number of worker nodes of a given type to launch. If this number
|
||||
is set to greater than zero, the autoscaler will maintain the number of nodes regardless of utilization. The sum of the min worker of all the node types
|
||||
must be less than or equal to the max_workers for the cluster.
|
||||
Programmatic Scaling
|
||||
--------------------
|
||||
|
||||
**available_node_types.<node_type_name>.max_workers[default_value=cluster max_workers, min_value=0]**: The maximum number of worker nodes of a given type to launch. This must be
|
||||
greater than or equal to available_node_types.<node_type_name>.min_workers. It must be less than or equal to the max_workers for the cluster.
|
||||
|
||||
.. note::
|
||||
If this value is modified at runtime, the autoscaler will immediately remove nodes until this constraint
|
||||
is satisfied. This may disrupt running workloads.
|
||||
|
||||
Autoscaler SDK
|
||||
==============
|
||||
|
||||
For more information on programmatic access to the autoscaler, see :ref:`Autoscaler SDK<ref-autoscaler-sdk-under-construction>`.
|
||||
For more information on programmatic access to the autoscaler, see the :ref:`Programmatic Cluster Scaling Guide <ref-autoscaler-sdk>`.
|
||||
|
|
|
@ -1,4 +1,15 @@
|
|||
(vm-cluster-guides)=
|
||||
|
||||
# User Guides
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
|
||||
:::{note}
|
||||
To learn the basics of Ray on Cloud VMs, we recommend taking a look
|
||||
at the {ref}`introductory guide <vm-cluster-quick-start>` first.
|
||||
:::
|
||||
|
||||
In these guides, we go into further depth on several topics related to
|
||||
deployments of Ray on Cloud VMs.
|
||||
* {ref}`launching-vm-clusters`
|
||||
* {ref}`vms-large-cluster`
|
||||
* {ref}`vms-autoscaling`
|
||||
* {ref}`ref-cluster-setup`
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
.. _vms-large-cluster:
|
||||
|
||||
Best practices for deploying large clusters
|
||||
-------------------------------------------
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
Monitor Ray using Amazon CloudWatch
|
||||
===================================
|
||||
|
||||
|
@ -237,4 +235,4 @@ You can find Ray Prometheus metrics in the ``{cluster_name}-ray-prometheus`` met
|
|||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can apply changes to the CloudWatch Logs, Metrics, Dashboard, and Alarms for your cluster by simply modifying the CloudWatch config files referenced by your Ray cluster config YAML and re-running ``ray up example-cloudwatch.yaml``.
|
||||
The Unified CloudWatch Agent will be automatically restarted on all cluster nodes, and your config changes will be applied.
|
||||
The Unified CloudWatch Agent will be automatically restarted on all cluster nodes, and your config changes will be applied.
|
||||
|
|
|
@ -67,6 +67,8 @@ If you want to learn more about the Ray cluster launcher, see this blog post for
|
|||
|
||||
## AWS Configurations
|
||||
|
||||
(aws-cluster-efs)=
|
||||
|
||||
### Using Amazon EFS
|
||||
|
||||
To utilize Amazon EFS in the Ray cluster, you will need to install some additional utilities and mount the EFS in `setup_commands`. Note that these instructions only work if you are using the Ray cluster launcher on AWS.
|
||||
|
@ -92,6 +94,8 @@ setup_commands:
|
|||
sudo chmod 777 efs;
|
||||
```
|
||||
|
||||
(aws-cluster-s3)=
|
||||
|
||||
### Accessing S3
|
||||
|
||||
In various scenarios, worker nodes may need write access to an S3 bucket, e.g., Ray Tune has an option to write checkpoints to S3 instead of syncing them directly back to the driver.
|
||||
|
@ -123,4 +127,4 @@ secret_key ****************YYYY iam-role
|
|||
region <not set> None None
|
||||
```
|
||||
|
||||
Please refer to this [discussion](https://github.com/ray-project/ray/issues/9327) for more details on ???.
|
||||
Please refer to this [discussion](https://github.com/ray-project/ray/issues/9327) for more details on ???.
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
.. _launching-vm-clusters:
|
||||
|
||||
Launching Ray Clusters
|
||||
======================
|
||||
|
@ -18,4 +15,4 @@ Table of Contents
|
|||
aws-cloud-watch.rst
|
||||
gcp.md
|
||||
azure.md
|
||||
on-premises.md
|
||||
on-premises.md
|
||||
|
|
|
@ -127,14 +127,7 @@ def mock_modules():
|
|||
|
||||
# Add doc files from external repositories to be downloaded during build here
|
||||
# (repo, ref, path to get, path to save on disk)
|
||||
EXTERNAL_MARKDOWN_FILES = [
|
||||
(
|
||||
"ray-project/ray_lightning",
|
||||
"6aed848f757a03c03166c1a9bddfeea5153e7b90",
|
||||
"README.md",
|
||||
"ray-more-libs/ray-lightning.md",
|
||||
),
|
||||
]
|
||||
EXTERNAL_MARKDOWN_FILES = []
|
||||
|
||||
|
||||
class DownloadAndPreprocessEcosystemDocs:
|
||||
|
|
|
@ -78,13 +78,13 @@ Here's an example:
|
|||
`Dask.distributed <https://distributed.dask.org/en/latest/quickstart.html>`__
|
||||
client; simply use plain Dask and its collections, and pass ``ray_dask_get``
|
||||
to ``.compute()`` calls, set the scheduler in one of the other ways detailed `here <https://docs.dask.org/en/latest/scheduling.html#configuration>`__, or use our ``enable_dask_on_ray`` configuration helper. Follow the instructions for
|
||||
:ref:`using Ray on a cluster <using-ray-on-a-cluster>` to modify the
|
||||
:ref:`using Ray on a cluster <cluster-index>` to modify the
|
||||
``ray.init()`` call.
|
||||
|
||||
Why use Dask on Ray?
|
||||
|
||||
1. To take advantage of Ray-specific features such as the
|
||||
:ref:`launching cloud clusters <cluster-cloud>` and
|
||||
:ref:`launching cloud clusters <cluster-index>` and
|
||||
:ref:`shared-memory store <memory>`.
|
||||
2. If you'd like to use Dask and Ray libraries in the same application without having two different clusters.
|
||||
3. If you'd like to create data analyses using the familiar NumPy and Pandas APIs provided by Dask and execute them on a fast, fault-tolerant distributed task execution system geared towards production, like Ray.
|
||||
|
|
|
@ -20,7 +20,7 @@ and are compatible with a variety of file formats, data sources, and distributed
|
|||
Here's an overview of the integrations with other processing frameworks, file formats, and supported operations,
|
||||
as well as a glimpse at the Ray Datasets API.
|
||||
|
||||
Check our :ref:`compatibility matrix<data-compatibility>` to see if your favorite format
|
||||
Check our :ref:`compatibility matrix <data-compatibility>` to see if your favorite format
|
||||
is already supported.
|
||||
|
||||
.. image:: images/dataset.svg
|
||||
|
@ -85,7 +85,7 @@ Advanced users can refer directly to the Ray Datasets :ref:`API reference <data-
|
|||
^^^
|
||||
|
||||
Understand the key concepts behind Ray Datasets.
|
||||
Learn what :ref:`Datasets<dataset_concept>` and :ref:`Dataset Pipelines<dataset_pipeline_concept>` are
|
||||
Learn what :ref:`Datasets <dataset_concept>` and :ref:`Dataset Pipelines <dataset_pipeline_concept>` are
|
||||
and how they get executed in Ray Datasets.
|
||||
|
||||
+++
|
||||
|
@ -98,11 +98,11 @@ Advanced users can refer directly to the Ray Datasets :ref:`API reference <data-
|
|||
**User Guides**
|
||||
^^^
|
||||
|
||||
Learn how to :ref:`create datasets<creating_datasets>`, :ref:`save
|
||||
datasets<saving_datasets>`, :ref:`transform datasets<transforming_datasets>`,
|
||||
:ref:`access and exchange datasets<consuming_datasets>`, :ref:`pipeline
|
||||
transformations<pipelining_datasets>`, :ref:`load and process data for ML<datasets-ml-preprocessing>`,
|
||||
work with :ref:`tensor data<datasets_tensor_support>`, or :ref:`use pipelines<data_pipeline_usage>`.
|
||||
Learn how to :ref:`create datasets <creating_datasets>`, :ref:`save
|
||||
datasets <saving_datasets>`, :ref:`transform datasets <transforming_datasets>`,
|
||||
:ref:`access and exchange datasets <consuming_datasets>`, :ref:`pipeline
|
||||
transformations <pipelining_datasets>`, :ref:`load and process data for ML <datasets-ml-preprocessing>`,
|
||||
work with :ref:`tensor data <datasets_tensor_support>`, or :ref:`use pipelines <data_pipeline_usage>`.
|
||||
|
||||
+++
|
||||
.. link-button:: data_user_guide
|
||||
|
|
|
@ -45,7 +45,7 @@ will be able to connect to and use the Ray cluster.
|
|||
Modin with the Ray Client
|
||||
-------------------------
|
||||
|
||||
When using Modin with the :ref:`Ray Client <ray-client>`, it is important to ensure that the
|
||||
When using Modin with the :ref:`Ray Client <ray-client-ref>`, it is important to ensure that the
|
||||
cluster has all dependencies installed.
|
||||
|
||||
.. code-block:: python
|
||||
|
|
|
@ -66,15 +66,15 @@ You can often [parallelize](ray-core/walkthrough.rst) single machine code with l
|
|||
**Deploy large-scale workloads with**\
|
||||
**<img src="ray-overview/images/ray_svg_logo.svg" alt="ray" width="50px">Clusters**
|
||||
^^^
|
||||
With a Ray cluster you can deploy your workloads on [AWS, GCP, Azure](cluster-deprecated/quickstart) or
|
||||
[on premise](cluster/cloud.html#cluster-private-setup).
|
||||
You can also use [Ray Cluster Managers](cluster-deprecated/deploy) to run Ray on your existing
|
||||
[Kubernetes](cluster-deprecated/kubernetes),
|
||||
[YARN](cluster-deprecated/yarn),
|
||||
or [Slurm](cluster-deprecated/slurm) clusters.
|
||||
With a Ray cluster you can deploy your workloads on [AWS, GCP, Azure](cluster/getting-started) or
|
||||
[on premise](cluster/vms/user-guides/launching-clusters/on-premises).
|
||||
You can also use Ray cluster managers to run Ray on your existing
|
||||
[Kubernetes](cluster/kubernetes/index),
|
||||
[YARN](cluster/vms/user-guides/community/yarn),
|
||||
or [Slurm](cluster/vms/user-guides/community/slurm) clusters.
|
||||
+++
|
||||
|
||||
```{link-button} cluster-deprecated/quickstart
|
||||
```{link-button} cluster/getting-started
|
||||
:type: ref
|
||||
:text: Get Started
|
||||
:classes: btn-outline-info btn-block
|
||||
|
|
|
@ -33,7 +33,7 @@ This scenario describes most usages of Ray libraries today.
|
|||
In the above diagram:
|
||||
|
||||
* Only one library is used -- showing that you can pick and choose and do not need to replace all of your ML infrastructure to use Ray AIR.
|
||||
* You can use one of :ref:`Ray's many deployment modes <ref-deployment-guide>` to launch and manage Ray clusters and Ray applications.
|
||||
* You can use one of :ref:`Ray's many deployment modes <jobs-overview>` to launch and manage Ray clusters and Ray applications.
|
||||
* AIR libraries can read data from external storage systems such as Amazon S3 / Google Cloud Storage, as well as store results there.
|
||||
|
||||
|
||||
|
|
|
@ -388,7 +388,7 @@
|
|||
"id": "1df4faa9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To scale your training script, create a [Ray Cluster](deployment-guide) and increase the number of workers. If your cluster contains GPUs, add `\"use_gpu\": True` to your scaling config.\n",
|
||||
"To scale your training script, create a [Ray Cluster](cluster-index) and increase the number of workers. If your cluster contains GPUs, add `\"use_gpu\": True` to your scaling config.\n",
|
||||
"\n",
|
||||
"```{code-block} python\n",
|
||||
"scaling_config=ScalingConfig(num_workers=8, \"use_gpu=True)\n",
|
||||
|
|
|
@ -16,14 +16,11 @@ Datasets
|
|||
|
||||
|
||||
Preprocessors
|
||||
~~~~~~~~~~~~~
|
||||
-------------
|
||||
|
||||
Preprocessors are primitives that can be used to transform input data into features.
|
||||
|
||||
A preprocessor can be fitted during Training, and applied at runtime in both Training and Serving on data batches in the same way. AIR comes with a collection of built-in preprocessors, and you can also define your own with simple templates.
|
||||
|
||||
Preprocessors operate on :ref:`Datasets <datasets>`, which makes them scalable and compatible with a variety of datasources and dataframe libraries.
|
||||
Preprocessors are primitives that can be used to transform input data into features. Preprocessors operate on :ref:`Datasets <datasets>`, which makes them scalable and compatible with a variety of datasources and dataframe libraries.
|
||||
|
||||
A Preprocessor is fitted during Training, and applied at runtime in both Training and Serving on data batches in the same way. AIR comes with a collection of built-in preprocessors, and you can also define your own with simple templates.
|
||||
|
||||
.. literalinclude:: doc_code/air_key_concepts.py
|
||||
:language: python
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
.. _air-preprocessors:
|
||||
|
||||
Using preprocessors
|
||||
Using Preprocessors
|
||||
===================
|
||||
|
||||
Data preprocessing is a common technique for transforming raw data into features for a machine learning model.
|
||||
|
|
|
@ -1,499 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "79724cfa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# XGBoost-Ray with Dask\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"This notebook includes an example workflow using\n",
|
||||
"[XGBoost-Ray](https://github.com/ray-project/xgboost_ray) and\n",
|
||||
"[Dask](https://docs.dask.org/en/latest/) for distributed model training,\n",
|
||||
"hyperparameter optimization, and prediction.\n",
|
||||
"\n",
|
||||
"## Cluster Setup\n",
|
||||
"\n",
|
||||
"First, we'll set up our Ray Cluster. The provided [dask_xgboost.yaml](https://raw.githubusercontent.com/ray-project/ray/master/doc/source/ray-core/examples/dask_xgboost/dask_xgboost.yaml)\n",
|
||||
"cluster config can be used to set up an AWS cluster with 64 CPUs.\n",
|
||||
"\n",
|
||||
"The following steps assume you are in a directory with both\n",
|
||||
"``dask_xgboost.yaml`` and this file saved as ``dask_xgboost.ipynb``.\n",
|
||||
"\n",
|
||||
"**Step 1:** Bring up the Ray cluster.\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"pip install ray boto3\n",
|
||||
"ray up dask_xgboost.yaml\n",
|
||||
"```\n",
|
||||
"**Step 2:** Move ``dask_xgboost.ipynb`` to the cluster and start Jupyter.\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"ray rsync_up dask_xgboost.yaml \"./dask_xgboost.ipynb\" \\\n",
|
||||
" \"~/dask_xgboost.ipynb\"\n",
|
||||
"ray exec dask_xgboost.yaml --port-forward=9999 \"jupyter notebook \\\n",
|
||||
" --port=9999\"\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"You can then access this notebook at the URL that is output:\n",
|
||||
"``http://localhost:9999/?token=<token>``\n",
|
||||
"\n",
|
||||
"## Python Setup\n",
|
||||
"\n",
|
||||
"First, we'll import all the libraries we'll be using. This step also helps us\n",
|
||||
"verify that the environment is configured correctly. If any of the imports\n",
|
||||
"are missing, an exception will be raised."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "166268b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import argparse\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"import dask\n",
|
||||
"import dask.dataframe as dd\n",
|
||||
"from xgboost_ray import RayDMatrix, RayParams, train, predict\n",
|
||||
"\n",
|
||||
"import ray\n",
|
||||
"from ray import air, tune\n",
|
||||
"from ray.util.dask import ray_dask_get"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "05d4da07",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, let's parse some arguments. This will be used for executing the ``.py``\n",
|
||||
"file, but not for the ``.ipynb``. If you are using the interactive notebook,\n",
|
||||
"you can directly override the arguments manually."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "77186e19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"parser = argparse.ArgumentParser()\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--address\", type=str, default=\"auto\", help=\"The address to use for Ray.\"\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--smoke-test\",\n",
|
||||
" action=\"store_true\",\n",
|
||||
" help=\"Read a smaller dataset for quick testing purposes.\",\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--num-actors\", type=int, default=4, help=\"Sets number of actors for training.\"\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--cpus-per-actor\",\n",
|
||||
" type=int,\n",
|
||||
" default=6,\n",
|
||||
" help=\"The number of CPUs per actor for training.\",\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--num-actors-inference\",\n",
|
||||
" type=int,\n",
|
||||
" default=16,\n",
|
||||
" help=\"Sets number of actors for inference.\",\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--cpus-per-actor-inference\",\n",
|
||||
" type=int,\n",
|
||||
" default=2,\n",
|
||||
" help=\"The number of CPUs per actor for inference.\",\n",
|
||||
")\n",
|
||||
"# Ignore -f from ipykernel_launcher\n",
|
||||
"args, _ = parser.parse_known_args()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1c07e34c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Override these arguments as needed:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a35a5f1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"address = args.address\n",
|
||||
"smoke_test = args.smoke_test\n",
|
||||
"num_actors = args.num_actors\n",
|
||||
"cpus_per_actor = args.cpus_per_actor\n",
|
||||
"num_actors_inference = args.num_actors_inference\n",
|
||||
"cpus_per_actor_inference = args.cpus_per_actor_inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e4077845",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Connecting to the Ray cluster\n",
|
||||
"\n",
|
||||
"Now, let's connect our Python script to this newly deployed Ray cluster!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "57fc42c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if not ray.is_initialized():\n",
|
||||
" ray.init(address=address)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2efcf435",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data Preparation\n",
|
||||
"\n",
|
||||
"We will use the `HIGGS dataset from the UCI Machine Learning dataset\n",
|
||||
"repository <https://archive.ics.uci.edu/ml/datasets/HIGGS>`_. The HIGGS\n",
|
||||
"dataset consists of 11,000,000 samples and 28 attributes, which is large\n",
|
||||
"enough size to show the benefits of distributed computation.\n",
|
||||
"\n",
|
||||
"We set the Dask scheduler to ``ray_dask_get`` to use `Dask on Ray\n",
|
||||
"<https://docs.ray.io/en/latest/data/dask-on-ray.html>`_ backend."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e3e8e177",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"LABEL_COLUMN = \"label\"\n",
|
||||
"if smoke_test:\n",
|
||||
" # Test dataset with only 10,000 records.\n",
|
||||
" FILE_URL = \"https://ray-ci-higgs.s3.us-west-2.amazonaws.com/simpleHIGGS\" \".csv\"\n",
|
||||
"else:\n",
|
||||
" # Full dataset. This may take a couple of minutes to load.\n",
|
||||
" FILE_URL = (\n",
|
||||
" \"https://archive.ics.uci.edu/ml/machine-learning-databases\"\n",
|
||||
" \"/00280/HIGGS.csv.gz\"\n",
|
||||
" )\n",
|
||||
"colnames = [LABEL_COLUMN] + [\"feature-%02d\" % i for i in range(1, 29)]\n",
|
||||
"dask.config.set(scheduler=ray_dask_get)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b2bdf8e7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_data_start_time = time.time()\n",
|
||||
"\n",
|
||||
"data = dd.read_csv(FILE_URL, names=colnames)\n",
|
||||
"data = data[sorted(colnames)]\n",
|
||||
"data = data.persist()\n",
|
||||
"\n",
|
||||
"load_data_end_time = time.time()\n",
|
||||
"load_data_duration = load_data_end_time - load_data_start_time\n",
|
||||
"print(f\"Dataset loaded in {load_data_duration} seconds.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "be214015",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"With the connection established, we can now create the Dask dataframe.\n",
|
||||
"\n",
|
||||
"We will split the data into a training set and a evaluation set using a 80-20\n",
|
||||
"proportion."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f84b5aca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_df, eval_df = data.random_split([0.8, 0.2])\n",
|
||||
"train_df, eval_df = train_df.persist(), eval_df.persist()\n",
|
||||
"print(train_df, eval_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "657e7f56",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Distributed Training\n",
|
||||
"\n",
|
||||
"The ``train_xgboost`` function contains all of the logic necessary for\n",
|
||||
"training using XGBoost-Ray.\n",
|
||||
"\n",
|
||||
"Distributed training can not only speed up the process, but also allow you\n",
|
||||
"to use datasets that are to large to fit in memory of a single node. With\n",
|
||||
"distributed training, the dataset is sharded across different actors\n",
|
||||
"running on separate nodes. Those actors communicate with each other to\n",
|
||||
"create the final model.\n",
|
||||
"\n",
|
||||
"First, the dataframes are wrapped in ``RayDMatrix`` objects, which handle\n",
|
||||
"data sharding across the cluster. Then, the ``train`` function is called.\n",
|
||||
"The evaluation scores will be saved to ``evals_result`` dictionary. The\n",
|
||||
"function returns a tuple of the trained model (booster) and the evaluation\n",
|
||||
"scores.\n",
|
||||
"\n",
|
||||
"The ``ray_params`` variable expects a ``RayParams`` object that contains\n",
|
||||
"Ray-specific settings, such as the number of workers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b8957b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_xgboost(config, train_df, test_df, target_column, ray_params):\n",
|
||||
" train_set = RayDMatrix(train_df, target_column)\n",
|
||||
" test_set = RayDMatrix(test_df, target_column)\n",
|
||||
"\n",
|
||||
" evals_result = {}\n",
|
||||
"\n",
|
||||
" train_start_time = time.time()\n",
|
||||
"\n",
|
||||
" # Train the classifier\n",
|
||||
" bst = train(\n",
|
||||
" params=config,\n",
|
||||
" dtrain=train_set,\n",
|
||||
" evals=[(test_set, \"eval\")],\n",
|
||||
" evals_result=evals_result,\n",
|
||||
" ray_params=ray_params,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" train_end_time = time.time()\n",
|
||||
" train_duration = train_end_time - train_start_time\n",
|
||||
" print(f\"Total time taken: {train_duration} seconds.\")\n",
|
||||
"\n",
|
||||
" model_path = \"model.xgb\"\n",
|
||||
" bst.save_model(model_path)\n",
|
||||
" print(\"Final validation error: {:.4f}\".format(evals_result[\"eval\"][\"error\"][-1]))\n",
|
||||
"\n",
|
||||
" return bst, evals_result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8edbeab5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can now pass our Dask dataframes and run the function. We will use\n",
|
||||
"``RayParams`` to specify that the number of actors and CPUs to train with.\n",
|
||||
"\n",
|
||||
"The dataset has to be downloaded onto the cluster, which may take a few\n",
|
||||
"minutes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b48ea01",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# standard XGBoost config for classification\n",
|
||||
"config = {\n",
|
||||
" \"tree_method\": \"approx\",\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"bst, evals_result = train_xgboost(\n",
|
||||
" config,\n",
|
||||
" train_df,\n",
|
||||
" eval_df,\n",
|
||||
" LABEL_COLUMN,\n",
|
||||
" RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors),\n",
|
||||
")\n",
|
||||
"print(f\"Results: {evals_result}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0c04c4e3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Hyperparameter optimization\n",
|
||||
"\n",
|
||||
"If we are not content with the results obtained with default XGBoost\n",
|
||||
"parameters, we can use [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) for cutting-edge\n",
|
||||
"distributed hyperparameter tuning. XGBoost-Ray automatically integrates\n",
|
||||
"with Ray Tune, meaning we can use the same training function as before.\n",
|
||||
"\n",
|
||||
"In this workflow, we will tune three hyperparameters - ``eta``, ``subsample``\n",
|
||||
"and ``max_depth``. We are using [Tune's samplers to define the search\n",
|
||||
"space](https://docs.ray.io/en/latest/tune/user-guide.html#search-space-grid-random).\n",
|
||||
"\n",
|
||||
"The experiment configuration is done through ``Tuner()``. We set the amount\n",
|
||||
"of resources each trial (hyperparameter combination) requires by using the\n",
|
||||
"``get_tune_resources`` method of ``RayParams``. The ``num_samples`` argument\n",
|
||||
"controls how many trials will be ran in total. In the end, the best\n",
|
||||
"combination of hyperparameters evaluated during the experiment will be\n",
|
||||
"returned.\n",
|
||||
"\n",
|
||||
"By default, Tune will use simple random search. However, Tune also\n",
|
||||
"provides various [search algorithms](https://docs.ray.io/en/latest/tune/api_docs/suggestion.html) and\n",
|
||||
"[schedulers](https://docs.ray.io/en/latest/tune/api_docs/schedulers.html)\n",
|
||||
"to further improve the optimization process."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bde33d2a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_xgboost(train_df, test_df, target_column):\n",
|
||||
" # Set XGBoost config.\n",
|
||||
" config = {\n",
|
||||
" \"tree_method\": \"approx\",\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"],\n",
|
||||
" \"eta\": tune.loguniform(1e-4, 1e-1),\n",
|
||||
" \"subsample\": tune.uniform(0.5, 1.0),\n",
|
||||
" \"max_depth\": tune.randint(1, 9),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" ray_params = RayParams(\n",
|
||||
" max_actor_restarts=1, cpus_per_actor=cpus_per_actor, num_actors=num_actors\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" tune_start_time = time.time()\n",
|
||||
" \n",
|
||||
" tuner = tune.Tuner(\n",
|
||||
" tune.with_resources(\n",
|
||||
" tune.with_parameters(\n",
|
||||
" train_xgboost,\n",
|
||||
" train_df=train_df,\n",
|
||||
" test_df=test_df,\n",
|
||||
" target_column=target_column,\n",
|
||||
" ray_params=ray_params,\n",
|
||||
" ),\n",
|
||||
" resources=ray_params.get_tune_resources()\n",
|
||||
" ),\n",
|
||||
" tune_config=tune.TuneConfig(\n",
|
||||
" num_samples=10,\n",
|
||||
" metric=\"eval-error\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" ),\n",
|
||||
" param_space=config\n",
|
||||
" )\n",
|
||||
" results = tuner.fit()\n",
|
||||
" \n",
|
||||
" best_result = results.get_best_result()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" tune_end_time = time.time()\n",
|
||||
" tune_duration = tune_end_time - tune_start_time\n",
|
||||
" print(f\"Total time taken: {tune_duration} seconds.\")\n",
|
||||
"\n",
|
||||
" accuracy = 1.0 - best_result.metrics[\"eval-error\"]\n",
|
||||
" print(f\"Best model parameters: {best_result.config}\")\n",
|
||||
" print(f\"Best model total accuracy: {accuracy:.4f}\")\n",
|
||||
"\n",
|
||||
" return best_result.config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f52fbc0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hyperparameter optimization may take some time to complete."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8836cc07",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tune_xgboost(train_df, eval_df, LABEL_COLUMN)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "82a1e0c8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prediction\n",
|
||||
"\n",
|
||||
"With the model trained, we can now predict on unseen data. For the\n",
|
||||
"purposes of this example, we will use the same dataset for prediction as\n",
|
||||
"for training.\n",
|
||||
"\n",
|
||||
"Since prediction is naively parallelizable, distributing it over multiple\n",
|
||||
"actors can measurably reduce the amount of time needed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cb1f0689",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"inference_df = RayDMatrix(data, ignore=[LABEL_COLUMN, \"partition\"])\n",
|
||||
"results = predict(\n",
|
||||
" bst,\n",
|
||||
" inference_df,\n",
|
||||
" ray_params=RayParams(\n",
|
||||
" cpus_per_actor=cpus_per_actor_inference, num_actors=num_actors_inference\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(results)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,340 +0,0 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: light
|
||||
# format_version: '1.5'
|
||||
# jupytext_version: 1.13.6
|
||||
# kernelspec:
|
||||
# display_name: Python 3
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# # XGBoost-Ray with Dask
|
||||
#
|
||||
#
|
||||
# This notebook includes an example workflow using
|
||||
# [XGBoost-Ray](https://github.com/ray-project/xgboost_ray) and
|
||||
# [Dask](https://docs.dask.org/en/latest/) for distributed model training,
|
||||
# hyperparameter optimization, and prediction.
|
||||
#
|
||||
# ## Cluster Setup
|
||||
#
|
||||
# First, we'll set up our Ray Cluster. The provided `dask_xgboost.yaml`
|
||||
# cluster config can be used to set up an AWS cluster with 64 CPUs.
|
||||
#
|
||||
# The following steps assume you are in a directory with both
|
||||
# ``dask_xgboost.yaml`` and this file saved as ``dask_xgboost.ipynb``.
|
||||
#
|
||||
# **Step 1:** Bring up the Ray cluster.
|
||||
#
|
||||
# ```bash
|
||||
# pip install ray boto3
|
||||
# ray up dask_xgboost.yaml
|
||||
# ```
|
||||
# **Step 2:** Move ``dask_xgboost.ipynb`` to the cluster and start Jupyter.
|
||||
#
|
||||
# ```bash
|
||||
# ray rsync_up dask_xgboost.yaml "./dask_xgboost.ipynb" \
|
||||
# "~/dask_xgboost.ipynb"
|
||||
# ray exec dask_xgboost.yaml --port-forward=9999 "jupyter notebook \
|
||||
# --port=9999"
|
||||
# ```
|
||||
#
|
||||
# You can then access this notebook at the URL that is output:
|
||||
# ``http://localhost:9999/?token=<token>``
|
||||
#
|
||||
# ## Python Setup
|
||||
#
|
||||
# First, we'll import all the libraries we'll be using. This step also helps us
|
||||
# verify that the environment is configured correctly. If any of the imports
|
||||
# are missing, an exception will be raised.
|
||||
|
||||
# +
|
||||
import argparse
|
||||
import time
|
||||
|
||||
import dask
|
||||
import dask.dataframe as dd
|
||||
from xgboost_ray import RayDMatrix, RayParams, predict, train
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.util.dask import ray_dask_get
|
||||
|
||||
# -
|
||||
|
||||
# Next, let's parse some arguments. This will be used for executing the ``.py``
|
||||
# file, but not for the ``.ipynb``. If you are using the interactive notebook,
|
||||
# you can directly override the arguments manually.
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--address", type=str, default="auto", help="The address to use for Ray."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--smoke-test",
|
||||
action="store_true",
|
||||
help="Read a smaller dataset for quick testing purposes.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-actors", type=int, default=4, help="Sets number of actors for training."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cpus-per-actor",
|
||||
type=int,
|
||||
default=6,
|
||||
help="The number of CPUs per actor for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-actors-inference",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Sets number of actors for inference.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cpus-per-actor-inference",
|
||||
type=int,
|
||||
default=2,
|
||||
help="The number of CPUs per actor for inference.",
|
||||
)
|
||||
# Ignore -f from ipykernel_launcher
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# Override these arguments as needed:
|
||||
|
||||
address = args.address
|
||||
smoke_test = args.smoke_test
|
||||
num_actors = args.num_actors
|
||||
cpus_per_actor = args.cpus_per_actor
|
||||
num_actors_inference = args.num_actors_inference
|
||||
cpus_per_actor_inference = args.cpus_per_actor_inference
|
||||
|
||||
# ## Connecting to the Ray cluster
|
||||
#
|
||||
# Now, let's connect our Python script to this newly deployed Ray cluster!
|
||||
|
||||
if not ray.is_initialized():
|
||||
ray.init(address=address)
|
||||
|
||||
# ## Data Preparation
|
||||
#
|
||||
# We will use the `HIGGS dataset from the UCI Machine Learning dataset
|
||||
# repository <https://archive.ics.uci.edu/ml/datasets/HIGGS>`_. The HIGGS
|
||||
# dataset consists of 11,000,000 samples and 28 attributes, which is large
|
||||
# enough size to show the benefits of distributed computation.
|
||||
#
|
||||
# We set the Dask scheduler to ``ray_dask_get`` to use `Dask on Ray
|
||||
# <https://docs.ray.io/en/latest/data/dask-on-ray.html>`_ backend.
|
||||
|
||||
LABEL_COLUMN = "label"
|
||||
if smoke_test:
|
||||
# Test dataset with only 10,000 records.
|
||||
FILE_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/simpleHIGGS" ".csv"
|
||||
else:
|
||||
# Full dataset. This may take a couple of minutes to load.
|
||||
FILE_URL = (
|
||||
"https://archive.ics.uci.edu/ml/machine-learning-databases"
|
||||
"/00280/HIGGS.csv.gz"
|
||||
)
|
||||
colnames = [LABEL_COLUMN] + ["feature-%02d" % i for i in range(1, 29)]
|
||||
dask.config.set(scheduler=ray_dask_get)
|
||||
|
||||
# +
|
||||
load_data_start_time = time.time()
|
||||
|
||||
data = dd.read_csv(FILE_URL, names=colnames)
|
||||
data = data[sorted(colnames)]
|
||||
data = data.persist()
|
||||
|
||||
load_data_end_time = time.time()
|
||||
load_data_duration = load_data_end_time - load_data_start_time
|
||||
print(f"Dataset loaded in {load_data_duration} seconds.")
|
||||
# -
|
||||
|
||||
# With the connection established, we can now create the Dask dataframe.
|
||||
#
|
||||
# We will split the data into a training set and a evaluation set using a 80-20
|
||||
# proportion.
|
||||
|
||||
train_df, eval_df = data.random_split([0.8, 0.2])
|
||||
train_df, eval_df = train_df.persist(), eval_df.persist()
|
||||
print(train_df, eval_df)
|
||||
|
||||
|
||||
# ## Distributed Training
|
||||
#
|
||||
# The ``train_xgboost`` function contains all of the logic necessary for
|
||||
# training using XGBoost-Ray.
|
||||
#
|
||||
# Distributed training can not only speed up the process, but also allow you
|
||||
# to use datasets that are to large to fit in memory of a single node. With
|
||||
# distributed training, the dataset is sharded across different actors
|
||||
# running on separate nodes. Those actors communicate with each other to
|
||||
# create the final model.
|
||||
#
|
||||
# First, the dataframes are wrapped in ``RayDMatrix`` objects, which handle
|
||||
# data sharding across the cluster. Then, the ``train`` function is called.
|
||||
# The evaluation scores will be saved to ``evals_result`` dictionary. The
|
||||
# function returns a tuple of the trained model (booster) and the evaluation
|
||||
# scores.
|
||||
#
|
||||
# The ``ray_params`` variable expects a ``RayParams`` object that contains
|
||||
# Ray-specific settings, such as the number of workers.
|
||||
|
||||
|
||||
def train_xgboost(config, train_df, test_df, target_column, ray_params):
|
||||
train_set = RayDMatrix(train_df, target_column)
|
||||
test_set = RayDMatrix(test_df, target_column)
|
||||
|
||||
evals_result = {}
|
||||
|
||||
train_start_time = time.time()
|
||||
|
||||
# Train the classifier
|
||||
bst = train(
|
||||
params=config,
|
||||
dtrain=train_set,
|
||||
evals=[(test_set, "eval")],
|
||||
evals_result=evals_result,
|
||||
ray_params=ray_params,
|
||||
)
|
||||
|
||||
train_end_time = time.time()
|
||||
train_duration = train_end_time - train_start_time
|
||||
print(f"Total time taken: {train_duration} seconds.")
|
||||
|
||||
model_path = "model.xgb"
|
||||
bst.save_model(model_path)
|
||||
print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))
|
||||
|
||||
return bst, evals_result
|
||||
|
||||
|
||||
# We can now pass our Dask dataframes and run the function. We will use
|
||||
# ``RayParams`` to specify that the number of actors and CPUs to train with.
|
||||
#
|
||||
# The dataset has to be downloaded onto the cluster, which may take a few
|
||||
# minutes.
|
||||
|
||||
# +
|
||||
# standard XGBoost config for classification
|
||||
config = {
|
||||
"tree_method": "approx",
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["logloss", "error"],
|
||||
}
|
||||
|
||||
bst, evals_result = train_xgboost(
|
||||
config,
|
||||
train_df,
|
||||
eval_df,
|
||||
LABEL_COLUMN,
|
||||
RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors),
|
||||
)
|
||||
print(f"Results: {evals_result}")
|
||||
|
||||
|
||||
# -
|
||||
|
||||
# ## Hyperparameter optimization
|
||||
#
|
||||
# If we are not content with the results obtained with default XGBoost
|
||||
# parameters, we can use [Ray Tune](https://docs.ray.io/en/latest/tune/index.html)
|
||||
# for cutting-edge
|
||||
# distributed hyperparameter tuning. XGBoost-Ray automatically integrates
|
||||
# with Ray Tune, meaning we can use the same training function as before.
|
||||
#
|
||||
# In this workflow, we will tune three hyperparameters - ``eta``, ``subsample``
|
||||
# and ``max_depth``. We are using [Tune's samplers to define the search
|
||||
# space](https://docs.ray.io/en/latest/tune/user-guide.html#search-space-grid-random).
|
||||
#
|
||||
# The experiment configuration is done through ``Tuner()``. We set the amount
|
||||
# of resources each trial (hyperparameter combination) requires by using the
|
||||
# ``get_tune_resources`` method of ``RayParams``. The ``num_samples`` argument
|
||||
# controls how many trials will be ran in total. In the end, the best
|
||||
# combination of hyperparameters evaluated during the experiment will be
|
||||
# returned.
|
||||
#
|
||||
# By default, Tune will use simple random search. However, Tune also
|
||||
# provides various
|
||||
# [search algorithms](https://docs.ray.io/en/latest/tune/api_docs/suggestion.html) and
|
||||
# [schedulers](https://docs.ray.io/en/latest/tune/api_docs/schedulers.html)
|
||||
# to further improve the optimization process.
|
||||
|
||||
|
||||
def tune_xgboost(train_df, test_df, target_column):
|
||||
# Set XGBoost config.
|
||||
config = {
|
||||
"tree_method": "approx",
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["logloss", "error"],
|
||||
"eta": tune.loguniform(1e-4, 1e-1),
|
||||
"subsample": tune.uniform(0.5, 1.0),
|
||||
"max_depth": tune.randint(1, 9),
|
||||
}
|
||||
|
||||
ray_params = RayParams(
|
||||
max_actor_restarts=1, cpus_per_actor=cpus_per_actor, num_actors=num_actors
|
||||
)
|
||||
|
||||
tune_start_time = time.time()
|
||||
|
||||
tuner = tune.Tuner(
|
||||
tune.with_resources(
|
||||
tune.with_parameters(
|
||||
train_xgboost,
|
||||
train_df=train_df,
|
||||
test_df=test_df,
|
||||
target_column=target_column,
|
||||
ray_params=ray_params,
|
||||
),
|
||||
resources=ray_params.get_tune_resources(),
|
||||
),
|
||||
tune_config=tune.TuneConfig(
|
||||
num_samples=10,
|
||||
metric="eval-error",
|
||||
mode="min",
|
||||
),
|
||||
param_space=config,
|
||||
)
|
||||
results = tuner.fit()
|
||||
|
||||
tune_end_time = time.time()
|
||||
tune_duration = tune_end_time - tune_start_time
|
||||
print(f"Total time taken: {tune_duration} seconds.")
|
||||
|
||||
best_result = results.get_best_result()
|
||||
accuracy = 1.0 - best_result.metrics["eval-error"]
|
||||
print(f"Best model parameters: {best_result.config}")
|
||||
print(f"Best model total accuracy: {accuracy:.4f}")
|
||||
|
||||
return best_result.config
|
||||
|
||||
|
||||
# Hyperparameter optimization may take some time to complete.
|
||||
|
||||
tune_xgboost(train_df, eval_df, LABEL_COLUMN)
|
||||
|
||||
# ## Prediction
|
||||
#
|
||||
# With the model trained, we can now predict on unseen data. For the
|
||||
# purposes of this example, we will use the same dataset for prediction as
|
||||
# for training.
|
||||
#
|
||||
# Since prediction is naively parallelizable, distributing it over multiple
|
||||
# actors can measurably reduce the amount of time needed.
|
||||
|
||||
# +
|
||||
inference_df = RayDMatrix(data, ignore=[LABEL_COLUMN, "partition"])
|
||||
results = predict(
|
||||
bst,
|
||||
inference_df,
|
||||
ray_params=RayParams(
|
||||
cpus_per_actor=cpus_per_actor_inference, num_actors=num_actors_inference
|
||||
),
|
||||
)
|
||||
|
||||
print(results)
|
|
@ -1,24 +0,0 @@
|
|||
cluster_name: dask_xgboost
|
||||
|
||||
max_workers: 3
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-1
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
available_node_types:
|
||||
16_cpu_node:
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
node_config:
|
||||
InstanceType: m5.4xlarge
|
||||
ImageId: latest_dlami
|
||||
resources: { }
|
||||
|
||||
head_node_type: 16_cpu_node
|
||||
|
||||
setup_commands:
|
||||
- pip install -U jupyter ray[tune] xgboost_ray dask pandas
|
Binary file not shown.
Before Width: | Height: | Size: 26 KiB |
Binary file not shown.
Before Width: | Height: | Size: 26 KiB |
|
@ -1,410 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "66806cbf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# XGBoost-Ray with Modin\n",
|
||||
"\n",
|
||||
"This notebook includes an example workflow using\n",
|
||||
"[XGBoost-Ray](https://github.com/ray-project/xgboost_ray) and\n",
|
||||
"[Modin](https://modin.readthedocs.io/en/latest/) for distributed model\n",
|
||||
"training and prediction.\n",
|
||||
"\n",
|
||||
"## Cluster Setup\n",
|
||||
"\n",
|
||||
"First, we'll set up our Ray Cluster. The provided [modin_xgboost.yaml](https://raw.githubusercontent.com/ray-project/ray/master/doc/source/ray-core/examples/modin_xgboost/modin_xgboost.yaml)\n",
|
||||
"cluster config can be used to set up an AWS cluster with 64 CPUs.\n",
|
||||
"\n",
|
||||
"The following steps assume you are in a directory with both\n",
|
||||
"``modin_xgboost.yaml`` and this file saved as ``modin_xgboost.ipynb``.\n",
|
||||
"\n",
|
||||
"**Step 1:** Bring up the Ray cluster.\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"pip install ray boto3\n",
|
||||
"ray up modin_xgboost.yaml\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Step 2:** Move ``modin_xgboost.ipynb`` to the cluster and start Jupyter.\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"ray rsync_up modin_xgboost.yaml \"./modin_xgboost.ipynb\" \\\n",
|
||||
" \"~/modin_xgboost.ipynb\"\n",
|
||||
"ray exec modin_xgboost.yaml --port-forward=9999 \"jupyter notebook \\\n",
|
||||
" --port=9999\"\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"You can then access this notebook at the URL that is output:\n",
|
||||
"``http://localhost:9999/?token=<token>``\n",
|
||||
"\n",
|
||||
"## Python Setup\n",
|
||||
"\n",
|
||||
"First, we'll import all the libraries we'll be using. This step also helps us\n",
|
||||
"verify that the environment is configured correctly. If any of the imports\n",
|
||||
"are missing, an exception will be raised."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b7da4af7",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import argparse\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"import modin.pandas as pd\n",
|
||||
"from modin.experimental.sklearn.model_selection import train_test_split\n",
|
||||
"from xgboost_ray import RayDMatrix, RayParams, train, predict\n",
|
||||
"\n",
|
||||
"import ray"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b07fdf9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, let's parse some arguments. This will be used for executing the ``.py``\n",
|
||||
"file, but not for the ``.ipynb``. If you are using the interactive notebook,\n",
|
||||
"you can directly override the arguments manually."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b2303e09",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"parser = argparse.ArgumentParser()\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--address\", type=str, default=\"auto\", help=\"The address to use for Ray.\"\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--smoke-test\",\n",
|
||||
" action=\"store_true\",\n",
|
||||
" help=\"Read a smaller dataset for quick testing purposes.\",\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--num-actors\", type=int, default=4, help=\"Sets number of actors for training.\"\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--cpus-per-actor\",\n",
|
||||
" type=int,\n",
|
||||
" default=8,\n",
|
||||
" help=\"The number of CPUs per actor for training.\",\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--num-actors-inference\",\n",
|
||||
" type=int,\n",
|
||||
" default=16,\n",
|
||||
" help=\"Sets number of actors for inference.\",\n",
|
||||
")\n",
|
||||
"parser.add_argument(\n",
|
||||
" \"--cpus-per-actor-inference\",\n",
|
||||
" type=int,\n",
|
||||
" default=2,\n",
|
||||
" help=\"The number of CPUs per actor for inference.\",\n",
|
||||
")\n",
|
||||
"# Ignore -f from ipykernel_launcher\n",
|
||||
"args, _ = parser.parse_known_args()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aa2d56be",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
" Override these arguments as needed:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a19350e",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"address = args.address\n",
|
||||
"smoke_test = args.smoke_test\n",
|
||||
"num_actors = args.num_actors\n",
|
||||
"cpus_per_actor = args.cpus_per_actor\n",
|
||||
"num_actors_inference = args.num_actors_inference\n",
|
||||
"cpus_per_actor_inference = args.cpus_per_actor_inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6c8aaf46",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Connecting to the Ray cluster\n",
|
||||
"\n",
|
||||
"Now, let's connect our Python script to this newly deployed Ray cluster!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cc6f836d",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if not ray.is_initialized():\n",
|
||||
" ray.init(address=address)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "29910376",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data Preparation\n",
|
||||
"\n",
|
||||
"We will use the [HIGGS dataset from the UCI Machine Learning dataset\n",
|
||||
"repository](https://archive.ics.uci.edu/ml/datasets/HIGGS). The HIGGS\n",
|
||||
"dataset consists of 11,000,000 samples and 28 attributes, which is large\n",
|
||||
"enough size to show the benefits of distributed computation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "69b46f57",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"LABEL_COLUMN = \"label\"\n",
|
||||
"if smoke_test:\n",
|
||||
" # Test dataset with only 10,000 records.\n",
|
||||
" FILE_URL = \"https://ray-ci-higgs.s3.us-west-2.amazonaws.com/simpleHIGGS\" \".csv\"\n",
|
||||
"else:\n",
|
||||
" # Full dataset. This may take a couple of minutes to load.\n",
|
||||
" FILE_URL = (\n",
|
||||
" \"https://archive.ics.uci.edu/ml/machine-learning-databases\"\n",
|
||||
" \"/00280/HIGGS.csv.gz\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"colnames = [LABEL_COLUMN] + [\"feature-%02d\" % i for i in range(1, 29)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c6151f1",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_data_start_time = time.time()\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(FILE_URL, names=colnames)\n",
|
||||
"\n",
|
||||
"load_data_end_time = time.time()\n",
|
||||
"load_data_duration = load_data_end_time - load_data_start_time\n",
|
||||
"print(f\"Dataset loaded in {load_data_duration} seconds.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "347d0479",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Split data into training and validation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "875dff40",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_train, df_validation = train_test_split(df)\n",
|
||||
"print(df_train, df_validation)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fe601729",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Distributed Training\n",
|
||||
"\n",
|
||||
"The ``train_xgboost`` function contains all the logic necessary for\n",
|
||||
"training using XGBoost-Ray.\n",
|
||||
"\n",
|
||||
"Distributed training can not only speed up the process, but also allow you\n",
|
||||
"to use datasets that are too large to fit in memory of a single node. With\n",
|
||||
"distributed training, the dataset is sharded across different actors\n",
|
||||
"running on separate nodes. Those actors communicate with each other to\n",
|
||||
"create the final model.\n",
|
||||
"\n",
|
||||
"First, the dataframes are wrapped in ``RayDMatrix`` objects, which handle\n",
|
||||
"data sharding across the cluster. Then, the ``train`` function is called.\n",
|
||||
"The evaluation scores will be saved to ``evals_result`` dictionary. The\n",
|
||||
"function returns a tuple of the trained model (booster) and the evaluation\n",
|
||||
"scores.\n",
|
||||
"\n",
|
||||
"The ``ray_params`` variable expects a ``RayParams`` object that contains\n",
|
||||
"Ray-specific settings, such as the number of workers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7c13aff1",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_xgboost(config, train_df, test_df, target_column, ray_params):\n",
|
||||
" train_set = RayDMatrix(train_df, target_column)\n",
|
||||
" test_set = RayDMatrix(test_df, target_column)\n",
|
||||
"\n",
|
||||
" evals_result = {}\n",
|
||||
"\n",
|
||||
" train_start_time = time.time()\n",
|
||||
"\n",
|
||||
" # Train the classifier\n",
|
||||
" bst = train(\n",
|
||||
" params=config,\n",
|
||||
" dtrain=train_set,\n",
|
||||
" evals=[(test_set, \"eval\")],\n",
|
||||
" evals_result=evals_result,\n",
|
||||
" verbose_eval=False,\n",
|
||||
" num_boost_round=100,\n",
|
||||
" ray_params=ray_params,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" train_end_time = time.time()\n",
|
||||
" train_duration = train_end_time - train_start_time\n",
|
||||
" print(f\"Total time taken: {train_duration} seconds.\")\n",
|
||||
"\n",
|
||||
" model_path = \"model.xgb\"\n",
|
||||
" bst.save_model(model_path)\n",
|
||||
" print(\"Final validation error: {:.4f}\".format(evals_result[\"eval\"][\"error\"][-1]))\n",
|
||||
"\n",
|
||||
" return bst, evals_result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0a499531",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can now pass our Modin dataframes and run the function. We will use\n",
|
||||
"``RayParams`` to specify that the number of actors and CPUs to train with."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "10e0d9af",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# standard XGBoost config for classification\n",
|
||||
"config = {\n",
|
||||
" \"tree_method\": \"approx\",\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"bst, evals_result = train_xgboost(\n",
|
||||
" config,\n",
|
||||
" df_train,\n",
|
||||
" df_validation,\n",
|
||||
" LABEL_COLUMN,\n",
|
||||
" RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors),\n",
|
||||
")\n",
|
||||
"print(f\"Results: {evals_result}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "556575be",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prediction\n",
|
||||
"\n",
|
||||
"With the model trained, we can now predict on unseen data. For the\n",
|
||||
"purposes of this example, we will use the same dataset for prediction as\n",
|
||||
"for training.\n",
|
||||
"\n",
|
||||
"Since prediction is naively parallelizable, distributing it over multiple\n",
|
||||
"actors can measurably reduce the amount of time needed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0170516b",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "python"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"inference_df = RayDMatrix(df, ignore=[LABEL_COLUMN, \"partition\"])\n",
|
||||
"results = predict(\n",
|
||||
" bst,\n",
|
||||
" inference_df,\n",
|
||||
" ray_params=RayParams(\n",
|
||||
" cpus_per_actor=cpus_per_actor_inference, num_actors=num_actors_inference\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(results)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,248 +0,0 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: light
|
||||
# format_version: '1.5'
|
||||
# jupytext_version: 1.13.6
|
||||
# kernelspec:
|
||||
# display_name: Python 3
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
# # XGBoost-Ray with Modin
|
||||
#
|
||||
# This notebook includes an example workflow using
|
||||
# [XGBoost-Ray](https://github.com/ray-project/xgboost_ray) and
|
||||
# [Modin](https://modin.readthedocs.io/en/latest/) for distributed model
|
||||
# training and prediction.
|
||||
#
|
||||
# ## Cluster Setup
|
||||
#
|
||||
# First, we'll set up our Ray Cluster. The provided ``modin_xgboost.yaml``
|
||||
# cluster config can be used to set up an AWS cluster with 64 CPUs.
|
||||
#
|
||||
# The following steps assume you are in a directory with both
|
||||
# ``modin_xgboost.yaml`` and this file saved as ``modin_xgboost.ipynb``.
|
||||
#
|
||||
# **Step 1:** Bring up the Ray cluster.
|
||||
#
|
||||
# ```bash
|
||||
# pip install ray boto3
|
||||
# ray up modin_xgboost.yaml
|
||||
# ```
|
||||
#
|
||||
# **Step 2:** Move ``modin_xgboost.ipynb`` to the cluster and start Jupyter.
|
||||
#
|
||||
# ```bash
|
||||
# ray rsync_up modin_xgboost.yaml "./modin_xgboost.ipynb" \
|
||||
# "~/modin_xgboost.ipynb"
|
||||
# ray exec modin_xgboost.yaml --port-forward=9999 "jupyter notebook \
|
||||
# --port=9999"
|
||||
# ```
|
||||
#
|
||||
# You can then access this notebook at the URL that is output:
|
||||
# ``http://localhost:9999/?token=<token>``
|
||||
#
|
||||
# ## Python Setup
|
||||
#
|
||||
# First, we'll import all the libraries we'll be using. This step also helps us
|
||||
# verify that the environment is configured correctly. If any of the imports
|
||||
# are missing, an exception will be raised.
|
||||
|
||||
# +
|
||||
import argparse
|
||||
import time
|
||||
|
||||
import modin.pandas as pd
|
||||
from modin.experimental.sklearn.model_selection import train_test_split
|
||||
from xgboost_ray import RayDMatrix, RayParams, predict, train
|
||||
|
||||
import ray
|
||||
|
||||
# -
|
||||
|
||||
# Next, let's parse some arguments. This will be used for executing the ``.py``
|
||||
# file, but not for the ``.ipynb``. If you are using the interactive notebook,
|
||||
# you can directly override the arguments manually.
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--address", type=str, default="auto", help="The address to use for Ray."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--smoke-test",
|
||||
action="store_true",
|
||||
help="Read a smaller dataset for quick testing purposes.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-actors", type=int, default=4, help="Sets number of actors for training."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cpus-per-actor",
|
||||
type=int,
|
||||
default=8,
|
||||
help="The number of CPUs per actor for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-actors-inference",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Sets number of actors for inference.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cpus-per-actor-inference",
|
||||
type=int,
|
||||
default=2,
|
||||
help="The number of CPUs per actor for inference.",
|
||||
)
|
||||
# Ignore -f from ipykernel_launcher
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# Override these arguments as needed:
|
||||
|
||||
address = args.address
|
||||
smoke_test = args.smoke_test
|
||||
num_actors = args.num_actors
|
||||
cpus_per_actor = args.cpus_per_actor
|
||||
num_actors_inference = args.num_actors_inference
|
||||
cpus_per_actor_inference = args.cpus_per_actor_inference
|
||||
|
||||
# ## Connecting to the Ray cluster
|
||||
#
|
||||
# Now, let's connect our Python script to this newly deployed Ray cluster!
|
||||
|
||||
if not ray.is_initialized():
|
||||
ray.init(address=address)
|
||||
|
||||
# ## Data Preparation
|
||||
#
|
||||
# We will use the [HIGGS dataset from the UCI Machine Learning dataset
|
||||
# repository](https://archive.ics.uci.edu/ml/datasets/HIGGS). The HIGGS
|
||||
# dataset consists of 11,000,000 samples and 28 attributes, which is large
|
||||
# enough size to show the benefits of distributed computation.
|
||||
|
||||
# +
|
||||
LABEL_COLUMN = "label"
|
||||
if smoke_test:
|
||||
# Test dataset with only 10,000 records.
|
||||
FILE_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/simpleHIGGS" ".csv"
|
||||
else:
|
||||
# Full dataset. This may take a couple of minutes to load.
|
||||
FILE_URL = (
|
||||
"https://archive.ics.uci.edu/ml/machine-learning-databases"
|
||||
"/00280/HIGGS.csv.gz"
|
||||
)
|
||||
|
||||
colnames = [LABEL_COLUMN] + ["feature-%02d" % i for i in range(1, 29)]
|
||||
|
||||
# +
|
||||
load_data_start_time = time.time()
|
||||
|
||||
df = pd.read_csv(FILE_URL, names=colnames)
|
||||
|
||||
load_data_end_time = time.time()
|
||||
load_data_duration = load_data_end_time - load_data_start_time
|
||||
print(f"Dataset loaded in {load_data_duration} seconds.")
|
||||
# -
|
||||
|
||||
# Split data into training and validation.
|
||||
|
||||
df_train, df_validation = train_test_split(df)
|
||||
print(df_train, df_validation)
|
||||
|
||||
|
||||
# ## Distributed Training
|
||||
#
|
||||
# The ``train_xgboost`` function contains all the logic necessary for
|
||||
# training using XGBoost-Ray.
|
||||
#
|
||||
# Distributed training can not only speed up the process, but also allow you
|
||||
# to use datasets that are too large to fit in memory of a single node. With
|
||||
# distributed training, the dataset is sharded across different actors
|
||||
# running on separate nodes. Those actors communicate with each other to
|
||||
# create the final model.
|
||||
#
|
||||
# First, the dataframes are wrapped in ``RayDMatrix`` objects, which handle
|
||||
# data sharding across the cluster. Then, the ``train`` function is called.
|
||||
# The evaluation scores will be saved to ``evals_result`` dictionary. The
|
||||
# function returns a tuple of the trained model (booster) and the evaluation
|
||||
# scores.
|
||||
#
|
||||
# The ``ray_params`` variable expects a ``RayParams`` object that contains
|
||||
# Ray-specific settings, such as the number of workers.
|
||||
|
||||
|
||||
def train_xgboost(config, train_df, test_df, target_column, ray_params):
|
||||
train_set = RayDMatrix(train_df, target_column)
|
||||
test_set = RayDMatrix(test_df, target_column)
|
||||
|
||||
evals_result = {}
|
||||
|
||||
train_start_time = time.time()
|
||||
|
||||
# Train the classifier
|
||||
bst = train(
|
||||
params=config,
|
||||
dtrain=train_set,
|
||||
evals=[(test_set, "eval")],
|
||||
evals_result=evals_result,
|
||||
verbose_eval=False,
|
||||
num_boost_round=100,
|
||||
ray_params=ray_params,
|
||||
)
|
||||
|
||||
train_end_time = time.time()
|
||||
train_duration = train_end_time - train_start_time
|
||||
print(f"Total time taken: {train_duration} seconds.")
|
||||
|
||||
model_path = "model.xgb"
|
||||
bst.save_model(model_path)
|
||||
print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))
|
||||
|
||||
return bst, evals_result
|
||||
|
||||
|
||||
# We can now pass our Modin dataframes and run the function. We will use
|
||||
# ``RayParams`` to specify that the number of actors and CPUs to train with.
|
||||
|
||||
# +
|
||||
# standard XGBoost config for classification
|
||||
config = {
|
||||
"tree_method": "approx",
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["logloss", "error"],
|
||||
}
|
||||
|
||||
bst, evals_result = train_xgboost(
|
||||
config,
|
||||
df_train,
|
||||
df_validation,
|
||||
LABEL_COLUMN,
|
||||
RayParams(cpus_per_actor=cpus_per_actor, num_actors=num_actors),
|
||||
)
|
||||
print(f"Results: {evals_result}")
|
||||
# -
|
||||
|
||||
# ## Prediction
|
||||
#
|
||||
# With the model trained, we can now predict on unseen data. For the
|
||||
# purposes of this example, we will use the same dataset for prediction as
|
||||
# for training.
|
||||
#
|
||||
# Since prediction is naively parallelizable, distributing it over multiple
|
||||
# actors can measurably reduce the amount of time needed.
|
||||
|
||||
# +
|
||||
inference_df = RayDMatrix(df, ignore=[LABEL_COLUMN, "partition"])
|
||||
results = predict(
|
||||
bst,
|
||||
inference_df,
|
||||
ray_params=RayParams(
|
||||
cpus_per_actor=cpus_per_actor_inference, num_actors=num_actors_inference
|
||||
),
|
||||
)
|
||||
|
||||
print(results)
|
|
@ -1,24 +0,0 @@
|
|||
cluster_name: modin_xgboost
|
||||
|
||||
max_workers: 3
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-1
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
available_node_types:
|
||||
16_cpu_node:
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
node_config:
|
||||
InstanceType: m5.4xlarge
|
||||
ImageId: latest_dlami
|
||||
resources: { }
|
||||
|
||||
head_node_type: 16_cpu_node
|
||||
|
||||
setup_commands:
|
||||
- pip install -U jupyter ray xgboost_ray modin pandas
|
|
@ -16,7 +16,7 @@ You can view the `code for this example`_.
|
|||
.. _`code for this example`: https://github.com/ray-project/ray/tree/master/doc/source/ray-core/examples/lm
|
||||
|
||||
|
||||
To use Ray cluster launcher on AWS, install boto (``pip install boto3``) and configure your AWS credentials in ``~/.aws/credentials`` as described on the :ref:`Automatic Cluster Setup page <cluster-cloud>`.
|
||||
To use Ray cluster launcher on AWS, install boto (``pip install boto3``) and configure your AWS credentials in ``~/.aws/credentials`` as described on the :ref:`Automatic Cluster Setup page <cluster-index>`.
|
||||
We provide an `example config file <https://github.com/ray-project/ray/tree/master/doc/source/ray-core/examples/lm/lm-cluster.yaml>`__ (``lm-cluster.yaml``).
|
||||
|
||||
In the example config file, we use an ``m5.xlarge`` on-demand instance as the head node, and use ``p3.2xlarge`` GPU spot instances as the worker nodes. We set the minimal number of workers to 1 and maximum workers to 2 in the config, which can be modified according to your own demand.
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue