[cluster doc] Promote new doc and deprecate the old (#27759)
Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
|
@ -251,63 +251,64 @@ parts:
|
|||
|
||||
- caption: Ray Clusters
|
||||
chapters:
|
||||
- file: cluster/index
|
||||
- file: cluster/quickstart
|
||||
- file: cluster/getting-started
|
||||
title: Getting Started
|
||||
- file: cluster/key-concepts
|
||||
- file: cluster/user-guide
|
||||
- file: cluster/cloud
|
||||
- file: cluster/deploy
|
||||
- file: cluster/api
|
||||
- file: cluster/usage-stats
|
||||
- file: cluster/cluster_under_construction
|
||||
title: Ray Clusters (under construction)
|
||||
title: Key Concepts
|
||||
- file: cluster/kubernetes/index
|
||||
title: Deploying a Ray Cluster on Kubernetes
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/getting-started
|
||||
title: Getting Started
|
||||
- file: cluster/cluster_under_construction/key-concepts
|
||||
title: Key Concepts
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/index
|
||||
title: Deploying a Ray Cluster on Kubernetes
|
||||
- file: cluster/kubernetes/getting-started
|
||||
- file: cluster/kubernetes/user-guides
|
||||
sections:
|
||||
- file: cluster/kubernetes/user-guides/k8s-cluster-setup.md
|
||||
- file: cluster/kubernetes/user-guides/config.md
|
||||
- file: cluster/kubernetes/user-guides/autoscaling.md
|
||||
- file: cluster/kubernetes/user-guides/logging.md
|
||||
- file: cluster/kubernetes/user-guides/gpu.md
|
||||
- file: cluster/kubernetes/user-guides/kuberay-vs-legacy.md
|
||||
- file: cluster/kubernetes/examples
|
||||
sections:
|
||||
- file: cluster/kubernetes/examples/ml-example.md
|
||||
- file: cluster/kubernetes/references
|
||||
- file: cluster/vms/index
|
||||
title: Deploying a Ray Cluster on VMs
|
||||
sections:
|
||||
- file: cluster/vms/getting-started
|
||||
- file: cluster/vms/user-guides/index
|
||||
title: User Guides
|
||||
sections:
|
||||
- file: cluster/vms/user-guides/launching-clusters/index
|
||||
title: Launching Clusters
|
||||
- file: cluster/vms/user-guides/large-cluster-best-practices
|
||||
- file: cluster/vms/user-guides/configuring-autoscaling
|
||||
- file: cluster/vms/user-guides/community/index
|
||||
title: Community-supported Cluster Managers
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/getting-started
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/user-guides
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/user-guides/k8s-cluster-setup.md
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/user-guides/config.md
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/user-guides/autoscaling.md
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/user-guides/logging.md
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/user-guides/gpu.md
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/user-guides/kuberay-vs-legacy.md
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/examples
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/examples/ml-example.md
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-kubernetes/references
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/index
|
||||
title: Deploying a Ray Cluster on VMs
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/getting-started
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/index
|
||||
title: User Guides
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/launching-clusters/index
|
||||
title: Launching Clusters
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/large-cluster-best-practices
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/configuring-autoscaling
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/community-supported-cluster-manager/index
|
||||
title: Community-supported Cluster Managers
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/community-supported-cluster-manager/yarn
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/community-supported-cluster-manager/slurm
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/user-guides/community-supported-cluster-manager/lsf
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/examples/index
|
||||
title: Examples
|
||||
sections:
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/examples/ml-example
|
||||
- file: cluster/cluster_under_construction/ray-clusters-on-vms/references/index
|
||||
- file: cluster/cluster_under_construction/running-applications-on-ray-clusters/index
|
||||
title: Running Applications on Ray Clusters
|
||||
- file: cluster/cluster_under_construction/references/index
|
||||
title: References
|
||||
- file: cluster/vms/user-guides/community/yarn
|
||||
- file: cluster/vms/user-guides/community/slurm
|
||||
- file: cluster/vms/user-guides/community/lsf
|
||||
- file: cluster/vms/examples/index
|
||||
title: Examples
|
||||
sections:
|
||||
- file: cluster/vms/examples/ml-example
|
||||
- file: cluster/vms/references/index
|
||||
- file: cluster/running-applications/index
|
||||
title: Running Applications on Ray Clusters
|
||||
- file: cluster/references/index
|
||||
title: References
|
||||
|
||||
- file: cluster-deprecated/index
|
||||
title: Deprecated Ray Clusters Docs
|
||||
sections:
|
||||
- file: cluster-deprecated/key-concepts
|
||||
- file: cluster-deprecated/cloud
|
||||
- file: cluster-deprecated/quickstart
|
||||
- file: cluster-deprecated/usage-stats
|
||||
- file: cluster-deprecated/user-guide
|
||||
- file: cluster-deprecated/cluster_under_construction.md
|
||||
- file: cluster-deprecated/deploy
|
||||
- file: cluster-deprecated/api
|
||||
|
||||
- caption: References
|
||||
chapters:
|
||||
|
|
|
@ -7,5 +7,5 @@ Ray Cluster API
|
|||
:maxdepth: 2
|
||||
:caption: Ray Cluster API References
|
||||
|
||||
../cluster/reference.rst
|
||||
../cluster/jobs-package-ref.rst
|
||||
../cluster-deprecated/reference.rst
|
||||
../cluster-deprecated/jobs-package-ref.rst
|
|
@ -21,7 +21,7 @@ Ray with cloud providers
|
|||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
/cluster/aws-tips.rst
|
||||
/cluster-deprecated/aws-tips.rst
|
||||
|
||||
.. tabbed:: AWS
|
||||
|
8
doc/source/cluster-deprecated/examples/slurm-basic.rst
Normal file
|
@ -0,0 +1,8 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-basic:
|
||||
|
||||
slurm-basic.sh
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
|
@ -61,5 +61,5 @@ done
|
|||
# __doc_worker_ray_end__
|
||||
|
||||
# __doc_script_start__
|
||||
# ray/doc/source/cluster/examples/simple-trainer.py
|
||||
# ray/doc/source/cluster-deprecated/examples/simple-trainer.py
|
||||
python -u simple-trainer.py "$SLURM_CPUS_PER_TASK"
|
8
doc/source/cluster-deprecated/examples/slurm-launch.rst
Normal file
|
@ -0,0 +1,8 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-launch:
|
||||
|
||||
slurm-launch.py
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-launch.py
|
|
@ -5,5 +5,5 @@
|
|||
slurm-template.sh
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster/examples/slurm-template.sh
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-template.sh
|
||||
:language: bash
|
107
doc/source/cluster-deprecated/key-concepts.rst
Normal file
|
@ -0,0 +1,107 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
|
||||
.. _cluster-key-concepts:
|
||||
|
||||
Key Concepts
|
||||
============
|
||||
|
||||
Cluster
|
||||
-------
|
||||
|
||||
A Ray cluster is a set of one or more nodes that are running Ray and share the
|
||||
same :ref:`head node<cluster-node-types>`.
|
||||
|
||||
.. _cluster-node-types:
|
||||
|
||||
Node types
|
||||
----------
|
||||
|
||||
A Ray cluster consists of a :ref:`head node<cluster-head-node>` and a set of
|
||||
:ref:`worker nodes<cluster-worker-node>`.
|
||||
|
||||
.. image:: ray-cluster.jpg
|
||||
:align: center
|
||||
:width: 600px
|
||||
|
||||
.. _cluster-head-node:
|
||||
|
||||
Head node
|
||||
~~~~~~~~~
|
||||
|
||||
The head node is the first node started by the
|
||||
:ref:`Ray cluster launcher<cluster-launcher>` when trying to launch a Ray
|
||||
cluster. Among other things, the head node holds the :ref:`Global Control Store
|
||||
(GCS)<memory>` and runs the :ref:`autoscaler<cluster-autoscaler>`. Once the head
|
||||
node is started, it will be responsible for launching any additional
|
||||
:ref:`worker nodes<cluster-worker-node>`. The head node itself will also execute
|
||||
tasks and actors to utilize its capacity.
|
||||
|
||||
.. _cluster-worker-node:
|
||||
|
||||
Worker node
|
||||
~~~~~~~~~~~
|
||||
|
||||
A worker node is any node in the Ray cluster that is not functioning as head node.
|
||||
Therefore, worker nodes are simply responsible for executing tasks and actors.
|
||||
When a worker node is launched, it will be given the address of the head node to
|
||||
form a cluster.
|
||||
|
||||
.. _cluster-launcher:
|
||||
|
||||
Cluster launcher
|
||||
----------------
|
||||
|
||||
The cluster launcher is a process responsible for bootstrapping the Ray cluster
|
||||
by launching the :ref:`head node<cluster-head-node>`. For more information on how
|
||||
to use the cluster launcher, refer to
|
||||
:ref:`cluster launcher CLI commands documentation<cluster-commands>` and the
|
||||
corresponding :ref:`documentation for the configuration file<cluster-config>`.
|
||||
|
||||
.. _cluster-autoscaler:
|
||||
|
||||
Autoscaler
|
||||
----------
|
||||
|
||||
The autoscaler is a process that runs on the :ref:`head node<cluster-head-node>`
|
||||
and is responsible for adding or removing :ref:`worker nodes<cluster-worker-node>`
|
||||
to meet the needs of the Ray workload while matching the specification in the
|
||||
:ref:`cluster config file<cluster-config>`. In particular, if the resource
|
||||
demands of the Ray workload exceed the current capacity of the cluster, the
|
||||
autoscaler will try to add nodes. Conversely, if a node is idle for long enough,
|
||||
the autoscaler will remove it from the cluster. To learn more about autoscaling,
|
||||
refer to the :ref:`Ray cluster deployment guide<deployment-guide-autoscaler>`.
|
||||
|
||||
Ray Client
|
||||
----------
|
||||
The Ray Client is an API that connects a Python script to a remote Ray cluster.
|
||||
To learn more about the Ray Client, you can refer to the :ref:`documentation<ray-client>`.
|
||||
|
||||
Job submission
|
||||
--------------
|
||||
|
||||
Ray Job submission is a mechanism to submit locally developed and tested applications
|
||||
to a remote Ray cluster. It simplifies the experience of packaging, deploying,
|
||||
and managing a Ray application. To learn more about Ray jobs, refer to the
|
||||
:ref:`documentation<ray-job-submission-api-ref>`.
|
||||
|
||||
Cloud clusters
|
||||
--------------
|
||||
|
||||
If you’re using AWS, GCP, Azure (community-maintained) or Aliyun (community-maintained), you can use the
|
||||
:ref:`Ray cluster launcher<cluster-launcher>` to launch cloud clusters, which
|
||||
greatly simplifies the cluster setup process.
|
||||
|
||||
Cluster managers
|
||||
----------------
|
||||
|
||||
You can simplify the process of managing Ray clusters using a number of popular
|
||||
cluster managers including :ref:`Kubernetes<kuberay-index>`,
|
||||
:ref:`YARN<ray-yarn-deploy>`, :ref:`Slurm<ray-slurm-deploy>` and :ref:`LSF<ray-LSF-deploy>`.
|
||||
|
||||
Kubernetes (K8s) operator
|
||||
-------------------------
|
||||
|
||||
Deployments of Ray on Kubernetes are managed by the Ray Kubernetes Operator. The
|
||||
Ray Operator makes it easy to deploy clusters of Ray pods within a Kubernetes
|
||||
cluster. To learn more about the K8s operator, refer to
|
||||
the :ref:`documentation<kuberay-index>`.
|
Before Width: | Height: | Size: 55 KiB After Width: | Height: | Size: 55 KiB |
|
@ -104,7 +104,7 @@ Obtain the head IP address
|
|||
|
||||
Next, we'll want to obtain a hostname and a node IP address for the head node. This way, when we start worker nodes, we'll be able to properly connect to the right head node.
|
||||
|
||||
.. literalinclude:: /cluster/examples/slurm-basic.sh
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_head_address_start__
|
||||
:end-before: __doc_head_address_end__
|
||||
|
@ -123,7 +123,7 @@ and number of GPUs (``num-gpus``) to Ray, as this will prevent Ray from using
|
|||
more resources than allocated. We also need to explictly
|
||||
indicate the ``node-ip-address`` for the Ray head runtime:
|
||||
|
||||
.. literalinclude:: /cluster/examples/slurm-basic.sh
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_head_ray_start__
|
||||
:end-before: __doc_head_ray_end__
|
||||
|
@ -135,7 +135,7 @@ Starting the Ray worker nodes
|
|||
|
||||
Below, we do the same thing, but for each worker. Make sure the Ray head and Ray worker processes are not started on the same node.
|
||||
|
||||
.. literalinclude:: /cluster/examples/slurm-basic.sh
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_worker_ray_start__
|
||||
:end-before: __doc_worker_ray_end__
|
||||
|
@ -145,7 +145,7 @@ Submitting your script
|
|||
|
||||
Finally, you can invoke your Python script:
|
||||
|
||||
.. literalinclude:: /cluster/examples/slurm-basic.sh
|
||||
.. literalinclude:: /cluster-deprecated/examples/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_script_start__
|
||||
|
|
@ -1,98 +0,0 @@
|
|||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
Key Concepts
|
||||
============
|
||||
|
||||
.. _cluster-key-concepts-under-construction:
|
||||
|
||||
|
||||
This page introduces the following key concepts concerning Ray clusters:
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
|
||||
Ray cluster
|
||||
------------
|
||||
A **Ray cluster** is comprised of a :ref:`head node<cluster-head-node-under-construction>`
|
||||
and any number of :ref:`worker nodes<cluster-worker-nodes-under-construction>`.
|
||||
|
||||
.. figure:: images/ray-cluster.svg
|
||||
:align: center
|
||||
:width: 600px
|
||||
|
||||
*A Ray cluster with two worker nodes. Each node runs Ray helper processes to
|
||||
facilitate distributed scheduling and memory management. The head node runs
|
||||
additional control processes, which are highlighted.*
|
||||
|
||||
The number of worker nodes in a cluster may change with application demand, according
|
||||
to your Ray cluster configuration. This is known as *autoscaling*. The head node runs
|
||||
the :ref:`autoscaler<cluster-autoscaler-under-construction>`.
|
||||
|
||||
.. note::
|
||||
Ray nodes are implemented as pods when :ref:`running on Kubernetes<kuberay-index>`.
|
||||
|
||||
Users can submit jobs for execution on the Ray cluster, or can interactively use the
|
||||
cluster by connecting to the head node and running `ray.init`. See
|
||||
:ref:`Clients and Jobs<cluster-clients-and-jobs-under-construction>` for more information.
|
||||
|
||||
.. _cluster-worker-nodes-under-construction:
|
||||
|
||||
Worker nodes
|
||||
~~~~~~~~~~~~
|
||||
**Worker nodes** execute a Ray application by executing tasks and actors and storing Ray objects. Each worker node runs helper processes which
|
||||
implement distributed scheduling and :ref:`memory management<memory>`.
|
||||
|
||||
.. _cluster-head-node-under-construction:
|
||||
|
||||
Head node
|
||||
~~~~~~~~~
|
||||
Every Ray cluster has one node which is designated as the **head node** of the cluster.
|
||||
The head node is identical to other worker nodes, except that it also runs singleton processes responsible for cluster management such as the
|
||||
:ref:`autoscaler<cluster-autoscaler-under-construction>` and the Ray driver processes
|
||||
:ref:`which run Ray jobs<cluster-clients-and-jobs-under-construction>`. Ray may schedule
|
||||
tasks and actors on the head node just like any other worker node, unless configured otherwise.
|
||||
|
||||
.. _cluster-autoscaler-under-construction:
|
||||
|
||||
Autoscaler
|
||||
----------
|
||||
|
||||
The **autoscaler** is a process that runs on the :ref:`head node<cluster-head-node-under-construction>` (or as a sidecar container in the head pod if :ref:`using Kubernetes<kuberay-index>`).
|
||||
It is responsible for provisioning or deprovisioning :ref:`worker nodes<cluster-worker-nodes-under-construction>`
|
||||
to meet the needs of the Ray workload. In particular, if the resource demands of the Ray workload exceed the
|
||||
current capacity of the cluster, the autoscaler will attempt to add more nodes. Conversely, if
|
||||
a node is idle for long enough, the autoscaler will remove it from the cluster.
|
||||
|
||||
To learn more about the autoscaler and how to configure it, refer to the following user guides:
|
||||
|
||||
* :ref:`Configuring Autoscaling on VMs<deployment-guide-autoscaler-under-construction>`.
|
||||
* :ref:`Autoscaling on Kubernetes<kuberay-autoscaler-discussion>`.
|
||||
|
||||
.. _cluster-clients-and-jobs-under-construction:
|
||||
|
||||
..
|
||||
|
||||
Clients and Jobs
|
||||
----------------
|
||||
TODO
|
||||
~~~~
|
||||
TODO: Update the following section so that we recommend the best tool for first-time users:
|
||||
See https://anyscaleteam.slack.com/archives/C01CLKUN38V/p1659990371608629?thread_ts=1659981502.811539&cid=C01CLKUN38V
|
||||
|
||||
Clients and Jobs
|
||||
~~~~~~~~~~~~~~~~
|
||||
Ray provides two methods for running workloads on a Ray Cluster: the Ray Client, and Ray Job Submission.
|
||||
|
||||
* **The Ray Client** enables interactive development by connecting a local Python script or shell to the cluster.
|
||||
Developers can scale-out their local programs on the cloud as if it were on their laptop. The Ray Client is used
|
||||
by specifying the :ref:`head node<cluster-head-node-under-construction>` address as an argument to `ray.init`.
|
||||
* **Ray Job Submission** enables users to submit locally developed-and-tested applications to a remote Ray
|
||||
Cluster. Ray Job Submission simplifies the experience of packaging, deploying, and managing a Ray application.
|
||||
|
||||
To learn how to run workloads on a Ray Cluster, refer to the following user guides:
|
||||
|
||||
* :ref:`Running Ray workloads on VMs<ref-deployment-guide-under-construction>`.
|
||||
* The :ref:`Ray Job Submission<kuberay-job>` and :ref:`Ray Client<kuberay-client>` sections in :ref:`Getting Started with Ray on Kubernetes<kuberay-quickstart>`.
|
|
@ -1,8 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-basic-under-construction:
|
||||
|
||||
slurm-basic.sh
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/slurm-basic.sh
|
|
@ -61,5 +61,5 @@ done
|
|||
# __doc_worker_ray_end__
|
||||
|
||||
# __doc_script_start__
|
||||
# ray/doc/source/cluster/cluster_under_construction/doc_code/simple-trainer.py
|
||||
# ray/doc/source/cluster/doc_code/simple-trainer.py
|
||||
python -u simple-trainer.py "$SLURM_CPUS_PER_TASK"
|
|
@ -10,7 +10,7 @@ services:
|
|||
vcores: 1
|
||||
memory: 2048
|
||||
files:
|
||||
# ray/doc/source/cluster/cluster_under_construction/doc_code/yarn/example.py
|
||||
# ray/doc/source/cluster/doc_code/yarn/example.py
|
||||
example.py: example.py
|
||||
# # A packaged python environment using `conda-pack`. Note that Skein
|
||||
# # doesn't require any specific way of distributing files, but this
|
|
@ -1,8 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-basic:
|
||||
|
||||
slurm-basic.sh
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster/examples/slurm-basic.sh
|
|
@ -1,8 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-launch:
|
||||
|
||||
slurm-launch.py
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster/examples/slurm-launch.py
|
Before Width: | Height: | Size: 123 KiB After Width: | Height: | Size: 123 KiB |
Before Width: | Height: | Size: 543 KiB After Width: | Height: | Size: 543 KiB |
|
@ -1,107 +1,98 @@
|
|||
.. include:: we_are_hiring.rst
|
||||
.. warning::
|
||||
This page is under construction!
|
||||
|
||||
.. _cluster-key-concepts:
|
||||
.. include:: /_includes/clusters/we_are_hiring.rst
|
||||
|
||||
Key Concepts
|
||||
============
|
||||
|
||||
Cluster
|
||||
-------
|
||||
.. _cluster-key-concepts-under-construction:
|
||||
|
||||
A Ray cluster is a set of one or more nodes that are running Ray and share the
|
||||
same :ref:`head node<cluster-node-types>`.
|
||||
|
||||
.. _cluster-node-types:
|
||||
This page introduces the following key concepts concerning Ray clusters:
|
||||
|
||||
Node types
|
||||
----------
|
||||
.. contents::
|
||||
:local:
|
||||
|
||||
A Ray cluster consists of a :ref:`head node<cluster-head-node>` and a set of
|
||||
:ref:`worker nodes<cluster-worker-node>`.
|
||||
Ray cluster
|
||||
------------
|
||||
A **Ray cluster** is comprised of a :ref:`head node<cluster-head-node-under-construction>`
|
||||
and any number of :ref:`worker nodes<cluster-worker-nodes-under-construction>`.
|
||||
|
||||
.. image:: ray-cluster.jpg
|
||||
.. figure:: images/ray-cluster.svg
|
||||
:align: center
|
||||
:width: 600px
|
||||
|
||||
*A Ray cluster with two worker nodes. Each node runs Ray helper processes to
|
||||
facilitate distributed scheduling and memory management. The head node runs
|
||||
additional control processes, which are highlighted.*
|
||||
|
||||
.. _cluster-head-node:
|
||||
The number of worker nodes in a cluster may change with application demand, according
|
||||
to your Ray cluster configuration. This is known as *autoscaling*. The head node runs
|
||||
the :ref:`autoscaler<cluster-autoscaler-under-construction>`.
|
||||
|
||||
.. note::
|
||||
Ray nodes are implemented as pods when :ref:`running on Kubernetes<kuberay-index>`.
|
||||
|
||||
Users can submit jobs for execution on the Ray cluster, or can interactively use the
|
||||
cluster by connecting to the head node and running `ray.init`. See
|
||||
:ref:`Clients and Jobs<cluster-clients-and-jobs-under-construction>` for more information.
|
||||
|
||||
.. _cluster-worker-nodes-under-construction:
|
||||
|
||||
Worker nodes
|
||||
~~~~~~~~~~~~
|
||||
**Worker nodes** execute a Ray application by executing tasks and actors and storing Ray objects. Each worker node runs helper processes which
|
||||
implement distributed scheduling and :ref:`memory management<memory>`.
|
||||
|
||||
.. _cluster-head-node-under-construction:
|
||||
|
||||
Head node
|
||||
~~~~~~~~~
|
||||
Every Ray cluster has one node which is designated as the **head node** of the cluster.
|
||||
The head node is identical to other worker nodes, except that it also runs singleton processes responsible for cluster management such as the
|
||||
:ref:`autoscaler<cluster-autoscaler-under-construction>` and the Ray driver processes
|
||||
:ref:`which run Ray jobs<cluster-clients-and-jobs-under-construction>`. Ray may schedule
|
||||
tasks and actors on the head node just like any other worker node, unless configured otherwise.
|
||||
|
||||
The head node is the first node started by the
|
||||
:ref:`Ray cluster launcher<cluster-launcher>` when trying to launch a Ray
|
||||
cluster. Among other things, the head node holds the :ref:`Global Control Store
|
||||
(GCS)<memory>` and runs the :ref:`autoscaler<cluster-autoscaler>`. Once the head
|
||||
node is started, it will be responsible for launching any additional
|
||||
:ref:`worker nodes<cluster-worker-node>`. The head node itself will also execute
|
||||
tasks and actors to utilize its capacity.
|
||||
|
||||
.. _cluster-worker-node:
|
||||
|
||||
Worker node
|
||||
~~~~~~~~~~~
|
||||
|
||||
A worker node is any node in the Ray cluster that is not functioning as head node.
|
||||
Therefore, worker nodes are simply responsible for executing tasks and actors.
|
||||
When a worker node is launched, it will be given the address of the head node to
|
||||
form a cluster.
|
||||
|
||||
.. _cluster-launcher:
|
||||
|
||||
Cluster launcher
|
||||
----------------
|
||||
|
||||
The cluster launcher is a process responsible for bootstrapping the Ray cluster
|
||||
by launching the :ref:`head node<cluster-head-node>`. For more information on how
|
||||
to use the cluster launcher, refer to
|
||||
:ref:`cluster launcher CLI commands documentation<cluster-commands>` and the
|
||||
corresponding :ref:`documentation for the configuration file<cluster-config>`.
|
||||
|
||||
.. _cluster-autoscaler:
|
||||
.. _cluster-autoscaler-under-construction:
|
||||
|
||||
Autoscaler
|
||||
----------
|
||||
|
||||
The autoscaler is a process that runs on the :ref:`head node<cluster-head-node>`
|
||||
and is responsible for adding or removing :ref:`worker nodes<cluster-worker-node>`
|
||||
to meet the needs of the Ray workload while matching the specification in the
|
||||
:ref:`cluster config file<cluster-config>`. In particular, if the resource
|
||||
demands of the Ray workload exceed the current capacity of the cluster, the
|
||||
autoscaler will try to add nodes. Conversely, if a node is idle for long enough,
|
||||
the autoscaler will remove it from the cluster. To learn more about autoscaling,
|
||||
refer to the :ref:`Ray cluster deployment guide<deployment-guide-autoscaler>`.
|
||||
The **autoscaler** is a process that runs on the :ref:`head node<cluster-head-node-under-construction>` (or as a sidecar container in the head pod if :ref:`using Kubernetes<kuberay-index>`).
|
||||
It is responsible for provisioning or deprovisioning :ref:`worker nodes<cluster-worker-nodes-under-construction>`
|
||||
to meet the needs of the Ray workload. In particular, if the resource demands of the Ray workload exceed the
|
||||
current capacity of the cluster, the autoscaler will attempt to add more nodes. Conversely, if
|
||||
a node is idle for long enough, the autoscaler will remove it from the cluster.
|
||||
|
||||
Ray Client
|
||||
----------
|
||||
The Ray Client is an API that connects a Python script to a remote Ray cluster.
|
||||
To learn more about the Ray Client, you can refer to the :ref:`documentation<ray-client>`.
|
||||
To learn more about the autoscaler and how to configure it, refer to the following user guides:
|
||||
|
||||
Job submission
|
||||
--------------
|
||||
* :ref:`Configuring Autoscaling on VMs<deployment-guide-autoscaler-under-construction>`.
|
||||
* :ref:`Autoscaling on Kubernetes<kuberay-autoscaler-discussion>`.
|
||||
|
||||
Ray Job submission is a mechanism to submit locally developed and tested applications
|
||||
to a remote Ray cluster. It simplifies the experience of packaging, deploying,
|
||||
and managing a Ray application. To learn more about Ray jobs, refer to the
|
||||
:ref:`documentation<ray-job-submission-api-ref>`.
|
||||
.. _cluster-clients-and-jobs-under-construction:
|
||||
|
||||
Cloud clusters
|
||||
--------------
|
||||
..
|
||||
|
||||
If you’re using AWS, GCP, Azure (community-maintained) or Aliyun (community-maintained), you can use the
|
||||
:ref:`Ray cluster launcher<cluster-launcher>` to launch cloud clusters, which
|
||||
greatly simplifies the cluster setup process.
|
||||
|
||||
Cluster managers
|
||||
Clients and Jobs
|
||||
----------------
|
||||
TODO
|
||||
~~~~
|
||||
TODO: Update the following section so that we recommend the best tool for first-time users:
|
||||
See https://anyscaleteam.slack.com/archives/C01CLKUN38V/p1659990371608629?thread_ts=1659981502.811539&cid=C01CLKUN38V
|
||||
|
||||
You can simplify the process of managing Ray clusters using a number of popular
|
||||
cluster managers including :ref:`Kubernetes<kuberay-index>`,
|
||||
:ref:`YARN<ray-yarn-deploy>`, :ref:`Slurm<ray-slurm-deploy>` and :ref:`LSF<ray-LSF-deploy>`.
|
||||
Clients and Jobs
|
||||
~~~~~~~~~~~~~~~~
|
||||
Ray provides two methods for running workloads on a Ray Cluster: the Ray Client, and Ray Job Submission.
|
||||
|
||||
Kubernetes (K8s) operator
|
||||
-------------------------
|
||||
* **The Ray Client** enables interactive development by connecting a local Python script or shell to the cluster.
|
||||
Developers can scale-out their local programs on the cloud as if it were on their laptop. The Ray Client is used
|
||||
by specifying the :ref:`head node<cluster-head-node-under-construction>` address as an argument to `ray.init`.
|
||||
* **Ray Job Submission** enables users to submit locally developed-and-tested applications to a remote Ray
|
||||
Cluster. Ray Job Submission simplifies the experience of packaging, deploying, and managing a Ray application.
|
||||
|
||||
Deployments of Ray on Kubernetes are managed by the Ray Kubernetes Operator. The
|
||||
Ray Operator makes it easy to deploy clusters of Ray pods within a Kubernetes
|
||||
cluster. To learn more about the K8s operator, refer to
|
||||
the :ref:`documentation<kuberay-index>`.
|
||||
To learn how to run workloads on a Ray Cluster, refer to the following user guides:
|
||||
|
||||
* :ref:`Running Ray workloads on VMs<ref-deployment-guide-under-construction>`.
|
||||
* The :ref:`Ray Job Submission<kuberay-job>` and :ref:`Ray Client<kuberay-client>` sections in :ref:`Getting Started with Ray on Kubernetes<kuberay-quickstart>`.
|
||||
|
|
|
@ -63,7 +63,7 @@ Broadly speaking, it is more efficient to use a few large Ray pods than many sma
|
|||
We recommend taking a look at the [config file][ConfigLink] applied in the following command.
|
||||
```shell
|
||||
# Starting from the parent directory of cloned Ray master,
|
||||
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/
|
||||
pushd ray/doc/source/cluster/kubernetes/configs/
|
||||
kubectl apply -f xgboost-benchmark.yaml
|
||||
popd
|
||||
```
|
||||
|
@ -102,7 +102,7 @@ kubectl port-forward service/raycluster-xgboost-benchmark-head-svc 8265:8265
|
|||
|
||||
We'll use the {ref}`Ray Job Python SDK<ray-job-sdk>` to submit the XGBoost workload.
|
||||
|
||||
```{literalinclude} ../doc_code/xgboost_submit.py
|
||||
```{literalinclude} /cluster/doc_code/xgboost_submit.py
|
||||
:language: python
|
||||
```
|
||||
|
||||
|
@ -111,7 +111,7 @@ The script is available in the Ray repository.
|
|||
|
||||
```shell
|
||||
# From the parent directory of cloned Ray master.
|
||||
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/doc_code/
|
||||
pushd ray/doc/source/cluster/doc_code/
|
||||
python xgboost_submit.py
|
||||
popd
|
||||
```
|
||||
|
@ -191,4 +191,6 @@ kubectl delete raycluster raycluster-xgboost-benchmark
|
|||
If you're on a public cloud, don't forget to clean up the underlying
|
||||
node group and/or Kubernetes cluster.
|
||||
|
||||
[ConfigLink]: https://raw.githubusercontent.com/ray-project/ray/291bba69fb90ee5e8401540ef55b7b74dd13f5c5/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/xgboost-benchmark-autoscaler.yaml
|
||||
<!-- TODO: Fix this -->
|
||||
<!-- [ConfigLink]: https://raw.githubusercontent.com/ray-project/ray/291bba69fb90ee5e8401540ef55b7b74dd13f5c5/doc/source/cluster/ray-clusters-on-kubernetes/configs/xgboost-benchmark-autoscaler.yaml -->
|
||||
[ConfigLink]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/
|
Before Width: | Height: | Size: 135 KiB After Width: | Height: | Size: 135 KiB |
Before Width: | Height: | Size: 133 KiB After Width: | Height: | Size: 133 KiB |
|
@ -13,7 +13,7 @@ heterogenous compute nodes (including GPUs) as well as running multiple Ray clus
|
|||
different Ray versions in the same Kubernetes cluster.
|
||||
|
||||
```{eval-rst}
|
||||
.. image:: /cluster/cluster_under_construction/ray-clusters-on-kubernetes/images/ray_on_kubernetes.png
|
||||
.. image:: images/ray_on_kubernetes.png
|
||||
:align: center
|
||||
..
|
||||
Find source document here: https://docs.google.com/drawings/d/1E3FQgWWLuj8y2zPdKXjoWKrfwgYXw6RV_FWRwK8dVlg/edit
|
|
@ -101,8 +101,9 @@ will halve the quantity of that task or actor that can fit in a given Ray pod.
|
|||
## Autoscaling architecture
|
||||
The following diagram illustrates the integration of the Ray Autoscaler
|
||||
with the KubeRay operator.
|
||||
|
||||
```{eval-rst}
|
||||
.. image:: /cluster/cluster_under_construction/ray-clusters-on-kubernetes/images/AutoscalerOperator.svg
|
||||
.. image:: ../images/AutoscalerOperator.svg
|
||||
:align: center
|
||||
..
|
||||
Find the source document here (https://docs.google.com/drawings/d/1LdOg9JQuN5AOII-vDpSaFBsTeg0JGWcsbyNNLP1yovg/edit)
|
|
@ -185,9 +185,11 @@ Here is a [link][ConfigLink] to the configuration shown below.
|
|||
```{literalinclude} ../configs/migration-example.yaml
|
||||
:language: yaml
|
||||
```
|
||||
|
||||
[RayExamples]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs
|
||||
<!-- TODO: fix this -->
|
||||
<!-- [RayExamples]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/kubernetes/configs -->
|
||||
[RayExamples]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/
|
||||
[KubeRayExamples]: https://ray-project.github.io/kuberay/components/operator/#running-an-example-cluster
|
||||
[ConfigLink]: https://raw.githubusercontent.com/ray-project/ray/7aeb1ab9cf7adb58fd9418c0e08984ff0fe6d018/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/migration-example.yaml
|
||||
[ConfigLink]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/
|
||||
<!-- [ConfigLink]: https://raw.githubusercontent.com/ray-project/ray/7aeb1ab9cf7adb58fd9418c0e08984ff0fe6d018/doc/source/cluster/ray-clusters-on-kubernetes/configs/migration-example.yaml -->
|
||||
[KubeRayHelm]: https://ray-project.github.io/kuberay/deploy/helm/
|
||||
[KubeRayHelmCode]: https://github.com/ray-project/kuberay/tree/master/helm-chart
|
|
@ -123,7 +123,7 @@ Now, run the following commands to deploy the Fluent Bit ConfigMap and a single-
|
|||
a Fluent Bit sidecar.
|
||||
```shell
|
||||
# Starting from the parent of cloned Ray master.
|
||||
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/
|
||||
pushd ray/doc/source/cluster/kubernetes/configs/
|
||||
kubectl apply -f ray-cluster.log.yaml
|
||||
popd
|
||||
```
|
||||
|
@ -145,4 +145,6 @@ kubectl logs raycluster-complete-logs-head-xxxxx -c fluentbit
|
|||
[Fluentd]: https://docs.fluentd.org/
|
||||
[Promtail]: https://grafana.com/docs/loki/latest/clients/promtail/
|
||||
[KubDoc]: https://kubernetes.io/docs/concepts/cluster-administration/logging/
|
||||
[ConfigLink]: https://raw.githubusercontent.com/ray-project/ray/779e9f7c5733ef9a471ad2bb61723158ff942e92/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/ray-cluster.log.yaml
|
||||
<!-- TODO: fix this -->
|
||||
[ConfigLink]: https://github.com/ray-project/ray/tree/master/doc/source/cluster/
|
||||
<!-- [ConfigLink]: https://raw.githubusercontent.com/ray-project/ray/779e9f7c5733ef9a471ad2bb61723158ff942e92/doc/source/cluster/ray-clusters-on-kubernetes/configs/ray-cluster.log.yaml -->
|
|
@ -72,7 +72,7 @@ This will forward remote port 8265 to port 8265 on localhost.
|
|||
|
||||
We'll use the {ref}`Ray Job Python SDK<ray-job-sdk>` to submit the XGBoost workload.
|
||||
|
||||
```{literalinclude} ../../ray-clusters-on-kubernetes/doc_code/xgboost_submit.py
|
||||
```{literalinclude} /cluster/doc_code/xgboost_submit.py
|
||||
:language: python
|
||||
```
|
||||
|
||||
|
@ -81,7 +81,7 @@ The script is also available in the Ray repository.
|
|||
|
||||
```shell
|
||||
# From the parent directory of cloned Ray master.
|
||||
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/doc_code/
|
||||
pushd ray/doc/source/cluster/doc_code/
|
||||
python xgboost_submit.py
|
||||
popd
|
||||
```
|
|
@ -1178,17 +1178,17 @@ Minimal configuration
|
|||
|
||||
.. tabbed:: AWS
|
||||
|
||||
.. literalinclude:: ../../../../../../python/ray/autoscaler/aws/example-minimal.yaml
|
||||
.. literalinclude:: ../../../../../python/ray/autoscaler/aws/example-minimal.yaml
|
||||
:language: yaml
|
||||
|
||||
.. tabbed:: Azure
|
||||
|
||||
.. literalinclude:: ../../../../../../python/ray/autoscaler/azure/example-minimal.yaml
|
||||
.. literalinclude:: ../../../../../python/ray/autoscaler/azure/example-minimal.yaml
|
||||
:language: yaml
|
||||
|
||||
.. tabbed:: GCP
|
||||
|
||||
.. literalinclude:: ../../../../../../python/ray/autoscaler/gcp/example-minimal.yaml
|
||||
.. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-minimal.yaml
|
||||
:language: yaml
|
||||
|
||||
Full configuration
|
||||
|
@ -1196,17 +1196,17 @@ Full configuration
|
|||
|
||||
.. tabbed:: AWS
|
||||
|
||||
.. literalinclude:: ../../../../../../python/ray/autoscaler/aws/example-full.yaml
|
||||
.. literalinclude:: ../../../../../python/ray/autoscaler/aws/example-full.yaml
|
||||
:language: yaml
|
||||
|
||||
.. tabbed:: Azure
|
||||
|
||||
.. literalinclude:: ../../../../../../python/ray/autoscaler/azure/example-full.yaml
|
||||
.. literalinclude:: ../../../../../python/ray/autoscaler/azure/example-full.yaml
|
||||
:language: yaml
|
||||
|
||||
.. tabbed:: GCP
|
||||
|
||||
.. literalinclude:: ../../../../../../python/ray/autoscaler/gcp/example-full.yaml
|
||||
.. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-full.yaml
|
||||
:language: yaml
|
||||
|
||||
TPU Configuration
|
||||
|
@ -1218,5 +1218,5 @@ Before using a config with TPUs, ensure that the `TPU API is enabled for your GC
|
|||
|
||||
.. tabbed:: GCP
|
||||
|
||||
.. literalinclude:: ../../../../../../python/ray/autoscaler/gcp/tpu.yaml
|
||||
.. literalinclude:: ../../../../../python/ray/autoscaler/gcp/tpu.yaml
|
||||
:language: yaml
|
|
@ -0,0 +1,8 @@
|
|||
:orphan:
|
||||
|
||||
.. _slurm-basic-under-construction:
|
||||
|
||||
slurm-basic.sh
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster/doc_code/slurm-basic.sh
|
|
@ -5,4 +5,4 @@
|
|||
slurm-launch.py
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/slurm-launch.py
|
||||
.. literalinclude:: /cluster/doc_code/slurm-launch.py
|
|
@ -5,5 +5,5 @@
|
|||
slurm-template.sh
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/slurm-template.sh
|
||||
.. literalinclude:: /cluster/doc_code/slurm-template.sh
|
||||
:language: bash
|
|
@ -104,7 +104,7 @@ Obtain the head IP address
|
|||
|
||||
Next, we'll want to obtain a hostname and a node IP address for the head node. This way, when we start worker nodes, we'll be able to properly connect to the right head node.
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/slurm-basic.sh
|
||||
.. literalinclude:: /cluster/doc_code/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_head_address_start__
|
||||
:end-before: __doc_head_address_end__
|
||||
|
@ -123,7 +123,7 @@ and number of GPUs (``num-gpus``) to Ray, as this will prevent Ray from using
|
|||
more resources than allocated. We also need to explictly
|
||||
indicate the ``node-ip-address`` for the Ray head runtime:
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/slurm-basic.sh
|
||||
.. literalinclude:: /cluster/doc_code/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_head_ray_start__
|
||||
:end-before: __doc_head_ray_end__
|
||||
|
@ -135,7 +135,7 @@ Starting the Ray worker nodes
|
|||
|
||||
Below, we do the same thing, but for each worker. Make sure the Ray head and Ray worker processes are not started on the same node.
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/slurm-basic.sh
|
||||
.. literalinclude:: /cluster/doc_code/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_worker_ray_start__
|
||||
:end-before: __doc_worker_ray_end__
|
||||
|
@ -145,7 +145,7 @@ Submitting your script
|
|||
|
||||
Finally, you can invoke your Python script:
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/slurm-basic.sh
|
||||
.. literalinclude:: /cluster/doc_code/slurm-basic.sh
|
||||
:language: bash
|
||||
:start-after: __doc_script_start__
|
||||
|
|
@ -133,7 +133,7 @@ Clean up all started processes even if the application fails or is killed.
|
|||
|
||||
Putting things together, we have:
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/yarn/ray-skein.yaml
|
||||
.. literalinclude:: /cluster/doc_code/yarn/ray-skein.yaml
|
||||
:language: yaml
|
||||
:start-after: # Head service
|
||||
:end-before: # Worker service
|
||||
|
@ -156,7 +156,7 @@ Start all of the processes needed on a ray worker node, blocking until killed by
|
|||
|
||||
Putting things together, we have:
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/yarn/ray-skein.yaml
|
||||
.. literalinclude:: /cluster/doc_code/yarn/ray-skein.yaml
|
||||
:language: yaml
|
||||
:start-after: # Worker service
|
||||
|
||||
|
@ -165,7 +165,7 @@ Running a Job
|
|||
|
||||
Within your Ray script, use the following to connect to the started Ray cluster:
|
||||
|
||||
.. literalinclude:: /cluster/cluster_under_construction/doc_code/yarn/example.py
|
||||
.. literalinclude:: /cluster/doc_code/yarn/example.py
|
||||
:language: python
|
||||
:start-after: if __name__ == "__main__"
|
||||
|
||||
|
@ -177,7 +177,7 @@ You can use the following command to launch the application as specified by the
|
|||
|
||||
Once it has been submitted, you can see the job running on the YARN dashboard.
|
||||
|
||||
.. image:: /cluster/cluster_under_construction/images/yarn-job.png
|
||||
.. image:: /cluster/images/yarn-job.png
|
||||
|
||||
Cleaning Up
|
||||
-----------
|