Merge remote-tracking branch 'upstream/master' into grpc-channel-reconnect

This commit is contained in:
Yi Cheng 2022-05-11 05:52:18 +00:00
commit 707329a170
52 changed files with 3005 additions and 2776 deletions

View file

@ -26,6 +26,7 @@ epilogue_commands: &epilogue_commands |-
steps:
- label: ":mac: :apple: Wheels and Jars"
<<: *common
conditions: ["RAY_CI_MACOS_WHEELS_AFFECTED", "RAY_CI_PYTHON_DEPENDENCIES_AFFECTED"]
commands:
# Cleanup environments
- ./ci/build/upload_build_info.sh
@ -62,6 +63,7 @@ steps:
- label: ":mac: :apple: Ray C++, Java and Libraries"
<<: *common
conditions: ["RAY_CI_SERVE_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_CPP_AFFECTED", "RAY_CI_JAVA_AFFECTED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DASHBOARD_AFFECTED"]
commands:
- export RAY_INSTALL_JAVA=1
- *prelude_commands
@ -73,17 +75,13 @@ steps:
# clang-format is needed by java/test.sh
- pip install clang-format==12.0.1
- ./java/test.sh
- ./ci/ci.sh test_cpp
- *epilogue_commands
- label: ":mac: :apple: Worker"
<<: *common
commands:
- *prelude_commands
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- ./ci/ci.sh test_cpp
- label: ":mac: :apple: Small & Client"
<<: *common
conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
commands:
- *prelude_commands
- bazel test $(./ci/run/bazel_export_options) --config=ci
@ -96,6 +94,7 @@ steps:
- label: ":mac: :apple: Large"
<<: *common
parallelism: 3
conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
commands:
- *prelude_commands
- . ./ci/ci.sh test_large
@ -103,6 +102,7 @@ steps:
- label: ":mac: :apple: Medium A-J"
<<: *common
conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
commands:
- *prelude_commands
- bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CI
@ -112,6 +112,7 @@ steps:
- label: ":mac: :apple: Medium K-Z"
<<: *common
conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
commands:
- *prelude_commands
- bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CI

View file

@ -7,7 +7,6 @@ import argparse
import json
import os
from pprint import pformat
import py_dep_analysis as pda
import re
import subprocess
import sys
@ -102,6 +101,8 @@ if __name__ == "__main__":
# Dry run py_dep_analysis.py to see which tests we would have run.
try:
import py_dep_analysis as pda
graph = pda.build_dep_graph()
rllib_tests = pda.list_rllib_tests()
print("Total # of RLlib tests: ", len(rllib_tests), file=sys.stderr)

View file

@ -147,11 +147,11 @@ const Dashboard: React.FC = () => {
run the following command: `ray disable-usage-stats` before starting
the cluster. See{" "}
<a
href="https://github.com/ray-project/ray/issues/20857"
href="https://docs.ray.io/en/master/cluster/usage-stats.html"
target="_blank"
rel="noreferrer"
>
https://github.com/ray-project/ray/issues/20857
https://docs.ray.io/en/master/cluster/usage-stats.html
</a>{" "}
for more details.
</span>

View file

@ -132,6 +132,7 @@ parts:
- file: cluster/cloud
- file: cluster/deploy
- file: cluster/api
- file: cluster/usage-stats
- caption: References
chapters:

View file

@ -0,0 +1,10 @@
Usage Stats Data API
====================
.. _ray-usage-stats-data-ref:
UsageStatsToReport
~~~~~~~~~~~~~~~~~~
.. autoclass:: ray._private.usage.usage_lib.UsageStatsToReport
:members:

View file

@ -0,0 +1,84 @@
.. _ref-usage-stats:
Usage Stats Collection
======================
Starting in Ray 1.13, Ray collects usage stats data by default (guarded by an opt-out prompt).
This data will be used by the open-source Ray engineering team to better understand how to improve our libraries and core APIs, and how to prioritize bug fixes and enhancements.
Here are the guiding principles of our collection policy:
- **No surprises** — you will be notified before we begin collecting data. You will be notified of any changes to the data being collected or how it is used.
- **Easy opt-out:** You will be able to easily opt-out of data collection
- **Transparency** — you will be able to review all data that is sent to us
- **Control** — you will have control over your data, and we will honor requests to delete your data.
- We will **not** collect any personally identifiable data or proprietary code/data
- We will **not** sell data or buy data about you.
You will always be able to :ref:`disable the usage stats collection<usage-disable>`.
For more context, please refer to this `RFC <https://github.com/ray-project/ray/issues/20857>`_.
What data is collected?
-----------------------
We collect non-sensitive data that helps us understand how Ray is used (e.g., which Ray libraries are used).
**Personally identifiable data will never be collected.** Please check :ref:`UsageStatsToReport <ray-usage-stats-data-ref>` to see the data we collect.
.. _usage-disable:
How to disable it
-----------------
There are multiple ways to disable usage stats collection before starting a cluster:
#. Add ``--disable-usage-stats`` option to the command that starts the Ray cluster (e.g., ``ray start --head --disable-usage-stats`` :ref:`command <ray-start-doc>`).
#. Run :ref:`ray disable-usage-stats <ray-disable-usage-stats-doc>` to disable collection for all future clusters. This won't affect currently running clusters. Under the hood, this command writes ``{"usage_stats": true}`` to the global config file ``~/.ray/config.json``.
#. Set the environment variable ``RAY_USAGE_STATS_ENABLED`` to 0 (e.g., ``RAY_USAGE_STATS_ENABLED=0 ray start --head`` :ref:`command <ray-start-doc>`).
Currently there is no way to enable or disable collection for a running cluster; you have to stop and restart the cluster.
How does it work?
-----------------
When a Ray cluster is started via :ref:`ray start --head <ray-start-doc>`, :ref:`ray up <ray-up-doc>`, :ref:`ray submit --start <ray-submit-doc>` or :ref:`ray exec --start <ray-exec-doc>`,
Ray will decide whether usage stats collection should be enabled or not by considering the following factors in order:
#. It checks whether the environment variable ``RAY_USAGE_STATS_ENABLED`` is set: 1 means enabled and 0 means disabled.
#. If the environment variable is not set, it reads the value of key ``usage_stats`` in the global config file ``~/.ray/config.json``: true means enabled and false means disabled.
#. If neither is set and the console is interactive, then the user will be prompted to enable or disable the collection. If the console is non-interactive, usage stats collection will be enabled by default. The decision will be saved to ``~/.ray/config.json``, so the prompt is only shown once.
Note: usage stats collection is not enabled when using local dev clusters started via ``ray.init()``. This means that Ray will never collect data from third-party library users not using Ray directly.
If usage stats collection is enabled, a background process on the head node will collect the usage stats
and report to ``https://usage-stats.ray.io/`` every hour. The reported usage stats will also be saved to
``/tmp/ray/session_xxx/usage_stats.json`` on the head node for inspection. You can check the existence of this file to see if collection is enabled.
Usage stats collection is very lightweight and should have no impact on your workload in any way.
Requesting removal of collected data
------------------------------------
To request removal of collected data, please email us at ``usage_stats@ray.io`` with the ``session_id`` that you can find in ``/tmp/ray/session_xxx/usage_stats.json``.
Frequently Asked Questions (FAQ)
--------------------------------
**Does the session_id map to personal data?**
No, the uuid will be a Ray session/job-specific random ID that cannot be used to identify a specific person nor machine. It will not live beyond the lifetime of your Ray session; and is primarily captured to enable us to honor deletion requests.
The session_id is logged so that deletion requests can be honored.
**Could an enterprise easily configure an additional endpoint or substitute a different endpoint?**
We definitely see this use case and would love to chat with you to make this work -- email ``usage_stats@ray.io``.
Contact us
----------
If you have any feedback regarding usage stats collection, please email us at ``usage_stats@ray.io``.

View file

@ -127,7 +127,7 @@ Transformations are executed *eagerly* and block until the operation is finished
.. code-block:: python
def transform_batch(df: pandas.DataFrame) -> pandas.DataFrame:
def transform_batch(df: pandas.DataFrame) -> pd.DataFrame:
return df.applymap(lambda x: x * 2)
ds = ray.data.range_arrow(10000)

View file

@ -0,0 +1,16 @@
import ray
# Put the values (1, 2, 3) into Ray's object store.
a, b, c = ray.put(1), ray.put(2), ray.put(3)
@ray.remote
def print_via_capture():
"""This function prints the values of (a, b, c) to stdout."""
print(ray.get([a, b, c]))
# Passing object references via closure-capture. Inside the `print_via_capture`
# function, the global object refs (a, b, c) can be retrieved and printed.
print_via_capture.remote()
# -> prints [1, 2, 3]

View file

@ -0,0 +1,18 @@
import ray
@ray.remote
def echo_and_get(x_list): # List[ObjectRef]
"""This function prints its input values to stdout."""
print("args:", x_list)
print("values:", ray.get(x_list))
# Put the values (1, 2, 3) into Ray's object store.
a, b, c = ray.put(1), ray.put(2), ray.put(3)
# Passing an object as a nested argument to `echo_and_get`. Ray does not
# de-reference nested args, so `echo_and_get` sees the references.
echo_and_get.remote([a, b, c])
# -> prints args: [ObjectRef(...), ObjectRef(...), ObjectRef(...)]
# values: [1, 2, 3]

View file

@ -0,0 +1,20 @@
import ray
@ray.remote
def echo(a: int, b: int, c: int):
"""This function prints its input values to stdout."""
print(a, b, c)
# Passing the literal values (1, 2, 3) to `echo`.
echo.remote(1, 2, 3)
# -> prints "1 2 3"
# Put the values (1, 2, 3) into Ray's object store.
a, b, c = ray.put(1), ray.put(2), ray.put(3)
# Passing an object as a top-level argument to `echo`. Ray will de-reference top-level
# arguments, so `echo` will see the literal values (1, 2, 3) in this case as well.
echo.remote(a, b, c)
# -> prints "1 2 3"

View file

@ -128,41 +128,51 @@ If the current node's object store does not contain the object, the object is do
assert(*results[1] == 1);
assert(*results[2] == 2);
Passing Objects by Reference
----------------------------
Passing Object Arguments
------------------------
Ray object references can be freely passed around a Ray application. This means that they can be passed as arguments to tasks, actor methods, and even stored in other objects. Objects are tracked via *distributed reference counting*, and their data is automatically freed once all references to the object are deleted.
There are two different ways one can pass an object to a Ray task or method. Depending on the way an object is passed, Ray will decide whether to *de-reference* the object prior to task execution.
**Passing an object as a top-level argmuent**: When an object is passed directly as a top-level argument to a task, Ray will de-reference the object. This means that Ray will fetch the underlying data for all top-level object reference arguments, not executing the task until the object data becomes fully available.
.. literalinclude:: doc_code/obj_val.py
**Passing an object as a nested argument**: When an object is passed within a nested object, for example, within a Python list, Ray will *not* de-reference it. This means that the task will need to call ``ray.get()`` on the reference to fetch the concrete value. However, if the task never calls ``ray.get()``, then the object value never needs to be transferred to the machine the task is running on. We recommend passing objects as top-level arguments where possible, but nested arguments can be useful for passing objects on to other tasks without needing to see the data.
.. literalinclude:: doc_code/obj_ref.py
The top-level vs not top-level passing convention also applies to actor constructors and actor method calls:
.. code-block:: python
@ray.remote
def echo(x):
print(x)
# Examples of passing objects to actor constructors.
actor_handle = Actor.remote(obj) # by-value
actor_handle = Actor.remote([obj]) # by-reference
# Put an object in Ray's object store.
object_ref = ray.put(1)
# Examples of passing objects to actor method calls.
actor_handle.method.remote(obj) # by-value
actor_handle.method.remote([obj]) # by-reference
# Pass-by-value: send the object to a task as a top-level argument.
# The object will be de-referenced, so the task only sees its value.
echo.remote(object_ref)
# -> prints "1"
Closure Capture of Objects
--------------------------
# Pass-by-reference: when passed inside a Python list or other data structure,
# the object ref is preserved. The object data is not transferred to the worker
# when it is passed by reference, until ray.get() is called on the reference.
echo.remote({"obj": object_ref})
# -> prints "{"obj": ObjectRef(...)}"
You can also pass objects to tasks via *closure-capture*. This can be convenient when you have a large object that you want to share verbatim between many tasks or actors, and don't want to pass it repeatedly as an argument. Be aware however that defining a task that closes over an object ref will pin the object via reference-counting, so the object will not be evicted until the job completes.
.. literalinclude:: doc_code/obj_capture.py
Nested Objects
--------------
Ray also supports nested object references. This allows you to build composite objects that themselves hold references to further sub-objects.
.. code-block:: python
# Objects can be nested within each other. Ray will keep the inner object
# alive via reference counting until all outer object references are deleted.
object_ref_2 = ray.put([object_ref])
# Examples of passing objects to actors.
actor_handle = Actor.remote(obj) # by-value
actor_handle = Actor.remote([obj]) # by-reference
actor_handle.method.remote(obj) # by-value
actor_handle.method.remote([obj]) # by-reference
More about Ray Objects
----------------------

View file

@ -314,3 +314,15 @@ The Ray Command Line API
.. click:: ray.scripts.scripts:debug
:prog: ray debug
:show-nested:
.. _ray-disable-usage-stats-doc:
.. click:: ray.scripts.scripts:disable_usage_stats
:prog: ray disable-usage-stats
:show-nested:
.. _ray-enable-usage-stats-doc:
.. click:: ray.scripts.scripts:enable_usage_stats
:prog: ray enable-usage-stats
:show-nested:

View file

@ -1,3 +1,5 @@
.. _ray-logging:
Logging
=======
This document will explain Ray's logging system and its best practices.

View file

@ -13,4 +13,5 @@ API References
../workflows/package-ref.rst
../ray-core/package-ref.rst
../cluster/reference.rst
../cluster/jobs-package-ref.rst
../cluster/jobs-package-ref.rst
../cluster/usage-stats-data-ref.rst

View file

@ -1,22 +1,22 @@
.. _serve-architecture:
(serve-architecture)=
# Serve Architecture
Serve Architecture
==================
This section should help you:
- understand an overview of how each component in Serve works
- understand the different types of actors that make up a Serve instance
.. Figure source: https://docs.google.com/drawings/d/1jSuBN5dkSj2s9-0eGzlU_ldsRa3TsswQUZM-cMQ29a0/edit?usp=sharing
% Figure source: https://docs.google.com/drawings/d/1jSuBN5dkSj2s9-0eGzlU_ldsRa3TsswQUZM-cMQ29a0/edit?usp=sharing
.. image:: architecture.svg
:align: center
:width: 600px
```{image} architecture.svg
:align: center
:width: 600px
```
High Level View
---------------
## High Level View
Serve runs on Ray and utilizes :ref:`Ray actors<actor-guide>`.
Serve runs on Ray and utilizes [Ray actors](actor-guide).
There are three kinds of actors that are created to make up a Serve instance:
@ -24,17 +24,16 @@ There are three kinds of actors that are created to make up a Serve instance:
the control plane. The Controller is responsible for creating, updating, and
destroying other actors. Serve API calls like creating or getting a deployment
make remote calls to the Controller.
- Router: There is one router per node. Each router is a `Uvicorn <https://www.uvicorn.org/>`_ HTTP
- Router: There is one router per node. Each router is a [Uvicorn](https://www.uvicorn.org/) HTTP
server that accepts incoming requests, forwards them to replicas, and
responds once they are completed.
- Worker Replica: Worker replicas actually execute the code in response to a
request. For example, they may contain an instantiation of an ML model. Each
replica processes individual requests from the routers (they may be batched
by the replica using ``@serve.batch``, see the :ref:`batching<serve-batching>` docs).
by the replica using `@serve.batch`, see the [batching](serve-batching) docs).
## Lifetime of a Request
Lifetime of a Request
---------------------
When an HTTP request is sent to the router, the follow things happen:
- The HTTP request is received and parsed.
@ -42,36 +41,33 @@ When an HTTP request is sent to the router, the follow things happen:
request is placed on a queue.
- For each request in a deployment queue, an available replica is looked up
and the request is sent to it. If there are no available replicas (there
are more than ``max_concurrent_queries`` requests outstanding), the request
are more than `max_concurrent_queries` requests outstanding), the request
is left in the queue until an outstanding request is finished.
Each replica maintains a queue of requests and executes one at a time, possibly
using asyncio to process them concurrently. If the handler (the function for the
deployment or ``__call__``) is ``async``, the replica will not wait for the
deployment or `__call__`) is `async`, the replica will not wait for the
handler to run; otherwise, the replica will block until the handler returns.
FAQ
---
## FAQ
.. _serve-ft-detail:
(serve-ft-detail)=
How does Serve handle fault tolerance?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### How does Serve handle fault tolerance?
Application errors like exceptions in your model evaluation code are caught and
wrapped. A 500 status code will be returned with the traceback information. The
replica will be able to continue to handle requests.
Machine errors and faults will be handled by Ray. Serve utilizes the :ref:`actor
reconstruction <actor-fault-tolerance>` capability. For example, when a machine hosting any of the
Machine errors and faults will be handled by Ray. Serve utilizes the [actor
reconstruction](actor-fault-tolerance) capability. For example, when a machine hosting any of the
actors crashes, those actors will be automatically restarted on another
available machine. All data in the Controller (routing policies, deployment
configurations, etc) is checkpointed to the Ray. Transient data in the
router and the replica (like network connections and internal request
queues) will be lost upon failure.
How does Serve ensure horizontal scalability and availability?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
### How does Serve ensure horizontal scalability and availability?
Serve starts one router per node. Each router will bind the same port. You
should be able to reach Serve and send requests to any models via any of the
@ -81,20 +77,17 @@ This architecture ensures horizontal scalability for Serve. You can scale the
router by adding more nodes and scale the model by increasing the number
of replicas.
How do ServeHandles work?
^^^^^^^^^^^^^^^^^^^^^^^^^
### How do ServeHandles work?
:mod:`ServeHandles <ray.serve.handle.RayServeHandle>` wrap a handle to the router actor on the same node. When a
{mod}`ServeHandles <ray.serve.handle.RayServeHandle>` wrap a handle to the router actor on the same node. When a
request is sent from one replica to another via the handle, the
requests go through the same data path as incoming HTTP requests. This enables
the same deployment selection and batching procedures to happen. ServeHandles are
often used to implement :ref:`model composition <serve-model-composition>`.
often used to implement [model composition](serve-model-composition).
### What happens to large requests?
What happens to large requests?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Serve utilizes Rays :ref:`shared memory object store <plasma-store>` and in process memory
Serve utilizes Rays [shared memory object store](plasma-store) and in process memory
store. Small request objects are directly sent between actors via network
call. Larger request objects (100KiB+) are written to a distributed shared
memory store and the replica can read them via zero-copy read.

View file

@ -0,0 +1,364 @@
# Core API: Deployments
This section should help you:
- create, query, update and configure deployments
- configure resources of your deployments
- specify different Python dependencies across different deployment using Runtime Environments
:::{tip}
Get in touch with us if you're using or considering using [Ray Serve](https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU).
:::
```{contents}
```
## Creating a Deployment
Deployments are the central concept in Ray Serve.
They allow you to define and update your business logic or models that will handle incoming requests as well as how this is exposed over HTTP or in Python.
A deployment is defined using {mod}`@serve.deployment <ray.serve.api.deployment>` on a Python class (or function for simple use cases).
You can specify arguments to be passed to the constructor when you call `Deployment.deploy()`, shown below.
A deployment consists of a number of *replicas*, which are individual copies of the function or class that are started in separate Ray Actors (processes).
```python
@serve.deployment
class MyFirstDeployment:
# Take the message to return as an argument to the constructor.
def __init__(self, msg):
self.msg = msg
def __call__(self, request):
return self.msg
def other_method(self, arg):
return self.msg
MyFirstDeployment.deploy("Hello world!")
```
Deployments can be exposed in two ways: over HTTP or in Python via the {ref}`servehandle-api`.
By default, HTTP requests will be forwarded to the `__call__` method of the class (or the function) and a `Starlette Request` object will be the sole argument.
You can also define a deployment that wraps a FastAPI app for more flexible handling of HTTP requests. See {ref}`serve-fastapi-http` for details.
To serve multiple deployments defined by the same class, use the `name` option:
```python
MyFirstDeployment.options(name="hello_service").deploy("Hello!")
MyFirstDeployment.options(name="hi_service").deploy("Hi!")
```
You can also list all available deployments and dynamically get references to them:
```python
>> serve.list_deployments()
{'A': Deployment(name=A,version=None,route_prefix=/A)}
{'MyFirstDeployment': Deployment(name=MyFirstDeployment,version=None,route_prefix=/MyFirstDeployment}
# Returns the same object as the original MyFirstDeployment object.
# This can be used to redeploy, get a handle, etc.
deployment = serve.get_deployment("MyFirstDeployment")
```
## Exposing a Deployment
By default, deployments are exposed over HTTP at `http://localhost:8000/<deployment_name>`.
The HTTP path that the deployment is available at can be changed using the `route_prefix` option.
All requests to `/{route_prefix}` and any subpaths will be routed to the deployment (using a longest-prefix match for overlapping route prefixes).
Here's an example:
```python
@serve.deployment(name="http_deployment", route_prefix="/api")
class HTTPDeployment:
def __call__(self, request):
return "Hello world!"
```
After creating the deployment, it is now exposed by the HTTP server and handles requests using the specified class.
We can query the model to verify that it's working.
```python
import requests
print(requests.get("http://127.0.0.1:8000/api").text)
```
We can also query the deployment using the {mod}`ServeHandle <ray.serve.handle.RayServeHandle>` interface.
```python
# To get a handle from the same script, use the Deployment object directly:
handle = HTTPDeployment.get_handle()
# To get a handle from a different script, reference it by name:
handle = serve.get_deployment("http_deployment").get_handle()
print(ray.get(handle.remote()))
```
As noted above, there are two ways to expose deployments. The first is by using the {mod}`ServeHandle <ray.serve.handle.RayServeHandle>`
interface. This method allows you to access deployments within a Python script or code, making it convenient for a
Python developer. And the second is by using the HTTP request, allowing access to deployments via a web client application.
Let's look at a simple end-to-end example using both ways to expose and access deployments. Your output may
vary due to random nature of how the prediction is computed; however, the example illustrates two things:
1\) how to expose and use deployments and 2) how to use replicas, to which requests are sent. Note that each pid
is a separate replica associated with each deployment name, `rep-1` and `rep-2` respectively.
```{literalinclude} _examples/doc_code/create_deployment.py
:end-before: __serve_example_end__
:language: python
:start-after: __serve_example_begin__
```
```python
# Output:
# {'rep-1': Deployment(name=rep-1,version=None,route_prefix=/rep-1),
# 'rep-2': Deployment(name=rep-2,version=None,route_prefix=/rep-2)}
#
# ServerHandle API responses: ----------
# handle name : rep-1
# prediction : (pid: 62636); path: /model/rep-1.pkl; data: 0.600; prediction: 1.292
# --
# handle name : rep-2
# prediction : (pid: 62635); path: /model/rep-2.pkl; data: 0.075; prediction: 0.075
# --
# handle name : rep-1
# prediction : (pid: 62634); path: /model/rep-1.pkl; data: 0.186; prediction: 0.186
# --
# handle name : rep-2
# prediction : (pid: 62637); path: /model/rep-2.pkl; data: 0.751; prediction: 1.444
# --
# HTTP responses: ----------
# handle name : rep-1
# prediction : (pid: 62636); path: /model/rep-1.pkl; data: 0.582; prediction: 1.481
# handle name : rep-2
# prediction : (pid: 62637); path: /model/rep-2.pkl; data: 0.778; prediction: 1.678
# handle name : rep-1
# prediction : (pid: 62634); path: /model/rep-1.pkl; data: 0.139; prediction: 0.139
# handle name : rep-2
# prediction : (pid: 62635); path: /model/rep-2.pkl; data: 0.569; prediction: 1.262
```
## Updating a Deployment
Often you want to be able to update your code or configuration options for a deployment over time.
Deployments can be updated simply by updating the code or configuration options and calling `deploy()` again.
```python
@serve.deployment(name="my_deployment", num_replicas=1)
class SimpleDeployment:
pass
# Creates one initial replica.
SimpleDeployment.deploy()
# Re-deploys, creating an additional replica.
# This could be the SAME Python script, modified and re-run.
@serve.deployment(name="my_deployment", num_replicas=2)
class SimpleDeployment:
pass
SimpleDeployment.deploy()
# You can also use Deployment.options() to change options without redefining
# the class. This is useful for programmatically updating deployments.
SimpleDeployment.options(num_replicas=2).deploy()
```
By default, each call to `.deploy()` will cause a redeployment, even if the underlying code and options didn't change.
This could be detrimental if you have many deployments in a script and and only want to update one: if you re-run the script, all of the deployments will be redeployed, not just the one you updated.
To prevent this, you may provide a `version` string for the deployment as a keyword argument in the decorator or `Deployment.options()`.
If provided, the replicas will only be updated if the value of `version` is updated; if the value of `version` is unchanged, the call to `.deploy()` will be a no-op.
When a redeployment happens, Serve will perform a rolling update, bringing down at most 20% of the replicas at any given time.
(configuring-a-deployment)=
## Configuring a Deployment
There are a number of things you'll likely want to do with your serving application including
scaling out or configuring the maximum number of in-flight requests for a deployment.
All of these options can be specified either in {mod}`@serve.deployment <ray.serve.api.deployment>` or in `Deployment.options()`.
To update the config options for a running deployment, simply redeploy it with the new options set.
### Scaling Out
To scale out a deployment to many processes, simply configure the number of replicas.
```python
# Create with a single replica.
@serve.deployment(num_replicas=1)
def func(*args):
pass
func.deploy()
# Scale up to 10 replicas.
func.options(num_replicas=10).deploy()
# Scale back down to 1 replica.
func.options(num_replicas=1).deploy()
```
#### Autoscaling
Serve also has experimental support for a demand-based replica autoscaler.
It reacts to traffic spikes via observing queue sizes and making scaling decisions.
To configure it, you can set the `_autoscaling` field in deployment options.
:::{warning}
The API is experimental and subject to change. We welcome you to test it out
and leave us feedback through [Github Issues](https://github.com/ray-project/ray/issues) or our [discussion forum](https://discuss.ray.io/)!
:::
```python
@serve.deployment(
_autoscaling_config={
"min_replicas": 1,
"max_replicas": 5,
"target_num_ongoing_requests_per_replica": 10,
},
version="v1")
def func(_):
time.sleep(1)
return ""
func.deploy() # The func deployment will now autoscale based on requests demand.
```
The `min_replicas` and `max_replicas` fields configure the range of replicas which the
Serve autoscaler chooses from. Deployments will start with `min_replicas` initially.
The `target_num_ongoing_requests_per_replica` configuration specifies how aggressively the
autoscaler should react to traffic. Serve will try to make sure that each replica has roughly that number
of requests being processed and waiting in the queue. For example, if your processing time is `10ms`
and the latency constraint is `100ms`, you can have at most `10` requests ongoing per replica so
the last requests can finish within the latency constraint. We recommend you benchmark your application
code and set this number based on end to end latency objective.
:::{note}
The `version` field is required for autoscaling. We are actively working on removing
this limitation.
:::
:::{note}
The Ray Serve Autoscaler is an application-level autoscaler that sits on top of the [Ray Autoscaler](cluster-index).
Concretely, this means that the Ray Serve autoscaler asks Ray to start a number of replica actors based on the request demand.
If the Ray Autoscaler determines there aren't enough available CPUs to place these actors, it responds by adding more nodes.
Similarly, when Ray Serve scales down and terminates some replica actors, it may result in some nodes being empty, at which point the Ray autoscaler will remove those nodes.
:::
(serve-cpus-gpus)=
### Resource Management (CPUs, GPUs)
To assign hardware resources per replica, you can pass resource requirements to
`ray_actor_options`.
By default, each replica requires one CPU.
To learn about options to pass in, take a look at [Resources with Actor](actor-resource-guide) guide.
For example, to create a deployment where each replica uses a single GPU, you can do the
following:
```python
@serve.deployment(ray_actor_options={"num_gpus": 1})
def func(*args):
return do_something_with_my_gpu()
```
### Fractional Resources
The resources specified in `ray_actor_options` can also be *fractional*.
This allows you to flexibly share resources between replicas.
For example, if you have two models and each doesn't fully saturate a GPU, you might want to have them share a GPU by allocating 0.5 GPUs each.
The same could be done to multiplex over CPUs.
```python
@serve.deployment(name="deployment1", ray_actor_options={"num_gpus": 0.5})
def func(*args):
return do_something_with_my_gpu()
@serve.deployment(name="deployment2", ray_actor_options={"num_gpus": 0.5})
def func(*args):
return do_something_with_my_gpu()
```
### Configuring Parallelism with OMP_NUM_THREADS
Deep learning models like PyTorch and Tensorflow often use multithreading when performing inference.
The number of CPUs they use is controlled by the OMP_NUM_THREADS environment variable.
To [avoid contention](omp-num-thread-note), Ray sets `OMP_NUM_THREADS=1` by default because Ray workers and actors use a single CPU by default.
If you *do* want to enable this parallelism in your Serve deployment, just set OMP_NUM_THREADS to the desired value either when starting Ray or in your function/class definition:
```bash
OMP_NUM_THREADS=12 ray start --head
OMP_NUM_THREADS=12 ray start --address=$HEAD_NODE_ADDRESS
```
```python
@serve.deployment
class MyDeployment:
def __init__(self, parallelism):
os.environ["OMP_NUM_THREADS"] = parallelism
# Download model weights, initialize model, etc.
MyDeployment.deploy()
```
:::{note}
Some other libraries may not respect `OMP_NUM_THREADS` and have their own way to configure parallelism.
For example, if you're using OpenCV, you'll need to manually set the number of threads using `cv2.setNumThreads(num_threads)` (set to 0 to disable multi-threading).
You can check the configuration using `cv2.getNumThreads()` and `cv2.getNumberOfCPUs()`.
:::
### User Configuration (Experimental)
Suppose you want to update a parameter in your model without needing to restart
the replicas in your deployment. You can do this by writing a `reconfigure` method
for the class underlying your deployment. At runtime, you can then pass in your
new parameters by setting the `user_config` option.
The following simple example will make the usage clear:
```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_reconfigure.py
```
The `reconfigure` method is called when the class is created if `user_config`
is set. In particular, it's also called when new replicas are created in the
future if scale up your deployment later. The `reconfigure` method is also called
each time `user_config` is updated.
## Handling Dependencies
Ray Serve supports serving deployments with different (possibly conflicting)
Python dependencies. For example, you can simultaneously serve one deployment
that uses legacy Tensorflow 1 and another that uses Tensorflow 2.
This is supported on Mac OS and Linux using Ray's {ref}`runtime-environments` feature.
As with all other Ray actor options, pass the runtime environment in via `ray_actor_options` in
your deployment. Be sure to first run `pip install "ray[default]"` to ensure the
Runtime Environments feature is installed.
Example:
```{literalinclude} ../../../python/ray/serve/examples/doc/conda_env.py
```
:::{tip}
Avoid dynamically installing packages that install from source: these can be slow and
use up all resources while installing, leading to problems with the Ray cluster. Consider
precompiling such packages in a private repository or Docker image.
:::
The dependencies required in the deployment may be different than
the dependencies installed in the driver program (the one running Serve API
calls). In this case, you should use a delayed import within the class to avoid
importing unavailable packages in the driver. This applies even when not
using runtime environments.
Example:
```{literalinclude} ../../../python/ray/serve/examples/doc/delayed_import.py
```

View file

@ -1,369 +0,0 @@
=====================
Core API: Deployments
=====================
This section should help you:
- create, query, update and configure deployments
- configure resources of your deployments
- specify different Python dependencies across different deployment using Runtime Environments
.. tip::
Get in touch with us if you're using or considering using `Ray Serve <https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU>`_.
.. contents::
Creating a Deployment
=====================
Deployments are the central concept in Ray Serve.
They allow you to define and update your business logic or models that will handle incoming requests as well as how this is exposed over HTTP or in Python.
A deployment is defined using :mod:`@serve.deployment <ray.serve.api.deployment>` on a Python class (or function for simple use cases).
You can specify arguments to be passed to the constructor when you call ``Deployment.deploy()``, shown below.
A deployment consists of a number of *replicas*, which are individual copies of the function or class that are started in separate Ray Actors (processes).
.. code-block:: python
@serve.deployment
class MyFirstDeployment:
# Take the message to return as an argument to the constructor.
def __init__(self, msg):
self.msg = msg
def __call__(self, request):
return self.msg
def other_method(self, arg):
return self.msg
MyFirstDeployment.deploy("Hello world!")
Deployments can be exposed in two ways: over HTTP or in Python via the :ref:`servehandle-api`.
By default, HTTP requests will be forwarded to the ``__call__`` method of the class (or the function) and a ``Starlette Request`` object will be the sole argument.
You can also define a deployment that wraps a FastAPI app for more flexible handling of HTTP requests. See :ref:`serve-fastapi-http` for details.
To serve multiple deployments defined by the same class, use the ``name`` option:
.. code-block:: python
MyFirstDeployment.options(name="hello_service").deploy("Hello!")
MyFirstDeployment.options(name="hi_service").deploy("Hi!")
You can also list all available deployments and dynamically get references to them:
.. code-block:: python
>> serve.list_deployments()
{'A': Deployment(name=A,version=None,route_prefix=/A)}
{'MyFirstDeployment': Deployment(name=MyFirstDeployment,version=None,route_prefix=/MyFirstDeployment}
# Returns the same object as the original MyFirstDeployment object.
# This can be used to redeploy, get a handle, etc.
deployment = serve.get_deployment("MyFirstDeployment")
Exposing a Deployment
=====================
By default, deployments are exposed over HTTP at ``http://localhost:8000/<deployment_name>``.
The HTTP path that the deployment is available at can be changed using the ``route_prefix`` option.
All requests to ``/{route_prefix}`` and any subpaths will be routed to the deployment (using a longest-prefix match for overlapping route prefixes).
Here's an example:
.. code-block:: python
@serve.deployment(name="http_deployment", route_prefix="/api")
class HTTPDeployment:
def __call__(self, request):
return "Hello world!"
After creating the deployment, it is now exposed by the HTTP server and handles requests using the specified class.
We can query the model to verify that it's working.
.. code-block:: python
import requests
print(requests.get("http://127.0.0.1:8000/api").text)
We can also query the deployment using the :mod:`ServeHandle <ray.serve.handle.RayServeHandle>` interface.
.. code-block:: python
# To get a handle from the same script, use the Deployment object directly:
handle = HTTPDeployment.get_handle()
# To get a handle from a different script, reference it by name:
handle = serve.get_deployment("http_deployment").get_handle()
print(ray.get(handle.remote()))
As noted above, there are two ways to expose deployments. The first is by using the :mod:`ServeHandle <ray.serve.handle.RayServeHandle>`
interface. This method allows you to access deployments within a Python script or code, making it convenient for a
Python developer. And the second is by using the HTTP request, allowing access to deployments via a web client application.
Let's look at a simple end-to-end example using both ways to expose and access deployments. Your output may
vary due to random nature of how the prediction is computed; however, the example illustrates two things:
1) how to expose and use deployments and 2) how to use replicas, to which requests are sent. Note that each pid
is a separate replica associated with each deployment name, ``rep-1`` and ``rep-2`` respectively.
.. literalinclude:: _examples/doc_code/create_deployment.py
:language: python
:start-after: __serve_example_begin__
:end-before: __serve_example_end__
.. code-block:: python
# Output:
# {'rep-1': Deployment(name=rep-1,version=None,route_prefix=/rep-1),
# 'rep-2': Deployment(name=rep-2,version=None,route_prefix=/rep-2)}
#
# ServerHandle API responses: ----------
# handle name : rep-1
# prediction : (pid: 62636); path: /model/rep-1.pkl; data: 0.600; prediction: 1.292
# --
# handle name : rep-2
# prediction : (pid: 62635); path: /model/rep-2.pkl; data: 0.075; prediction: 0.075
# --
# handle name : rep-1
# prediction : (pid: 62634); path: /model/rep-1.pkl; data: 0.186; prediction: 0.186
# --
# handle name : rep-2
# prediction : (pid: 62637); path: /model/rep-2.pkl; data: 0.751; prediction: 1.444
# --
# HTTP responses: ----------
# handle name : rep-1
# prediction : (pid: 62636); path: /model/rep-1.pkl; data: 0.582; prediction: 1.481
# handle name : rep-2
# prediction : (pid: 62637); path: /model/rep-2.pkl; data: 0.778; prediction: 1.678
# handle name : rep-1
# prediction : (pid: 62634); path: /model/rep-1.pkl; data: 0.139; prediction: 0.139
# handle name : rep-2
# prediction : (pid: 62635); path: /model/rep-2.pkl; data: 0.569; prediction: 1.262
Updating a Deployment
=====================
Often you want to be able to update your code or configuration options for a deployment over time.
Deployments can be updated simply by updating the code or configuration options and calling ``deploy()`` again.
.. code-block:: python
@serve.deployment(name="my_deployment", num_replicas=1)
class SimpleDeployment:
pass
# Creates one initial replica.
SimpleDeployment.deploy()
# Re-deploys, creating an additional replica.
# This could be the SAME Python script, modified and re-run.
@serve.deployment(name="my_deployment", num_replicas=2)
class SimpleDeployment:
pass
SimpleDeployment.deploy()
# You can also use Deployment.options() to change options without redefining
# the class. This is useful for programmatically updating deployments.
SimpleDeployment.options(num_replicas=2).deploy()
By default, each call to ``.deploy()`` will cause a redeployment, even if the underlying code and options didn't change.
This could be detrimental if you have many deployments in a script and and only want to update one: if you re-run the script, all of the deployments will be redeployed, not just the one you updated.
To prevent this, you may provide a ``version`` string for the deployment as a keyword argument in the decorator or ``Deployment.options()``.
If provided, the replicas will only be updated if the value of ``version`` is updated; if the value of ``version`` is unchanged, the call to ``.deploy()`` will be a no-op.
When a redeployment happens, Serve will perform a rolling update, bringing down at most 20% of the replicas at any given time.
.. _configuring-a-deployment:
Configuring a Deployment
========================
There are a number of things you'll likely want to do with your serving application including
scaling out or configuring the maximum number of in-flight requests for a deployment.
All of these options can be specified either in :mod:`@serve.deployment <ray.serve.api.deployment>` or in ``Deployment.options()``.
To update the config options for a running deployment, simply redeploy it with the new options set.
Scaling Out
-----------
To scale out a deployment to many processes, simply configure the number of replicas.
.. code-block:: python
# Create with a single replica.
@serve.deployment(num_replicas=1)
def func(*args):
pass
func.deploy()
# Scale up to 10 replicas.
func.options(num_replicas=10).deploy()
# Scale back down to 1 replica.
func.options(num_replicas=1).deploy()
Autoscaling
^^^^^^^^^^^
Serve also has experimental support for a demand-based replica autoscaler.
It reacts to traffic spikes via observing queue sizes and making scaling decisions.
To configure it, you can set the ``_autoscaling`` field in deployment options.
.. warning::
The API is experimental and subject to change. We welcome you to test it out
and leave us feedback through `Github Issues <https://github.com/ray-project/ray/issues>`_ or our `discussion forum <https://discuss.ray.io/>`_!
.. code-block:: python
@serve.deployment(
_autoscaling_config={
"min_replicas": 1,
"max_replicas": 5,
"target_num_ongoing_requests_per_replica": 10,
},
version="v1")
def func(_):
time.sleep(1)
return ""
func.deploy() # The func deployment will now autoscale based on requests demand.
The ``min_replicas`` and ``max_replicas`` fields configure the range of replicas which the
Serve autoscaler chooses from. Deployments will start with ``min_replicas`` initially.
The ``target_num_ongoing_requests_per_replica`` configuration specifies how aggressively the
autoscaler should react to traffic. Serve will try to make sure that each replica has roughly that number
of requests being processed and waiting in the queue. For example, if your processing time is ``10ms``
and the latency constraint is ``100ms``, you can have at most ``10`` requests ongoing per replica so
the last requests can finish within the latency constraint. We recommend you benchmark your application
code and set this number based on end to end latency objective.
.. note::
The ``version`` field is required for autoscaling. We are actively working on removing
this limitation.
.. note::
The Ray Serve Autoscaler is an application-level autoscaler that sits on top of the :ref:`Ray Autoscaler<cluster-index>`.
Concretely, this means that the Ray Serve autoscaler asks Ray to start a number of replica actors based on the request demand.
If the Ray Autoscaler determines there aren't enough available CPUs to place these actors, it responds by adding more nodes.
Similarly, when Ray Serve scales down and terminates some replica actors, it may result in some nodes being empty, at which point the Ray autoscaler will remove those nodes.
.. _`serve-cpus-gpus`:
Resource Management (CPUs, GPUs)
--------------------------------
To assign hardware resources per replica, you can pass resource requirements to
``ray_actor_options``.
By default, each replica requires one CPU.
To learn about options to pass in, take a look at :ref:`Resources with Actor<actor-resource-guide>` guide.
For example, to create a deployment where each replica uses a single GPU, you can do the
following:
.. code-block:: python
@serve.deployment(ray_actor_options={"num_gpus": 1})
def func(*args):
return do_something_with_my_gpu()
Fractional Resources
--------------------
The resources specified in ``ray_actor_options`` can also be *fractional*.
This allows you to flexibly share resources between replicas.
For example, if you have two models and each doesn't fully saturate a GPU, you might want to have them share a GPU by allocating 0.5 GPUs each.
The same could be done to multiplex over CPUs.
.. code-block:: python
@serve.deployment(name="deployment1", ray_actor_options={"num_gpus": 0.5})
def func(*args):
return do_something_with_my_gpu()
@serve.deployment(name="deployment2", ray_actor_options={"num_gpus": 0.5})
def func(*args):
return do_something_with_my_gpu()
Configuring Parallelism with OMP_NUM_THREADS
--------------------------------------------
Deep learning models like PyTorch and Tensorflow often use multithreading when performing inference.
The number of CPUs they use is controlled by the OMP_NUM_THREADS environment variable.
To :ref:`avoid contention<omp-num-thread-note>`, Ray sets ``OMP_NUM_THREADS=1`` by default because Ray workers and actors use a single CPU by default.
If you *do* want to enable this parallelism in your Serve deployment, just set OMP_NUM_THREADS to the desired value either when starting Ray or in your function/class definition:
.. code-block:: bash
OMP_NUM_THREADS=12 ray start --head
OMP_NUM_THREADS=12 ray start --address=$HEAD_NODE_ADDRESS
.. code-block:: python
@serve.deployment
class MyDeployment:
def __init__(self, parallelism):
os.environ["OMP_NUM_THREADS"] = parallelism
# Download model weights, initialize model, etc.
MyDeployment.deploy()
.. note::
Some other libraries may not respect ``OMP_NUM_THREADS`` and have their own way to configure parallelism.
For example, if you're using OpenCV, you'll need to manually set the number of threads using ``cv2.setNumThreads(num_threads)`` (set to 0 to disable multi-threading).
You can check the configuration using ``cv2.getNumThreads()`` and ``cv2.getNumberOfCPUs()``.
User Configuration (Experimental)
---------------------------------
Suppose you want to update a parameter in your model without needing to restart
the replicas in your deployment. You can do this by writing a `reconfigure` method
for the class underlying your deployment. At runtime, you can then pass in your
new parameters by setting the `user_config` option.
The following simple example will make the usage clear:
.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_reconfigure.py
The `reconfigure` method is called when the class is created if `user_config`
is set. In particular, it's also called when new replicas are created in the
future if scale up your deployment later. The `reconfigure` method is also called
each time `user_config` is updated.
Handling Dependencies
=====================
Ray Serve supports serving deployments with different (possibly conflicting)
Python dependencies. For example, you can simultaneously serve one deployment
that uses legacy Tensorflow 1 and another that uses Tensorflow 2.
This is supported on Mac OS and Linux using Ray's :ref:`runtime-environments` feature.
As with all other Ray actor options, pass the runtime environment in via ``ray_actor_options`` in
your deployment. Be sure to first run ``pip install "ray[default]"`` to ensure the
Runtime Environments feature is installed.
Example:
.. literalinclude:: ../../../python/ray/serve/examples/doc/conda_env.py
.. tip::
Avoid dynamically installing packages that install from source: these can be slow and
use up all resources while installing, leading to problems with the Ray cluster. Consider
precompiling such packages in a private repository or Docker image.
The dependencies required in the deployment may be different than
the dependencies installed in the driver program (the one running Serve API
calls). In this case, you should use a delayed import within the class to avoid
importing unavailable packages in the driver. This applies even when not
using runtime environments.
Example:
.. literalinclude:: ../../../python/ray/serve/examples/doc/delayed_import.py

View file

@ -298,7 +298,7 @@ Serve provides a default DAGDriver implementation that accepts HTTP request and
You can configure how does the DAGDriver convert HTTP request types. By default, we directly send in a [```starlette.requests.Request```](https://www.starlette.io/requests/) object to represent the whole request. You can also specifies built-in adapters. In this example, we will use a `json_request` adapter that parses HTTP body with JSON parser.
```{tip}
There are several useful adapters like ndarray JSON, image object, etc. You can checkout {ref}`the list of adapters here <serve-http-adapters>`. You can also easily plug in your own adapter by passing in in the ```http_adapter``` field.
There are several useful adapters like ndarray JSON, image object, etc. You can checkout [the list of adapters here](serve-http-adapters). You can also easily plug in your own adapter by passing in in the ```http_adapter``` field.
```
+++

View file

@ -0,0 +1,299 @@
(serve-deploy-tutorial)=
# Deploying Ray Serve
This section should help you:
- understand how Ray Serve runs on a Ray cluster beyond the basics mentioned in {doc}`core-apis`
- deploy and update your Serve application over time
- monitor your Serve application using the Ray Dashboard and logging
```{contents} Deploying Ray Serve
```
(ray-serve-instance-lifetime)=
## Lifetime of a Ray Serve Instance
Ray Serve instances run on top of Ray clusters and are started using {mod}`serve.start <ray.serve.start>`.
Once {mod}`serve.start <ray.serve.start>` has been called, further API calls can be used to create and update the deployments that will be used to serve your Python code (including ML models).
The Serve instance will be torn down when the script exits.
When running on a long-lived Ray cluster (e.g., one started using `ray start` and connected
to using `ray.init(address="auto", namespace="serve")`, you can also deploy a Ray Serve instance as a long-running
service using `serve.start(detached=True)`. In this case, the Serve instance will continue to
run on the Ray cluster even after the script that calls it exits. If you want to run another script
to update the Serve instance, you can run another script that connects to the same Ray cluster and makes further API calls (e.g., to create, update, or delete a deployment). Note that there can only be one detached Serve instance on each Ray cluster.
All non-detached Serve instances will be started in the current namespace that was specified when connecting to the cluster. If a namespace is specified for a detached Serve instance, it will be used. Otherwise if the current namespace is anonymous, the Serve instance will be started in the `serve` namespace.
If `serve.start()` is called again in a process in which there is already a running Serve instance, Serve will re-connect to the existing instance (regardless of whether the original instance was detached or not). To reconnect to a Serve instance that exists in the Ray cluster but not in the current process, connect to the cluster with the same namespace that was specified when starting the instance and run `serve.start()`.
## Deploying on a Single Node
While Ray Serve makes it easy to scale out on a multi-node Ray cluster, in some scenarios a single node may suit your needs.
There are two ways you can run Ray Serve on a single node, shown below.
In general, **Option 2 is recommended for most users** because it allows you to fully make use of Serve's ability to dynamically update running deployments.
1. Start Ray and deploy with Ray Serve all in a single Python file.
```python
import ray
from ray import serve
import time
# This will start Ray locally and start Serve on top of it.
serve.start()
@serve.deployment
def my_func(request):
return "hello"
my_func.deploy()
# Serve will be shut down once the script exits, so keep it alive manually.
while True:
time.sleep(5)
print(serve.list_deployments())
```
2. First running `ray start --head` on the machine, then connecting to the running local Ray cluster using `ray.init(address="auto", namespace="serve")` in your Serve script(s) (this is the Ray namespace, not Kubernetes namespace, and you can specify any namespace that you like). You can run multiple scripts to update your deployments over time.
```bash
ray start --head # Start local Ray cluster.
serve start # Start Serve on the local Ray cluster.
```
```python
import ray
from ray import serve
# This will connect to the running Ray cluster.
ray.init(address="auto", namespace="serve")
@serve.deployment
def my_func(request):
return "hello"
my_func.deploy()
```
## Deploying on Kubernetes
In order to deploy Ray Serve on Kubernetes, we need to do the following:
1. Start a Ray cluster on Kubernetes.
2. Expose the head node of the cluster as a [Service].
3. Start Ray Serve on the cluster.
There are multiple ways to start a Ray cluster on Kubernetes, see {ref}`ray-k8s-deploy` for more information.
Here, we will be using the [Ray Cluster Launcher](cluster-cloud) tool, which has support for Kubernetes as a backend.
The cluster launcher takes in a yaml config file that describes the cluster.
Here, we'll be using the [Kubernetes default config] with a few small modifications.
First, we need to make sure that the head node of the cluster, where Ray Serve will run its HTTP server, is exposed as a Kubernetes [Service].
There is already a default head node service defined in the `services` field of the config, so we just need to make sure that it's exposing the right port: 8000, which Ray Serve binds on by default.
```yaml
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
name: ray-head
spec:
# Must match the label in the head pod spec below.
selector:
component: ray-head
ports:
- protocol: TCP
# Port that this service will listen on.
port: 8000
# Port that requests will be sent to in pods backing the service.
targetPort: 8000
```
Then, we also need to make sure that the head node pod spec matches the selector defined here and exposes the same port:
```yaml
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Matches the selector in the service definition above.
labels:
component: ray-head
spec:
# ...
containers:
- name: ray-node
# ...
ports:
- containerPort: 8000 # Ray Serve default port.
# ...
```
The rest of the config remains unchanged for this example, though you may want to change the container image or the number of worker pods started by default when running your own deployment.
Now, we just need to start the cluster:
```shell
# Start the cluster.
$ ray up ray/python/ray/autoscaler/kubernetes/example-full.yaml
# Check the status of the service pointing to the head node. If configured
# properly, you should see the 'Endpoints' field populated with an IP
# address like below. If not, make sure the head node pod started
# successfully and the selector/labels match.
$ kubectl -n ray describe service ray-head
Name: ray-head
Namespace: ray
Labels: <none>
Annotations: <none>
Selector: component=ray-head
Type: ClusterIP
IP: 10.100.188.203
Port: <unset> 8000/TCP
TargetPort: 8000/TCP
Endpoints: 192.168.73.98:8000
Session Affinity: None
Events: <none>
```
With the cluster now running, we can run a simple script to start Ray Serve and deploy a "hello world" deployment:
> ```python
> import ray
> from ray import serve
>
> # Connect to the running Ray cluster.
> ray.init(address="auto", namespace="serve")
> # Bind on 0.0.0.0 to expose the HTTP server on external IPs.
> serve.start(detached=True, http_options={"host": "0.0.0.0"})
>
>
> @serve.deployment(route_prefix="/hello")
> def hello(request):
> return "hello world"
>
> hello.deploy()
> ```
Save this script locally as `deploy.py` and run it on the head node using `ray submit`:
> ```shell
> $ ray submit ray/python/ray/autoscaler/kubernetes/example-full.yaml deploy.py
> ```
Now we can try querying the service by sending an HTTP request to the service from within the Kubernetes cluster.
> ```shell
> # Get a shell inside of the head node.
> $ ray attach ray/python/ray/autoscaler/kubernetes/example-full.yaml
>
> # Query the Ray Serve deployment. This can be run from anywhere in the
> # Kubernetes cluster.
> $ curl -X GET http://$RAY_HEAD_SERVICE_HOST:8000/hello
> hello world
> ```
In order to expose the Ray Serve deployment externally, we would need to deploy the Service we created here behind an [Ingress] or a [NodePort].
Please refer to the Kubernetes documentation for more information.
## Health Checking
By default, each actor making up a Serve deployment is health checked and restarted on failure.
:::{note}
User-defined health checks are experimental and may be subject to change before the interface is stabilized. If you have any feedback or run into any issues or unexpected behaviors, please file an issue on GitHub.
:::
You can customize this behavior to perform an application-level health check or to adjust the frequency/timeout.
To define a custom healthcheck, define a `check_health` method on your deployment class.
This method should take no arguments and return no result, raising an exception if the replica should be considered unhealthy.
You can also customize how frequently the health check is run and the timeout when a replica will be deemed unhealthy if it hasn't responded in the deployment options.
> ```python
> @serve.deployment(_health_check_period_s=10, _health_check_timeout_s=30)
> class MyDeployment:
> def __init__(self, db_addr: str):
> self._my_db_connection = connect_to_db(db_addr)
>
> def __call__(self, request):
> return self._do_something_cool()
>
> # Will be called by Serve to check the health of the replica.
> def check_health(self):
> if not self._my_db_connection.is_connected():
> # The specific type of exception is not important.
> raise RuntimeError("uh-oh, DB connection is broken.")
> ```
:::{tip}
You can use the Serve CLI command `serve status` to get status info
about your live deployments. The CLI was included with Serve when you did
`pip install "ray[serve]"`. If you're checking your deployments on a
remote Ray cluster, make sure to include the Ray cluster's dashboard address
in the command: `serve status --address [dashboard_address]`.
:::
## Failure Recovery
Ray Serve is resilient to any component failures within the Ray cluster out of the box.
You can checkout the detail of how process and worker node failure handled at {ref}`serve-ft-detail`.
However, when the Ray head node goes down, you would need to recover the state by creating a new
Ray cluster and re-deploys all Serve deployments into that cluster.
:::{note}
Ray currently cannot survive head node failure and we recommend using application specific
failure recovery solutions. Although Ray is not currently highly available (HA), it is on
the long term roadmap and being actively worked on.
:::
Ray Serve added an experimental feature to help recovering the state.
This features enables Serve to write all your deployment configuration and code into a storage location.
Upon Ray cluster failure and restarts, you can simply call Serve to reconstruct the state.
Here is how to use it:
:::{warning}
The API is experimental and subject to change. We welcome you to test it out
and leave us feedback through github issues or discussion forum!
:::
You can use both the start argument and the CLI to specify it:
```python
serve.start(_checkpoint_path=...)
```
or
```shell
serve start --checkpoint-path ...
```
The checkpoint path argument accepts the following format:
- `file://local_file_path`
- `s3://bucket/path`
- `gs://bucket/path`
- `custom://importable.custom_python.Class/path`
While we have native support for on disk, AWS S3, and Google Cloud Storage (GCS), there is no reason we cannot support more.
In Kubernetes environment, we recommend using [Persistent Volumes] to create a disk and mount it into the Ray head node.
For example, you can provision Azure Disk, AWS Elastic Block Store, or GCP Persistent Disk using the K8s [Persistent Volumes] API.
Alternatively, you can also directly write to object store like S3.
You can easily try to plug into your own implementation using the `custom://` path and inherit the [KVStoreBase] class.
Feel free to open new github issues and contribute more storage backends!
[ingress]: https://kubernetes.io/docs/concepts/services-networking/ingress/
[kubernetes default config]: https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/kubernetes/example-full.yaml
[kvstorebase]: https://github.com/ray-project/ray/blob/master/python/ray/serve/storage/kv_store_base.py
[nodeport]: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
[persistent volumes]: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
[service]: https://kubernetes.io/docs/concepts/services-networking/service/

View file

@ -1,308 +0,0 @@
.. _serve-deploy-tutorial:
===================
Deploying Ray Serve
===================
This section should help you:
- understand how Ray Serve runs on a Ray cluster beyond the basics mentioned in :doc:`core-apis`
- deploy and update your Serve application over time
- monitor your Serve application using the Ray Dashboard and logging
.. contents:: Deploying Ray Serve
.. _ray-serve-instance-lifetime:
Lifetime of a Ray Serve Instance
================================
Ray Serve instances run on top of Ray clusters and are started using :mod:`serve.start <ray.serve.start>`.
Once :mod:`serve.start <ray.serve.start>` has been called, further API calls can be used to create and update the deployments that will be used to serve your Python code (including ML models).
The Serve instance will be torn down when the script exits.
When running on a long-lived Ray cluster (e.g., one started using ``ray start`` and connected
to using ``ray.init(address="auto", namespace="serve")``, you can also deploy a Ray Serve instance as a long-running
service using ``serve.start(detached=True)``. In this case, the Serve instance will continue to
run on the Ray cluster even after the script that calls it exits. If you want to run another script
to update the Serve instance, you can run another script that connects to the same Ray cluster and makes further API calls (e.g., to create, update, or delete a deployment). Note that there can only be one detached Serve instance on each Ray cluster.
All non-detached Serve instances will be started in the current namespace that was specified when connecting to the cluster. If a namespace is specified for a detached Serve instance, it will be used. Otherwise if the current namespace is anonymous, the Serve instance will be started in the ``serve`` namespace.
If ``serve.start()`` is called again in a process in which there is already a running Serve instance, Serve will re-connect to the existing instance (regardless of whether the original instance was detached or not). To reconnect to a Serve instance that exists in the Ray cluster but not in the current process, connect to the cluster with the same namespace that was specified when starting the instance and run ``serve.start()``.
Deploying on a Single Node
==========================
While Ray Serve makes it easy to scale out on a multi-node Ray cluster, in some scenarios a single node may suit your needs.
There are two ways you can run Ray Serve on a single node, shown below.
In general, **Option 2 is recommended for most users** because it allows you to fully make use of Serve's ability to dynamically update running deployments.
1. Start Ray and deploy with Ray Serve all in a single Python file.
.. code-block:: python
import ray
from ray import serve
import time
# This will start Ray locally and start Serve on top of it.
serve.start()
@serve.deployment
def my_func(request):
return "hello"
my_func.deploy()
# Serve will be shut down once the script exits, so keep it alive manually.
while True:
time.sleep(5)
print(serve.list_deployments())
2. First running ``ray start --head`` on the machine, then connecting to the running local Ray cluster using ``ray.init(address="auto", namespace="serve")`` in your Serve script(s) (this is the Ray namespace, not Kubernetes namespace, and you can specify any namespace that you like). You can run multiple scripts to update your deployments over time.
.. code-block:: bash
ray start --head # Start local Ray cluster.
serve start # Start Serve on the local Ray cluster.
.. code-block:: python
import ray
from ray import serve
# This will connect to the running Ray cluster.
ray.init(address="auto", namespace="serve")
@serve.deployment
def my_func(request):
return "hello"
my_func.deploy()
Deploying on Kubernetes
=======================
In order to deploy Ray Serve on Kubernetes, we need to do the following:
1. Start a Ray cluster on Kubernetes.
2. Expose the head node of the cluster as a `Service`_.
3. Start Ray Serve on the cluster.
There are multiple ways to start a Ray cluster on Kubernetes, see :ref:`ray-k8s-deploy` for more information.
Here, we will be using the :ref:`Ray Cluster Launcher <cluster-cloud>` tool, which has support for Kubernetes as a backend.
The cluster launcher takes in a yaml config file that describes the cluster.
Here, we'll be using the `Kubernetes default config`_ with a few small modifications.
First, we need to make sure that the head node of the cluster, where Ray Serve will run its HTTP server, is exposed as a Kubernetes `Service`_.
There is already a default head node service defined in the ``services`` field of the config, so we just need to make sure that it's exposing the right port: 8000, which Ray Serve binds on by default.
.. code-block:: yaml
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
name: ray-head
spec:
# Must match the label in the head pod spec below.
selector:
component: ray-head
ports:
- protocol: TCP
# Port that this service will listen on.
port: 8000
# Port that requests will be sent to in pods backing the service.
targetPort: 8000
Then, we also need to make sure that the head node pod spec matches the selector defined here and exposes the same port:
.. code-block:: yaml
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Matches the selector in the service definition above.
labels:
component: ray-head
spec:
# ...
containers:
- name: ray-node
# ...
ports:
- containerPort: 8000 # Ray Serve default port.
# ...
The rest of the config remains unchanged for this example, though you may want to change the container image or the number of worker pods started by default when running your own deployment.
Now, we just need to start the cluster:
.. code-block:: shell
# Start the cluster.
$ ray up ray/python/ray/autoscaler/kubernetes/example-full.yaml
# Check the status of the service pointing to the head node. If configured
# properly, you should see the 'Endpoints' field populated with an IP
# address like below. If not, make sure the head node pod started
# successfully and the selector/labels match.
$ kubectl -n ray describe service ray-head
Name: ray-head
Namespace: ray
Labels: <none>
Annotations: <none>
Selector: component=ray-head
Type: ClusterIP
IP: 10.100.188.203
Port: <unset> 8000/TCP
TargetPort: 8000/TCP
Endpoints: 192.168.73.98:8000
Session Affinity: None
Events: <none>
With the cluster now running, we can run a simple script to start Ray Serve and deploy a "hello world" deployment:
.. code-block:: python
import ray
from ray import serve
# Connect to the running Ray cluster.
ray.init(address="auto", namespace="serve")
# Bind on 0.0.0.0 to expose the HTTP server on external IPs.
serve.start(detached=True, http_options={"host": "0.0.0.0"})
@serve.deployment(route_prefix="/hello")
def hello(request):
return "hello world"
hello.deploy()
Save this script locally as ``deploy.py`` and run it on the head node using ``ray submit``:
.. code-block:: shell
$ ray submit ray/python/ray/autoscaler/kubernetes/example-full.yaml deploy.py
Now we can try querying the service by sending an HTTP request to the service from within the Kubernetes cluster.
.. code-block:: shell
# Get a shell inside of the head node.
$ ray attach ray/python/ray/autoscaler/kubernetes/example-full.yaml
# Query the Ray Serve deployment. This can be run from anywhere in the
# Kubernetes cluster.
$ curl -X GET http://$RAY_HEAD_SERVICE_HOST:8000/hello
hello world
In order to expose the Ray Serve deployment externally, we would need to deploy the Service we created here behind an `Ingress`_ or a `NodePort`_.
Please refer to the Kubernetes documentation for more information.
.. _`Kubernetes default config`: https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/kubernetes/example-full.yaml
.. _`Service`: https://kubernetes.io/docs/concepts/services-networking/service/
.. _`Ingress`: https://kubernetes.io/docs/concepts/services-networking/ingress/
.. _`NodePort`: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
Health Checking
===============
By default, each actor making up a Serve deployment is health checked and restarted on failure.
.. note::
User-defined health checks are experimental and may be subject to change before the interface is stabilized. If you have any feedback or run into any issues or unexpected behaviors, please file an issue on GitHub.
You can customize this behavior to perform an application-level health check or to adjust the frequency/timeout.
To define a custom healthcheck, define a ``check_health`` method on your deployment class.
This method should take no arguments and return no result, raising an exception if the replica should be considered unhealthy.
You can also customize how frequently the health check is run and the timeout when a replica will be deemed unhealthy if it hasn't responded in the deployment options.
.. code-block:: python
@serve.deployment(_health_check_period_s=10, _health_check_timeout_s=30)
class MyDeployment:
def __init__(self, db_addr: str):
self._my_db_connection = connect_to_db(db_addr)
def __call__(self, request):
return self._do_something_cool()
# Will be called by Serve to check the health of the replica.
def check_health(self):
if not self._my_db_connection.is_connected():
# The specific type of exception is not important.
raise RuntimeError("uh-oh, DB connection is broken.")
.. tip::
You can use the Serve CLI command ``serve status`` to get status info
about your live deployments. The CLI was included with Serve when you did
``pip install "ray[serve]"``. If you're checking your deployments on a
remote Ray cluster, make sure to include the Ray cluster's dashboard address
in the command: ``serve status --address [dashboard_address]``.
Failure Recovery
================
Ray Serve is resilient to any component failures within the Ray cluster out of the box.
You can checkout the detail of how process and worker node failure handled at :ref:`serve-ft-detail`.
However, when the Ray head node goes down, you would need to recover the state by creating a new
Ray cluster and re-deploys all Serve deployments into that cluster.
.. note::
Ray currently cannot survive head node failure and we recommend using application specific
failure recovery solutions. Although Ray is not currently highly available (HA), it is on
the long term roadmap and being actively worked on.
Ray Serve added an experimental feature to help recovering the state.
This features enables Serve to write all your deployment configuration and code into a storage location.
Upon Ray cluster failure and restarts, you can simply call Serve to reconstruct the state.
Here is how to use it:
.. warning::
The API is experimental and subject to change. We welcome you to test it out
and leave us feedback through github issues or discussion forum!
You can use both the start argument and the CLI to specify it:
.. code-block:: python
serve.start(_checkpoint_path=...)
or
.. code-block:: shell
serve start --checkpoint-path ...
The checkpoint path argument accepts the following format:
- ``file://local_file_path``
- ``s3://bucket/path``
- ``gs://bucket/path``
- ``custom://importable.custom_python.Class/path``
While we have native support for on disk, AWS S3, and Google Cloud Storage (GCS), there is no reason we cannot support more.
In Kubernetes environment, we recommend using `Persistent Volumes`_ to create a disk and mount it into the Ray head node.
For example, you can provision Azure Disk, AWS Elastic Block Store, or GCP Persistent Disk using the K8s `Persistent Volumes`_ API.
Alternatively, you can also directly write to object store like S3.
You can easily try to plug into your own implementation using the ``custom://`` path and inherit the `KVStoreBase`_ class.
Feel free to open new github issues and contribute more storage backends!
.. _`Persistent Volumes`: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
.. _`KVStoreBase`: https://github.com/ray-project/ray/blob/master/python/ray/serve/storage/kv_store_base.py

View file

@ -0,0 +1,397 @@
(end-to-end-tutorial)=
# End-to-End Tutorial
By the end of this tutorial you will have learned how to deploy a machine
learning model locally via Ray Serve.
First, install Ray Serve and all of its dependencies by running the following
command in your terminal:
```bash
$ pip install "ray[serve]"
```
For this tutorial, we'll use [HuggingFace's SummarizationPipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.SummarizationPipeline)
to access a model that summarizes text.
## Example Model
Let's first take a look at how the model works, without using Ray Serve.
This is the code for the model:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_local.py
:end-before: __local_model_end__
:language: python
:linenos: true
:start-after: __local_model_start__
```
The Python file, called `local_model.py` uses the `summarize` function to
generate summaries of text.
- The `summarizer` variable on line 7 inside `summarize` points to a
function that uses the [t5-small](https://huggingface.co/t5-small)
model to summarize text.
- When `summarizer` is called on a Python String, it returns summarized text
inside a dictionary formatted as `[{"summary_text": "...", ...}, ...]`.
- `summarize` then extracts the summarized text on line 13 by indexing into
the dictionary.
The file can be run locally by executing the Python script, which uses the
model to summarize an article about the Apollo 11 moon landing [^f1].
```bash
$ python local_model.py
"two astronauts steered their fragile lunar module safely and smoothly to the
historic landing . the first men to reach the moon -- Armstrong and his
co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
rest on a level, rock-strewn plain ."
```
Keep in mind that the `SummarizationPipeline` is an example machine learning
model for this tutorial. You can follow along using arbitrary models in any
framework that has a Python API. Check out our tutorials on sckit-learn,
PyTorch, and Tensorflow for more info and examples:
- {ref}`serve-sklearn-tutorial`
- {ref}`serve-pytorch-tutorial`
- {ref}`serve-tensorflow-tutorial`
## Converting to Ray Serve Deployment
This tutorial's goal is to deploy this model using Ray Serve, so it can be
scaled up and queried over HTTP. We'll start by converting the above Python
function into a Ray Serve deployment that can be launched locally on a laptop.
We start by opening a new Python file. First, we need to import `ray` and
`ray serve`, to use features in Ray Serve such as `deployments`, which
provide HTTP access to our model.
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
:end-before: __import_end__
:language: python
:start-after: __import_start__
```
After these imports, we can include our model code from above.
We won't call our `summarize` function just yet though!
We will soon add logic to handle HTTP requests, so the `summarize` function
can operate on article text sent via HTTP request.
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
:end-before: __local_model_end__
:language: python
:start-after: __local_model_start__
```
Ray Serve needs to run on top of a Ray cluster, so we connect to a local one.
See {ref}`serve-deploy-tutorial` to learn more about starting a Ray Serve
instance and deploying to a Ray cluster.
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
:end-before: __start_ray_cluster_end__
:language: python
:start-after: __start_ray_cluster_start__
```
The `address` parameter in `ray.init()` connects your Serve script to a
running local Ray cluster. Later, we'll discuss how to start a local Ray
cluster.
:::{note}
`ray.init()` connects to or starts a single-node Ray cluster on your
local machine, which allows you to use all your CPU cores to serve
requests in parallel. To start a multi-node cluster, see
{ref}`serve-deploy-tutorial`.
:::
Next, we start the Ray Serve runtime:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
:end-before: __start_serve_end__
:language: python
:start-after: __start_serve_start__
```
:::{note}
`detached=True` means Ray Serve will continue running even when the Python
script exits. If you would rather stop Ray Serve after the script exits, use
`serve.start()` instead (see {ref}`ray-serve-instance-lifetime` for
details).
:::
Now that we have defined our `summarize` function, connected to a Ray
Cluster, and started the Ray Serve runtime, we can define a function that
accepts HTTP requests and routes them to the `summarize` function. We
define a function called `router` that takes in a Starlette `request`
object [^f2]:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
:end-before: __router_end__
:language: python
:linenos: true
:start-after: __router_start__
```
- In line 1, we add the decorator `@serve.deployment`
to the `router` function to turn the function into a Serve `Deployment`
object.
- In line 3, `router` uses the `"txt"` query parameter in the `request`
to get the article text to summarize.
- In line 4, it then passes this article text into the `summarize` function
and returns the value.
:::{note}
Lines 3 and 4 define our HTTP request schema. The HTTP requests sent to this
endpoint must have a `"txt"` query parameter that contains a string.
In general, you can accept HTTP data using query parameters or the
request body. Additionally, you can add other Serve deployments with
different names to create more endpoints that can accept different schemas.
For more complex validation, you can also use FastAPI (see
{ref}`serve-fastapi-http` for more info).
:::
:::{tip}
This routing function's name doesn't have to be `router`.
It can be any function name as long as the corresponding name is present in
the HTTP request. If you want the function name to be different than the name
in the HTTP request, you can add the `name` keyword parameter to the
`@serve.deployment` decorator to specify the name sent in the HTTP request.
For example, if the decorator is `@serve.deployment(name="responder")` and
the function signature is `def request_manager(request)`, the HTTP request
should use `responder`, not `request_manager`. If no `name` is passed
into `@serve.deployment`, the `request` uses the function's name by
default. For example, if the decorator is `@serve.deployment` and the
function's signature is `def manager(request)`, the HTTP request should use
`manager`.
:::
Since `@serve.deployment` makes `router` a `Deployment` object, it can be
deployed using `router.deploy()`:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
:end-before: __router_deploy_end__
:language: python
:start-after: __router_deploy_start__
```
Once we deploy `router`, we can query the model over HTTP.
With that, we can run our model on Ray Serve!
Here's the full Ray Serve deployment script that we built for our model:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment_full.py
:end-before: __deployment_full_end__
:language: python
:linenos: true
:start-after: __deployment_full_start__
```
To deploy `router`, we first start a local Ray cluster:
```bash
$ ray start --head
```
The Ray cluster that this command launches is the same Ray cluster that the
Python code connects to using `ray.init(address="auto", namespace="serve")`.
It is also the same Ray cluster that keeps Ray Serve (and any deployments on
it, such as `router`) alive even after the Python script exits as long as
`detached=True` inside `serve.start()`.
:::{tip}
To stop the Ray cluster, run the command `ray stop`.
:::
After starting the Ray cluster, we can run the Python file to deploy `router`
and begin accepting HTTP requests:
```bash
$ python model_on_ray_serve.py
```
## Testing the Ray Serve Deployment
We can now test our model over HTTP. The structure of our HTTP query is:
`http://127.0.0.1:8000/[Deployment Name]?[Parameter Name-1]=[Parameter Value-1]&[Parameter Name-2]=[Parameter Value-2]&...&[Parameter Name-n]=[Parameter Value-n]`
Since the cluster is deployed locally in this tutorial, the `127.0.0.1:8000`
refers to a localhost with port 8000. The `[Deployment Name]` refers to
either the name of the function that we called `.deploy()` on (in our case,
this is `router`), or the `name` keyword parameter's value in
`@serve.deployment` (see the Tip under the `router` function definition
above for more info).
Each `[Parameter Name]` refers to a field's name in the
request's `query_params` dictionary for our deployed function. In our
example, the only parameter we need to pass in is `txt`. This parameter is
referenced in the `txt = request.query_params["txt"]` line in the `router`
function. Each \[Parameter Name\] object has a corresponding \[Parameter Value\]
object. The `txt`'s \[Parameter Value\] is a string containing the article
text to summarize. We can chain together any number of the name-value pairs
using the `&` symbol in the request URL.
Now that the `summarize` function is deployed on Ray Serve, we can make HTTP
requests to it. Here's a client script that requests a summary from the same
article as the original Python script:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_router_client.py
:end-before: __client_function_end__
:language: python
:start-after: __client_function_start__
```
We can run this script while the model is deployed to get a response over HTTP:
```bash
$ python router_client.py
"two astronauts steered their fragile lunar module safely and smoothly to the
historic landing . the first men to reach the moon -- Armstrong and his
co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
rest on a level, rock-strewn plain ."
```
## Using Classes in the Ray Serve Deployment
Our application is still a bit inefficient though. In particular, the
`summarize` function loads the model on each call when it sets the
`summarizer` variable. However, the model never changes, so it would be more
efficient to define `summarizer` only once and keep its value in memory
instead of reloading it for each HTTP query.
We can achieve this by converting our `summarize` function into a class:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_class_deployment.py
:end-before: __deployment_class_end__
:language: python
:linenos: true
:start-after: __deployment_class_start__
```
In this configuration, we can query the `Summarizer` class directly.
The `Summarizer` is initialized once (after calling `Summarizer.deploy()`).
In line 13, its `__init__` function loads and stores the model in
`self.summarize`. HTTP queries for the `Summarizer` class are routed to its
`__call__` method by default, which takes in the Starlette `request`
object. The `Summarizer` class can then take the request's `txt` data and
call the `self.summarize` function on it without loading the model on each
query.
:::{tip}
Instance variables can also store state. For example, to
count the number of requests served, a `@serve.deployment` class can define
a `self.counter` instance variable in its `__init__` function and set it
to 0. When the class is queried, it can increment the `self.counter`
variable inside of the function responding to the query. The `self.counter`
will keep track of the number of requests served across requests.
:::
HTTP queries for the Ray Serve class deployments follow a similar format to Ray
Serve function deployments. Here's an example client script for the
`Summarizer` class. Notice that the only difference from the `router`'s
client script is that the URL uses the `Summarizer` path instead of
`router`.
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_summarizer_client.py
:end-before: __client_class_end__
:language: python
:start-after: __client_class_start__
```
We can deploy the class-based model on Serve without stopping the Ray cluster.
However, for the purposes of this tutorial, let's restart the cluster, deploy
the model, and query it over HTTP:
```bash
$ ray stop
$ ray start --head
$ python summarizer_on_ray_serve.py
$ python summarizer_client.py
"two astronauts steered their fragile lunar module safely and smoothly to the
historic landing . the first men to reach the moon -- Armstrong and his
co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
rest on a level, rock-strewn plain ."
```
## Adding Functionality with FastAPI
Now suppose we want to expose additional functionality in our model. In
particular, the `summarize` function also has `min_length` and
`max_length` parameters. Although we could expose these options as additional
parameters in URL, Ray Serve also allows us to add more route options to the
URL itself and handle each route separately.
Because this logic can get complex, Serve integrates with
[FastAPI](https://fastapi.tiangolo.com/). This allows us to define a Serve
deployment by adding the `@serve.ingress` decorator to a FastAPI app. For
more info about FastAPI with Serve, please see {ref}`serve-fastapi-http`.
As an example of FastAPI, here's a modified version of our `Summarizer` class
with route options to request a minimum or maximum length of ten words in the
summaries:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_fastapi_deployment.py
:end-before: __fastapi_end__
:language: python
:linenos: true
:start-after: __fastapi_start__
```
The class now exposes three routes:
- `/Summarizer`: As before, this route takes in article text and returns
a summary.
- `/Summarizer/min10`: This route takes in article text and returns a
summary with at least 10 words.
- `/Summarizer/max10`: This route takes in article text and returns a
summary with at most 10 words.
Notice that `Summarizer`'s methods no longer take in a Starlette `request`
object. Instead, they take in the URL's `txt` parameter directly with FastAPI's
[query parameter](https://fastapi.tiangolo.com/tutorial/query-params/)
feature.
Since we still deploy our model locally, the full URL still uses the
localhost IP. This means each of our three routes comes after the
`http://127.0.0.1:8000` IP and port address. As an example, we can make
requests to the `max10` route using this client script:
```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_fastapi_client.py
:end-before: __client_fastapi_end__
:language: python
:start-after: __client_fastapi_start__
```
```bash
$ ray stop
$ ray start --head
$ python serve_with_fastapi.py
$ python fastapi_client.py
"two astronauts steered their fragile lunar"
```
Congratulations! You just built and deployed a machine learning model on Ray
Serve! You should now have enough context to dive into the {doc}`core-apis` to
get a deeper understanding of Ray Serve.
To learn more about how to start a multi-node cluster for your Ray Serve
deployments, see {ref}`serve-deploy-tutorial`. For more interesting example
applications, including integrations with popular machine learning frameworks
and Python web servers, be sure to check out {doc}`tutorials/index`.
```{rubric} Footnotes
```
[^f1]: The article text comes from the New York Times article "Astronauts
Land on Plain; Collect Rocks, Plant Flag" archived
[here](https://archive.nytimes.com/www.nytimes.com/library/national/science/nasa/072169sci-nasa.html).
[^f2]: [Starlette](https://www.starlette.io/) is a web server framework
used by Ray Serve. Its [Request](https://www.starlette.io/requests/) class
provides a nice interface for incoming HTTP requests.

View file

@ -1,392 +0,0 @@
.. _end_to_end_tutorial:
===================
End-to-End Tutorial
===================
By the end of this tutorial you will have learned how to deploy a machine
learning model locally via Ray Serve.
First, install Ray Serve and all of its dependencies by running the following
command in your terminal:
.. code-block:: bash
$ pip install "ray[serve]"
For this tutorial, we'll use `HuggingFace's SummarizationPipeline <https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.SummarizationPipeline>`_
to access a model that summarizes text.
Example Model
=============
Let's first take a look at how the model works, without using Ray Serve.
This is the code for the model:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_local.py
:linenos:
:language: python
:start-after: __local_model_start__
:end-before: __local_model_end__
The Python file, called ``local_model.py`` uses the ``summarize`` function to
generate summaries of text.
- The ``summarizer`` variable on line 7 inside ``summarize`` points to a
function that uses the `t5-small <https://huggingface.co/t5-small>`_
model to summarize text.
- When ``summarizer`` is called on a Python String, it returns summarized text
inside a dictionary formatted as ``[{"summary_text": "...", ...}, ...]``.
- ``summarize`` then extracts the summarized text on line 13 by indexing into
the dictionary.
The file can be run locally by executing the Python script, which uses the
model to summarize an article about the Apollo 11 moon landing [#f1]_.
.. code-block:: bash
$ python local_model.py
"two astronauts steered their fragile lunar module safely and smoothly to the
historic landing . the first men to reach the moon -- Armstrong and his
co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
rest on a level, rock-strewn plain ."
Keep in mind that the ``SummarizationPipeline`` is an example machine learning
model for this tutorial. You can follow along using arbitrary models in any
framework that has a Python API. Check out our tutorials on sckit-learn,
PyTorch, and Tensorflow for more info and examples:
- :ref:`serve-sklearn-tutorial`
- :ref:`serve-pytorch-tutorial`
- :ref:`serve-tensorflow-tutorial`
Converting to Ray Serve Deployment
==================================
This tutorial's goal is to deploy this model using Ray Serve, so it can be
scaled up and queried over HTTP. We'll start by converting the above Python
function into a Ray Serve deployment that can be launched locally on a laptop.
We start by opening a new Python file. First, we need to import ``ray`` and
``ray serve``, to use features in Ray Serve such as ``deployments``, which
provide HTTP access to our model.
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
:language: python
:start-after: __import_start__
:end-before: __import_end__
After these imports, we can include our model code from above.
We won't call our ``summarize`` function just yet though!
We will soon add logic to handle HTTP requests, so the ``summarize`` function
can operate on article text sent via HTTP request.
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
:language: python
:start-after: __local_model_start__
:end-before: __local_model_end__
Ray Serve needs to run on top of a Ray cluster, so we connect to a local one.
See :ref:`serve-deploy-tutorial` to learn more about starting a Ray Serve
instance and deploying to a Ray cluster.
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
:language: python
:start-after: __start_ray_cluster_start__
:end-before: __start_ray_cluster_end__
The ``address`` parameter in ``ray.init()`` connects your Serve script to a
running local Ray cluster. Later, we'll discuss how to start a local Ray
cluster.
.. note::
``ray.init()`` connects to or starts a single-node Ray cluster on your
local machine, which allows you to use all your CPU cores to serve
requests in parallel. To start a multi-node cluster, see
:ref:`serve-deploy-tutorial`.
Next, we start the Ray Serve runtime:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
:language: python
:start-after: __start_serve_start__
:end-before: __start_serve_end__
.. note::
``detached=True`` means Ray Serve will continue running even when the Python
script exits. If you would rather stop Ray Serve after the script exits, use
``serve.start()`` instead (see :ref:`ray-serve-instance-lifetime` for
details).
Now that we have defined our ``summarize`` function, connected to a Ray
Cluster, and started the Ray Serve runtime, we can define a function that
accepts HTTP requests and routes them to the ``summarize`` function. We
define a function called ``router`` that takes in a Starlette ``request``
object [#f2]_:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
:linenos:
:language: python
:start-after: __router_start__
:end-before: __router_end__
- In line 1, we add the decorator ``@serve.deployment``
to the ``router`` function to turn the function into a Serve ``Deployment``
object.
- In line 3, ``router`` uses the ``"txt"`` query parameter in the ``request``
to get the article text to summarize.
- In line 4, it then passes this article text into the ``summarize`` function
and returns the value.
.. note::
Lines 3 and 4 define our HTTP request schema. The HTTP requests sent to this
endpoint must have a ``"txt"`` query parameter that contains a string.
In general, you can accept HTTP data using query parameters or the
request body. Additionally, you can add other Serve deployments with
different names to create more endpoints that can accept different schemas.
For more complex validation, you can also use FastAPI (see
:ref:`serve-fastapi-http` for more info).
.. tip::
This routing function's name doesn't have to be ``router``.
It can be any function name as long as the corresponding name is present in
the HTTP request. If you want the function name to be different than the name
in the HTTP request, you can add the ``name`` keyword parameter to the
``@serve.deployment`` decorator to specify the name sent in the HTTP request.
For example, if the decorator is ``@serve.deployment(name="responder")`` and
the function signature is ``def request_manager(request)``, the HTTP request
should use ``responder``, not ``request_manager``. If no ``name`` is passed
into ``@serve.deployment``, the ``request`` uses the function's name by
default. For example, if the decorator is ``@serve.deployment`` and the
function's signature is ``def manager(request)``, the HTTP request should use
``manager``.
Since ``@serve.deployment`` makes ``router`` a ``Deployment`` object, it can be
deployed using ``router.deploy()``:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
:language: python
:start-after: __router_deploy_start__
:end-before: __router_deploy_end__
Once we deploy ``router``, we can query the model over HTTP.
With that, we can run our model on Ray Serve!
Here's the full Ray Serve deployment script that we built for our model:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment_full.py
:linenos:
:language: python
:start-after: __deployment_full_start__
:end-before: __deployment_full_end__
To deploy ``router``, we first start a local Ray cluster:
.. code-block:: bash
$ ray start --head
The Ray cluster that this command launches is the same Ray cluster that the
Python code connects to using ``ray.init(address="auto", namespace="serve")``.
It is also the same Ray cluster that keeps Ray Serve (and any deployments on
it, such as ``router``) alive even after the Python script exits as long as
``detached=True`` inside ``serve.start()``.
.. tip::
To stop the Ray cluster, run the command ``ray stop``.
After starting the Ray cluster, we can run the Python file to deploy ``router``
and begin accepting HTTP requests:
.. code-block:: bash
$ python model_on_ray_serve.py
Testing the Ray Serve Deployment
================================
We can now test our model over HTTP. The structure of our HTTP query is:
``http://127.0.0.1:8000/[Deployment Name]?[Parameter Name-1]=[Parameter Value-1]&[Parameter Name-2]=[Parameter Value-2]&...&[Parameter Name-n]=[Parameter Value-n]``
Since the cluster is deployed locally in this tutorial, the ``127.0.0.1:8000``
refers to a localhost with port 8000. The ``[Deployment Name]`` refers to
either the name of the function that we called ``.deploy()`` on (in our case,
this is ``router``), or the ``name`` keyword parameter's value in
``@serve.deployment`` (see the Tip under the ``router`` function definition
above for more info).
Each ``[Parameter Name]`` refers to a field's name in the
request's ``query_params`` dictionary for our deployed function. In our
example, the only parameter we need to pass in is ``txt``. This parameter is
referenced in the ``txt = request.query_params["txt"]`` line in the ``router``
function. Each [Parameter Name] object has a corresponding [Parameter Value]
object. The ``txt``'s [Parameter Value] is a string containing the article
text to summarize. We can chain together any number of the name-value pairs
using the ``&`` symbol in the request URL.
Now that the ``summarize`` function is deployed on Ray Serve, we can make HTTP
requests to it. Here's a client script that requests a summary from the same
article as the original Python script:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_router_client.py
:language: python
:start-after: __client_function_start__
:end-before: __client_function_end__
We can run this script while the model is deployed to get a response over HTTP:
.. code-block:: bash
$ python router_client.py
"two astronauts steered their fragile lunar module safely and smoothly to the
historic landing . the first men to reach the moon -- Armstrong and his
co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
rest on a level, rock-strewn plain ."
Using Classes in the Ray Serve Deployment
=========================================
Our application is still a bit inefficient though. In particular, the
``summarize`` function loads the model on each call when it sets the
``summarizer`` variable. However, the model never changes, so it would be more
efficient to define ``summarizer`` only once and keep its value in memory
instead of reloading it for each HTTP query.
We can achieve this by converting our ``summarize`` function into a class:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_class_deployment.py
:linenos:
:language: python
:start-after: __deployment_class_start__
:end-before: __deployment_class_end__
In this configuration, we can query the ``Summarizer`` class directly.
The ``Summarizer`` is initialized once (after calling ``Summarizer.deploy()``).
In line 13, its ``__init__`` function loads and stores the model in
``self.summarize``. HTTP queries for the ``Summarizer`` class are routed to its
``__call__`` method by default, which takes in the Starlette ``request``
object. The ``Summarizer`` class can then take the request's ``txt`` data and
call the ``self.summarize`` function on it without loading the model on each
query.
.. tip::
Instance variables can also store state. For example, to
count the number of requests served, a ``@serve.deployment`` class can define
a ``self.counter`` instance variable in its ``__init__`` function and set it
to 0. When the class is queried, it can increment the ``self.counter``
variable inside of the function responding to the query. The ``self.counter``
will keep track of the number of requests served across requests.
HTTP queries for the Ray Serve class deployments follow a similar format to Ray
Serve function deployments. Here's an example client script for the
``Summarizer`` class. Notice that the only difference from the ``router``'s
client script is that the URL uses the ``Summarizer`` path instead of
``router``.
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_summarizer_client.py
:language: python
:start-after: __client_class_start__
:end-before: __client_class_end__
We can deploy the class-based model on Serve without stopping the Ray cluster.
However, for the purposes of this tutorial, let's restart the cluster, deploy
the model, and query it over HTTP:
.. code-block:: bash
$ ray stop
$ ray start --head
$ python summarizer_on_ray_serve.py
$ python summarizer_client.py
"two astronauts steered their fragile lunar module safely and smoothly to the
historic landing . the first men to reach the moon -- Armstrong and his
co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
rest on a level, rock-strewn plain ."
Adding Functionality with FastAPI
=================================
Now suppose we want to expose additional functionality in our model. In
particular, the ``summarize`` function also has ``min_length`` and
``max_length`` parameters. Although we could expose these options as additional
parameters in URL, Ray Serve also allows us to add more route options to the
URL itself and handle each route separately.
Because this logic can get complex, Serve integrates with
`FastAPI <https://fastapi.tiangolo.com/>`_. This allows us to define a Serve
deployment by adding the ``@serve.ingress`` decorator to a FastAPI app. For
more info about FastAPI with Serve, please see :ref:`serve-fastapi-http`.
As an example of FastAPI, here's a modified version of our ``Summarizer`` class
with route options to request a minimum or maximum length of ten words in the
summaries:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_fastapi_deployment.py
:linenos:
:language: python
:start-after: __fastapi_start__
:end-before: __fastapi_end__
The class now exposes three routes:
- ``/Summarizer``: As before, this route takes in article text and returns
a summary.
- ``/Summarizer/min10``: This route takes in article text and returns a
summary with at least 10 words.
- ``/Summarizer/max10``: This route takes in article text and returns a
summary with at most 10 words.
Notice that ``Summarizer``'s methods no longer take in a Starlette ``request``
object. Instead, they take in the URL's `txt` parameter directly with FastAPI's
`query parameter <https://fastapi.tiangolo.com/tutorial/query-params/>`_
feature.
Since we still deploy our model locally, the full URL still uses the
localhost IP. This means each of our three routes comes after the
``http://127.0.0.1:8000`` IP and port address. As an example, we can make
requests to the ``max10`` route using this client script:
.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_fastapi_client.py
:language: python
:start-after: __client_fastapi_start__
:end-before: __client_fastapi_end__
.. code-block:: bash
$ ray stop
$ ray start --head
$ python serve_with_fastapi.py
$ python fastapi_client.py
"two astronauts steered their fragile lunar"
Congratulations! You just built and deployed a machine learning model on Ray
Serve! You should now have enough context to dive into the :doc:`core-apis` to
get a deeper understanding of Ray Serve.
To learn more about how to start a multi-node cluster for your Ray Serve
deployments, see :ref:`serve-deploy-tutorial`. For more interesting example
applications, including integrations with popular machine learning frameworks
and Python web servers, be sure to check out :doc:`tutorials/index`.
.. rubric:: Footnotes
.. [#f1] The article text comes from the New York Times article "Astronauts
Land on Plain; Collect Rocks, Plant Flag" archived
`here <https://archive.nytimes.com/www.nytimes.com/library/national/science/nasa/072169sci-nasa.html>`_.
.. [#f2] `Starlette <https://www.starlette.io/>`_ is a web server framework
used by Ray Serve. Its `Request <https://www.starlette.io/requests/>`_ class
provides a nice interface for incoming HTTP requests.

View file

@ -1,46 +1,43 @@
.. _serve-faq:
(serve-faq)=
Ray Serve FAQ
=============
# Ray Serve FAQ
This page answers some common questions about Ray Serve. If you have more
questions, feel free to ask them in the `Discussion Board <https://discuss.ray.io/>`_.
questions, feel free to ask them in the [Discussion Board](https://discuss.ray.io/).
.. contents::
```{contents}
```
How do I deploy Ray Serve?
--------------------------
## How do I deploy Ray Serve?
See :doc:`deployment` for information about how to deploy Serve.
See {doc}`deployment` for information about how to deploy Serve.
## How fast is Ray Serve?
How fast is Ray Serve?
----------------------
We are continuously benchmarking Ray Serve. We can confidently say:
- Ray Serve's **latency** overhead is single digit milliseconds, often times just 1-2 milliseconds.
- For **throughput**, Serve achieves about 3-4k qps on a single machine.
- It is **horizontally scalable** so you can add more machines to increase the overall throughput.
You can checkout our `microbenchmark instruction <https://github.com/ray-project/ray/tree/master/python/ray/serve/benchmarks>`_
You can checkout our [microbenchmark instruction](https://github.com/ray-project/ray/tree/master/python/ray/serve/benchmarks)
to benchmark on your hardware.
## Can I use `asyncio` along with Ray Serve?
Can I use ``asyncio`` along with Ray Serve?
-------------------------------------------
Yes! You can make your servable methods ``async def`` and Serve will run them
Yes! You can make your servable methods `async def` and Serve will run them
concurrently inside a Python asyncio event loop.
Are there any other similar frameworks?
---------------------------------------
## Are there any other similar frameworks?
Yes and no. We truly believe Serve is unique as it gives you end to end control
over the API while delivering scalability and high performance. To achieve
something like what Serve offers, you often need to glue together multiple
frameworks like Tensorflow Serving, SageMaker, or even roll your own
batching server.
How does Serve compare to TFServing, TorchServe, ONNXRuntime, and others?
-------------------------------------------------------------------------
## How does Serve compare to TFServing, TorchServe, ONNXRuntime, and others?
Ray Serve is *framework agnostic*, you can use any Python framework and libraries.
We believe data scientists are not bounded a particular machine learning framework.
They use the best tool available for the job.
@ -48,12 +45,12 @@ They use the best tool available for the job.
Compared to these framework specific solution, Ray Serve doesn't perform any optimizations
to make your ML model run faster. However, you can still optimize the models yourself
and run them in Ray Serve: for example, you can run a model compiled by
`PyTorch JIT <https://pytorch.org/docs/stable/jit.html>`_.
[PyTorch JIT](https://pytorch.org/docs/stable/jit.html).
## How does Serve compare to AWS SageMaker, Azure ML, Google AI Platform?
How does Serve compare to AWS SageMaker, Azure ML, Google AI Platform?
----------------------------------------------------------------------
Ray Serve brings the scalability and parallelism of these hosted offering to
your own infrastructure. You can use our :ref:`cluster launcher <cluster-cloud>`
your own infrastructure. You can use our [cluster launcher](cluster-cloud)
to deploy Ray Serve to all major public clouds, K8s, as well as on bare-metal, on-premise machines.
Compared to these offerings, Ray Serve lacks a unified user interface and functionality
@ -61,20 +58,20 @@ let you manage the lifecycle of the models, visualize it's performance, etc. Ray
Serve focuses on just model serving and provides the primitives for you to
build your own ML platform on top.
How does Serve compare to Seldon, KFServing, Cortex?
----------------------------------------------------
## How does Serve compare to Seldon, KFServing, Cortex?
You can develop Ray Serve on your laptop, deploy it on a dev box, and scale it out
to multiple machines or K8s cluster without changing one lines of code. It's a lot
easier to get started with when you don't need to provision and manage K8s cluster.
When it's time to deploy, you can use Ray :ref:`cluster launcher <cluster-cloud>`
When it's time to deploy, you can use Ray [cluster launcher](cluster-cloud)
to transparently put your Ray Serve application in K8s.
Compare to these frameworks letting you deploy ML models on K8s, Ray Serve lacks
the ability to declaratively configure your ML application via YAML files. In
Ray Serve, you configure everything by Python code.
Is Ray Serve only for ML models?
--------------------------------
## Is Ray Serve only for ML models?
Nope! Ray Serve can be used to build any type of Python microservices
application. You can also use the full power of Ray within your Ray Serve
programs, so it's easy to run parallel computations within your deployments.

View file

@ -0,0 +1,415 @@
# Calling Deployments via HTTP and Python
This section should help you:
- understand how deployments can be called in two ways: from HTTP and from Python
- integrate Ray Serve with an existing web server
```{contents} Calling Deployments via HTTP and Python
```
(serve-http)=
## Calling Deployments via HTTP
### Basic Example
As shown in the {ref}`serve-quickstart`, when you create a deployment, it is exposed over HTTP by default at `/{deployment_name}`. You can change the route by specifying the `route_prefix` argument to the {mod}`@serve.deployment <ray.serve.api.deployment>` decorator.
```python
@serve.deployment(route_prefix="/counter")
class Counter:
def __call__(self, request):
pass
```
When you make a request to the Serve HTTP server at `/counter`, it will forward the request to the deployment's `__call__` method and provide a [Starlette Request object](https://www.starlette.io/requests/) as the sole argument. The `__call__` method can return any JSON-serializable object or a [Starlette Response object](https://www.starlette.io/responses/) (e.g., to return a custom status code).
Below, we discuss some advanced features for customizing Ray Serve's HTTP functionality.
(serve-fastapi-http)=
### FastAPI HTTP Deployments
If you want to define more complex HTTP handling logic, Serve integrates with [FastAPI](https://fastapi.tiangolo.com/). This allows you to define a Serve deployment using the {mod}`@serve.ingress <ray.serve.api.ingress>` decorator that wraps a FastAPI app with its full range of features. The most basic example of this is shown below, but for more details on all that FastAPI has to offer such as variable routes, automatic type validation, dependency injection (e.g., for database connections), and more, please check out [their documentation](https://fastapi.tiangolo.com/).
```python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@serve.deployment(route_prefix="/hello")
@serve.ingress(app)
class MyFastAPIDeployment:
@app.get("/")
def root(self):
return "Hello, world!"
MyFastAPIDeployment.deploy()
```
Now if you send a request to `/hello`, this will be routed to the `root` method of our deployment. We can also easily leverage FastAPI to define multiple routes with different HTTP methods:
```python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@serve.deployment(route_prefix="/hello")
@serve.ingress(app)
class MyFastAPIDeployment:
@app.get("/")
def root(self):
return "Hello, world!"
@app.post("/{subpath}")
def root(self, subpath: str):
return f"Hello from {subpath}!"
MyFastAPIDeployment.deploy()
```
You can also pass in an existing FastAPI app to a deployment to serve it as-is:
```python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@app.get("/")
def f():
return "Hello from the root!"
# ... add more routes, routers, etc. to `app` ...
@serve.deployment(route_prefix="/")
@serve.ingress(app)
class FastAPIWrapper:
pass
FastAPIWrapper.deploy()
```
This is useful for scaling out an existing FastAPI app with no modifications necessary.
Existing middlewares, automatic OpenAPI documentation generation, and other advanced FastAPI features should work as-is.
You can also combine routes defined this way with routes defined on the deployment:
```python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@app.get("/")
def f():
return "Hello from the root!"
@serve.deployment(route_prefix="/api1")
@serve.ingress(app)
class FastAPIWrapper1:
@app.get("/subpath")
def method(self):
return "Hello 1!"
@serve.deployment(route_prefix="/api2")
@serve.ingress(app)
class FastAPIWrapper2:
@app.get("/subpath")
def method(self):
return "Hello 2!"
FastAPIWrapper1.deploy()
FastAPIWrapper2.deploy()
```
In this example, requests to both `/api1` and `/api2` would return `Hello from the root!` while a request to `/api1/subpath` would return `Hello 1!` and a request to `/api2/subpath` would return `Hello 2!`.
To try it out, save a code snippet in a local python file (i.e. main.py) and in the same directory, run the following commands to start a local Ray cluster on your machine.
```bash
ray start --head
python main.py
```
(serve-http-adapters)=
### HTTP Adapters
HTTP adapters are functions that convert raw HTTP request to Python types that you know and recognize.
Its input arguments should be type annotated. At minimal, it should accept a `starlette.requests.Request` type.
But it can also accept any type that's recognized by the FastAPI's dependency injection framework.
For example, here is an adapter that extra the json content from request.
```python
async def json_resolver(request: starlette.requests.Request):
return await request.json()
```
Here is an adapter that accept two HTTP query parameters.
```python
def parse_query_args(field_a: int, field_b: str):
return YourDataClass(field_a, field_b)
```
You can specify different type signatures to facilitate HTTP fields extraction
include
[query parameters](https://fastapi.tiangolo.com/tutorial/query-params/),
[body parameters](https://fastapi.tiangolo.com/tutorial/body/),
and [many other data types](https://fastapi.tiangolo.com/tutorial/extra-data-types/).
For more detail, you can take a look at [FastAPI documentation](https://fastapi.tiangolo.com/).
You can use adapters in different scenarios within Serve:
- Ray AIR `ModelWrapper`
- Serve Deployment Graph `DAGDriver`
- Embedded in Bring Your Own `FastAPI` Application
Let's go over them one by one.
#### Ray AIR `ModelWrapper`
Ray Serve provides a suite of adapters to convert HTTP requests to ML inputs like `numpy` arrays.
You can just use it with [Ray AI Runtime (AIR) model wrapper](air-serve-integration) feature
to one click deploy pre-trained models.
For example, we provide a simple adapter for n-dimensional array.
With [model wrappers](air-serve-integration), you can specify it via the `http_adapter` field.
```python
from ray import serve
from ray.serve.http_adapters import json_to_ndarray
from ray.serve.model_wrappers import ModelWrapperDeployment
ModelWrapperDeployment.options(name="my_model").deploy(
my_ray_air_predictor,
my_ray_air_checkpoint,
http_adapter=json_to_ndarray
)
```
:::{note}
Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
Once validated, the model instance will passed to the predictor.
```python
from pydantic import BaseModel
class User(BaseModel):
user_id: int
user_name: str
...
ModelWrapperDeployment.deploy(..., http_adapter=User)
```
:::
#### Serve Deployment Graph `DAGDriver`
In [Serve Deployment Graph](serve-deployment-graph), you can configure
`ray.serve.drivers.DAGDriver` to accept an http adapter via it's `http_adapter` field.
For example, the json request adapters parse JSON in HTTP body:
```python
from ray.serve.drivers import DAGDriver
from ray.serve.http_adapters import json_request
from ray.experimental.dag.input_node import InputNode
with InputNode() as input_node:
...
dag = DAGDriver.bind(other_node, http_adapter=json_request)
```
:::{note}
Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
Once validated, the model instance will passed as `input_node` variable.
```python
from pydantic import BaseModel
class User(BaseModel):
user_id: int
user_name: str
...
DAGDriver.bind(other_node, http_adapter=User)
```
:::
#### Embedded in Bring Your Own `FastAPI` Application
You can also bring the adapter to your own FastAPI app using
[Depends](https://fastapi.tiangolo.com/tutorial/dependencies/#import-depends).
The input schema will automatically be part of the generated OpenAPI schema with FastAPI.
```python
from fastapi import FastAPI, Depends
from ray.serve.http_adapters import json_to_ndarray
app = FastAPI()
@app.post("/endpoint")
async def endpoint(np_array = Depends(json_to_ndarray)):
...
```
It has the following schema for input:
(serve-ndarray-schema)=
```{eval-rst}
.. autopydantic_model:: ray.serve.http_adapters.NdArray
```
#### List of Built-in Adapters
Here is a list of adapters and please feel free to [contribute more](https://github.com/ray-project/ray/issues/new/choose)!
```{eval-rst}
.. automodule:: ray.serve.http_adapters
:members: json_to_ndarray, image_to_ndarray, starlette_request, json_request
```
### Configuring HTTP Server Locations
By default, Ray Serve starts a single HTTP server on the head node of the Ray cluster.
You can configure this behavior using the `http_options={"location": ...}` flag
in {mod}`serve.start <ray.serve.start>`:
- "HeadOnly": start one HTTP server on the head node. Serve
assumes the head node is the node you executed serve.start
on. This is the default.
- "EveryNode": start one HTTP server per node.
- "NoServer" or `None`: disable HTTP server.
:::{note}
Using the "EveryNode" option, you can point a cloud load balancer to the
instance group of Ray cluster to achieve high availability of Serve's HTTP
proxies.
:::
### Enabling CORS and other HTTP middlewares
Serve supports arbitrary [Starlette middlewares](https://www.starlette.io/middleware/)
and custom middlewares in Starlette format. The example below shows how to enable
[Cross-Origin Resource Sharing (CORS)](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS).
You can follow the same pattern for other Starlette middlewares.
```python
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
client = serve.start(
http_options={"middlewares": [
Middleware(
CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
]})
```
(serve-handle-explainer)=
## ServeHandle: Calling Deployments from Python
Ray Serve enables you to query models both from HTTP and Python. This feature
enables seamless [model composition](serve-model-composition). You can
get a `ServeHandle` corresponding to deployment, similar how you can
reach a deployment through HTTP via a specific route. When you issue a request
to a deployment through `ServeHandle`, the request is load balanced across
available replicas in the same way an HTTP request is.
To call a Ray Serve deployment from python, use {mod}`Deployment.get_handle <ray.serve.api.Deployment>`
to get a handle to the deployment, then use
{mod}`handle.remote <ray.serve.handle.RayServeHandle.remote>` to send requests
to that deployment. These requests can pass ordinary args and kwargs that are
passed directly to the method. This returns a Ray `ObjectRef` whose result
can be waited for or retrieved using `ray.wait` or `ray.get`.
```python
@serve.deployment
class Deployment:
def method1(self, arg):
return f"Method1: {arg}"
def __call__(self, arg):
return f"__call__: {arg}"
Deployment.deploy()
handle = Deployment.get_handle()
ray.get(handle.remote("hi")) # Defaults to calling the __call__ method.
ray.get(handle.method1.remote("hi")) # Call a different method.
```
If you want to use the same deployment to serve both HTTP and ServeHandle traffic, the recommended best practice is to define an internal method that the HTTP handling logic will call:
```python
@serve.deployment(route_prefix="/api")
class Deployment:
def say_hello(self, name: str):
return f"Hello {name}!"
def __call__(self, request):
return self.say_hello(request.query_params["name"])
Deployment.deploy()
```
Now we can invoke the same logic from both HTTP or Python:
```python
print(requests.get("http://localhost:8000/api?name=Alice"))
# Hello Alice!
handle = Deployment.get_handle()
print(ray.get(handle.say_hello.remote("Alice")))
# Hello Alice!
```
(serve-sync-async-handles)=
### Sync and Async Handles
Ray Serve offers two types of `ServeHandle`. You can use the `Deployment.get_handle(..., sync=True|False)`
flag to toggle between them.
- When you set `sync=True` (the default), a synchronous handle is returned.
Calling `handle.remote()` should return a Ray `ObjectRef`.
- When you set `sync=False`, an asyncio based handle is returned. You need to
Call it with `await handle.remote()` to return a Ray ObjectRef. To use `await`,
you have to run `Deployment.get_handle` and `handle.remote` in Python asyncio event loop.
The async handle has performance advantage because it uses asyncio directly; as compared
to the sync handle, which talks to an asyncio event loop in a thread. To learn more about
the reasoning behind these, checkout our [architecture documentation](serve-architecture).
## Integrating with existing web servers
Ray Serve comes with its own HTTP server out of the box, but if you have an existing
web application, you can still plug in Ray Serve to scale up your compute using the `ServeHandle`.
For a tutorial with sample code, see {ref}`serve-web-server-integration-tutorial`.

View file

@ -1,428 +0,0 @@
=======================================
Calling Deployments via HTTP and Python
=======================================
This section should help you:
- understand how deployments can be called in two ways: from HTTP and from Python
- integrate Ray Serve with an existing web server
.. contents:: Calling Deployments via HTTP and Python
.. _serve-http:
Calling Deployments via HTTP
============================
Basic Example
^^^^^^^^^^^^^
As shown in the :ref:`serve_quickstart`, when you create a deployment, it is exposed over HTTP by default at ``/{deployment_name}``. You can change the route by specifying the ``route_prefix`` argument to the :mod:`@serve.deployment <ray.serve.api.deployment>` decorator.
.. code-block:: python
@serve.deployment(route_prefix="/counter")
class Counter:
def __call__(self, request):
pass
When you make a request to the Serve HTTP server at ``/counter``, it will forward the request to the deployment's ``__call__`` method and provide a `Starlette Request object <https://www.starlette.io/requests/>`_ as the sole argument. The ``__call__`` method can return any JSON-serializable object or a `Starlette Response object <https://www.starlette.io/responses/>`_ (e.g., to return a custom status code).
Below, we discuss some advanced features for customizing Ray Serve's HTTP functionality.
.. _serve-fastapi-http:
FastAPI HTTP Deployments
^^^^^^^^^^^^^^^^^^^^^^^^
If you want to define more complex HTTP handling logic, Serve integrates with `FastAPI <https://fastapi.tiangolo.com/>`_. This allows you to define a Serve deployment using the :mod:`@serve.ingress <ray.serve.api.ingress>` decorator that wraps a FastAPI app with its full range of features. The most basic example of this is shown below, but for more details on all that FastAPI has to offer such as variable routes, automatic type validation, dependency injection (e.g., for database connections), and more, please check out `their documentation <https://fastapi.tiangolo.com/>`_.
.. code-block:: python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@serve.deployment(route_prefix="/hello")
@serve.ingress(app)
class MyFastAPIDeployment:
@app.get("/")
def root(self):
return "Hello, world!"
MyFastAPIDeployment.deploy()
Now if you send a request to ``/hello``, this will be routed to the ``root`` method of our deployment. We can also easily leverage FastAPI to define multiple routes with different HTTP methods:
.. code-block:: python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@serve.deployment(route_prefix="/hello")
@serve.ingress(app)
class MyFastAPIDeployment:
@app.get("/")
def root(self):
return "Hello, world!"
@app.post("/{subpath}")
def root(self, subpath: str):
return f"Hello from {subpath}!"
MyFastAPIDeployment.deploy()
You can also pass in an existing FastAPI app to a deployment to serve it as-is:
.. code-block:: python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@app.get("/")
def f():
return "Hello from the root!"
# ... add more routes, routers, etc. to `app` ...
@serve.deployment(route_prefix="/")
@serve.ingress(app)
class FastAPIWrapper:
pass
FastAPIWrapper.deploy()
This is useful for scaling out an existing FastAPI app with no modifications necessary.
Existing middlewares, automatic OpenAPI documentation generation, and other advanced FastAPI features should work as-is.
You can also combine routes defined this way with routes defined on the deployment:
.. code-block:: python
import ray
from fastapi import FastAPI
from ray import serve
app = FastAPI()
ray.init(address="auto", namespace="summarizer")
serve.start(detached=True)
@app.get("/")
def f():
return "Hello from the root!"
@serve.deployment(route_prefix="/api1")
@serve.ingress(app)
class FastAPIWrapper1:
@app.get("/subpath")
def method(self):
return "Hello 1!"
@serve.deployment(route_prefix="/api2")
@serve.ingress(app)
class FastAPIWrapper2:
@app.get("/subpath")
def method(self):
return "Hello 2!"
FastAPIWrapper1.deploy()
FastAPIWrapper2.deploy()
In this example, requests to both ``/api1`` and ``/api2`` would return ``Hello from the root!`` while a request to ``/api1/subpath`` would return ``Hello 1!`` and a request to ``/api2/subpath`` would return ``Hello 2!``.
To try it out, save a code snippet in a local python file (i.e. main.py) and in the same directory, run the following commands to start a local Ray cluster on your machine.
.. code-block:: bash
ray start --head
python main.py
.. _serve-http-adapters:
HTTP Adapters
^^^^^^^^^^^^^
HTTP adapters are functions that convert raw HTTP request to Python types that you know and recognize.
Its input arguments should be type annotated. At minimal, it should accept a ``starlette.requests.Request`` type.
But it can also accept any type that's recognized by the FastAPI's dependency injection framework.
For example, here is an adapter that extra the json content from request.
.. code-block:: python
async def json_resolver(request: starlette.requests.Request):
return await request.json()
Here is an adapter that accept two HTTP query parameters.
.. code-block:: python
def parse_query_args(field_a: int, field_b: str):
return YourDataClass(field_a, field_b)
You can specify different type signatures to facilitate HTTP fields extraction
include
`query parameters <https://fastapi.tiangolo.com/tutorial/query-params/>`_,
`body parameters <https://fastapi.tiangolo.com/tutorial/body/>`_,
and `many other data types <https://fastapi.tiangolo.com/tutorial/extra-data-types/>`_.
For more detail, you can take a look at `FastAPI documentation <https://fastapi.tiangolo.com/>`_.
You can use adapters in different scenarios within Serve:
- Ray AIR ``ModelWrapper``
- Serve Deployment Graph ``DAGDriver``
- Embedded in Bring Your Own ``FastAPI`` Application
Let's go over them one by one.
Ray AIR ``ModelWrapper``
""""""""""""""""""""""""
Ray Serve provides a suite of adapters to convert HTTP requests to ML inputs like `numpy` arrays.
You can just use it with :ref:`Ray AI Runtime (AIR) model wrapper<air-serve-integration>` feature
to one click deploy pre-trained models.
For example, we provide a simple adapter for n-dimensional array.
With :ref:`model wrappers<air-serve-integration>`, you can specify it via the ``http_adapter`` field.
.. code-block:: python
from ray import serve
from ray.serve.http_adapters import json_to_ndarray
from ray.serve.model_wrappers import ModelWrapperDeployment
ModelWrapperDeployment.options(name="my_model").deploy(
my_ray_air_predictor,
my_ray_air_checkpoint,
http_adapter=json_to_ndarray
)
.. note::
Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
Once validated, the model instance will passed to the predictor.
.. code-block:: python
from pydantic import BaseModel
class User(BaseModel):
user_id: int
user_name: str
...
ModelWrapperDeployment.deploy(..., http_adapter=User)
Serve Deployment Graph ``DAGDriver``
""""""""""""""""""""""""""""""""""""
In :ref:`Serve Deployment Graph <serve-deployment-graph>`, you can configure
``ray.serve.drivers.DAGDriver`` to accept an http adapter via it's ``http_adapter`` field.
For example, the json request adapters parse JSON in HTTP body:
.. code-block:: python
from ray.serve.drivers import DAGDriver
from ray.serve.http_adapters import json_request
from ray.experimental.dag.input_node import InputNode
with InputNode() as input_node:
...
dag = DAGDriver.bind(other_node, http_adapter=json_request)
.. note::
Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
Once validated, the model instance will passed as ``input_node`` variable.
.. code-block:: python
from pydantic import BaseModel
class User(BaseModel):
user_id: int
user_name: str
...
DAGDriver.bind(other_node, http_adapter=User)
Embedded in Bring Your Own ``FastAPI`` Application
""""""""""""""""""""""""""""""""""""""""""""""""""
You can also bring the adapter to your own FastAPI app using
`Depends <https://fastapi.tiangolo.com/tutorial/dependencies/#import-depends>`_.
The input schema will automatically be part of the generated OpenAPI schema with FastAPI.
.. code-block:: python
from fastapi import FastAPI, Depends
from ray.serve.http_adapters import json_to_ndarray
app = FastAPI()
@app.post("/endpoint")
async def endpoint(np_array = Depends(json_to_ndarray)):
...
It has the following schema for input:
.. _serve-ndarray-schema:
.. autopydantic_model:: ray.serve.http_adapters.NdArray
List of Built-in Adapters
"""""""""""""""""""""""""
Here is a list of adapters and please feel free to `contribute more <https://github.com/ray-project/ray/issues/new/choose>`_!
.. automodule:: ray.serve.http_adapters
:members: json_to_ndarray, image_to_ndarray, starlette_request, json_request
Configuring HTTP Server Locations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
By default, Ray Serve starts a single HTTP server on the head node of the Ray cluster.
You can configure this behavior using the ``http_options={"location": ...}`` flag
in :mod:`serve.start <ray.serve.start>`:
- "HeadOnly": start one HTTP server on the head node. Serve
assumes the head node is the node you executed serve.start
on. This is the default.
- "EveryNode": start one HTTP server per node.
- "NoServer" or ``None``: disable HTTP server.
.. note::
Using the "EveryNode" option, you can point a cloud load balancer to the
instance group of Ray cluster to achieve high availability of Serve's HTTP
proxies.
Enabling CORS and other HTTP middlewares
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Serve supports arbitrary `Starlette middlewares <https://www.starlette.io/middleware/>`_
and custom middlewares in Starlette format. The example below shows how to enable
`Cross-Origin Resource Sharing (CORS) <https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS>`_.
You can follow the same pattern for other Starlette middlewares.
.. code-block:: python
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
client = serve.start(
http_options={"middlewares": [
Middleware(
CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
]})
.. _serve-handle-explainer:
ServeHandle: Calling Deployments from Python
============================================
Ray Serve enables you to query models both from HTTP and Python. This feature
enables seamless :ref:`model composition<serve-model-composition>`. You can
get a ``ServeHandle`` corresponding to deployment, similar how you can
reach a deployment through HTTP via a specific route. When you issue a request
to a deployment through ``ServeHandle``, the request is load balanced across
available replicas in the same way an HTTP request is.
To call a Ray Serve deployment from python, use :mod:`Deployment.get_handle <ray.serve.api.Deployment>`
to get a handle to the deployment, then use
:mod:`handle.remote <ray.serve.handle.RayServeHandle.remote>` to send requests
to that deployment. These requests can pass ordinary args and kwargs that are
passed directly to the method. This returns a Ray ``ObjectRef`` whose result
can be waited for or retrieved using ``ray.wait`` or ``ray.get``.
.. code-block:: python
@serve.deployment
class Deployment:
def method1(self, arg):
return f"Method1: {arg}"
def __call__(self, arg):
return f"__call__: {arg}"
Deployment.deploy()
handle = Deployment.get_handle()
ray.get(handle.remote("hi")) # Defaults to calling the __call__ method.
ray.get(handle.method1.remote("hi")) # Call a different method.
If you want to use the same deployment to serve both HTTP and ServeHandle traffic, the recommended best practice is to define an internal method that the HTTP handling logic will call:
.. code-block:: python
@serve.deployment(route_prefix="/api")
class Deployment:
def say_hello(self, name: str):
return f"Hello {name}!"
def __call__(self, request):
return self.say_hello(request.query_params["name"])
Deployment.deploy()
Now we can invoke the same logic from both HTTP or Python:
.. code-block:: python
print(requests.get("http://localhost:8000/api?name=Alice"))
# Hello Alice!
handle = Deployment.get_handle()
print(ray.get(handle.say_hello.remote("Alice")))
# Hello Alice!
.. _serve-sync-async-handles:
Sync and Async Handles
^^^^^^^^^^^^^^^^^^^^^^
Ray Serve offers two types of ``ServeHandle``. You can use the ``Deployment.get_handle(..., sync=True|False)``
flag to toggle between them.
- When you set ``sync=True`` (the default), a synchronous handle is returned.
Calling ``handle.remote()`` should return a Ray ``ObjectRef``.
- When you set ``sync=False``, an asyncio based handle is returned. You need to
Call it with ``await handle.remote()`` to return a Ray ObjectRef. To use ``await``,
you have to run ``Deployment.get_handle`` and ``handle.remote`` in Python asyncio event loop.
The async handle has performance advantage because it uses asyncio directly; as compared
to the sync handle, which talks to an asyncio event loop in a thread. To learn more about
the reasoning behind these, checkout our `architecture documentation <./architecture.html>`_.
Integrating with existing web servers
=====================================
Ray Serve comes with its own HTTP server out of the box, but if you have an existing
web application, you can still plug in Ray Serve to scale up your compute using the ``ServeHandle``.
For a tutorial with sample code, see :ref:`serve-web-server-integration-tutorial`.

226
doc/source/serve/index.md Normal file
View file

@ -0,0 +1,226 @@
```{eval-rst}
.. include:: /_includes/serve/announcement.rst
```
(rayserve)=
# Serve: Scalable and Programmable Serving
:::{tip}
Get in touch with us if you're using or considering using [Ray Serve](https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU).
:::
```{image} logo.svg
:align: center
:height: 250px
:width: 400px
```
(rayserve-overview)=
Ray Serve is an easy-to-use scalable model serving library built on Ray. Ray Serve is:
- **Framework-agnostic**: Use a single toolkit to serve everything from deep learning models
built with frameworks like [PyTorch](serve-pytorch-tutorial),
[Tensorflow, and Keras](serve-tensorflow-tutorial), to [Scikit-Learn](serve-sklearn-tutorial) models, to arbitrary Python business logic.
- **Python-first**: Configure your model serving declaratively in pure Python, without needing YAML or JSON configs.
Ray Serve enables composing multiple ML models into a [deployment graph](serve-deployment-graph). This allows you to write a complex inference service consisting of multiple ML models and business logic all in Python code.
Since Ray Serve is built on Ray, it allows you to easily scale to many machines, both in your datacenter and in the cloud.
Ray Serve can be used in two primary ways to deploy your models at scale:
1. Have Python functions and classes automatically placed behind HTTP endpoints.
2. Alternatively, call them from [within your existing Python web server](serve-web-server-integration-tutorial) using the Python-native {ref}`servehandle-api`.
:::{note}
Serve recently added an experimental API for building deployment graphs of multiple models.
Please take a look at the [Deployment Graph API](serve-deployment-graph) and try it out!
:::
:::{tip}
Chat with Ray Serve users and developers on our [forum](https://discuss.ray.io/)!
:::
(serve-quickstart)=
## Ray Serve Quickstart
First install Ray Serve and all of its dependencies by running the following
command in your terminal:
```bash
pip install "ray[serve]"
```
:::{note}
Ray Serve supports the same Python versions as Ray. See {ref}`installation`
for a list of supported Python versions.
:::
Now we will write a Python script to serve a simple "Counter" class over HTTP. You may open an interactive Python terminal and copy in the lines below as we go.
First, import Ray and Ray Serve:
```python
import ray
from ray import serve
```
Ray Serve runs on top of a Ray cluster, so the next step is to start a local Ray cluster:
```python
ray.init()
```
:::{note}
`ray.init()` will start a single-node Ray cluster on your local machine, which will allow you to use all your CPU cores to serve requests in parallel. To start a multi-node cluster, see {doc}`../cluster/index`.
:::
Next, start the Ray Serve runtime:
```python
serve.start()
```
:::{warning}
When the Python script exits, Ray Serve will shut down.
If you would rather keep Ray Serve running in the background you can use `serve.start(detached=True)` (see {doc}`deployment` for details).
:::
Now we will define a simple Counter class. The goal is to serve this class behind an HTTP endpoint using Ray Serve.
By default, Ray Serve offers a simple HTTP proxy that will send requests to the class' `__call__` method. The argument to this method will be a Starlette `Request` object.
```python
@serve.deployment
class Counter:
def __init__(self):
self.count = 0
def __call__(self, request):
self.count += 1
return {"count": self.count}
```
:::{note}
Besides classes, you can also serve standalone functions with Ray Serve in the same way.
:::
Notice that we made this class into a `Deployment` with the {mod}`@serve.deployment <ray.serve.api.deployment>` decorator.
This decorator is where we could set various configuration options such as the number of replicas, unique name of the deployment (it defaults to the class name), or the HTTP route prefix to expose the deployment at.
See the {mod}`Deployment package reference <ray.serve.api.Deployment>` for more details.
In order to deploy this, we simply need to call `Counter.deploy()`.
```python
Counter.deploy()
```
:::{note}
Deployments can be configured to improve performance, for example by increasing the number of replicas of the class being served in parallel. For details, see {ref}`configuring-a-deployment`.
:::
Now that our deployment is up and running, let's test it out by making a query over HTTP.
In your browser, simply visit `http://127.0.0.1:8000/Counter`, and you should see the output `{"count": 1"}`.
If you keep refreshing the page, the count should increase, as expected.
Now let's say we want to update this deployment to add another method to decrement the counter.
Here, because we want more flexible HTTP configuration we'll use Serve's FastAPI integration.
For more information on this, please see {ref}`serve-fastapi-http`.
```python
from fastapi import FastAPI
app = FastAPI()
@serve.deployment
@serve.ingress(app)
class Counter:
def __init__(self):
self.count = 0
@app.get("/")
def get(self):
return {"count": self.count}
@app.get("/incr")
def incr(self):
self.count += 1
return {"count": self.count}
@app.get("/decr")
def decr(self):
self.count -= 1
return {"count": self.count}
```
We've now redefined the `Counter` class to wrap a `FastAPI` application.
This class is exposing three HTTP routes: `/Counter` will get the current count, `/Counter/incr` will increment the count, and `/Counter/decr` will decrement the count.
To redeploy this updated version of the `Counter`, all we need to do is run `Counter.deploy()` again.
Serve will perform a rolling update here to replace the existing replicas with the new version we defined.
```python
Counter.deploy()
```
If we test out the HTTP endpoint again, we can see this in action.
Note that the count has been reset to zero because the new version of `Counter` was deployed.
```bash
> curl -X GET localhost:8000/Counter/
{"count": 0}
> curl -X GET localhost:8000/Counter/incr
{"count": 1}
> curl -X GET localhost:8000/Counter/decr
{"count": 0}
```
Congratulations, you just built and ran your first Ray Serve application! You should now have enough context to dive into the {doc}`core-apis` to get a deeper understanding of Ray Serve.
For more interesting example applications, including integrations with popular machine learning frameworks and Python web servers, be sure to check out {doc}`tutorials/index`.
For a high-level view of the architecture underlying Ray Serve, see {doc}`architecture`.
## Why Ray Serve?
There are generally two ways of serving machine learning applications, both with serious limitations:
you can use a **traditional web server**---your own Flask app---or you can use a cloud-hosted solution.
The first approach is easy to get started with, but it's hard to scale each component. The second approach
requires vendor lock-in (SageMaker), framework-specific tooling (TFServing), and a general
lack of flexibility.
Ray Serve solves these problems by giving you a simple web server (and the ability to [use your own](serve-web-server-integration-tutorial)) while still handling the complex routing, scaling, and testing logic
necessary for production deployments.
Beyond scaling up your deployments with multiple replicas, Ray Serve also enables:
- {ref}`serve-model-composition`---ability to flexibly compose multiple models and independently scale and update each.
- {ref}`serve-batching`---built in request batching to help you meet your performance objectives.
- {ref}`serve-cpus-gpus`---specify fractional resource requirements to fully saturate each of your GPUs with several models.
For more on the motivation behind Ray Serve, check out these [meetup slides](https://tinyurl.com/serve-meetup) and this [blog post](https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f).
### When should I use Ray Serve?
Ray Serve is a flexible tool that's easy to use for deploying, operating, and monitoring Python-based machine learning applications.
Ray Serve excels when you want to mix business logic with ML models and scaling out in production is a necessity. This might be because of large-scale batch processing
requirements or because you want to scale up a deployment graph consisting of many individual models with different performance properties.
If you plan on running on multiple machines, Ray Serve will serve you well!
## What's next?
Check out the {ref}`end-to-end-tutorial` and {doc}`core-apis`, look at the {ref}`serve-faq`,
or head over to the {doc}`tutorials/index` to get started building your Ray Serve applications.
For more, see the following blog posts about Ray Serve:
- [Serving ML Models in Production: Common Patterns](https://www.anyscale.com/blog/serving-ml-models-in-production-common-patterns) by Simon Mo, Edward Oakes, and Michael Galarnyk
- [How to Scale Up Your FastAPI Application Using Ray Serve](https://medium.com/distributed-computing-with-ray/how-to-scale-up-your-fastapi-application-using-ray-serve-c9a7b69e786) by Archit Kulkarni
- [Machine Learning is Broken](https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f) by Simon Mo
- [The Simplest Way to Serve your NLP Model in Production with Pure Python](https://medium.com/distributed-computing-with-ray/the-simplest-way-to-serve-your-nlp-model-in-production-with-pure-python-d42b6a97ad55) by Edward Oakes and Bill Chambers
```{eval-rst}
.. include:: /_includes/serve/announcement_bottom.rst
```

View file

@ -1,225 +0,0 @@
.. include:: /_includes/serve/announcement.rst
.. _rayserve:
========================================
Serve: Scalable and Programmable Serving
========================================
.. tip::
Get in touch with us if you're using or considering using `Ray Serve <https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU>`_.
.. image:: logo.svg
:align: center
:height: 250px
:width: 400px
.. _rayserve-overview:
Ray Serve is an easy-to-use scalable model serving library built on Ray. Ray Serve is:
- **Framework-agnostic**: Use a single toolkit to serve everything from deep learning models
built with frameworks like :ref:`PyTorch <serve-pytorch-tutorial>`,
:ref:`Tensorflow, and Keras <serve-tensorflow-tutorial>`, to :ref:`Scikit-Learn <serve-sklearn-tutorial>` models, to arbitrary Python business logic.
- **Python-first**: Configure your model serving declaratively in pure Python, without needing YAML or JSON configs.
Ray Serve enables composing multiple ML models into a :ref:`deployment graph <serve-deployment-graph>`. This allows you to write a complex inference service consisting of multiple ML models and business logic all in Python code.
Since Ray Serve is built on Ray, it allows you to easily scale to many machines, both in your datacenter and in the cloud.
Ray Serve can be used in two primary ways to deploy your models at scale:
1. Have Python functions and classes automatically placed behind HTTP endpoints.
2. Alternatively, call them from :ref:`within your existing Python web server <serve-web-server-integration-tutorial>` using the Python-native :ref:`servehandle-api`.
.. note::
Serve recently added an experimental API for building deployment graphs of multiple models.
Please take a look at the :ref:`Deployment Graph API <serve-deployment-graph>` and try it out!
.. tip::
Chat with Ray Serve users and developers on our `forum <https://discuss.ray.io/>`_!
.. _serve_quickstart:
Ray Serve Quickstart
====================
First install Ray Serve and all of its dependencies by running the following
command in your terminal:
.. code-block:: bash
pip install "ray[serve]"
.. note::
Ray Serve supports the same Python versions as Ray. See :ref:`installation`
for a list of supported Python versions.
Now we will write a Python script to serve a simple "Counter" class over HTTP. You may open an interactive Python terminal and copy in the lines below as we go.
First, import Ray and Ray Serve:
.. code-block:: python
import ray
from ray import serve
Ray Serve runs on top of a Ray cluster, so the next step is to start a local Ray cluster:
.. code-block:: python
ray.init()
.. note::
``ray.init()`` will start a single-node Ray cluster on your local machine, which will allow you to use all your CPU cores to serve requests in parallel. To start a multi-node cluster, see :doc:`../cluster/index`.
Next, start the Ray Serve runtime:
.. code-block:: python
serve.start()
.. warning::
When the Python script exits, Ray Serve will shut down.
If you would rather keep Ray Serve running in the background you can use ``serve.start(detached=True)`` (see :doc:`deployment` for details).
Now we will define a simple Counter class. The goal is to serve this class behind an HTTP endpoint using Ray Serve.
By default, Ray Serve offers a simple HTTP proxy that will send requests to the class' ``__call__`` method. The argument to this method will be a Starlette ``Request`` object.
.. code-block:: python
@serve.deployment
class Counter:
def __init__(self):
self.count = 0
def __call__(self, request):
self.count += 1
return {"count": self.count}
.. note::
Besides classes, you can also serve standalone functions with Ray Serve in the same way.
Notice that we made this class into a ``Deployment`` with the :mod:`@serve.deployment <ray.serve.api.deployment>` decorator.
This decorator is where we could set various configuration options such as the number of replicas, unique name of the deployment (it defaults to the class name), or the HTTP route prefix to expose the deployment at.
See the :mod:`Deployment package reference <ray.serve.api.Deployment>` for more details.
In order to deploy this, we simply need to call ``Counter.deploy()``.
.. code-block:: python
Counter.deploy()
.. note::
Deployments can be configured to improve performance, for example by increasing the number of replicas of the class being served in parallel. For details, see :ref:`configuring-a-deployment`.
Now that our deployment is up and running, let's test it out by making a query over HTTP.
In your browser, simply visit ``http://127.0.0.1:8000/Counter``, and you should see the output ``{"count": 1"}``.
If you keep refreshing the page, the count should increase, as expected.
Now let's say we want to update this deployment to add another method to decrement the counter.
Here, because we want more flexible HTTP configuration we'll use Serve's FastAPI integration.
For more information on this, please see :ref:`serve-fastapi-http`.
.. code-block:: python
from fastapi import FastAPI
app = FastAPI()
@serve.deployment
@serve.ingress(app)
class Counter:
def __init__(self):
self.count = 0
@app.get("/")
def get(self):
return {"count": self.count}
@app.get("/incr")
def incr(self):
self.count += 1
return {"count": self.count}
@app.get("/decr")
def decr(self):
self.count -= 1
return {"count": self.count}
We've now redefined the ``Counter`` class to wrap a ``FastAPI`` application.
This class is exposing three HTTP routes: ``/Counter`` will get the current count, ``/Counter/incr`` will increment the count, and ``/Counter/decr`` will decrement the count.
To redeploy this updated version of the ``Counter``, all we need to do is run ``Counter.deploy()`` again.
Serve will perform a rolling update here to replace the existing replicas with the new version we defined.
.. code-block:: python
Counter.deploy()
If we test out the HTTP endpoint again, we can see this in action.
Note that the count has been reset to zero because the new version of ``Counter`` was deployed.
.. code-block:: bash
> curl -X GET localhost:8000/Counter/
{"count": 0}
> curl -X GET localhost:8000/Counter/incr
{"count": 1}
> curl -X GET localhost:8000/Counter/decr
{"count": 0}
Congratulations, you just built and ran your first Ray Serve application! You should now have enough context to dive into the :doc:`core-apis` to get a deeper understanding of Ray Serve.
For more interesting example applications, including integrations with popular machine learning frameworks and Python web servers, be sure to check out :doc:`tutorials/index`.
For a high-level view of the architecture underlying Ray Serve, see :doc:`architecture`.
Why Ray Serve?
==============
There are generally two ways of serving machine learning applications, both with serious limitations:
you can use a **traditional web server**---your own Flask app---or you can use a cloud-hosted solution.
The first approach is easy to get started with, but it's hard to scale each component. The second approach
requires vendor lock-in (SageMaker), framework-specific tooling (TFServing), and a general
lack of flexibility.
Ray Serve solves these problems by giving you a simple web server (and the ability to :ref:`use your own <serve-web-server-integration-tutorial>`) while still handling the complex routing, scaling, and testing logic
necessary for production deployments.
Beyond scaling up your deployments with multiple replicas, Ray Serve also enables:
- :ref:`serve-model-composition`---ability to flexibly compose multiple models and independently scale and update each.
- :ref:`serve-batching`---built in request batching to help you meet your performance objectives.
- :ref:`serve-cpus-gpus`---specify fractional resource requirements to fully saturate each of your GPUs with several models.
For more on the motivation behind Ray Serve, check out these `meetup slides <https://tinyurl.com/serve-meetup>`_ and this `blog post <https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f>`_.
When should I use Ray Serve?
----------------------------
Ray Serve is a flexible tool that's easy to use for deploying, operating, and monitoring Python-based machine learning applications.
Ray Serve excels when you want to mix business logic with ML models and scaling out in production is a necessity. This might be because of large-scale batch processing
requirements or because you want to scale up a deployment graph consisting of many individual models with different performance properties.
If you plan on running on multiple machines, Ray Serve will serve you well!
What's next?
============
Check out the :ref:`end_to_end_tutorial` and :doc:`core-apis`, look at the :ref:`serve-faq`,
or head over to the :doc:`tutorials/index` to get started building your Ray Serve applications.
For more, see the following blog posts about Ray Serve:
- `Serving ML Models in Production: Common Patterns <https://www.anyscale.com/blog/serving-ml-models-in-production-common-patterns>`_ by Simon Mo, Edward Oakes, and Michael Galarnyk
- `How to Scale Up Your FastAPI Application Using Ray Serve <https://medium.com/distributed-computing-with-ray/how-to-scale-up-your-fastapi-application-using-ray-serve-c9a7b69e786>`_ by Archit Kulkarni
- `Machine Learning is Broken <https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f>`_ by Simon Mo
- `The Simplest Way to Serve your NLP Model in Production with Pure Python <https://medium.com/distributed-computing-with-ray/the-simplest-way-to-serve-your-nlp-model-in-production-with-pure-python-d42b6a97ad55>`_ by Edward Oakes and Bill Chambers
.. include:: /_includes/serve/announcement_bottom.rst

View file

@ -0,0 +1,300 @@
# Serving ML Models
This section should help you:
- batch requests to optimize performance
- serve multiple models by composing deployments
- serve multiple models by making ensemble deployments
```{contents}
```
(serve-batching)=
## Request Batching
You can also have Ray Serve batch requests for performance, which is especially important for some ML models that run on GPUs. In order to use this feature, you need to do the following two things:
1. Use `async def` for your request handling logic to process queries concurrently.
2. Use the `@serve.batch` decorator to batch individual queries that come into the replica. The method/function that's decorated should handle a list of requests and return a list of the same length.
```python
@serve.deployment(route_prefix="/increment")
class BatchingExample:
def __init__(self):
self.count = 0
@serve.batch
async def handle_batch(self, requests):
responses = []
for request in requests:
responses.append(request.json())
return responses
async def __call__(self, request):
return await self.handle_batch(request)
BatchingExample.deploy()
```
Please take a look at [Batching Tutorial](serve-batch-tutorial) for a deep
dive.
(serve-model-composition)=
## Model Composition
:::{note}
Serve recently added an experimental API for building deployment graphs of multiple models.
Please take a look at the [Deployment Graph API](serve-deployment-graph) and try it out!
:::
Ray Serve supports composing individually scalable models into a single model
out of the box. For instance, you can combine multiple models to perform
stacking or ensembles.
To define a higher-level composed model you need to do three things:
1. Define your underlying models (the ones that you will compose together) as
Ray Serve deployments.
2. Define your composed model, using the handles of the underlying models
(see the example below).
3. Define a deployment representing this composed model and query it!
In order to avoid synchronous execution in the composed model (e.g., it's very
slow to make calls to the composed model), you'll need to make the function
asynchronous by using an `async def`. You'll see this in the example below.
That's it. Let's take a look at an example:
```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_model_composition.py
```
(serve-model-ensemble)=
## Model Ensemble
Ray Serve supports creating different ensemble models
To define an ensemble of different models you need to do three things:
1. Define your underlying sub models (the ones that make up the ensemble) as
Ray Serve deployments.
2. Define your ensemble model, using the handles of the underlying models
(see the example below).
3. Define a deployment representing this ensemble model and query it!
In order to avoid synchronous execution in the ensemble model, you'll need to make
the function asynchronous by using an `async def`. In contrast to a composition model,
within an ensemble model, you want to call **all** sub models in parallel. This will be
achieved by sending all prediction calls to the sub models via async by using
`asyncio.wait()`. Each serve deployment used in an ensemble use case is independently
scalable via changing `num_replicas`.
That's it. Let's take a look at an example:
```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_model_ensemble.py
```
## Integration with Model Registries
Ray Serve is flexible. If you can load your model as a Python
function or class, then you can scale it up and serve it with Ray Serve.
For example, if you are using the
[MLflow Model Registry](https://www.mlflow.org/docs/latest/model-registry.html)
to manage your models, the following wrapper
class will allow you to load a model using its MLflow `Model URI`:
```python
import pandas as pd
import mlflow.pyfunc
@serve.deployment
class MLflowDeployment:
def __init__(self, model_uri):
self.model = mlflow.pyfunc.load_model(model_uri=model_uri)
async def __call__(self, request):
csv_text = await request.body() # The body contains just raw csv text.
df = pd.read_csv(csv_text)
return self.model.predict(df)
model_uri = "model:/my_registered_model/Production"
MLflowDeployment.deploy(model_uri)
```
To serve multiple different MLflow models in the same program, use the `name` option:
```python
MLflowDeployment.options(name="my_mlflow_model_1").deploy(model_uri)
```
:::{tip}
The above approach will work for any model registry, not just MLflow.
Namely, load the model from the registry in `__init__`, and forward the request to the model in `__call__`.
:::
For a complete hands-on and seamless integration with MLflow, try this self-contained example on your laptop.
But first install `mlflow`.
```bash
pip install mlflow
```
```python
# This brief example shows how to deploy models saved in a model registry such as
# MLflow to Ray Serve, using the simple Ray Serve deployment APIs. You can peruse
# the saved models' metrics and parameters in MLflow ui.
#
import json
import numpy as np
import pandas as pd
import requests
import os
import tempfile
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
from mlflow.tracking import MlflowClient
from ray import serve
import mlflow
def create_and_save_model():
# load Iris data
iris_data = load_iris()
data, target, target_names = (iris_data['data'],
iris_data['target'],
iris_data['target_names'])
# Instantiate a model
model = GradientBoostingClassifier()
# Training and validation split
np.random.shuffle(data), np.random.shuffle(target)
train_x, train_y = data[:100], target[:100]
val_x, val_y = data[100:], target[100:]
# Create labels list as file
LABEL_PATH = os.path.join(tempfile.gettempdir(), "iris_labels.json")
with open(LABEL_PATH, "w") as f:
json.dump(target_names.tolist(), f)
# Train the model and save our label list as an MLflow artifact
# mlflow.sklearn.autolog automatically logs all parameters and metrics during
# the training.
mlflow.sklearn.autolog()
with mlflow.start_run() as run:
model.fit(train_x, train_y)
# Log label list as a artifact
mlflow.log_artifact(LABEL_PATH, artifact_path="labels")
return run.info.run_id
#
# Create our Ray Serve deployment class
#
@serve.deployment(route_prefix="/regressor")
class BoostingModel:
def __init__(self, uri):
# Load the model and label artifact from the local
# Mlflow model registry as a PyFunc Model
self.model = mlflow.pyfunc.load_model(model_uri=uri)
# Download the artifact list of labels
local_dir = "/tmp/artifact_downloads"
if not os.path.exists(local_dir):
os.mkdir(local_dir)
client = MlflowClient()
local_path = f"{client.download_artifacts(run_id, 'labels', local_dir)}/iris_labels.json"
with open(local_path, "r") as f:
self.label_list = json.load(f)
async def __call__(self, starlette_request):
payload = await starlette_request.json()
print(f"Worker: received Starlette request with data: {payload}")
# Get the input vector from the payload
input_vector = [
payload["sepal length"],
payload["sepal width"],
payload["petal length"],
payload["petal width"],
]
# Convert the input vector in a Pandas DataFrame for prediction since
# an MLflow PythonFunc model, model.predict(...), takes pandas DataFrame
prediction = self.model.predict(pd.DataFrame([input_vector]))[0]
human_name = self.label_list[prediction]
return {"result": human_name}
if __name__ == '__main__':
# Train and save the model artifacts in MLflow.
# Here our MLflow model registry is local file
# directory ./mlruns
run_id = create_and_save_model()
# Start the Ray Serve instance
serve.start()
# Construct model uri to load the model from our model registry
uri = f"runs:/{run_id}/model"
# Deploy our model.
BoostingModel.deploy(uri)
# Send in a request for labels types virginica, setosa, versicolor
sample_request_inputs = [{
"sepal length": 6.3,
"sepal width": 3.3,
"petal length": 6.0,
"petal width": 2.5},
{
"sepal length": 5.1,
"sepal width": 3.5,
"petal length": 1.4,
"petal width": 0.2},
{
"sepal length": 6.4,
"sepal width": 3.2,
"petal length": 4.5,
"petal width": 1.5},
]
for input_request in sample_request_inputs:
response = requests.get("http://localhost:8000/regressor",
json=input_request)
print(response.text)
print("Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.")
#output
#{
# "result": "versicolor"
#}
#{
# "result": "virginica"
#}
#{
# "result": "setosa"
#}
#
# Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.
```
For an even more hands-off and seamless integration with MLflow, check out the
[Ray Serve MLflow deployment plugin](https://github.com/ray-project/mlflow-ray-serve). A full
tutorial is available [here](https://github.com/mlflow/mlflow/tree/master/examples/ray_serve).
## Framework-Specific Tutorials
Ray Serve seamlessly integrates with popular Python ML libraries.
Below are tutorials with some of these frameworks to help get you started.
- [PyTorch Tutorial](serve-pytorch-tutorial)
- [Scikit-Learn Tutorial](serve-sklearn-tutorial)
- [Keras and Tensorflow Tutorial](serve-tensorflow-tutorial)
- [RLlib Tutorial](serve-rllib-tutorial)

View file

@ -1,307 +0,0 @@
=================
Serving ML Models
=================
This section should help you:
- batch requests to optimize performance
- serve multiple models by composing deployments
- serve multiple models by making ensemble deployments
.. contents::
.. _serve-batching:
Request Batching
================
You can also have Ray Serve batch requests for performance, which is especially important for some ML models that run on GPUs. In order to use this feature, you need to do the following two things:
1. Use ``async def`` for your request handling logic to process queries concurrently.
2. Use the ``@serve.batch`` decorator to batch individual queries that come into the replica. The method/function that's decorated should handle a list of requests and return a list of the same length.
.. code-block:: python
@serve.deployment(route_prefix="/increment")
class BatchingExample:
def __init__(self):
self.count = 0
@serve.batch
async def handle_batch(self, requests):
responses = []
for request in requests:
responses.append(request.json())
return responses
async def __call__(self, request):
return await self.handle_batch(request)
BatchingExample.deploy()
Please take a look at :ref:`Batching Tutorial<serve-batch-tutorial>` for a deep
dive.
.. _serve-model-composition:
Model Composition
=================
.. note::
Serve recently added an experimental API for building deployment graphs of multiple models.
Please take a look at the :ref:`Deployment Graph API <serve-deployment-graph>` and try it out!
Ray Serve supports composing individually scalable models into a single model
out of the box. For instance, you can combine multiple models to perform
stacking or ensembles.
To define a higher-level composed model you need to do three things:
1. Define your underlying models (the ones that you will compose together) as
Ray Serve deployments.
2. Define your composed model, using the handles of the underlying models
(see the example below).
3. Define a deployment representing this composed model and query it!
In order to avoid synchronous execution in the composed model (e.g., it's very
slow to make calls to the composed model), you'll need to make the function
asynchronous by using an ``async def``. You'll see this in the example below.
That's it. Let's take a look at an example:
.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_composition.py
.. _serve-model-ensemble:
Model Ensemble
=================
Ray Serve supports creating different ensemble models
To define an ensemble of different models you need to do three things:
1. Define your underlying sub models (the ones that make up the ensemble) as
Ray Serve deployments.
2. Define your ensemble model, using the handles of the underlying models
(see the example below).
3. Define a deployment representing this ensemble model and query it!
In order to avoid synchronous execution in the ensemble model, you'll need to make
the function asynchronous by using an ``async def``. In contrast to a composition model,
within an ensemble model, you want to call **all** sub models in parallel. This will be
achieved by sending all prediction calls to the sub models via async by using
``asyncio.wait()``. Each serve deployment used in an ensemble use case is independently
scalable via changing ``num_replicas``.
That's it. Let's take a look at an example:
.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_ensemble.py
Integration with Model Registries
=================================
Ray Serve is flexible. If you can load your model as a Python
function or class, then you can scale it up and serve it with Ray Serve.
For example, if you are using the
`MLflow Model Registry <https://www.mlflow.org/docs/latest/model-registry.html>`_
to manage your models, the following wrapper
class will allow you to load a model using its MLflow `Model URI`:
.. code-block:: python
import pandas as pd
import mlflow.pyfunc
@serve.deployment
class MLflowDeployment:
def __init__(self, model_uri):
self.model = mlflow.pyfunc.load_model(model_uri=model_uri)
async def __call__(self, request):
csv_text = await request.body() # The body contains just raw csv text.
df = pd.read_csv(csv_text)
return self.model.predict(df)
model_uri = "model:/my_registered_model/Production"
MLflowDeployment.deploy(model_uri)
To serve multiple different MLflow models in the same program, use the ``name`` option:
.. code-block:: python
MLflowDeployment.options(name="my_mlflow_model_1").deploy(model_uri)
.. tip::
The above approach will work for any model registry, not just MLflow.
Namely, load the model from the registry in ``__init__``, and forward the request to the model in ``__call__``.
For a complete hands-on and seamless integration with MLflow, try this self-contained example on your laptop.
But first install ``mlflow``.
.. code-block:: bash
pip install mlflow
.. code-block:: python
# This brief example shows how to deploy models saved in a model registry such as
# MLflow to Ray Serve, using the simple Ray Serve deployment APIs. You can peruse
# the saved models' metrics and parameters in MLflow ui.
#
import json
import numpy as np
import pandas as pd
import requests
import os
import tempfile
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
from mlflow.tracking import MlflowClient
from ray import serve
import mlflow
def create_and_save_model():
# load Iris data
iris_data = load_iris()
data, target, target_names = (iris_data['data'],
iris_data['target'],
iris_data['target_names'])
# Instantiate a model
model = GradientBoostingClassifier()
# Training and validation split
np.random.shuffle(data), np.random.shuffle(target)
train_x, train_y = data[:100], target[:100]
val_x, val_y = data[100:], target[100:]
# Create labels list as file
LABEL_PATH = os.path.join(tempfile.gettempdir(), "iris_labels.json")
with open(LABEL_PATH, "w") as f:
json.dump(target_names.tolist(), f)
# Train the model and save our label list as an MLflow artifact
# mlflow.sklearn.autolog automatically logs all parameters and metrics during
# the training.
mlflow.sklearn.autolog()
with mlflow.start_run() as run:
model.fit(train_x, train_y)
# Log label list as a artifact
mlflow.log_artifact(LABEL_PATH, artifact_path="labels")
return run.info.run_id
#
# Create our Ray Serve deployment class
#
@serve.deployment(route_prefix="/regressor")
class BoostingModel:
def __init__(self, uri):
# Load the model and label artifact from the local
# Mlflow model registry as a PyFunc Model
self.model = mlflow.pyfunc.load_model(model_uri=uri)
# Download the artifact list of labels
local_dir = "/tmp/artifact_downloads"
if not os.path.exists(local_dir):
os.mkdir(local_dir)
client = MlflowClient()
local_path = f"{client.download_artifacts(run_id, 'labels', local_dir)}/iris_labels.json"
with open(local_path, "r") as f:
self.label_list = json.load(f)
async def __call__(self, starlette_request):
payload = await starlette_request.json()
print(f"Worker: received Starlette request with data: {payload}")
# Get the input vector from the payload
input_vector = [
payload["sepal length"],
payload["sepal width"],
payload["petal length"],
payload["petal width"],
]
# Convert the input vector in a Pandas DataFrame for prediction since
# an MLflow PythonFunc model, model.predict(...), takes pandas DataFrame
prediction = self.model.predict(pd.DataFrame([input_vector]))[0]
human_name = self.label_list[prediction]
return {"result": human_name}
if __name__ == '__main__':
# Train and save the model artifacts in MLflow.
# Here our MLflow model registry is local file
# directory ./mlruns
run_id = create_and_save_model()
# Start the Ray Serve instance
serve.start()
# Construct model uri to load the model from our model registry
uri = f"runs:/{run_id}/model"
# Deploy our model.
BoostingModel.deploy(uri)
# Send in a request for labels types virginica, setosa, versicolor
sample_request_inputs = [{
"sepal length": 6.3,
"sepal width": 3.3,
"petal length": 6.0,
"petal width": 2.5},
{
"sepal length": 5.1,
"sepal width": 3.5,
"petal length": 1.4,
"petal width": 0.2},
{
"sepal length": 6.4,
"sepal width": 3.2,
"petal length": 4.5,
"petal width": 1.5},
]
for input_request in sample_request_inputs:
response = requests.get("http://localhost:8000/regressor",
json=input_request)
print(response.text)
print("Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.")
#output
#{
# "result": "versicolor"
#}
#{
# "result": "virginica"
#}
#{
# "result": "setosa"
#}
#
# Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.
For an even more hands-off and seamless integration with MLflow, check out the
`Ray Serve MLflow deployment plugin <https://github.com/ray-project/mlflow-ray-serve>`__. A full
tutorial is available `here <https://github.com/mlflow/mlflow/tree/master/examples/ray_serve>`__.
Framework-Specific Tutorials
============================
Ray Serve seamlessly integrates with popular Python ML libraries.
Below are tutorials with some of these frameworks to help get you started.
- :ref:`PyTorch Tutorial<serve-pytorch-tutorial>`
- :ref:`Scikit-Learn Tutorial<serve-sklearn-tutorial>`
- :ref:`Keras and Tensorflow Tutorial<serve-tensorflow-tutorial>`
- :ref:`RLlib Tutorial<serve-rllib-tutorial>`

View file

@ -0,0 +1,266 @@
(serve-monitoring)=
# Debugging & Monitoring
This section should help you understand how to debug and monitor your Serve application.
## Ray Dashboard
A high-level way to monitor your Ray Serve application is via the Ray Dashboard.
See the [Ray Dashboard documentation](ray-dashboard) for a detailed overview, including instructions on how to view the dashboard.
Below is an example of what the Ray Dashboard might look like for a Serve deployment:
```{image} https://raw.githubusercontent.com/ray-project/Images/master/docs/dashboard/serve-dashboard.png
:align: center
```
Here you can see the Serve controller actor, an HTTP proxy actor, and all of the replicas for each Serve deployment.
To learn about the function of the controller and proxy actors, see the [Serve Architecture page](serve-architecture).
In this example pictured above, we have a single-node cluster with a deployment named Counter with `num_replicas=2`.
## Logging
:::{note}
For an overview of logging in Ray, see [Ray Logging](ray-logging).
:::
Ray Serve uses Python's standard `logging` facility with the `"ray.serve"` named logger.
By default, logs are emitted from actors both to `stderr` and on disk on each node at `/tmp/ray/session_latest/logs/serve/`.
This includes both system-level logs from the Serve controller and HTTP proxy as well as access logs and custom user logs produced from within deployment replicas.
In development, logs are streamed to the driver Ray program (the program that calls `.deploy()` or `serve.run`, or the `serve run` CLI command) that deployed the deployments, so it's most convenient to keep the driver running for debugging.
For example, let's run a basic Serve application and view the logs that are emitted.
You can run this in an interactive shell like IPython to follow along.
First we call `serve.start()`:
```python
from ray import serve
serve.start()
```
This produces a few INFO-level log messages about startup from the Serve controller.
```bash
2022-04-02 09:10:49,906 INFO services.py:1460 -- View the Ray dashboard at http://127.0.0.1:8265
(ServeController pid=67312) INFO 2022-04-02 09:10:51,386 controller 67312 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.
(ServeController pid=67312) INFO 2022-04-02 09:10:51,492 controller 67312 http_state.py:108 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:xlehoa:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'
```
Next, let's create a simple deployment that logs a custom log message when it's queried:
```python
import logging
logger = logging.getLogger("ray.serve")
@serve.deployment(route_prefix="/")
class SayHello:
def __call__(self, *args):
logger.info("Hello world!")
return "hi"
SayHello.deploy()
```
Running this code block, we first get some log messages from the controller saying that a new replica of the deployment is being created:
```bash
(ServeController pid=67312) INFO 2022-04-02 09:16:13,323 controller 67312 deployment_state.py:1198 - Adding 1 replicas to deployment 'SayHello'.
```
Then when we query the deployment, we get both a default access log as well as our custom `"Hello world!"` message.
Note that these log lines are tagged with the deployment name followed by a unique identifier for the specific replica.
These can be parsed by a logging stack such as ELK or Loki to enable searching logs by deployment and replica.
```bash
handle = SayHello.get_handle()
ray.get(handle.remote())
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
```
Querying the deployment over HTTP produces a similar access log message from the HTTP proxy:
```bash
curl -X GET http://localhost:8000/
(HTTPProxyActor pid=67315) INFO 2022-04-02 09:20:08,976 http_proxy 127.0.0.1 http_proxy.py:310 - GET / 200 2.6ms
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
```
You can also be able to view all of these log messages in the files in `/tmp/ray/session_latest/logs/serve/`.
To silence the replica-level logs or otherwise configure logging, configure the `"ray.serve"` logger *from inside the deployment constructor:*
```python
import logging
logger = logging.getLogger("ray.serve")
@serve.deployment
class Silenced:
def __init__(self):
logger.setLevel(logging.ERROR)
```
This will prevent the replica INFO-level logs from being written to STDOUT or to files on disk.
You can also use your own custom logger, in which case you'll need to configure the behavior to write to STDOUT/STDERR, files on disk, or both.
### Tutorial: Ray Serve with Loki
Here is a quick walkthrough of how to explore and filter your logs using [Loki](https://grafana.com/oss/loki/).
Setup and configuration is very easy on Kubernetes, but in this tutorial we'll just set things up manually.
First, install Loki and Promtail using the instructions on <https://grafana.com>.
It will be convenient to save the Loki and Promtail executables in the same directory, and to navigate to this directory in your terminal before beginning this walkthrough.
Now let's get our logs into Loki using Promtail.
Save the following file as `promtail-local-config.yaml`:
```yaml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://localhost:3100/loki/api/v1/push
scrape_configs:
- job_name: ray
static_configs:
- labels:
job: ray
__path__: /tmp/ray/session_latest/logs/serve/*.*
```
The relevant part for Ray is the `static_configs` field, where we have indicated the location of our log files with `__path__`.
The expression `*.*` will match all files, but not directories, which cause an error with Promtail.
We will run Loki locally. Grab the default config file for Loki with the following command in your terminal:
```shell
wget https://raw.githubusercontent.com/grafana/loki/v2.1.0/cmd/loki/loki-local-config.yaml
```
Now start Loki:
```shell
./loki-darwin-amd64 -config.file=loki-local-config.yaml
```
Here you may need to replace `./loki-darwin-amd64` with the path to your Loki executable file, which may have a different name depending on your operating system.
Start Promtail and pass in the path to the config file we saved earlier:
```shell
./promtail-darwin-amd64 -config.file=promtail-local-config.yaml
```
As above, you may need to replace `./promtail-darwin-amd64` with the appropriate filename and path.
Now we are ready to start our Ray Serve deployment. Start a long-running Ray cluster and Ray Serve instance in your terminal:
```shell
ray start --head
serve start
```
Now run the following Python script to deploy a basic Serve deployment with a Serve deployment logger:
```{literalinclude} ../../../python/ray/serve/examples/doc/deployment_logger.py
```
Now [install and run Grafana](https://grafana.com/docs/grafana/latest/installation/) and navigate to `http://localhost:3000`, where you can log in with the default username "admin" and default password "admin".
On the welcome page, click "Add your first data source" and click "Loki" to add Loki as a data source.
Now click "Explore" in the left-side panel. You are ready to run some queries!
To filter all these Ray logs for the ones relevant to our deployment, use the following [LogQL](https://grafana.com/docs/loki/latest/logql/) query:
```shell
{job="ray"} |= "Counter"
```
You should see something similar to the following:
```{image} https://raw.githubusercontent.com/ray-project/Images/master/docs/serve/loki-serve.png
:align: center
```
## Metrics
Ray Serve exposes important system metrics like the number of successful and
errored requests through the [Ray metrics monitoring infrastructure](ray-metrics). By default,
the metrics are exposed in Prometheus format on each node.
The following metrics are exposed by Ray Serve:
```{eval-rst}
.. list-table::
:header-rows: 1
* - Name
- Description
* - ``serve_deployment_request_counter``
- The number of queries that have been processed in this replica.
* - ``serve_deployment_error_counter``
- The number of exceptions that have occurred in the deployment.
* - ``serve_deployment_replica_starts``
- The number of times this replica has been restarted due to failure.
* - ``serve_deployment_queuing_latency_ms``
- The latency for queries in the replica's queue waiting to be processed.
* - ``serve_deployment_processing_latency_ms``
- The latency for queries to be processed.
* - ``serve_replica_queued_queries``
- The current number of queries queued in the deployment replicas.
* - ``serve_replica_processing_queries``
- The current number of queries being processed.
* - ``serve_num_http_requests``
- The number of HTTP requests processed.
* - ``serve_num_http_error_requests``
- The number of non-200 HTTP responses.
* - ``serve_num_router_requests``
- The number of requests processed by the router.
* - ``serve_handle_request_counter``
- The number of requests processed by this ServeHandle.
* - ``serve_deployment_queued_queries``
- The number of queries for this deployment waiting to be assigned to a replica.
* - ``serve_num_deployment_http_error_requests``
- The number of non-200 HTTP responses returned by each deployment.
```
To see this in action, run `ray start --head --metrics-export-port=8080` in your terminal, and then run the following script:
```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_metrics.py
```
In your web browser, navigate to `localhost:8080`.
In the output there, you can search for `serve_` to locate the metrics above.
The metrics are updated once every ten seconds, and you will need to refresh the page to see the new values.
For example, after running the script for some time and refreshing `localhost:8080` you might see something that looks like:
```
ray_serve_deployment_processing_latency_ms_count{...,deployment="f",...} 99.0
ray_serve_deployment_processing_latency_ms_sum{...,deployment="f",...} 99279.30498123169
```
which indicates that the average processing latency is just over one second, as expected.
You can even define a [custom metric](application-level-metrics) to use in your deployment, and tag it with the current deployment or replica.
Here's an example:
```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_custom_metric.py
:end-before: __custom_metrics_deployment_end__
:start-after: __custom_metrics_deployment_start__
```
See the
[Ray Metrics documentation](ray-metrics) for more details, including instructions for scraping these metrics using Prometheus.

View file

@ -1,269 +0,0 @@
.. _serve-monitoring:
======================
Debugging & Monitoring
======================
This section should help you understand how to debug and monitor your Serve application.
Ray Dashboard
=============
A high-level way to monitor your Ray Serve application is via the Ray Dashboard.
See the `Ray Dashboard documentation <../ray-dashboard.html>`__ for a detailed overview, including instructions on how to view the dashboard.
Below is an example of what the Ray Dashboard might look like for a Serve deployment:
.. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/dashboard/serve-dashboard.png
:align: center
Here you can see the Serve controller actor, an HTTP proxy actor, and all of the replicas for each Serve deployment.
To learn about the function of the controller and proxy actors, see the `Serve Architecture page <architecture.html>`__.
In this example pictured above, we have a single-node cluster with a deployment named Counter with ``num_replicas=2``.
Logging
=======
.. note::
For an overview of logging in Ray, see `Ray Logging <../ray-logging.html>`__.
Ray Serve uses Python's standard ``logging`` facility with the ``"ray.serve"`` named logger.
By default, logs are emitted from actors both to ``stderr`` and on disk on each node at ``/tmp/ray/session_latest/logs/serve/``.
This includes both system-level logs from the Serve controller and HTTP proxy as well as access logs and custom user logs produced from within deployment replicas.
In development, logs are streamed to the driver Ray program (the program that calls ``.deploy()`` or ``serve.run``, or the ``serve run`` CLI command) that deployed the deployments, so it's most convenient to keep the driver running for debugging.
For example, let's run a basic Serve application and view the logs that are emitted.
You can run this in an interactive shell like IPython to follow along.
First we call ``serve.start()``:
.. code-block:: python
from ray import serve
serve.start()
This produces a few INFO-level log messages about startup from the Serve controller.
.. code-block:: bash
2022-04-02 09:10:49,906 INFO services.py:1460 -- View the Ray dashboard at http://127.0.0.1:8265
(ServeController pid=67312) INFO 2022-04-02 09:10:51,386 controller 67312 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.
(ServeController pid=67312) INFO 2022-04-02 09:10:51,492 controller 67312 http_state.py:108 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:xlehoa:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'
Next, let's create a simple deployment that logs a custom log message when it's queried:
.. code-block:: python
import logging
logger = logging.getLogger("ray.serve")
@serve.deployment(route_prefix="/")
class SayHello:
def __call__(self, *args):
logger.info("Hello world!")
return "hi"
SayHello.deploy()
Running this code block, we first get some log messages from the controller saying that a new replica of the deployment is being created:
.. code-block:: bash
(ServeController pid=67312) INFO 2022-04-02 09:16:13,323 controller 67312 deployment_state.py:1198 - Adding 1 replicas to deployment 'SayHello'.
Then when we query the deployment, we get both a default access log as well as our custom ``"Hello world!"`` message.
Note that these log lines are tagged with the deployment name followed by a unique identifier for the specific replica.
These can be parsed by a logging stack such as ELK or Loki to enable searching logs by deployment and replica.
.. code-block:: bash
handle = SayHello.get_handle()
ray.get(handle.remote())
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
Querying the deployment over HTTP produces a similar access log message from the HTTP proxy:
.. code-block:: bash
curl -X GET http://localhost:8000/
(HTTPProxyActor pid=67315) INFO 2022-04-02 09:20:08,976 http_proxy 127.0.0.1 http_proxy.py:310 - GET / 200 2.6ms
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
You can also be able to view all of these log messages in the files in ``/tmp/ray/session_latest/logs/serve/``.
To silence the replica-level logs or otherwise configure logging, configure the ``"ray.serve"`` logger *from inside the deployment constructor:*
.. code-block:: python
import logging
logger = logging.getLogger("ray.serve")
@serve.deployment
class Silenced:
def __init__(self):
logger.setLevel(logging.ERROR)
This will prevent the replica INFO-level logs from being written to STDOUT or to files on disk.
You can also use your own custom logger, in which case you'll need to configure the behavior to write to STDOUT/STDERR, files on disk, or both.
Tutorial: Ray Serve with Loki
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here is a quick walkthrough of how to explore and filter your logs using `Loki <https://grafana.com/oss/loki/>`__.
Setup and configuration is very easy on Kubernetes, but in this tutorial we'll just set things up manually.
First, install Loki and Promtail using the instructions on https://grafana.com.
It will be convenient to save the Loki and Promtail executables in the same directory, and to navigate to this directory in your terminal before beginning this walkthrough.
Now let's get our logs into Loki using Promtail.
Save the following file as ``promtail-local-config.yaml``:
.. code-block:: yaml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://localhost:3100/loki/api/v1/push
scrape_configs:
- job_name: ray
static_configs:
- labels:
job: ray
__path__: /tmp/ray/session_latest/logs/serve/*.*
The relevant part for Ray is the ``static_configs`` field, where we have indicated the location of our log files with ``__path__``.
The expression ``*.*`` will match all files, but not directories, which cause an error with Promtail.
We will run Loki locally. Grab the default config file for Loki with the following command in your terminal:
.. code-block:: shell
wget https://raw.githubusercontent.com/grafana/loki/v2.1.0/cmd/loki/loki-local-config.yaml
Now start Loki:
.. code-block:: shell
./loki-darwin-amd64 -config.file=loki-local-config.yaml
Here you may need to replace ``./loki-darwin-amd64`` with the path to your Loki executable file, which may have a different name depending on your operating system.
Start Promtail and pass in the path to the config file we saved earlier:
.. code-block:: shell
./promtail-darwin-amd64 -config.file=promtail-local-config.yaml
As above, you may need to replace ``./promtail-darwin-amd64`` with the appropriate filename and path.
Now we are ready to start our Ray Serve deployment. Start a long-running Ray cluster and Ray Serve instance in your terminal:
.. code-block:: shell
ray start --head
serve start
Now run the following Python script to deploy a basic Serve deployment with a Serve deployment logger:
.. literalinclude:: ../../../python/ray/serve/examples/doc/deployment_logger.py
Now `install and run Grafana <https://grafana.com/docs/grafana/latest/installation/>`__ and navigate to ``http://localhost:3000``, where you can log in with the default username "admin" and default password "admin".
On the welcome page, click "Add your first data source" and click "Loki" to add Loki as a data source.
Now click "Explore" in the left-side panel. You are ready to run some queries!
To filter all these Ray logs for the ones relevant to our deployment, use the following `LogQL <https://grafana.com/docs/loki/latest/logql/>`__ query:
.. code-block:: shell
{job="ray"} |= "Counter"
You should see something similar to the following:
.. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/serve/loki-serve.png
:align: center
Metrics
=======
Ray Serve exposes important system metrics like the number of successful and
errored requests through the `Ray metrics monitoring infrastructure <../ray-metrics.html>`__. By default,
the metrics are exposed in Prometheus format on each node.
The following metrics are exposed by Ray Serve:
.. list-table::
:header-rows: 1
* - Name
- Description
* - ``serve_deployment_request_counter``
- The number of queries that have been processed in this replica.
* - ``serve_deployment_error_counter``
- The number of exceptions that have occurred in the deployment.
* - ``serve_deployment_replica_starts``
- The number of times this replica has been restarted due to failure.
* - ``serve_deployment_queuing_latency_ms``
- The latency for queries in the replica's queue waiting to be processed.
* - ``serve_deployment_processing_latency_ms``
- The latency for queries to be processed.
* - ``serve_replica_queued_queries``
- The current number of queries queued in the deployment replicas.
* - ``serve_replica_processing_queries``
- The current number of queries being processed.
* - ``serve_num_http_requests``
- The number of HTTP requests processed.
* - ``serve_num_http_error_requests``
- The number of non-200 HTTP responses.
* - ``serve_num_router_requests``
- The number of requests processed by the router.
* - ``serve_handle_request_counter``
- The number of requests processed by this ServeHandle.
* - ``serve_deployment_queued_queries``
- The number of queries for this deployment waiting to be assigned to a replica.
* - ``serve_num_deployment_http_error_requests``
- The number of non-200 HTTP responses returned by each deployment.
To see this in action, run ``ray start --head --metrics-export-port=8080`` in your terminal, and then run the following script:
.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_metrics.py
In your web browser, navigate to ``localhost:8080``.
In the output there, you can search for ``serve_`` to locate the metrics above.
The metrics are updated once every ten seconds, and you will need to refresh the page to see the new values.
For example, after running the script for some time and refreshing ``localhost:8080`` you might see something that looks like::
ray_serve_deployment_processing_latency_ms_count{...,deployment="f",...} 99.0
ray_serve_deployment_processing_latency_ms_sum{...,deployment="f",...} 99279.30498123169
which indicates that the average processing latency is just over one second, as expected.
You can even define a :ref:`custom metric <application-level-metrics>` to use in your deployment, and tag it with the current deployment or replica.
Here's an example:
.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_custom_metric.py
:start-after: __custom_metrics_deployment_start__
:end-before: __custom_metrics_deployment_end__
See the
:ref:`Ray Metrics documentation <ray-metrics>` for more details, including instructions for scraping these metrics using Prometheus.

View file

@ -1,29 +1,47 @@
Ray Serve API
=============
# Ray Serve API
Core APIs
---------
## Core APIs
```{eval-rst}
.. autofunction:: ray.serve.start
```
```{eval-rst}
.. autofunction:: ray.serve.deployment
```
```{eval-rst}
.. autofunction:: ray.serve.list_deployments
```
```{eval-rst}
.. autofunction:: ray.serve.get_deployment
```
```{eval-rst}
.. autofunction:: ray.serve.shutdown
```
.. _`deployment-api`:
(deployment-api)=
Deployment API
--------------
## Deployment API
```{eval-rst}
.. autoclass:: ray.serve.deployment.Deployment
:members: deploy, delete, options, get_handle
```
.. _`servehandle-api`:
(servehandle-api)=
ServeHandle API
---------------
## ServeHandle API
```{eval-rst}
.. autoclass:: ray.serve.handle.RayServeHandle
:members: remote, options
```
Batching Requests
-----------------
## Batching Requests
```{eval-rst}
.. autofunction:: ray.serve.batch(max_batch_size=10, batch_wait_timeout_s=0.0)
```

View file

@ -1,79 +1,80 @@
Performance Tuning
==================
# Performance Tuning
This section should help you:
- understand the performance characteristics of Ray Serve
- find ways to debug and tune the performance of your Serve deployment
.. note::
While this section offers some tips and tricks to improve the performance of your Serve deployment,
the :ref:`architecture doc <serve-architecture>` is helpful to gain a deeper understanding of these contexts and parameters.
:::{note}
While this section offers some tips and tricks to improve the performance of your Serve deployment,
the [architecture doc](serve-architecture) is helpful to gain a deeper understanding of these contexts and parameters.
:::
.. contents::
```{contents}
```
## Performance and known benchmarks
Performance and known benchmarks
--------------------------------
We are continuously benchmarking Ray Serve. The metrics we care about are latency, throughput, and scalability. We can confidently say:
- Ray Serves latency overhead is single digit milliseconds, around 1-2 milliseconds on average.
- For throughput, Serve achieves about 3-4k queries per second on a single machine (8 cores) using 1 http proxy and 8 replicas performing noop requests.
- It is horizontally scalable so you can add more machines to increase the overall throughput. Ray Serve is built on top of Ray,
so its scalability is bounded by Rays scalability. Please check out Rays `scalability envelope <https://github.com/ray-project/ray/blob/master/release/benchmarks/README.md>`_
- It is horizontally scalable so you can add more machines to increase the overall throughput. Ray Serve is built on top of Ray,
so its scalability is bounded by Rays scalability. Please check out Rays [scalability envelope](https://github.com/ray-project/ray/blob/master/release/benchmarks/README.md)
to learn more about the maximum number of nodes and other limitations.
You can check out our `microbenchmark instruction <https://github.com/ray-project/ray/blob/master/python/ray/serve/benchmarks/README.md>`_
You can check out our [microbenchmark instruction](https://github.com/ray-project/ray/blob/master/python/ray/serve/benchmarks/README.md)
to benchmark on your hardware.
Debugging performance issues
----------------------------
## Debugging performance issues
The performance issue you're most likely to encounter is high latency and/or low throughput for requests.
If you have set up :ref:`monitoring <serve-monitoring>` with Ray and Ray Serve, you will likely observe that
``serve_num_router_requests`` is constant while your load increases
``serve_deployment_queuing_latency_ms`` is spiking up as queries queue up in the background
If you have set up [monitoring](serve-monitoring) with Ray and Ray Serve, you will likely observe that
`serve_num_router_requests` is constant while your load increases
`serve_deployment_queuing_latency_ms` is spiking up as queries queue up in the background
Given the symptom, there are several ways to fix it.
Choosing the right hardware
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Make sure you are using the right hardware and resources.
Are you using GPUs (``actor_init_options={“num_gpus”: 1}``) or 1+ cores (``actor_init_options={“num_cpus”: 2}``, and setting ``OMP_NUM_THREADS``)
### Choosing the right hardware
Make sure you are using the right hardware and resources.
Are you using GPUs (`actor_init_options={“num_gpus”: 1}`) or 1+ cores (`actor_init_options={“num_cpus”: 2}`, and setting `OMP_NUM_THREADS`)
to increase the performance of your deep learning framework?
Async functions
^^^^^^^^^^^^^^^
Are you using ``async def`` in your callable? If you are using asyncio and
hitting the same queuing issue mentioned above, you might want to increase
``max_concurrent_queries``. Serve sets a low number by default so the client gets
### Async functions
Are you using `async def` in your callable? If you are using asyncio and
hitting the same queuing issue mentioned above, you might want to increase
`max_concurrent_queries`. Serve sets a low number by default so the client gets
proper backpressure. You can increase the value in the Deployment decorator.
Batching
^^^^^^^^
If your deployment can process a batch at a time at a sublinear latency
(for example, if it takes 1ms to process 1 query and 5ms to process 10 of them)
then batching is your best approach. Check out the :ref:`batching guide <serve-batching>` to
make your deployment accept batches (especially for GPU-based ML inference). You might want to tune your ``max_batch_size`` and ``batch_wait_timeout`` in the ``@serve.batch`` decorator to maximize the benefits:
### Batching
- ``max_batch_size`` specifies how big the batch should be. Generally,
we recommend choosing the largest batch size your function can handle
AND the performance improvement is no longer sublinear. Take a dummy
If your deployment can process a batch at a time at a sublinear latency
(for example, if it takes 1ms to process 1 query and 5ms to process 10 of them)
then batching is your best approach. Check out the [batching guide](serve-batching) to
make your deployment accept batches (especially for GPU-based ML inference). You might want to tune your `max_batch_size` and `batch_wait_timeout` in the `@serve.batch` decorator to maximize the benefits:
- `max_batch_size` specifies how big the batch should be. Generally,
we recommend choosing the largest batch size your function can handle
AND the performance improvement is no longer sublinear. Take a dummy
example: suppose it takes 1ms to process 1 query, 5ms to process 10 queries,
and 6ms to process 11 queries. Here you should set the batch size to to 10
and 6ms to process 11 queries. Here you should set the batch size to to 10
because adding more queries wont improve the performance.
- ``batch_wait_timeout`` specifies how the maximum amount of time to wait before
a batch should be processed, even if its not full. It should be set according
to `batch-wait-timeout + full batch processing time ~= expected latency`. The idea
here is to have the first query wait for the longest possible time to achieve high throughput.
This means you should set ``batch_wait_timeout`` as large as possible without exceeding your desired expected latency in the equation above.
- `batch_wait_timeout` specifies how the maximum amount of time to wait before
a batch should be processed, even if its not full. It should be set according
to `batch-wait-timeout + full batch processing time ~= expected latency`. The idea
here is to have the first query wait for the longest possible time to achieve high throughput.
This means you should set `batch_wait_timeout` as large as possible without exceeding your desired expected latency in the equation above.
### Scaling HTTP servers
Scaling HTTP servers
^^^^^^^^^^^^^^^^^^^^
Sometimes its not about your code: Serves HTTP server can become the bottleneck.
If you observe that the CPU utilization for HTTPProxy actor spike up to 100%, the HTTP server is the bottleneck.
Serve only starts a single HTTP server on the Ray head node by default.
This single HTTP server can handle about 3k queries per second.
Serve only starts a single HTTP server on the Ray head node by default.
This single HTTP server can handle about 3k queries per second.
If your workload exceeds this number, you might want to consider starting one
HTTP server per Ray node to spread the load by ``serve.start(http_options={“location”: “EveryNode”})``.
This configuration tells Serve to spawn one HTTP server per node.
HTTP server per Ray node to spread the load by `serve.start(http_options={“location”: “EveryNode”})`.
This configuration tells Serve to spawn one HTTP server per node.
You should put an external load balancer in front of it.

View file

@ -0,0 +1,105 @@
(serve-batch-tutorial)=
# Batching Tutorial
In this guide, we will deploy a simple vectorized adder that takes
a batch of queries and adds them at once. In particular, we show:
- How to implement and deploy a Ray Serve deployment that accepts batches.
- How to configure the batch size.
- How to query the model in Python.
This tutorial should help the following use cases:
- You want to perform offline batch inference on a cluster of machines.
- You want to serve online queries and your model can take advantage of batching.
For example, linear regressions and neural networks use CPU and GPU's
vectorized instructions to perform computation in parallel. Performing
inference with batching can increase the *throughput* of the model as well as
*utilization* of the hardware.
Let's import Ray Serve and some other helpers.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:end-before: __doc_import_end__
:start-after: __doc_import_begin__
```
You can use the `@serve.batch` decorator to annotate a function or a method.
This annotation will automatically cause calls to the function to be batched together.
The function must handle a list of objects and will be called with a single object.
This function must also be `async def` so that you can handle multiple queries concurrently:
```python
@serve.batch
async def my_batch_handler(self, requests: List):
pass
```
This batch handler can then be called from another `async def` method in your deployment.
These calls will be batched and executed together, but return an individual result as if
they were a normal function call:
```python
class MyBackend:
@serve.batch
async def my_batch_handler(self, requests: List):
results = []
for request in requests:
results.append(request.json())
return results
async def __call__(self, request):
await self.my_batch_handler(request)
```
:::{note}
By default, Ray Serve performs *opportunistic batching*. This means that as
soon as the batch handler is called, the method will be executed without
waiting for a full batch. If there are more queries available after this call
finishes, a larger batch may be executed. This behavior can be tuned using the
`batch_wait_timeout_s` option to `@serve.batch` (defaults to 0). Increasing this
timeout may improve throughput at the cost of latency under low load.
:::
Let's define a deployment that takes in a list of requests, extracts the input value,
converts them into an array, and uses NumPy to add 1 to each element.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:end-before: __doc_define_servable_end__
:start-after: __doc_define_servable_begin__
```
Let's deploy it. Note that in the `@serve.batch` decorator, we are specifying
specifying the maximum batch size via `max_batch_size=4`. This option limits
the maximum possible batch size that will be executed at once.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:end-before: __doc_deploy_end__
:start-after: __doc_deploy_begin__
```
Let's define a [Ray remote task](ray-remote-functions) to send queries in
parallel. As you can see, the first batch has a batch size of 1, and the subsequent
queries have a batch size of 4. Even though each query is issued independently,
Ray Serve was able to evaluate them in batches.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:end-before: __doc_query_end__
:start-after: __doc_query_begin__
```
What if you want to evaluate a whole batch in Python? Ray Serve allows you to send
queries via the Python API. A batch of queries can either come from the web server
or the Python API. Learn more [here](serve-handle-explainer).
To query the deployment via the Python API, we can use `Deployment.get_handle` to receive
a handle to the corresponding deployment. To enqueue a query, you can call
`handle.method.remote(data)`. This call returns immediately
with a [Ray ObjectRef](ray-object-refs). You can call `ray.get` to retrieve
the result.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:end-before: __doc_query_handle_end__
:start-after: __doc_query_handle_begin__
```

View file

@ -1,101 +0,0 @@
.. _serve-batch-tutorial:
Batching Tutorial
=================
In this guide, we will deploy a simple vectorized adder that takes
a batch of queries and adds them at once. In particular, we show:
- How to implement and deploy a Ray Serve deployment that accepts batches.
- How to configure the batch size.
- How to query the model in Python.
This tutorial should help the following use cases:
- You want to perform offline batch inference on a cluster of machines.
- You want to serve online queries and your model can take advantage of batching.
For example, linear regressions and neural networks use CPU and GPU's
vectorized instructions to perform computation in parallel. Performing
inference with batching can increase the *throughput* of the model as well as
*utilization* of the hardware.
Let's import Ray Serve and some other helpers.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:start-after: __doc_import_begin__
:end-before: __doc_import_end__
You can use the ``@serve.batch`` decorator to annotate a function or a method.
This annotation will automatically cause calls to the function to be batched together.
The function must handle a list of objects and will be called with a single object.
This function must also be ``async def`` so that you can handle multiple queries concurrently:
.. code-block:: python
@serve.batch
async def my_batch_handler(self, requests: List):
pass
This batch handler can then be called from another ``async def`` method in your deployment.
These calls will be batched and executed together, but return an individual result as if
they were a normal function call:
.. code-block:: python
class MyBackend:
@serve.batch
async def my_batch_handler(self, requests: List):
results = []
for request in requests:
results.append(request.json())
return results
async def __call__(self, request):
await self.my_batch_handler(request)
.. note::
By default, Ray Serve performs *opportunistic batching*. This means that as
soon as the batch handler is called, the method will be executed without
waiting for a full batch. If there are more queries available after this call
finishes, a larger batch may be executed. This behavior can be tuned using the
``batch_wait_timeout_s`` option to ``@serve.batch`` (defaults to 0). Increasing this
timeout may improve throughput at the cost of latency under low load.
Let's define a deployment that takes in a list of requests, extracts the input value,
converts them into an array, and uses NumPy to add 1 to each element.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:start-after: __doc_define_servable_begin__
:end-before: __doc_define_servable_end__
Let's deploy it. Note that in the ``@serve.batch`` decorator, we are specifying
specifying the maximum batch size via ``max_batch_size=4``. This option limits
the maximum possible batch size that will be executed at once.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:start-after: __doc_deploy_begin__
:end-before: __doc_deploy_end__
Let's define a :ref:`Ray remote task<ray-remote-functions>` to send queries in
parallel. As you can see, the first batch has a batch size of 1, and the subsequent
queries have a batch size of 4. Even though each query is issued independently,
Ray Serve was able to evaluate them in batches.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:start-after: __doc_query_begin__
:end-before: __doc_query_end__
What if you want to evaluate a whole batch in Python? Ray Serve allows you to send
queries via the Python API. A batch of queries can either come from the web server
or the Python API. Learn more :ref:`here<serve-handle-explainer>`.
To query the deployment via the Python API, we can use ``Deployment.get_handle`` to receive
a handle to the corresponding deployment. To enqueue a query, you can call
``handle.method.remote(data)``. This call returns immediately
with a :ref:`Ray ObjectRef<ray-object-refs>`. You can call `ray.get` to retrieve
the result.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:start-after: __doc_query_handle_begin__
:end-before: __doc_query_handle_end__

View file

@ -0,0 +1,22 @@
# Advanced Tutorials
Below is a list of tutorials that you can use to learn more about the different pieces of
Ray Serve functionality and how to integrate different modeling frameworks.
```{toctree}
:caption: Serve Tutorials
:maxdepth: '-1'
:name: serve-tutorials
tensorflow
pytorch
sklearn
batch
web-server-integration
rllib
gradio
```
Other Topics:
- {doc}`../deployment`

View file

@ -1,23 +0,0 @@
==================
Advanced Tutorials
==================
Below is a list of tutorials that you can use to learn more about the different pieces of
Ray Serve functionality and how to integrate different modeling frameworks.
.. toctree::
:caption: Serve Tutorials
:name: serve-tutorials
:maxdepth: -1
tensorflow
pytorch
sklearn
batch
web-server-integration
rllib
gradio
Other Topics:
- :doc:`../deployment`

View file

@ -0,0 +1,48 @@
(serve-pytorch-tutorial)=
# PyTorch Tutorial
In this guide, we will load and serve a PyTorch Resnet Model.
In particular, we show:
- How to load the model from PyTorch's pre-trained modelzoo.
- How to parse the JSON request, transform the payload and evaluated in the model.
Please see the {doc}`../core-apis` to learn more general information about Ray Serve.
This tutorial requires Pytorch and Torchvision installed in your system. Ray Serve
is framework agnostic and works with any version of PyTorch.
```bash
pip install torch torchvision
```
Let's import Ray Serve and some other helpers.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:end-before: __doc_import_end__
:start-after: __doc_import_begin__
```
Services are just defined as normal classes with `__init__` and `__call__` methods.
The `__call__` method will be invoked per request.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:end-before: __doc_define_servable_end__
:start-after: __doc_define_servable_begin__
```
Now that we've defined our services, let's deploy the model to Ray Serve. We will
define a Serve deployment that will be exposed over an HTTP route.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:end-before: __doc_deploy_end__
:start-after: __doc_deploy_begin__
```
Let's query it!
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:end-before: __doc_query_end__
:start-after: __doc_query_begin__
```

View file

@ -1,46 +0,0 @@
.. _serve-pytorch-tutorial:
PyTorch Tutorial
================
In this guide, we will load and serve a PyTorch Resnet Model.
In particular, we show:
- How to load the model from PyTorch's pre-trained modelzoo.
- How to parse the JSON request, transform the payload and evaluated in the model.
Please see the :doc:`../core-apis` to learn more general information about Ray Serve.
This tutorial requires Pytorch and Torchvision installed in your system. Ray Serve
is framework agnostic and works with any version of PyTorch.
.. code-block:: bash
pip install torch torchvision
Let's import Ray Serve and some other helpers.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:start-after: __doc_import_begin__
:end-before: __doc_import_end__
Services are just defined as normal classes with ``__init__`` and ``__call__`` methods.
The ``__call__`` method will be invoked per request.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:start-after: __doc_define_servable_begin__
:end-before: __doc_define_servable_end__
Now that we've defined our services, let's deploy the model to Ray Serve. We will
define a Serve deployment that will be exposed over an HTTP route.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:start-after: __doc_deploy_begin__
:end-before: __doc_deploy_end__
Let's query it!
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
:start-after: __doc_query_begin__
:end-before: __doc_query_end__

View file

@ -91,7 +91,7 @@ class ServePPOModel:
:::{tip}
Although we used a single input and `trainer.compute_single_action(...)` here, you
can process a batch of input using Ray Serve's {ref}`batching<serve-batching>` feature
can process a batch of input using Ray Serve's [batching](serve-batching) feature
and use `trainer.compute_actions(...)` to process a batch of inputs.
:::

View file

@ -0,0 +1,54 @@
(serve-sklearn-tutorial)=
# Scikit-Learn Tutorial
In this guide, we will train and deploy a simple Scikit-Learn classifier.
In particular, we show:
- How to load the model from file system in your Ray Serve definition
- How to parse the JSON request and evaluated in sklearn model
Please see the {doc}`../core-apis` to learn more general information about Ray Serve.
Ray Serve is framework agnostic. You can use any version of sklearn.
```bash
pip install scikit-learn
```
Let's import Ray Serve and some other helpers.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:end-before: __doc_import_end__
:start-after: __doc_import_begin__
```
We will train a logistic regression with the iris dataset.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:end-before: __doc_train_model_end__
:start-after: __doc_train_model_begin__
```
Services are just defined as normal classes with `__init__` and `__call__` methods.
The `__call__` method will be invoked per request.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:end-before: __doc_define_servable_end__
:start-after: __doc_define_servable_begin__
```
Now that we've defined our services, let's deploy the model to Ray Serve. We will
define a Serve deployment that will be exposed over an HTTP route.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:end-before: __doc_deploy_end__
:start-after: __doc_deploy_begin__
```
Let's query it!
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:end-before: __doc_query_end__
:start-after: __doc_query_begin__
```

View file

@ -1,50 +0,0 @@
.. _serve-sklearn-tutorial:
Scikit-Learn Tutorial
=====================
In this guide, we will train and deploy a simple Scikit-Learn classifier.
In particular, we show:
- How to load the model from file system in your Ray Serve definition
- How to parse the JSON request and evaluated in sklearn model
Please see the :doc:`../core-apis` to learn more general information about Ray Serve.
Ray Serve is framework agnostic. You can use any version of sklearn.
.. code-block:: bash
pip install scikit-learn
Let's import Ray Serve and some other helpers.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:start-after: __doc_import_begin__
:end-before: __doc_import_end__
We will train a logistic regression with the iris dataset.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:start-after: __doc_train_model_begin__
:end-before: __doc_train_model_end__
Services are just defined as normal classes with ``__init__`` and ``__call__`` methods.
The ``__call__`` method will be invoked per request.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:start-after: __doc_define_servable_begin__
:end-before: __doc_define_servable_end__
Now that we've defined our services, let's deploy the model to Ray Serve. We will
define a Serve deployment that will be exposed over an HTTP route.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:start-after: __doc_deploy_begin__
:end-before: __doc_deploy_end__
Let's query it!
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
:start-after: __doc_query_begin__
:end-before: __doc_query_end__

View file

@ -0,0 +1,56 @@
(serve-tensorflow-tutorial)=
# Keras and Tensorflow Tutorial
In this guide, we will train and deploy a simple Tensorflow neural net.
In particular, we show:
- How to load the model from file system in your Ray Serve definition
- How to parse the JSON request and evaluated in Tensorflow
Please see the {doc}`../core-apis` to learn more general information about Ray Serve.
Ray Serve is framework agnostic -- you can use any version of Tensorflow.
However, for this tutorial, we use Tensorflow 2 and Keras. Please make sure you have
Tensorflow 2 installed.
```bash
pip install "tensorflow>=2.0"
```
Let's import Ray Serve and some other helpers.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:end-before: __doc_import_end__
:start-after: __doc_import_begin__
```
We will train a simple MNIST model using Keras.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:end-before: __doc_train_model_end__
:start-after: __doc_train_model_begin__
```
Services are just defined as normal classes with `__init__` and `__call__` methods.
The `__call__` method will be invoked per request.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:end-before: __doc_define_servable_end__
:start-after: __doc_define_servable_begin__
```
Now that we've defined our services, let's deploy the model to Ray Serve. We will
define a Serve deployment that will be exposed over an HTTP route.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:end-before: __doc_deploy_end__
:start-after: __doc_deploy_begin__
```
Let's query it!
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:end-before: __doc_query_end__
:start-after: __doc_query_begin__
```

View file

@ -1,53 +0,0 @@
.. _serve-tensorflow-tutorial:
Keras and Tensorflow Tutorial
=============================
In this guide, we will train and deploy a simple Tensorflow neural net.
In particular, we show:
- How to load the model from file system in your Ray Serve definition
- How to parse the JSON request and evaluated in Tensorflow
Please see the :doc:`../core-apis` to learn more general information about Ray Serve.
Ray Serve is framework agnostic -- you can use any version of Tensorflow.
However, for this tutorial, we use Tensorflow 2 and Keras. Please make sure you have
Tensorflow 2 installed.
.. code-block:: bash
pip install "tensorflow>=2.0"
Let's import Ray Serve and some other helpers.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:start-after: __doc_import_begin__
:end-before: __doc_import_end__
We will train a simple MNIST model using Keras.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:start-after: __doc_train_model_begin__
:end-before: __doc_train_model_end__
Services are just defined as normal classes with ``__init__`` and ``__call__`` methods.
The ``__call__`` method will be invoked per request.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:start-after: __doc_define_servable_begin__
:end-before: __doc_define_servable_end__
Now that we've defined our services, let's deploy the model to Ray Serve. We will
define a Serve deployment that will be exposed over an HTTP route.
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:start-after: __doc_deploy_begin__
:end-before: __doc_deploy_end__
Let's query it!
.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
:start-after: __doc_query_begin__
:end-before: __doc_query_end__

View file

@ -0,0 +1,33 @@
(serve-web-server-integration-tutorial)=
# Integration with Existing Web Servers
In this guide, you will learn how to use Ray Serve to scale up your existing web application. The key feature of Ray Serve that makes this possible is the Python-native {ref}`servehandle-api`, which allows you keep using your same Python web server while offloading your heavy computation to Ray Serve.
We give two examples, one using a [FastAPI](https://fastapi.tiangolo.com/) web server and another using an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server, but the same approach will work with any Python web server.
## Scaling Up a FastAPI Application
Ray Serve has a native integration with FastAPI - please see {ref}`serve-fastapi-http`.
## Scaling Up an AIOHTTP Application
In this section, we'll integrate Ray Serve with an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server run using [Gunicorn](https://gunicorn.org/). You'll need to install AIOHTTP and gunicorn with the command `pip install aiohttp gunicorn`.
First, here is the script that deploys Ray Serve:
```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
```
Next is the script that defines the AIOHTTP server:
```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
```
Here's how to run this example:
1. Run `ray start --head` to start a local Ray cluster in the background.
2. In the directory where the example files are saved, run `python aiohttp_deploy_serve.py` to deploy our Ray Serve deployment.
3. Run `gunicorn aiohttp_app:app --worker-class aiohttp.GunicornWebWorker` to start the AIOHTTP app using gunicorn.
4. To test out the server, run `curl localhost:8000/dummy-model`. This should output `Model received data: dummy input`.
5. For cleanup, you can press Ctrl-C to stop the Gunicorn server, and run `ray stop` to stop the background Ray cluster.

View file

@ -1,39 +0,0 @@
.. _serve-web-server-integration-tutorial:
Integration with Existing Web Servers
=====================================
In this guide, you will learn how to use Ray Serve to scale up your existing web application. The key feature of Ray Serve that makes this possible is the Python-native :ref:`servehandle-api`, which allows you keep using your same Python web server while offloading your heavy computation to Ray Serve.
We give two examples, one using a `FastAPI <https://fastapi.tiangolo.com/>`__ web server and another using an `AIOHTTP <https://docs.aiohttp.org/en/stable/>`__ web server, but the same approach will work with any Python web server.
Scaling Up a FastAPI Application
--------------------------------
Ray Serve has a native integration with FastAPI - please see :ref:`serve-fastapi-http`.
Scaling Up an AIOHTTP Application
---------------------------------
In this section, we'll integrate Ray Serve with an `AIOHTTP <https://docs.aiohttp.org/en/stable/>`_ web server run using `Gunicorn <https://gunicorn.org/>`_. You'll need to install AIOHTTP and gunicorn with the command ``pip install aiohttp gunicorn``.
First, here is the script that deploys Ray Serve:
.. literalinclude:: ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
Next is the script that defines the AIOHTTP server:
.. literalinclude:: ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
Here's how to run this example:
1. Run ``ray start --head`` to start a local Ray cluster in the background.
2. In the directory where the example files are saved, run ``python aiohttp_deploy_serve.py`` to deploy our Ray Serve deployment.
3. Run ``gunicorn aiohttp_app:app --worker-class aiohttp.GunicornWebWorker`` to start the AIOHTTP app using gunicorn.
4. To test out the server, run ``curl localhost:8000/dummy-model``. This should output ``Model received data: dummy input``.
5. For cleanup, you can press Ctrl-C to stop the Gunicorn server, and run ``ray stop`` to stop the background Ray cluster.

View file

@ -12,7 +12,7 @@ USAGE_STATS_ENABLED_MESSAGE = (
"Usage stats collection is enabled. To disable this, add `--disable-usage-stats` "
"to the command that starts the cluster, or run the following command:"
" `ray disable-usage-stats` before starting the cluster. "
"See https://github.com/ray-project/ray/issues/20857 for more details."
"See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
)
USAGE_STATS_DISABLED_MESSAGE = "Usage stats collection is disabled."
@ -23,7 +23,7 @@ USAGE_STATS_ENABLED_BY_DEFAULT_MESSAGE = (
"To disable this, add `--disable-usage-stats` to the command that starts "
"the cluster, or run the following command:"
" `ray disable-usage-stats` before starting the cluster. "
"See https://github.com/ray-project/ray/issues/20857 for more details."
"See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
)
USAGE_STATS_CONFIRMATION_MESSAGE = (

View file

@ -92,30 +92,49 @@ class ClusterStatusToReport:
class UsageStatsToReport:
"""Usage stats to report"""
#: The Ray version in use.
ray_version: str
#: The Python version in use.
python_version: str
#: The schema version of the report.
schema_version: str
#: The source of the data (i.e. OSS).
source: str
#: A random id of the cluster session.
session_id: str
#: The git commit hash of Ray (i.e. ray.__commit__).
git_commit: str
#: The operating system in use.
os: str
#: When the data is collected and reported.
collect_timestamp_ms: int
#: When the cluster is started.
session_start_timestamp_ms: int
#: The cloud provider found in the cluster.yaml file (e.g., aws).
cloud_provider: Optional[str]
#: The min_workers found in the cluster.yaml file.
min_workers: Optional[int]
#: The max_workers found in the cluster.yaml file.
max_workers: Optional[int]
#: The head node instance type found in the cluster.yaml file (e.g., i3.8xlarge).
head_node_instance_type: Optional[str]
#: The worker node instance types found in the cluster.yaml file (e.g., i3.8xlarge).
worker_node_instance_types: Optional[List[str]]
#: The total num of cpus in the cluster.
total_num_cpus: Optional[int]
#: The total num of gpus in the cluster.
total_num_gpus: Optional[int]
#: The total size of memory in the cluster.
total_memory_gb: Optional[float]
#: The total size of object store memory in the cluster.
total_object_store_memory_gb: Optional[float]
#: The Ray libraries that are used (e.g., rllib).
library_usages: Optional[List[str]]
# The total number of successful reports for the lifetime of the cluster.
#: The total number of successful reports for the lifetime of the cluster.
total_success: int
# The total number of failed reports for the lifetime of the cluster.
#: The total number of failed reports for the lifetime of the cluster.
total_failed: int
# The sequence number of the report.
#: The sequence number of the report.
seq_number: int

View file

@ -44,6 +44,9 @@ from ray.serve.handle import RayServeHandle, RayServeSyncHandle
logger = logging.getLogger(__file__)
# Whether to issue warnings about using sync handles in async context
# or using async handle in sync context.
_WARN_SYNC_ASYNC_HANDLE_CONTEXT: bool = True
def _ensure_connected(f: Callable) -> Callable:
@ -393,7 +396,7 @@ class ServeControllerClient:
else:
raise ex
if asyncio_loop_running and sync:
if asyncio_loop_running and sync and _WARN_SYNC_ASYNC_HANDLE_CONTEXT:
logger.warning(
"You are retrieving a sync handle inside an asyncio loop. "
"Try getting client.get_handle(.., sync=False) to get better "
@ -401,7 +404,7 @@ class ServeControllerClient:
"serve/http-servehandle.html#sync-and-async-handles"
)
if not asyncio_loop_running and not sync:
if not asyncio_loop_running and not sync and _WARN_SYNC_ASYNC_HANDLE_CONTEXT:
logger.warning(
"You are retrieving an async handle outside an asyncio loop. "
"You should make sure client.get_handle is called inside a "

View file

@ -1,9 +1,19 @@
from contextlib import contextmanager
import json
from ray.experimental.dag.class_node import ClassNode # noqa: F401
from ray.experimental.dag.function_node import FunctionNode # noqa: F401
from ray.experimental.dag.input_node import InputNode # noqa: F401
from ray.experimental.dag import DAGNode # noqa: F401
from ray.util.annotations import PublicAPI
import ray.serve.client
@contextmanager
def _mute_sync_handle_warnings():
ray.serve.client._WARN_SYNC_ASYNC_HANDLE_CONTEXT = False
yield
ray.serve.client._WARN_SYNC_ASYNC_HANDLE_CONTEXT = True
@PublicAPI(stability="alpha")
@ -31,10 +41,12 @@ class RayServeDAGHandle:
return RayServeDAGHandle._deserialize, (self.dag_node_json,)
def remote(self, *args, **kwargs):
if self.dag_node is None:
from ray.serve.pipeline.json_serde import dagnode_from_json
# NOTE: There's nothing user can do about these warnings, we should hide it.
with _mute_sync_handle_warnings():
if self.dag_node is None:
from ray.serve.pipeline.json_serde import dagnode_from_json
self.dag_node = json.loads(
self.dag_node_json, object_hook=dagnode_from_json
)
return self.dag_node.execute(*args, **kwargs)
self.dag_node = json.loads(
self.dag_node_json, object_hook=dagnode_from_json
)
return self.dag_node.execute(*args, **kwargs)

View file

@ -1,3 +1,5 @@
import contextlib
import io
import sys
import numpy as np
from pydantic import BaseModel
@ -12,6 +14,7 @@ from ray.serve.http_adapters import json_request
from ray.experimental.dag.input_node import InputNode
from ray import serve
import ray
from ray._private.test_utils import wait_for_condition
def my_resolver(a: int):
@ -170,5 +173,30 @@ def test_driver_np_serializer(serve_instance):
assert requests.get("http://127.0.0.1:8000/").json() == [42]
def test_dag_driver_sync_warning(serve_instance):
with InputNode() as inp:
dag = echo.bind(inp)
log_file = io.StringIO()
with contextlib.redirect_stderr(log_file):
handle = serve.run(DAGDriver.bind(dag))
assert ray.get(handle.predict.remote(42)) == 42
def wait_for_request_success_log():
lines = log_file.getvalue().splitlines()
for line in lines:
if "DAGDriver" in line and "HANDLE predict OK" in line:
return True
return False
wait_for_condition(wait_for_request_success_log)
assert (
"You are retrieving a sync handle inside an asyncio loop."
not in log_file.getvalue()
)
if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))

View file

@ -456,6 +456,13 @@ class GlobalState:
self._check_connected()
# Add a small delay to account for propagation delay of events to the GCS.
# This should be harmless enough but prevents calls to timeline() from
# missing recent timeline data.
import time
time.sleep(1)
profile_table = self.profile_table()
all_events = []