[serve][doc] Update Serve API in tutorials code (#27579)

This commit is contained in:
zcin 2022-08-09 19:59:14 -07:00 committed by GitHub
parent e33edcb0b7
commit ea2a11080f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 173 additions and 212 deletions

View file

@ -0,0 +1,34 @@
# fmt: off
# __doc_import_begin__
from typing import List
from starlette.requests import Request
from transformers import pipeline, Pipeline
from ray import serve
# __doc_import_end__
# fmt: on
# __doc_define_servable_begin__
@serve.deployment
class BatchTextGenerator:
def __init__(self, model: Pipeline):
self.model = model
@serve.batch(max_batch_size=4)
async def handle_batch(self, inputs: List[str]) -> List[str]:
print("Our input array has length:", len(inputs))
results = self.model(inputs)
return [result[0]["generated_text"] for result in results]
async def __call__(self, request: Request) -> List[str]:
return await self.handle_batch(request.query_params["text"])
# __doc_define_servable_end__
# __doc_deploy_begin__
model = pipeline("text-generation", "gpt2")
generator = BatchTextGenerator.bind(model)
# __doc_deploy_end__

View file

@ -4,6 +4,8 @@ from ray import serve
from io import BytesIO
from PIL import Image
from starlette.requests import Request
from typing import Dict
import torch
from torchvision import transforms
@ -13,7 +15,7 @@ from torchvision.models import resnet18
# __doc_define_servable_begin__
@serve.deployment(route_prefix="/image_predict")
@serve.deployment
class ImageModel:
def __init__(self):
self.model = resnet18(pretrained=True).eval()
@ -29,7 +31,7 @@ class ImageModel:
]
)
async def __call__(self, starlette_request):
async def __call__(self, starlette_request: Request) -> Dict:
image_payload_bytes = await starlette_request.body()
pil_image = Image.open(BytesIO(image_payload_bytes))
print("[1/3] Parsed image data: {}".format(pil_image))
@ -48,5 +50,5 @@ class ImageModel:
# __doc_deploy_begin__
app = ImageModel.bind()
image_model = ImageModel.bind()
# __doc_deploy_end__

View file

@ -7,6 +7,8 @@ import json
import numpy as np
import os
import tempfile
from starlette.requests import Request
from typing import Dict
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
@ -49,15 +51,15 @@ with open(LABEL_PATH, "w") as f:
# __doc_define_servable_begin__
@serve.deployment(route_prefix="/classifier")
@serve.deployment
class BoostingModel:
def __init__(self, model_path, label_path):
def __init__(self, model_path: str, label_path: str):
with open(model_path, "rb") as f:
self.model = pickle.load(f)
with open(label_path) as f:
self.label_list = json.load(f)
async def __call__(self, starlette_request):
async def __call__(self, starlette_request: Request) -> Dict:
payload = await starlette_request.json()
print("Worker: received starlette request with data", payload)
@ -74,5 +76,5 @@ class BoostingModel:
# __doc_deploy_begin__
app = BoostingModel.bind(MODEL_PATH, LABEL_PATH)
boosting_model = BoostingModel.bind(MODEL_PATH, LABEL_PATH)
# __doc_deploy_end__

View file

@ -5,6 +5,8 @@ from ray import serve
import os
import tempfile
import numpy as np
from starlette.requests import Request
from typing import Dict
import tensorflow as tf
# __doc_import_end__
@ -46,15 +48,15 @@ if not os.path.exists(TRAINED_MODEL_PATH):
# __doc_define_servable_begin__
@serve.deployment(route_prefix="/mnist")
@serve.deployment
class TFMnistModel:
def __init__(self, model_path):
def __init__(self, model_path: str):
import tensorflow as tf
self.model_path = model_path
self.model = tf.keras.models.load_model(model_path)
async def __call__(self, starlette_request):
async def __call__(self, starlette_request: Request) -> Dict:
# Step 1: transform HTTP request -> tensorflow input
# Here we define the request schema to be a json array.
input_array = np.array((await starlette_request.json())["array"])
@ -69,5 +71,5 @@ class TFMnistModel:
# __doc_deploy_begin__
app = TFMnistModel.bind(TRAINED_MODEL_PATH)
mnist_model = TFMnistModel.bind(TRAINED_MODEL_PATH)
# __doc_deploy_end__

View file

@ -73,9 +73,3 @@ flag to toggle between them.
The async handle has performance advantage because it uses asyncio directly; as compared
to the sync handle, which talks to an asyncio event loop in a thread. To learn more about
the reasoning behind these, checkout our [architecture documentation](serve-architecture).
## Integrating with existing web servers
Ray Serve comes with its own HTTP server out of the box, but if you have an existing
web application, you can still plug in Ray Serve to scale up your compute using the `ServeHandle`.
For a tutorial with sample code, see {ref}`serve-web-server-integration-tutorial`.

View file

@ -173,7 +173,7 @@ or head over to the {doc}`tutorials/index` to get started building your Ray Serv
**Examples**
^^^
Follow the tutorials to learn how to integrate Ray Serve with :ref:`TensorFlow<serve-ml-models-tutorial>`, :ref:`Scikit-Learn<serve-ml-models-tutorial>`, and :ref:`RLlib<serve-rllib-tutorial>`. Learn how Ray Serve also integrates with :ref:`existing web applications<serve-web-server-integration-tutorial>`
Follow the tutorials to learn how to integrate Ray Serve with :ref:`TensorFlow<serve-ml-models-tutorial>`, :ref:`Scikit-Learn<serve-ml-models-tutorial>`, and :ref:`RLlib<serve-rllib-tutorial>`.
+++
.. link-button:: serve-examples

View file

@ -2,8 +2,8 @@
# Batching Tutorial
In this guide, we will deploy a simple vectorized adder that takes
a batch of queries and adds them at once. In particular, we show:
In this guide, we will deploy a simple text generator that takes in
a batch of queries and processes them at once. In particular, we show:
- How to implement and deploy a Ray Serve deployment that accepts batches.
- How to configure the batch size.
@ -18,9 +18,11 @@ This tutorial should help the following use cases:
inference with batching can increase the *throughput* of the model as well as
*utilization* of the hardware.
Let's import Ray Serve and some other helpers.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
## Define the Deployment
Open a new Python file called `tutorial_batch.py`. First, let's import Ray Serve and some other helpers.
```{literalinclude} ../doc_code/tutorial_batch.py
:end-before: __doc_import_end__
:start-after: __doc_import_begin__
```
@ -62,44 +64,126 @@ finishes, a larger batch may be executed. This behavior can be tuned using the
timeout may improve throughput at the cost of latency under low load.
:::
Let's define a deployment that takes in a list of requests, extracts the input value,
converts them into an array, and uses NumPy to add 1 to each element.
Let's define a deployment that takes in a list of input strings and runs
vectorized text generation on the inputs.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
```{literalinclude} ../doc_code/tutorial_batch.py
:end-before: __doc_define_servable_end__
:start-after: __doc_define_servable_begin__
```
Let's deploy it. Note that in the `@serve.batch` decorator, we are specifying
the maximum batch size via `max_batch_size=4`. This option limits
Let's prepare to deploy the deployment. Note that in the `@serve.batch` decorator, we
are specifying the maximum batch size via `max_batch_size=4`. This option limits
the maximum possible batch size that will be executed at once.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
```{literalinclude} ../doc_code/tutorial_batch.py
:end-before: __doc_deploy_end__
:start-after: __doc_deploy_begin__
```
Let's define a [Ray remote task](ray-remote-functions) to send queries in
parallel. As you can see, the first batch has a batch size of 1, and the subsequent
queries have a batch size of 4. Even though each query is issued independently,
Ray Serve was able to evaluate them in batches.
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:end-before: __doc_query_end__
:start-after: __doc_query_begin__
## Deploy the Deployment
Deploy the deployment by running the following through the terminal.
```console
$ serve run tutorial_batch:generator
```
Let's define a [Ray remote task](ray-remote-functions) to send queries in
parallel. While Serve is running, open a separate terminal window, and run the
following in an interactive Python shell or a separate Python script:
```python
import ray
import requests
import numpy as np
@ray.remote
def send_query(text):
resp = requests.get("http://localhost:8000/?text={}".format(text))
return resp.text
# Let's use Ray to send all queries in parallel
texts = [
'Once upon a time,',
'Hi my name is Lewis and I like to',
'My name is Mary, and my favorite',
'My name is Clara and I am',
'My name is Julien and I like to',
'Today I accidentally',
'My greatest wish is to',
'In a galaxy far far away',
'My best talent is',
]
results = ray.get([send_query.remote(text) for text in texts])
print("Result returned:", results)
```
You should get an output like the following. As you can see, the first batch has a
batch size of 1, and the subsequent queries have a batch size of 4. Even though each
query is issued independently, Ray Serve was able to evaluate them in batches.
```python
(pid=...) Our input array has length: 1
(pid=...) Our input array has length: 4
(pid=...) Our input array has length: 4
Result returned: [
'Once upon a time, when I got to look at and see the work of my parents (I still can\'t stand them,) they said, "Boys, you\'re going to like it if you\'ll stay away from him or make him look',
"Hi my name is Lewis and I like to look great. When I'm not playing against, it's when I play my best and always feel most comfortable. I get paid by the same people who make my games, who work hardest for me.",
"My name is Mary, and my favorite person in these two universes, the Green Lantern and the Red Lantern, are the same, except they're two of the Green Lanterns, but they also have their own different traits. Now their relationship is known",
'My name is Clara and I am married and live in Philadelphia. I am an English language teacher and translator. I am passionate about the issues that have so inspired me and my journey. My story begins with the discovery of my own child having been born',
'My name is Julien and I like to travel with my son on vacations... In fact I really prefer to spend more time with my son."\n\nIn 2011, the following year he was diagnosed with terminal Alzheimer\'s disease, and since then,',
"Today I accidentally got lost and went on another tour in August. My story was different, but it had so many emotions that it made me happy. I'm proud to still be able to go back to Oregon for work.\n\nFor the longest",
'My greatest wish is to return your loved ones to this earth where they can begin their own free and prosperous lives. This is true only on occasion as it is not intended or even encouraged to be so.\n\nThe Gospel of Luke 8:29',
'In a galaxy far far away, the most brilliant and powerful beings known would soon enter upon New York, setting out to restore order to the state. When the world turned against them, Darth Vader himself and Obi-Wan Kenobi, along with the Jedi',
'My best talent is that I can make a movie with somebody who really has a big and strong voice. I do believe that they would be great writers. I can tell you that to make sure."\n\n\nWith this in mind, "Ghostbusters'
]
```
## Deploy the Deployment using Python API
What if you want to evaluate a whole batch in Python? Ray Serve allows you to send
queries via the Python API. A batch of queries can either come from the web server
or the Python API. Learn more [here](serve-handle-explainer).
or the Python API.
To query the deployment via the Python API, we can use `Deployment.get_handle` to receive
a handle to the corresponding deployment. To enqueue a query, you can call
`handle.method.remote(data)`. This call returns immediately
with a [Ray ObjectRef](ray-object-refs). You can call `ray.get` to retrieve
the result.
To query the deployment via the Python API, we can use `serve.run()`, which is part
of the Python API, instead of running `serve run` from the console. Add the following
to the Python script `tutorial_batch.py`:
```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
:end-before: __doc_query_handle_end__
:start-after: __doc_query_handle_begin__
```python
handle = serve.run(generator)
```
Generally, to enqueue a query, you can call `handle.method.remote(data)`. This call
returns immediately with a [Ray ObjectRef](ray-object-refs). You can call `ray.get` to
retrieve the result. Add the following to the same Python script.
```python
input_batch = [
'Once upon a time,',
'Hi my name is Lewis and I like to',
'My name is Mary, and my favorite',
'My name is Clara and I am',
'My name is Julien and I like to',
'Today I accidentally',
'My greatest wish is to',
'In a galaxy far far away',
'My best talent is',
]
print("Input batch is", input_batch)
import ray
result_batch = ray.get([handle.handle_batch.remote(batch) for batch in input_batch])
print("Result batch is", result_batch)
```
Finally, let's run the script.
```console
$ python tutorial_batch.py
```
You should get a similar output like before!

View file

@ -11,7 +11,6 @@ Ray Serve functionality and how to integrate different modeling frameworks.
serve-ml-models
batch
web-server-integration
rllib
gradio
```

View file

@ -72,7 +72,7 @@ pass them into the restored `Algorithm` using the `compute_single_action` method
from starlette.requests import Request
@serve.deployment(route_prefix="/cartpole-ppo")
@serve.deployment
class ServePPOModel:
def __init__(self, checkpoint_path) -> None:
# Re-create the originally used config.
@ -99,12 +99,11 @@ and use `Algorithm.compute_actions(...)` to process a batch of inputs.
:::
Now that we've defined our `ServePPOModel` service, let's deploy it to Ray Serve.
The deployment will be exposed through the `/cartpole-ppo` route.
```{code-cell} python3
:tags: [hide-output]
serve.start()
ServePPOModel.deploy(checkpoint_path)
ppo_model = ServePPOModel.bind(checkpoint_path)
serve.run(ppo_model)
```
Note that the `checkpoint_path` that we passed to the `deploy()` method will be passed to
@ -123,7 +122,7 @@ for _ in range(5):
print(f"-> Sending observation {obs}")
resp = requests.get(
"http://localhost:8000/cartpole-ppo", json={"observation": obs.tolist()}
"http://localhost:8000/", json={"observation": obs.tolist()}
)
print(f"<- Received response {resp.json()}")
```

View file

@ -62,7 +62,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be
Finally, we can deploy our model to Ray Serve through the terminal.
```console
$ serve run tutorial_tensorflow:app
$ serve run tutorial_tensorflow:mnist_model
```
Let's query it! While Serve is running, open a separate terminal window, and run the following in an interactive Python shell or a separate Python script:
@ -72,7 +72,7 @@ import requests
import numpy as np
resp = requests.get(
"http://localhost:8000/mnist", json={"array": np.random.randn(28 * 28).tolist()}
"http://localhost:8000/", json={"array": np.random.randn(28 * 28).tolist()}
)
print(resp.json())
```
@ -132,7 +132,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be
Finally, we can deploy our model to Ray Serve through the terminal.
```console
$ serve run tutorial_pytorch:app
$ serve run tutorial_pytorch:image_model
```
Let's query it! While Serve is running, open a separate terminal window, and run the following in an interactive Python shell or a separate Python script:
@ -145,7 +145,7 @@ ray_logo_bytes = requests.get(
"ray/master/doc/source/images/ray_header_logo.png"
).content
resp = requests.post("http://localhost:8000/image_predict", data=ray_logo_bytes)
resp = requests.post("http://localhost:8000/", data=ray_logo_bytes)
print(resp.json())
```
@ -231,7 +231,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be
Finally, we can deploy our model to Ray Serve through the terminal.
```console
$ serve run tutorial_sklearn:app
$ serve run tutorial_sklearn:boosting_model
```
Let's query it! While Serve is running, open a separate terminal window, and run the following in an interactive Python shell or a separate Python script:
@ -245,7 +245,7 @@ sample_request_input = {
"petal length": 1.1,
"petal width": 0.9,
}
response = requests.get("http://localhost:8000/classifier", json=sample_request_input)
response = requests.get("http://localhost:8000/", json=sample_request_input)
print(response.text)
```

View file

@ -1,33 +0,0 @@
(serve-web-server-integration-tutorial)=
# Integration with Existing Web Servers
In this guide, you will learn how to use Ray Serve to scale up your existing web application. The key feature of Ray Serve that makes this possible is the Python-native {ref}`servehandle-api`, which allows you keep using your same Python web server while offloading your heavy computation to Ray Serve.
We give two examples, one using a [FastAPI](https://fastapi.tiangolo.com/) web server and another using an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server, but the same approach will work with any Python web server.
## Scaling Up a FastAPI Application
Ray Serve has a native integration with FastAPI - please see {ref}`serve-fastapi-http`.
## Scaling Up an AIOHTTP Application
In this section, we'll integrate Ray Serve with an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server run using [Gunicorn](https://gunicorn.org/). You'll need to install AIOHTTP and gunicorn with the command `pip install aiohttp gunicorn`.
First, here is the script that deploys Ray Serve:
```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
```
Next is the script that defines the AIOHTTP server:
```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
```
Here's how to run this example:
1. Run `ray start --head` to start a local Ray cluster in the background.
2. In the directory where the example files are saved, run `python aiohttp_deploy_serve.py` to deploy our Ray Serve deployment.
3. Run `gunicorn aiohttp_app:app --worker-class aiohttp.GunicornWebWorker` to start the AIOHTTP app using gunicorn.
4. To test out the server, run `curl localhost:8000/dummy-model`. This should output `Model received data: dummy input`.
5. For cleanup, you can press Ctrl-C to stop the Gunicorn server, and run `ray stop` to stop the background Ray cluster.

View file

@ -1,26 +0,0 @@
# File name: aiohttp_app.py
from aiohttp import web
import ray
from ray import serve
# Connect to the running Ray cluster.
ray.init(address="auto")
# Fetch the ServeHandle to query our model.
my_handle = serve.get_deployment("my_model").get_handle()
# Define our AIOHTTP request handler.
async def handle_request(request):
# Offload the computation to our Ray Serve deployment.
result = await my_handle.remote("dummy input")
return web.Response(text=result)
# Set up an HTTP endpoint.
app = web.Application()
app.add_routes([web.get("/dummy-model", handle_request)])
if __name__ == "__main__":
web.run_app(app)

View file

@ -1,20 +0,0 @@
# File name: aiohttp_deploy_serve.py
import ray
from ray import serve
# Connect to the running Ray cluster.
ray.init(address="auto")
# Start a detached Ray Serve instance. It will persist after the script exits.
serve.start(http_options={"http_host": None}, detached=True)
# Set up a deployment with the desired number of replicas. This could also be
# a stateful class (e.g., if we had an expensive model to set up).
@serve.deployment(name="my_model", num_replicas=2)
async def my_model(request):
data = await request.body()
return f"Model received data: {data}"
my_model.deploy()

View file

@ -1,76 +0,0 @@
# fmt: off
# __doc_import_begin__
from typing import List
import time
import numpy as np
import requests
from starlette.requests import Request
import ray
from ray import serve
# __doc_import_end__
# fmt: on
# __doc_define_servable_begin__
@serve.deployment(route_prefix="/adder")
class BatchAdder:
@serve.batch(max_batch_size=4)
async def handle_batch(self, numbers: List[int]):
input_array = np.array(numbers)
print("Our input array has shape:", input_array.shape)
# Sleep for 200ms, this could be performing CPU intensive computation
# in real models
time.sleep(0.2)
output_array = input_array + 1
return output_array.astype(int).tolist()
async def __call__(self, request: Request):
return await self.handle_batch(int(request.query_params["number"]))
# __doc_define_servable_end__
# __doc_deploy_begin__
ray.init(num_cpus=8)
serve.start()
BatchAdder.deploy()
# __doc_deploy_end__
# __doc_query_begin__
@ray.remote
def send_query(number):
resp = requests.get("http://localhost:8000/adder?number={}".format(number))
return int(resp.text)
# Let's use Ray to send all queries in parallel
results = ray.get([send_query.remote(i) for i in range(9)])
print("Result returned:", results)
# Output
# (pid=...) Our input array has shape: (1,)
# (pid=...) Our input array has shape: (4,)
# (pid=...) Our input array has shape: (4,)
# Result returned: [1, 2, 3, 4, 5, 6, 7, 8, 9]
# __doc_query_end__
# __doc_query_handle_begin__
handle = BatchAdder.get_handle()
input_batch = list(range(9))
print("Input batch is", input_batch)
# Input batch is [0, 1, 2, 3, 4, 5, 6, 7, 8]
result_batch = ray.get([handle.handle_batch.remote(i) for i in input_batch])
# Output
# (pid=...) Current context is python
# (pid=...) Our input array has shape: (1,)
# (pid=...) Current context is python
# (pid=...) Our input array has shape: (4,)
# (pid=...) Current context is python
# (pid=...) Our input array has shape: (4,)
print("Result batch is", result_batch)
# Result batch is [1, 2, 3, 4, 5, 6, 7, 8, 9]
# __doc_query_handle_end__