[serve][doc] Update Serve API in tutorials code (#27579)

2025-03-05 10:01:43 -05:00 · 2022-08-09 19:59:14 -07:00 · 2022-08-09 19:59:14 -07:00 · ea2a11080f
commit ea2a11080f
parent e33edcb0b7
14 changed files with 173 additions and 212 deletions
--- a/doc/source/serve/doc_code/tutorial_batch.py
+++ b/doc/source/serve/doc_code/tutorial_batch.py
@ -0,0 +1,34 @@
+# fmt: off
+# __doc_import_begin__
+from typing import List
+
+from starlette.requests import Request
+from transformers import pipeline, Pipeline
+
+from ray import serve
+# __doc_import_end__
+# fmt: on
+
+
+# __doc_define_servable_begin__
+@serve.deployment
+class BatchTextGenerator:
+    def __init__(self, model: Pipeline):
+        self.model = model
+
+    @serve.batch(max_batch_size=4)
+    async def handle_batch(self, inputs: List[str]) -> List[str]:
+        print("Our input array has length:", len(inputs))
+
+        results = self.model(inputs)
+        return [result[0]["generated_text"] for result in results]
+
+    async def __call__(self, request: Request) -> List[str]:
+        return await self.handle_batch(request.query_params["text"])
+        # __doc_define_servable_end__
+
+
+# __doc_deploy_begin__
+model = pipeline("text-generation", "gpt2")
+generator = BatchTextGenerator.bind(model)
+# __doc_deploy_end__
--- a/doc/source/serve/doc_code/tutorial_pytorch.py
+++ b/doc/source/serve/doc_code/tutorial_pytorch.py
@ -4,6 +4,8 @@ from ray import serve

 from io import BytesIO
 from PIL import Image
+from starlette.requests import Request
+from typing import Dict

 import torch
 from torchvision import transforms
@ -13,7 +15,7 @@ from torchvision.models import resnet18


 # __doc_define_servable_begin__
-@serve.deployment(route_prefix="/image_predict")
+@serve.deployment
 class ImageModel:
    def __init__(self):
        self.model = resnet18(pretrained=True).eval()
@ -29,7 +31,7 @@ class ImageModel:
            ]
        )

-    async def __call__(self, starlette_request):
+    async def __call__(self, starlette_request: Request) -> Dict:
        image_payload_bytes = await starlette_request.body()
        pil_image = Image.open(BytesIO(image_payload_bytes))
        print("[1/3] Parsed image data: {}".format(pil_image))
@ -48,5 +50,5 @@ class ImageModel:


 # __doc_deploy_begin__
-app = ImageModel.bind()
+image_model = ImageModel.bind()
 # __doc_deploy_end__
--- a/doc/source/serve/doc_code/tutorial_sklearn.py
+++ b/doc/source/serve/doc_code/tutorial_sklearn.py
@ -7,6 +7,8 @@ import json
 import numpy as np
 import os
 import tempfile
+from starlette.requests import Request
+from typing import Dict

 from sklearn.datasets import load_iris
 from sklearn.ensemble import GradientBoostingClassifier
@ -49,15 +51,15 @@ with open(LABEL_PATH, "w") as f:


 # __doc_define_servable_begin__
-@serve.deployment(route_prefix="/classifier")
+@serve.deployment
 class BoostingModel:
-    def __init__(self, model_path, label_path):
+    def __init__(self, model_path: str, label_path: str):
        with open(model_path, "rb") as f:
            self.model = pickle.load(f)
        with open(label_path) as f:
            self.label_list = json.load(f)

-    async def __call__(self, starlette_request):
+    async def __call__(self, starlette_request: Request) -> Dict:
        payload = await starlette_request.json()
        print("Worker: received starlette request with data", payload)

@ -74,5 +76,5 @@ class BoostingModel:


 # __doc_deploy_begin__
-app = BoostingModel.bind(MODEL_PATH, LABEL_PATH)
+boosting_model = BoostingModel.bind(MODEL_PATH, LABEL_PATH)
 # __doc_deploy_end__
--- a/doc/source/serve/doc_code/tutorial_tensorflow.py
+++ b/doc/source/serve/doc_code/tutorial_tensorflow.py
@ -5,6 +5,8 @@ from ray import serve
 import os
 import tempfile
 import numpy as np
+from starlette.requests import Request
+from typing import Dict

 import tensorflow as tf
 # __doc_import_end__
@ -46,15 +48,15 @@ if not os.path.exists(TRAINED_MODEL_PATH):


 # __doc_define_servable_begin__
-@serve.deployment(route_prefix="/mnist")
+@serve.deployment
 class TFMnistModel:
-    def __init__(self, model_path):
+    def __init__(self, model_path: str):
        import tensorflow as tf

        self.model_path = model_path
        self.model = tf.keras.models.load_model(model_path)

-    async def __call__(self, starlette_request):
+    async def __call__(self, starlette_request: Request) -> Dict:
        # Step 1: transform HTTP request -> tensorflow input
        # Here we define the request schema to be a json array.
        input_array = np.array((await starlette_request.json())["array"])
@ -69,5 +71,5 @@ class TFMnistModel:


 # __doc_deploy_begin__
-app = TFMnistModel.bind(TRAINED_MODEL_PATH)
+mnist_model = TFMnistModel.bind(TRAINED_MODEL_PATH)
 # __doc_deploy_end__
--- a/doc/source/serve/handle-guide.md
+++ b/doc/source/serve/handle-guide.md
@ -73,9 +73,3 @@ flag to toggle between them.
 The async handle has performance advantage because it uses asyncio directly; as compared
 to the sync handle, which talks to an asyncio event loop in a thread. To learn more about
 the reasoning behind these, checkout our [architecture documentation](serve-architecture).
-
-## Integrating with existing web servers
-
-Ray Serve comes with its own HTTP server out of the box, but if you have an existing
-web application, you can still plug in Ray Serve to scale up your compute using the `ServeHandle`.
-For a tutorial with sample code, see {ref}`serve-web-server-integration-tutorial`.
--- a/doc/source/serve/index.md
+++ b/doc/source/serve/index.md
@ -173,7 +173,7 @@ or head over to the {doc}`tutorials/index` to get started building your Ray Serv
    **Examples**
    ^^^

-    Follow the tutorials to learn how to integrate Ray Serve with :ref:`TensorFlow<serve-ml-models-tutorial>`, :ref:`Scikit-Learn<serve-ml-models-tutorial>`, and :ref:`RLlib<serve-rllib-tutorial>`. Learn how Ray Serve also integrates with :ref:`existing web applications<serve-web-server-integration-tutorial>`
+    Follow the tutorials to learn how to integrate Ray Serve with :ref:`TensorFlow<serve-ml-models-tutorial>`, :ref:`Scikit-Learn<serve-ml-models-tutorial>`, and :ref:`RLlib<serve-rllib-tutorial>`.

    +++
    .. link-button:: serve-examples
--- a/doc/source/serve/tutorials/batch.md
+++ b/doc/source/serve/tutorials/batch.md
@ -2,8 +2,8 @@

 # Batching Tutorial

-In this guide, we will deploy a simple vectorized adder that takes
-a batch of queries and adds them at once. In particular, we show:
+In this guide, we will deploy a simple text generator that takes in
+a batch of queries and processes them at once. In particular, we show:

 - How to implement and deploy a Ray Serve deployment that accepts batches.
 - How to configure the batch size.
@ -18,9 +18,11 @@ This tutorial should help the following use cases:
  inference with batching can increase the *throughput* of the model as well as
  *utilization* of the hardware.

-Let's import Ray Serve and some other helpers.

-```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+## Define the Deployment
+Open a new Python file called `tutorial_batch.py`. First, let's import Ray Serve and some other helpers.
+
+```{literalinclude} ../doc_code/tutorial_batch.py
 :end-before: __doc_import_end__
 :start-after: __doc_import_begin__
 ```
@ -62,44 +64,126 @@ finishes, a larger batch may be executed. This behavior can be tuned using the
 timeout may improve throughput at the cost of latency under low load.
 :::

-Let's define a deployment that takes in a list of requests, extracts the input value,
-converts them into an array, and uses NumPy to add 1 to each element.
+Let's define a deployment that takes in a list of input strings and runs 
+vectorized text generation on the inputs.

-```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+```{literalinclude} ../doc_code/tutorial_batch.py
 :end-before: __doc_define_servable_end__
 :start-after: __doc_define_servable_begin__
 ```

-Let's deploy it. Note that in the `@serve.batch` decorator, we are specifying
-the maximum batch size via `max_batch_size=4`. This option limits
+Let's prepare to deploy the deployment. Note that in the `@serve.batch` decorator, we
+are specifying the maximum batch size via `max_batch_size=4`. This option limits
 the maximum possible batch size that will be executed at once.

-```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+```{literalinclude} ../doc_code/tutorial_batch.py
 :end-before: __doc_deploy_end__
 :start-after: __doc_deploy_begin__
 ```

-Let's define a [Ray remote task](ray-remote-functions) to send queries in
-parallel. As you can see, the first batch has a batch size of 1, and the subsequent
-queries have a batch size of 4. Even though each query is issued independently,
-Ray Serve was able to evaluate them in batches.
-
-```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
-:end-before: __doc_query_end__
-:start-after: __doc_query_begin__
+## Deploy the Deployment
+Deploy the deployment by running the following through the terminal.
+```console
+$ serve run tutorial_batch:generator
 ```

+Let's define a [Ray remote task](ray-remote-functions) to send queries in
+parallel. While Serve is running, open a separate terminal window, and run the 
+following in an interactive Python shell or a separate Python script:
+
+```python
+import ray
+import requests
+import numpy as np
+
+@ray.remote
+def send_query(text):
+    resp = requests.get("http://localhost:8000/?text={}".format(text))
+    return resp.text
+
+# Let's use Ray to send all queries in parallel
+texts = [
+    'Once upon a time,',
+    'Hi my name is Lewis and I like to',
+    'My name is Mary, and my favorite',
+    'My name is Clara and I am',
+    'My name is Julien and I like to',
+    'Today I accidentally',
+    'My greatest wish is to',
+    'In a galaxy far far away',
+    'My best talent is',
+]
+results = ray.get([send_query.remote(text) for text in texts])
+print("Result returned:", results)
+```
+
+You should get an output like the following. As you can see, the first batch has a 
+batch size of 1, and the subsequent queries have a batch size of 4. Even though each 
+query is issued independently, Ray Serve was able to evaluate them in batches.
+```python
+(pid=...) Our input array has length: 1
+(pid=...) Our input array has length: 4
+(pid=...) Our input array has length: 4
+Result returned: [
+    'Once upon a time, when I got to look at and see the work of my parents (I still can\'t stand them,) they said, "Boys, you\'re going to like it if you\'ll stay away from him or make him look',
+
+    "Hi my name is Lewis and I like to look great. When I'm not playing against, it's when I play my best and always feel most comfortable. I get paid by the same people who make my games, who work hardest for me.", 
+
+    "My name is Mary, and my favorite person in these two universes, the Green Lantern and the Red Lantern, are the same, except they're two of the Green Lanterns, but they also have their own different traits. Now their relationship is known", 
+
+    'My name is Clara and I am married and live in Philadelphia. I am an English language teacher and translator. I am passionate about the issues that have so inspired me and my journey. My story begins with the discovery of my own child having been born', 
+
+    'My name is Julien and I like to travel with my son on vacations... In fact I really prefer to spend more time with my son."\n\nIn 2011, the following year he was diagnosed with terminal Alzheimer\'s disease, and since then,', 
+
+    "Today I accidentally got lost and went on another tour in August. My story was different, but it had so many emotions that it made me happy. I'm proud to still be able to go back to Oregon for work.\n\nFor the longest", 
+
+    'My greatest wish is to return your loved ones to this earth where they can begin their own free and prosperous lives. This is true only on occasion as it is not intended or even encouraged to be so.\n\nThe Gospel of Luke 8:29', 
+
+    'In a galaxy far far away, the most brilliant and powerful beings known would soon enter upon New York, setting out to restore order to the state. When the world turned against them, Darth Vader himself and Obi-Wan Kenobi, along with the Jedi', 
+
+    'My best talent is that I can make a movie with somebody who really has a big and strong voice. I do believe that they would be great writers. I can tell you that to make sure."\n\n\nWith this in mind, "Ghostbusters'
+]
+```
+
+## Deploy the Deployment using Python API
 What if you want to evaluate a whole batch in Python? Ray Serve allows you to send
 queries via the Python API. A batch of queries can either come from the web server
-or the Python API. Learn more [here](serve-handle-explainer).
+or the Python API.

-To query the deployment via the Python API, we can use `Deployment.get_handle` to receive
-a handle to the corresponding deployment. To enqueue a query, you can call
-`handle.method.remote(data)`. This call returns immediately
-with a [Ray ObjectRef](ray-object-refs). You can call `ray.get` to retrieve
-the result.
+To query the deployment via the Python API, we can use `serve.run()`, which is part
+of the Python API, instead of running `serve run` from the console. Add the following
+to the Python script `tutorial_batch.py`:

-```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
-:end-before: __doc_query_handle_end__
-:start-after: __doc_query_handle_begin__
+```python
+handle = serve.run(generator)
 ```
+
+Generally, to enqueue a query, you can call `handle.method.remote(data)`. This call 
+returns immediately with a [Ray ObjectRef](ray-object-refs). You can call `ray.get` to 
+retrieve the result. Add the following to the same Python script.
+
+```python
+input_batch = [
+    'Once upon a time,',
+    'Hi my name is Lewis and I like to',
+    'My name is Mary, and my favorite',
+    'My name is Clara and I am',
+    'My name is Julien and I like to',
+    'Today I accidentally',
+    'My greatest wish is to',
+    'In a galaxy far far away',
+    'My best talent is',
+]
+print("Input batch is", input_batch)
+
+import ray
+result_batch = ray.get([handle.handle_batch.remote(batch) for batch in input_batch])
+print("Result batch is", result_batch)
+```
+
+Finally, let's run the script.
+```console
+$ python tutorial_batch.py
+```
+
+You should get a similar output like before!
--- a/doc/source/serve/tutorials/index.md
+++ b/doc/source/serve/tutorials/index.md
@ -11,7 +11,6 @@ Ray Serve functionality and how to integrate different modeling frameworks.

 serve-ml-models
 batch
-web-server-integration
 rllib
 gradio
 ```
--- a/doc/source/serve/tutorials/rllib.md
+++ b/doc/source/serve/tutorials/rllib.md
@ -72,7 +72,7 @@ pass them into the restored `Algorithm` using the `compute_single_action` method
 from starlette.requests import Request


-@serve.deployment(route_prefix="/cartpole-ppo")
+@serve.deployment
 class ServePPOModel:
    def __init__(self, checkpoint_path) -> None:
        # Re-create the originally used config.
@ -99,12 +99,11 @@ and use `Algorithm.compute_actions(...)` to process a batch of inputs.
 :::

 Now that we've defined our `ServePPOModel` service, let's deploy it to Ray Serve.
-The deployment will be exposed through the `/cartpole-ppo` route.

 ```{code-cell} python3
 :tags: [hide-output]
-serve.start()
-ServePPOModel.deploy(checkpoint_path)
+ppo_model = ServePPOModel.bind(checkpoint_path)
+serve.run(ppo_model)
 ```

 Note that the `checkpoint_path` that we passed to the `deploy()` method will be passed to
@ -123,7 +122,7 @@ for _ in range(5):

    print(f"-> Sending observation {obs}")
    resp = requests.get(
-        "http://localhost:8000/cartpole-ppo", json={"observation": obs.tolist()}
+        "http://localhost:8000/", json={"observation": obs.tolist()}
    )
    print(f"<- Received response {resp.json()}")
 ```
--- a/doc/source/serve/tutorials/serve-ml-models.md
+++ b/doc/source/serve/tutorials/serve-ml-models.md
@ -62,7 +62,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be

 Finally, we can deploy our model to Ray Serve through the terminal.
 ```console
-$ serve run tutorial_tensorflow:app
+$ serve run tutorial_tensorflow:mnist_model
 ```

 Let's query it! While Serve is running, open a separate terminal window, and run the following in an interactive Python shell or a separate Python script:
@ -72,7 +72,7 @@ import requests
 import numpy as np

 resp = requests.get(
-    "http://localhost:8000/mnist", json={"array": np.random.randn(28 * 28).tolist()}
+    "http://localhost:8000/", json={"array": np.random.randn(28 * 28).tolist()}
 )
 print(resp.json())
 ```
@ -132,7 +132,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be

 Finally, we can deploy our model to Ray Serve through the terminal.
 ```console
-$ serve run tutorial_pytorch:app
+$ serve run tutorial_pytorch:image_model
 ```

 Let's query it! While Serve is running, open a separate terminal window, and run the following in an interactive Python shell or a separate Python script:
@ -145,7 +145,7 @@ ray_logo_bytes = requests.get(
    "ray/master/doc/source/images/ray_header_logo.png"
 ).content

-resp = requests.post("http://localhost:8000/image_predict", data=ray_logo_bytes)
+resp = requests.post("http://localhost:8000/", data=ray_logo_bytes)
 print(resp.json())
 ```

@ -231,7 +231,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be

 Finally, we can deploy our model to Ray Serve through the terminal.
 ```console
-$ serve run tutorial_sklearn:app
+$ serve run tutorial_sklearn:boosting_model
 ```

 Let's query it! While Serve is running, open a separate terminal window, and run the following in an interactive Python shell or a separate Python script:
@ -245,7 +245,7 @@ sample_request_input = {
    "petal length": 1.1,
    "petal width": 0.9,
 }
-response = requests.get("http://localhost:8000/classifier", json=sample_request_input)
+response = requests.get("http://localhost:8000/", json=sample_request_input)
 print(response.text)
 ```

--- a/doc/source/serve/tutorials/web-server-integration.md
+++ b/doc/source/serve/tutorials/web-server-integration.md
@ -1,33 +0,0 @@
-(serve-web-server-integration-tutorial)=
-
-# Integration with Existing Web Servers
-
-In this guide, you will learn how to use Ray Serve to scale up your existing web application.  The key feature of Ray Serve that makes this possible is the Python-native {ref}`servehandle-api`, which allows you keep using your same Python web server while offloading your heavy computation to Ray Serve.
-
-We give two examples, one using a [FastAPI](https://fastapi.tiangolo.com/) web server and another using an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server, but the same approach will work with any Python web server.
-
-## Scaling Up a FastAPI Application
-
-Ray Serve has a native integration with FastAPI - please see {ref}`serve-fastapi-http`.
-
-## Scaling Up an AIOHTTP Application
-
-In this section, we'll integrate Ray Serve with an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server run using [Gunicorn](https://gunicorn.org/).  You'll need to install AIOHTTP and gunicorn with the command `pip install aiohttp gunicorn`.
-
-First, here is the script that deploys Ray Serve:
-
-```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
-```
-
-Next is the script that defines the AIOHTTP server:
-
-```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
-```
-
-Here's how to run this example:
-
-1. Run `ray start --head` to start a local Ray cluster in the background.
-2. In the directory where the example files are saved, run `python aiohttp_deploy_serve.py` to deploy our Ray Serve deployment.
-3. Run `gunicorn aiohttp_app:app --worker-class aiohttp.GunicornWebWorker` to start the AIOHTTP app using gunicorn.
-4. To test out the server, run `curl localhost:8000/dummy-model`.  This should output `Model received data: dummy input`.
-5. For cleanup, you can press Ctrl-C to stop the Gunicorn server, and run `ray stop` to stop the background Ray cluster.
--- a/python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
+++ b/python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
@ -1,26 +0,0 @@
-# File name: aiohttp_app.py
-from aiohttp import web
-
-import ray
-from ray import serve
-
-# Connect to the running Ray cluster.
-ray.init(address="auto")
-
-# Fetch the ServeHandle to query our model.
-my_handle = serve.get_deployment("my_model").get_handle()
-
-
-# Define our AIOHTTP request handler.
-async def handle_request(request):
-    # Offload the computation to our Ray Serve deployment.
-    result = await my_handle.remote("dummy input")
-    return web.Response(text=result)
-
-
-# Set up an HTTP endpoint.
-app = web.Application()
-app.add_routes([web.get("/dummy-model", handle_request)])
-
-if __name__ == "__main__":
-    web.run_app(app)
--- a/python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
+++ b/python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
@ -1,20 +0,0 @@
-# File name: aiohttp_deploy_serve.py
-import ray
-from ray import serve
-
-# Connect to the running Ray cluster.
-ray.init(address="auto")
-
-# Start a detached Ray Serve instance.  It will persist after the script exits.
-serve.start(http_options={"http_host": None}, detached=True)
-
-
-# Set up a deployment with the desired number of replicas. This could also be
-# a stateful class (e.g., if we had an expensive model to set up).
-@serve.deployment(name="my_model", num_replicas=2)
-async def my_model(request):
-    data = await request.body()
-    return f"Model received data: {data}"
-
-
-my_model.deploy()
--- a/python/ray/serve/examples/doc/tutorial_batch.py
+++ b/python/ray/serve/examples/doc/tutorial_batch.py
@ -1,76 +0,0 @@
-# fmt: off
-# __doc_import_begin__
-from typing import List
-import time
-
-import numpy as np
-import requests
-from starlette.requests import Request
-
-import ray
-from ray import serve
-# __doc_import_end__
-# fmt: on
-
-
-# __doc_define_servable_begin__
-@serve.deployment(route_prefix="/adder")
-class BatchAdder:
-    @serve.batch(max_batch_size=4)
-    async def handle_batch(self, numbers: List[int]):
-        input_array = np.array(numbers)
-        print("Our input array has shape:", input_array.shape)
-        # Sleep for 200ms, this could be performing CPU intensive computation
-        # in real models
-        time.sleep(0.2)
-        output_array = input_array + 1
-        return output_array.astype(int).tolist()
-
-    async def __call__(self, request: Request):
-        return await self.handle_batch(int(request.query_params["number"]))
-
-
-# __doc_define_servable_end__
-
-# __doc_deploy_begin__
-ray.init(num_cpus=8)
-serve.start()
-BatchAdder.deploy()
-# __doc_deploy_end__
-
-
-# __doc_query_begin__
-@ray.remote
-def send_query(number):
-    resp = requests.get("http://localhost:8000/adder?number={}".format(number))
-    return int(resp.text)
-
-
-# Let's use Ray to send all queries in parallel
-results = ray.get([send_query.remote(i) for i in range(9)])
-print("Result returned:", results)
-# Output
-# (pid=...) Our input array has shape: (1,)
-# (pid=...) Our input array has shape: (4,)
-# (pid=...) Our input array has shape: (4,)
-# Result returned: [1, 2, 3, 4, 5, 6, 7, 8, 9]
-# __doc_query_end__
-
-# __doc_query_handle_begin__
-handle = BatchAdder.get_handle()
-input_batch = list(range(9))
-print("Input batch is", input_batch)
-# Input batch is [0, 1, 2, 3, 4, 5, 6, 7, 8]
-
-result_batch = ray.get([handle.handle_batch.remote(i) for i in input_batch])
-# Output
-# (pid=...) Current context is python
-# (pid=...) Our input array has shape: (1,)
-# (pid=...) Current context is python
-# (pid=...) Our input array has shape: (4,)
-# (pid=...) Current context is python
-# (pid=...) Our input array has shape: (4,)
-
-print("Result batch is", result_batch)
-# Result batch is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-# __doc_query_handle_end__