[Serve] Add doc for model composition (#8871)

Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
2025-03-06 02:21:39 -05:00 · 2020-06-10 11:09:22 -07:00 · 2020-06-10 11:09:22 -07:00 · cf53b35147
commit cf53b35147
parent 0ba7472da9
3 changed files with 34 additions and 2 deletions
--- a/doc/source/serve/advanced.rst
+++ b/doc/source/serve/advanced.rst
@ -185,6 +185,28 @@ The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or `
  handle = serve.get_handle("api_endpoint")
  handler.options(shard_key=session_id).remote(args)

+Composing Multiple Models
+=========================
+Ray Serve supports composing individually scalable models into a single model 
+out of the box. For instance, you can combine multiple models to perform 
+stacking or ensembles.
+
+To define a higher-level composed model you need to do three things:
+
+1. Define your underlying models (the ones that you will compose together) as 
+   Ray Serve backends
+2. Define your composed model, using the handles of the underlying models 
+   (see the example below).
+3. Define an endpoint representing this composed model and query it!
+
+In order to avoid synchronous execution in the composed model (e.g., it's very
+slow to make calls to the composed model), you'll need to make the function
+asynchronous by using an ``async def``. You'll see this in the example below.
+
+That's it. Let's take a look at an example:
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_composition.py
+

 .. _serve-faq:

--- a/doc/source/serve/key-concepts.rst
+++ b/doc/source/serve/key-concepts.rst
@ -11,6 +11,7 @@ To follow along, you'll need to make the necessary imports.
  from ray import serve
  serve.init() # Initializes Ray and Ray Serve.

+.. _`serve-backend`:

 Backends
 ========
--- a/python/ray/serve/examples/doc/snippet_model_composition.py
+++ b/python/ray/serve/examples/doc/snippet_model_composition.py
@ -1,11 +1,17 @@
 from random import random
-
 import requests
-
 from ray import serve

 serve.init()

+# Our pipeline will be structured as follows:
+# - Input comes in, the composed model sends it to model_one
+# - model_one outputs a random number between 0 and 1, if the value is
+#   greater than 0.5, then the data is sent to model_two
+# - otherwise, the data is returned to the user.
+
+# Let's define two models that just print out the data they received.
+

 def model_one(_unused_flask_request, data=None):
    print("Model 1 called with data ", data)
@ -22,6 +28,7 @@ class ComposedModel:
        self.model_one = serve.get_handle("model_one")
        self.model_two = serve.get_handle("model_two")

+    # This method can be called concurrently!
    async def __call__(self, flask_request):
        data = flask_request.data

@ -41,6 +48,8 @@ serve.create_endpoint("model_one", backend="model_one")
 serve.create_backend("model_two", model_two)
 serve.create_endpoint("model_two", backend="model_two")

+# max_concurrent_queries is optional. By default, if you pass in an async
+# function, Ray Serve sets the limit to a high number.
 serve.create_backend(
    "composed_backend", ComposedModel, config={"max_concurrent_queries": 10})
 serve.create_endpoint(