mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Serve] Add doc for model composition (#8871)
Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
This commit is contained in:
parent
0ba7472da9
commit
cf53b35147
3 changed files with 34 additions and 2 deletions
|
@ -185,6 +185,28 @@ The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or `
|
|||
handle = serve.get_handle("api_endpoint")
|
||||
handler.options(shard_key=session_id).remote(args)
|
||||
|
||||
Composing Multiple Models
|
||||
=========================
|
||||
Ray Serve supports composing individually scalable models into a single model
|
||||
out of the box. For instance, you can combine multiple models to perform
|
||||
stacking or ensembles.
|
||||
|
||||
To define a higher-level composed model you need to do three things:
|
||||
|
||||
1. Define your underlying models (the ones that you will compose together) as
|
||||
Ray Serve backends
|
||||
2. Define your composed model, using the handles of the underlying models
|
||||
(see the example below).
|
||||
3. Define an endpoint representing this composed model and query it!
|
||||
|
||||
In order to avoid synchronous execution in the composed model (e.g., it's very
|
||||
slow to make calls to the composed model), you'll need to make the function
|
||||
asynchronous by using an ``async def``. You'll see this in the example below.
|
||||
|
||||
That's it. Let's take a look at an example:
|
||||
|
||||
.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_composition.py
|
||||
|
||||
|
||||
.. _serve-faq:
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ To follow along, you'll need to make the necessary imports.
|
|||
from ray import serve
|
||||
serve.init() # Initializes Ray and Ray Serve.
|
||||
|
||||
.. _`serve-backend`:
|
||||
|
||||
Backends
|
||||
========
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
from random import random
|
||||
|
||||
import requests
|
||||
|
||||
from ray import serve
|
||||
|
||||
serve.init()
|
||||
|
||||
# Our pipeline will be structured as follows:
|
||||
# - Input comes in, the composed model sends it to model_one
|
||||
# - model_one outputs a random number between 0 and 1, if the value is
|
||||
# greater than 0.5, then the data is sent to model_two
|
||||
# - otherwise, the data is returned to the user.
|
||||
|
||||
# Let's define two models that just print out the data they received.
|
||||
|
||||
|
||||
def model_one(_unused_flask_request, data=None):
|
||||
print("Model 1 called with data ", data)
|
||||
|
@ -22,6 +28,7 @@ class ComposedModel:
|
|||
self.model_one = serve.get_handle("model_one")
|
||||
self.model_two = serve.get_handle("model_two")
|
||||
|
||||
# This method can be called concurrently!
|
||||
async def __call__(self, flask_request):
|
||||
data = flask_request.data
|
||||
|
||||
|
@ -41,6 +48,8 @@ serve.create_endpoint("model_one", backend="model_one")
|
|||
serve.create_backend("model_two", model_two)
|
||||
serve.create_endpoint("model_two", backend="model_two")
|
||||
|
||||
# max_concurrent_queries is optional. By default, if you pass in an async
|
||||
# function, Ray Serve sets the limit to a high number.
|
||||
serve.create_backend(
|
||||
"composed_backend", ComposedModel, config={"max_concurrent_queries": 10})
|
||||
serve.create_endpoint(
|
||||
|
|
Loading…
Add table
Reference in a new issue