Merge remote-tracking branch 'upstream/master' into grpc-channel-reconnect

2025-03-05 10:01:43 -05:00 · 2022-05-11 05:52:18 +00:00 · 2022-05-11 05:52:18 +00:00 · 707329a170
commit 707329a170
parent 48514aa1b0 4c6fccafe6
52 changed files with 3005 additions and 2776 deletions
--- a/.buildkite/pipeline.macos.yml
+++ b/.buildkite/pipeline.macos.yml
@ -26,6 +26,7 @@ epilogue_commands: &epilogue_commands |-
 steps:
 - label: ":mac: :apple: Wheels and Jars"
  <<: *common
+  conditions: ["RAY_CI_MACOS_WHEELS_AFFECTED", "RAY_CI_PYTHON_DEPENDENCIES_AFFECTED"]
  commands:
    # Cleanup environments
    - ./ci/build/upload_build_info.sh
@ -62,6 +63,7 @@ steps:

 - label: ":mac: :apple: Ray C++, Java and Libraries"
  <<: *common
+  conditions: ["RAY_CI_SERVE_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_CPP_AFFECTED", "RAY_CI_JAVA_AFFECTED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DASHBOARD_AFFECTED"]
  commands:
    - export RAY_INSTALL_JAVA=1
    - *prelude_commands
@ -73,17 +75,13 @@ steps:
    # clang-format is needed by java/test.sh
    - pip install clang-format==12.0.1
    - ./java/test.sh
+    - ./ci/ci.sh test_cpp
    - *epilogue_commands

- label: ":mac: :apple: Worker"
-  <<: *common
-  commands:
-    - *prelude_commands
-    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
-    - ./ci/ci.sh test_cpp

 - label: ":mac: :apple: Small & Client"
  <<: *common
+  conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
  commands:
    - *prelude_commands
    - bazel test $(./ci/run/bazel_export_options) --config=ci
@ -96,6 +94,7 @@ steps:
 - label: ":mac: :apple: Large"
  <<: *common
  parallelism: 3
+  conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
  commands:
    - *prelude_commands
    - . ./ci/ci.sh test_large
@ -103,6 +102,7 @@ steps:

 - label: ":mac: :apple: Medium A-J"
  <<: *common
+  conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
  commands:
    - *prelude_commands
    - bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CI
@ -112,6 +112,7 @@ steps:

 - label: ":mac: :apple: Medium K-Z"
  <<: *common
+  conditions: ["RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED"]
  commands:
    - *prelude_commands
    - bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CI
--- a/ci/pipeline/determine_tests_to_run.py
+++ b/ci/pipeline/determine_tests_to_run.py
@ -7,7 +7,6 @@ import argparse
 import json
 import os
 from pprint import pformat
-import py_dep_analysis as pda
 import re
 import subprocess
 import sys
@ -102,6 +101,8 @@ if __name__ == "__main__":

        # Dry run py_dep_analysis.py to see which tests we would have run.
        try:
+            import py_dep_analysis as pda
+
            graph = pda.build_dep_graph()
            rllib_tests = pda.list_rllib_tests()
            print("Total # of RLlib tests: ", len(rllib_tests), file=sys.stderr)
--- a/dashboard/client/src/pages/dashboard/Dashboard.tsx
+++ b/dashboard/client/src/pages/dashboard/Dashboard.tsx
@ -147,11 +147,11 @@ const Dashboard: React.FC = () => {
            run the following command: `ray disable-usage-stats` before starting
            the cluster. See{" "}
            <a
-              href="https://github.com/ray-project/ray/issues/20857"
+              href="https://docs.ray.io/en/master/cluster/usage-stats.html"
              target="_blank"
              rel="noreferrer"
            >
-              https://github.com/ray-project/ray/issues/20857
+              https://docs.ray.io/en/master/cluster/usage-stats.html
            </a>{" "}
            for more details.
          </span>
--- a/doc/source/_toc.yml
+++ b/doc/source/_toc.yml
@ -132,6 +132,7 @@ parts:
      - file: cluster/cloud
      - file: cluster/deploy
      - file: cluster/api
+      - file: cluster/usage-stats

  - caption: References
    chapters:
--- a/doc/source/cluster/usage-stats-data-ref.rst
+++ b/doc/source/cluster/usage-stats-data-ref.rst
@ -0,0 +1,10 @@
+Usage Stats Data API
+====================
+
+.. _ray-usage-stats-data-ref:
+
+UsageStatsToReport
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ray._private.usage.usage_lib.UsageStatsToReport
+   :members:
--- a/doc/source/cluster/usage-stats.rst
+++ b/doc/source/cluster/usage-stats.rst
@ -0,0 +1,84 @@
+.. _ref-usage-stats:
+
+Usage Stats Collection
+======================
+
+Starting in Ray 1.13, Ray collects usage stats data by default (guarded by an opt-out prompt).
+This data will be used by the open-source Ray engineering team to better understand how to improve our libraries and core APIs, and how to prioritize bug fixes and enhancements.
+
+Here are the guiding principles of our collection policy:
+
+- **No surprises** — you will be notified before we begin collecting data. You will be notified of any changes to the data being collected or how it is used.
+- **Easy opt-out:** You will be able to easily opt-out of data collection
+- **Transparency** — you will be able to review all data that is sent to us
+- **Control** — you will have control over your data, and we will honor requests to delete your data.
+- We will **not** collect any personally identifiable data or proprietary code/data
+- We will **not** sell data or buy data about you.
+
+You will always be able to :ref:`disable the usage stats collection<usage-disable>`.
+
+For more context, please refer to this `RFC <https://github.com/ray-project/ray/issues/20857>`_.
+
+What data is collected?
+-----------------------
+
+We collect non-sensitive data that helps us understand how Ray is used (e.g., which Ray libraries are used).
+**Personally identifiable data will never be collected.** Please check :ref:`UsageStatsToReport <ray-usage-stats-data-ref>` to see the data we collect.
+
+.. _usage-disable:
+
+How to disable it
+-----------------
+There are multiple ways to disable usage stats collection before starting a cluster:
+
+#. Add ``--disable-usage-stats`` option to the command that starts the Ray cluster (e.g., ``ray start --head --disable-usage-stats`` :ref:`command <ray-start-doc>`).
+
+#. Run :ref:`ray disable-usage-stats <ray-disable-usage-stats-doc>` to disable collection for all future clusters. This won't affect currently running clusters. Under the hood, this command writes ``{"usage_stats": true}`` to the global config file ``~/.ray/config.json``.
+
+#. Set the environment variable ``RAY_USAGE_STATS_ENABLED`` to 0 (e.g., ``RAY_USAGE_STATS_ENABLED=0 ray start --head`` :ref:`command <ray-start-doc>`).
+
+Currently there is no way to enable or disable collection for a running cluster; you have to stop and restart the cluster.
+
+
+How does it work?
+-----------------
+
+When a Ray cluster is started via :ref:`ray start --head <ray-start-doc>`, :ref:`ray up <ray-up-doc>`, :ref:`ray submit --start <ray-submit-doc>` or :ref:`ray exec --start <ray-exec-doc>`,
+Ray will decide whether usage stats collection should be enabled or not by considering the following factors in order:
+
+#. It checks whether the environment variable ``RAY_USAGE_STATS_ENABLED`` is set: 1 means enabled and 0 means disabled.
+
+#. If the environment variable is not set, it reads the value of key ``usage_stats`` in the global config file ``~/.ray/config.json``: true means enabled and false means disabled.
+
+#. If neither is set and the console is interactive, then the user will be prompted to enable or disable the collection. If the console is non-interactive, usage stats collection will be enabled by default. The decision will be saved to ``~/.ray/config.json``, so the prompt is only shown once.
+
+Note: usage stats collection is not enabled when using local dev clusters started via ``ray.init()``. This means that Ray will never collect data from third-party library users not using Ray directly.
+
+If usage stats collection is enabled, a background process on the head node will collect the usage stats
+and report to ``https://usage-stats.ray.io/`` every hour. The reported usage stats will also be saved to
+``/tmp/ray/session_xxx/usage_stats.json`` on the head node for inspection. You can check the existence of this file to see if collection is enabled.
+
+Usage stats collection is very lightweight and should have no impact on your workload in any way.
+
+Requesting removal of collected data
+------------------------------------
+
+To request removal of collected data, please email us at ``usage_stats@ray.io`` with the ``session_id`` that you can find in ``/tmp/ray/session_xxx/usage_stats.json``.
+
+Frequently Asked Questions (FAQ)
+--------------------------------
+
+**Does the session_id map to personal data?**
+
+No, the uuid will be a Ray session/job-specific random ID that cannot be used to identify a specific person nor machine. It will not live beyond the lifetime of your Ray session; and is primarily captured to enable us to honor deletion requests.
+
+The session_id is logged so that deletion requests can be honored.
+
+**Could an enterprise easily configure an additional endpoint or substitute a different endpoint?**
+
+We definitely see this use case and would love to chat with you to make this work -- email ``usage_stats@ray.io``.
+
+
+Contact us
+----------
+If you have any feedback regarding usage stats collection, please email us at ``usage_stats@ray.io``.
--- a/doc/source/data/getting-started.rst
+++ b/doc/source/data/getting-started.rst
@ -127,7 +127,7 @@ Transformations are executed *eagerly* and block until the operation is finished

 .. code-block:: python

-    def transform_batch(df: pandas.DataFrame) -> pandas.DataFrame:
+    def transform_batch(df: pandas.DataFrame) -> pd.DataFrame:
        return df.applymap(lambda x: x * 2)

    ds = ray.data.range_arrow(10000)
--- a/doc/source/ray-core/doc_code/obj_capture.py
+++ b/doc/source/ray-core/doc_code/obj_capture.py
@ -0,0 +1,16 @@
+import ray
+
+# Put the values (1, 2, 3) into Ray's object store.
+a, b, c = ray.put(1), ray.put(2), ray.put(3)
+
+
+@ray.remote
+def print_via_capture():
+    """This function prints the values of (a, b, c) to stdout."""
+    print(ray.get([a, b, c]))
+
+
+# Passing object references via closure-capture. Inside the `print_via_capture`
+# function, the global object refs (a, b, c) can be retrieved and printed.
+print_via_capture.remote()
+# -> prints [1, 2, 3]
--- a/doc/source/ray-core/doc_code/obj_ref.py
+++ b/doc/source/ray-core/doc_code/obj_ref.py
@ -0,0 +1,18 @@
+import ray
+
+
+@ray.remote
+def echo_and_get(x_list):  # List[ObjectRef]
+    """This function prints its input values to stdout."""
+    print("args:", x_list)
+    print("values:", ray.get(x_list))
+
+
+# Put the values (1, 2, 3) into Ray's object store.
+a, b, c = ray.put(1), ray.put(2), ray.put(3)
+
+# Passing an object as a nested argument to `echo_and_get`. Ray does not
+# de-reference nested args, so `echo_and_get` sees the references.
+echo_and_get.remote([a, b, c])
+# -> prints args: [ObjectRef(...), ObjectRef(...), ObjectRef(...)]
+#           values: [1, 2, 3]
--- a/doc/source/ray-core/doc_code/obj_val.py
+++ b/doc/source/ray-core/doc_code/obj_val.py
@ -0,0 +1,20 @@
+import ray
+
+
+@ray.remote
+def echo(a: int, b: int, c: int):
+    """This function prints its input values to stdout."""
+    print(a, b, c)
+
+
+# Passing the literal values (1, 2, 3) to `echo`.
+echo.remote(1, 2, 3)
+# -> prints "1 2 3"
+
+# Put the values (1, 2, 3) into Ray's object store.
+a, b, c = ray.put(1), ray.put(2), ray.put(3)
+
+# Passing an object as a top-level argument to `echo`. Ray will de-reference top-level
+# arguments, so `echo` will see the literal values (1, 2, 3) in this case as well.
+echo.remote(a, b, c)
+# -> prints "1 2 3"
--- a/doc/source/ray-core/objects.rst
+++ b/doc/source/ray-core/objects.rst
@ -128,41 +128,51 @@ If the current node's object store does not contain the object, the object is do
      assert(*results[1] == 1);
      assert(*results[2] == 2);

-Passing Objects by Reference
----------------------------
+Passing Object Arguments
+------------------------

 Ray object references can be freely passed around a Ray application. This means that they can be passed as arguments to tasks, actor methods, and even stored in other objects. Objects are tracked via *distributed reference counting*, and their data is automatically freed once all references to the object are deleted.

+There are two different ways one can pass an object to a Ray task or method. Depending on the way an object is passed, Ray will decide whether to *de-reference* the object prior to task execution.
+
+**Passing an object as a top-level argmuent**: When an object is passed directly as a top-level argument to a task, Ray will de-reference the object. This means that Ray will fetch the underlying data for all top-level object reference arguments, not executing the task until the object data becomes fully available.
+
+.. literalinclude:: doc_code/obj_val.py
+
+**Passing an object as a nested argument**: When an object is passed within a nested object, for example, within a Python list, Ray will *not* de-reference it. This means that the task will need to call ``ray.get()`` on the reference to fetch the concrete value. However, if the task never calls ``ray.get()``, then the object value never needs to be transferred to the machine the task is running on. We recommend passing objects as top-level arguments where possible, but nested arguments can be useful for passing objects on to other tasks without needing to see the data.
+
+.. literalinclude:: doc_code/obj_ref.py
+
+The top-level vs not top-level passing convention also applies to actor constructors and actor method calls:
+
 .. code-block:: python

-    @ray.remote
-    def echo(x):
-        print(x)
+    # Examples of passing objects to actor constructors.
+    actor_handle = Actor.remote(obj)  # by-value
+    actor_handle = Actor.remote([obj])  # by-reference

-    # Put an object in Ray's object store.
-    object_ref = ray.put(1)
+    # Examples of passing objects to actor method calls.
+    actor_handle.method.remote(obj)  # by-value
+    actor_handle.method.remote([obj])  # by-reference

-    # Pass-by-value: send the object to a task as a top-level argument.
-    # The object will be de-referenced, so the task only sees its value.
-    echo.remote(object_ref)
-    # -> prints "1"
+Closure Capture of Objects
+--------------------------

-    # Pass-by-reference: when passed inside a Python list or other data structure,
-    # the object ref is preserved. The object data is not transferred to the worker
-    # when it is passed by reference, until ray.get() is called on the reference.
-    echo.remote({"obj": object_ref})
-    # -> prints "{"obj": ObjectRef(...)}"
+You can also pass objects to tasks via *closure-capture*. This can be convenient when you have a large object that you want to share verbatim between many tasks or actors, and don't want to pass it repeatedly as an argument. Be aware however that defining a task that closes over an object ref will pin the object via reference-counting, so the object will not be evicted until the job completes.
+
+.. literalinclude:: doc_code/obj_capture.py
+
+Nested Objects
+--------------
+
+Ray also supports nested object references. This allows you to build composite objects that themselves hold references to further sub-objects.
+
+.. code-block:: python

    # Objects can be nested within each other. Ray will keep the inner object
    # alive via reference counting until all outer object references are deleted.
    object_ref_2 = ray.put([object_ref])

-    # Examples of passing objects to actors.
-    actor_handle = Actor.remote(obj)  # by-value
-    actor_handle = Actor.remote([obj])  # by-reference
-    actor_handle.method.remote(obj)  # by-value
-    actor_handle.method.remote([obj])  # by-reference
-
 More about Ray Objects
 ----------------------

--- a/doc/source/ray-core/package-ref.rst
+++ b/doc/source/ray-core/package-ref.rst
@ -314,3 +314,15 @@ The Ray Command Line API
 .. click:: ray.scripts.scripts:debug
   :prog: ray debug
   :show-nested:
+
+.. _ray-disable-usage-stats-doc:
+
+.. click:: ray.scripts.scripts:disable_usage_stats
+   :prog: ray disable-usage-stats
+   :show-nested:
+
+.. _ray-enable-usage-stats-doc:
+
+.. click:: ray.scripts.scripts:enable_usage_stats
+   :prog: ray enable-usage-stats
+   :show-nested:
--- a/doc/source/ray-observability/ray-logging.rst
+++ b/doc/source/ray-observability/ray-logging.rst
@ -1,3 +1,5 @@
+.. _ray-logging:
+
 Logging
 =======
 This document will explain Ray's logging system and its best practices.
--- a/doc/source/ray-references/api.rst
+++ b/doc/source/ray-references/api.rst
@ -13,4 +13,5 @@ API References
    ../workflows/package-ref.rst
    ../ray-core/package-ref.rst
    ../cluster/reference.rst
-    ../cluster/jobs-package-ref.rst
+    ../cluster/jobs-package-ref.rst
+    ../cluster/usage-stats-data-ref.rst
--- a/doc/source/serve/architecture.rst
+++ b/doc/source/serve/architecture.rst
@ -1,22 +1,22 @@
-.. _serve-architecture:
+(serve-architecture)=
+
+# Serve Architecture

-Serve Architecture
-==================
 This section should help you:

 - understand an overview of how each component in Serve works
 - understand the different types of actors that make up a Serve instance

-.. Figure source: https://docs.google.com/drawings/d/1jSuBN5dkSj2s9-0eGzlU_ldsRa3TsswQUZM-cMQ29a0/edit?usp=sharing
+% Figure source: https://docs.google.com/drawings/d/1jSuBN5dkSj2s9-0eGzlU_ldsRa3TsswQUZM-cMQ29a0/edit?usp=sharing

-.. image:: architecture.svg
-    :align: center
-    :width: 600px
+```{image} architecture.svg
+:align: center
+:width: 600px
+```

-High Level View
---------------
+## High Level View

-Serve runs on Ray and utilizes :ref:`Ray actors<actor-guide>`.
+Serve runs on Ray and utilizes [Ray actors](actor-guide).

 There are three kinds of actors that are created to make up a Serve instance:

@ -24,17 +24,16 @@ There are three kinds of actors that are created to make up a Serve instance:
  the control plane. The Controller is responsible for creating, updating, and
  destroying other actors. Serve API calls like creating or getting a deployment
  make remote calls to the Controller.
- Router: There is one router per node. Each router is a `Uvicorn <https://www.uvicorn.org/>`_ HTTP
+- Router: There is one router per node. Each router is a [Uvicorn](https://www.uvicorn.org/) HTTP
  server that accepts incoming requests, forwards them to replicas, and
  responds once they are completed.
 - Worker Replica: Worker replicas actually execute the code in response to a
  request. For example, they may contain an instantiation of an ML model. Each
  replica processes individual requests from the routers (they may be batched
-  by the replica using ``@serve.batch``, see the :ref:`batching<serve-batching>` docs).
+  by the replica using `@serve.batch`, see the [batching](serve-batching) docs).

+## Lifetime of a Request

-Lifetime of a Request
---------------------
 When an HTTP request is sent to the router, the follow things happen:

 - The HTTP request is received and parsed.
@ -42,36 +41,33 @@ When an HTTP request is sent to the router, the follow things happen:
  request is placed on a queue.
 - For each request in a deployment queue, an available replica is looked up
  and the request is sent to it. If there are no available replicas (there
-  are more than ``max_concurrent_queries`` requests outstanding), the request
+  are more than `max_concurrent_queries` requests outstanding), the request
  is left in the queue until an outstanding request is finished.

 Each replica maintains a queue of requests and executes one at a time, possibly
 using asyncio to process them concurrently. If the handler (the function for the
-deployment or ``__call__``) is ``async``, the replica will not wait for the
+deployment or `__call__`) is `async`, the replica will not wait for the
 handler to run; otherwise, the replica will block until the handler returns.

-FAQ
---
+## FAQ

-.. _serve-ft-detail:
+(serve-ft-detail)=

-How does Serve handle fault tolerance?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### How does Serve handle fault tolerance?

 Application errors like exceptions in your model evaluation code are caught and
 wrapped. A 500 status code will be returned with the traceback information. The
 replica will be able to continue to handle requests.

-Machine errors and faults will be handled by Ray. Serve utilizes the :ref:`actor
-reconstruction <actor-fault-tolerance>` capability. For example, when a machine hosting any of the
+Machine errors and faults will be handled by Ray. Serve utilizes the [actor
+reconstruction](actor-fault-tolerance) capability. For example, when a machine hosting any of the
 actors crashes, those actors will be automatically restarted on another
 available machine. All data in the Controller (routing policies, deployment
 configurations, etc) is checkpointed to the Ray. Transient data in the
 router and the replica (like network connections and internal request
 queues) will be lost upon failure.

-How does Serve ensure horizontal scalability and availability?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### How does Serve ensure horizontal scalability and availability?

 Serve starts one router per node. Each router will bind the same port. You
 should be able to reach Serve and send requests to any models via any of the
@ -81,20 +77,17 @@ This architecture ensures horizontal scalability for Serve. You can scale the
 router by adding more nodes and scale the model by increasing the number
 of replicas.

-How do ServeHandles work?
-^^^^^^^^^^^^^^^^^^^^^^^^^
+### How do ServeHandles work?

-:mod:`ServeHandles <ray.serve.handle.RayServeHandle>` wrap a handle to the router actor on the same node. When a
+{mod}`ServeHandles <ray.serve.handle.RayServeHandle>` wrap a handle to the router actor on the same node. When a
 request is sent from one replica to another via the handle, the
 requests go through the same data path as incoming HTTP requests. This enables
 the same deployment selection and batching procedures to happen. ServeHandles are
-often used to implement :ref:`model composition <serve-model-composition>`.
+often used to implement [model composition](serve-model-composition).

+### What happens to large requests?

-What happens to large requests?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Serve utilizes Ray’s :ref:`shared memory object store <plasma-store>` and in process memory
+Serve utilizes Ray’s [shared memory object store](plasma-store) and in process memory
 store. Small request objects are directly sent between actors via network
 call. Larger request objects (100KiB+) are written to a distributed shared
 memory store and the replica can read them via zero-copy read.
--- a/doc/source/serve/core-apis.md
+++ b/doc/source/serve/core-apis.md
@ -0,0 +1,364 @@
+# Core API: Deployments
+
+This section should help you:
+
+- create, query, update and configure deployments
+- configure resources of your deployments
+- specify different Python dependencies across different deployment using Runtime Environments
+
+:::{tip}
+Get in touch with us if you're using or considering using [Ray Serve](https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU).
+:::
+
+```{contents}
+```
+
+## Creating a Deployment
+
+Deployments are the central concept in Ray Serve.
+They allow you to define and update your business logic or models that will handle incoming requests as well as how this is exposed over HTTP or in Python.
+
+A deployment is defined using {mod}`@serve.deployment <ray.serve.api.deployment>` on a Python class (or function for simple use cases).
+You can specify arguments to be passed to the constructor when you call `Deployment.deploy()`, shown below.
+
+A deployment consists of a number of *replicas*, which are individual copies of the function or class that are started in separate Ray Actors (processes).
+
+```python
+@serve.deployment
+class MyFirstDeployment:
+  # Take the message to return as an argument to the constructor.
+  def __init__(self, msg):
+      self.msg = msg
+
+  def __call__(self, request):
+      return self.msg
+
+  def other_method(self, arg):
+      return self.msg
+
+MyFirstDeployment.deploy("Hello world!")
+```
+
+Deployments can be exposed in two ways: over HTTP or in Python via the {ref}`servehandle-api`.
+By default, HTTP requests will be forwarded to the `__call__` method of the class (or the function) and a `Starlette Request` object will be the sole argument.
+You can also define a deployment that wraps a FastAPI app for more flexible handling of HTTP requests. See {ref}`serve-fastapi-http` for details.
+
+To serve multiple deployments defined by the same class, use the `name` option:
+
+```python
+MyFirstDeployment.options(name="hello_service").deploy("Hello!")
+MyFirstDeployment.options(name="hi_service").deploy("Hi!")
+```
+
+You can also list all available deployments and dynamically get references to them:
+
+```python
+>> serve.list_deployments()
+{'A': Deployment(name=A,version=None,route_prefix=/A)}
+{'MyFirstDeployment': Deployment(name=MyFirstDeployment,version=None,route_prefix=/MyFirstDeployment}
+
+# Returns the same object as the original MyFirstDeployment object.
+# This can be used to redeploy, get a handle, etc.
+deployment = serve.get_deployment("MyFirstDeployment")
+```
+
+## Exposing a Deployment
+
+By default, deployments are exposed over HTTP at `http://localhost:8000/<deployment_name>`.
+The HTTP path that the deployment is available at can be changed using the `route_prefix` option.
+All requests to `/{route_prefix}` and any subpaths will be routed to the deployment (using a longest-prefix match for overlapping route prefixes).
+
+Here's an example:
+
+```python
+@serve.deployment(name="http_deployment", route_prefix="/api")
+class HTTPDeployment:
+  def __call__(self, request):
+      return "Hello world!"
+```
+
+After creating the deployment, it is now exposed by the HTTP server and handles requests using the specified class.
+We can query the model to verify that it's working.
+
+```python
+import requests
+print(requests.get("http://127.0.0.1:8000/api").text)
+```
+
+We can also query the deployment using the {mod}`ServeHandle <ray.serve.handle.RayServeHandle>` interface.
+
+```python
+# To get a handle from the same script, use the Deployment object directly:
+handle = HTTPDeployment.get_handle()
+
+# To get a handle from a different script, reference it by name:
+handle = serve.get_deployment("http_deployment").get_handle()
+
+print(ray.get(handle.remote()))
+```
+
+As noted above, there are two ways to expose deployments. The first is by using the {mod}`ServeHandle <ray.serve.handle.RayServeHandle>`
+interface. This method allows you to access deployments within a Python script or code, making it convenient for a
+Python developer. And the second is by using the HTTP request, allowing access to deployments via a web client application.
+
+Let's look at a simple end-to-end example using both ways to expose and access deployments. Your output may
+vary due to random nature of how the prediction is computed; however, the example illustrates two things:
+1\) how to expose and use deployments and 2) how to use replicas, to which requests are sent. Note that each pid
+is a separate replica associated with each deployment name, `rep-1` and `rep-2` respectively.
+
+```{literalinclude} _examples/doc_code/create_deployment.py
+:end-before: __serve_example_end__
+:language: python
+:start-after: __serve_example_begin__
+```
+
+```python
+# Output:
+# {'rep-1': Deployment(name=rep-1,version=None,route_prefix=/rep-1),
+# 'rep-2': Deployment(name=rep-2,version=None,route_prefix=/rep-2)}
+#
+# ServerHandle API responses: ----------
+# handle name : rep-1
+# prediction  : (pid: 62636); path: /model/rep-1.pkl; data: 0.600; prediction: 1.292
+# --
+# handle name : rep-2
+# prediction  : (pid: 62635); path: /model/rep-2.pkl; data: 0.075; prediction: 0.075
+# --
+# handle name : rep-1
+# prediction  : (pid: 62634); path: /model/rep-1.pkl; data: 0.186; prediction: 0.186
+# --
+# handle name : rep-2
+# prediction  : (pid: 62637); path: /model/rep-2.pkl; data: 0.751; prediction: 1.444
+# --
+# HTTP responses: ----------
+# handle name : rep-1
+# prediction  : (pid: 62636); path: /model/rep-1.pkl; data: 0.582; prediction: 1.481
+# handle name : rep-2
+# prediction  : (pid: 62637); path: /model/rep-2.pkl; data: 0.778; prediction: 1.678
+# handle name : rep-1
+# prediction  : (pid: 62634); path: /model/rep-1.pkl; data: 0.139; prediction: 0.139
+# handle name : rep-2
+# prediction  : (pid: 62635); path: /model/rep-2.pkl; data: 0.569; prediction: 1.262
+```
+
+## Updating a Deployment
+
+Often you want to be able to update your code or configuration options for a deployment over time.
+Deployments can be updated simply by updating the code or configuration options and calling `deploy()` again.
+
+```python
+@serve.deployment(name="my_deployment", num_replicas=1)
+class SimpleDeployment:
+    pass
+
+# Creates one initial replica.
+SimpleDeployment.deploy()
+
+# Re-deploys, creating an additional replica.
+# This could be the SAME Python script, modified and re-run.
+@serve.deployment(name="my_deployment", num_replicas=2)
+class SimpleDeployment:
+    pass
+
+SimpleDeployment.deploy()
+
+# You can also use Deployment.options() to change options without redefining
+# the class. This is useful for programmatically updating deployments.
+SimpleDeployment.options(num_replicas=2).deploy()
+```
+
+By default, each call to `.deploy()` will cause a redeployment, even if the underlying code and options didn't change.
+This could be detrimental if you have many deployments in a script and and only want to update one: if you re-run the script, all of the deployments will be redeployed, not just the one you updated.
+To prevent this, you may provide a `version` string for the deployment as a keyword argument in the decorator or `Deployment.options()`.
+If provided, the replicas will only be updated if the value of `version` is updated; if the value of `version` is unchanged, the call to `.deploy()` will be a no-op.
+When a redeployment happens, Serve will perform a rolling update, bringing down at most 20% of the replicas at any given time.
+
+(configuring-a-deployment)=
+
+## Configuring a Deployment
+
+There are a number of things you'll likely want to do with your serving application including
+scaling out or configuring the maximum number of in-flight requests for a deployment.
+All of these options can be specified either in {mod}`@serve.deployment <ray.serve.api.deployment>` or in `Deployment.options()`.
+
+To update the config options for a running deployment, simply redeploy it with the new options set.
+
+### Scaling Out
+
+To scale out a deployment to many processes, simply configure the number of replicas.
+
+```python
+# Create with a single replica.
+@serve.deployment(num_replicas=1)
+def func(*args):
+    pass
+
+func.deploy()
+
+# Scale up to 10 replicas.
+func.options(num_replicas=10).deploy()
+
+# Scale back down to 1 replica.
+func.options(num_replicas=1).deploy()
+```
+
+#### Autoscaling
+
+Serve also has experimental support for a demand-based replica autoscaler.
+It reacts to traffic spikes via observing queue sizes and making scaling decisions.
+To configure it, you can set the `_autoscaling` field in deployment options.
+
+:::{warning}
+The API is experimental and subject to change. We welcome you to test it out
+and leave us feedback through [Github Issues](https://github.com/ray-project/ray/issues) or our [discussion forum](https://discuss.ray.io/)!
+:::
+
+```python
+@serve.deployment(
+    _autoscaling_config={
+        "min_replicas": 1,
+        "max_replicas": 5,
+        "target_num_ongoing_requests_per_replica": 10,
+    },
+    version="v1")
+def func(_):
+    time.sleep(1)
+    return ""
+
+func.deploy() # The func deployment will now autoscale based on requests demand.
+```
+
+The `min_replicas` and `max_replicas` fields configure the range of replicas which the
+Serve autoscaler chooses from.  Deployments will start with `min_replicas` initially.
+
+The `target_num_ongoing_requests_per_replica` configuration specifies how aggressively the
+autoscaler should react to traffic. Serve will try to make sure that each replica has roughly that number
+of requests being processed and waiting in the queue. For example, if your processing time is `10ms`
+and the latency constraint is `100ms`, you can have at most `10` requests ongoing per replica so
+the last requests can finish within the latency constraint. We recommend you benchmark your application
+code and set this number based on end to end latency objective.
+
+:::{note}
+The `version` field is required for autoscaling. We are actively working on removing
+this limitation.
+:::
+
+:::{note}
+The Ray Serve Autoscaler is an application-level autoscaler that sits on top of the [Ray Autoscaler](cluster-index).
+Concretely, this means that the Ray Serve autoscaler asks Ray to start a number of replica actors based on the request demand.
+If the Ray Autoscaler determines there aren't enough available CPUs to place these actors, it responds by adding more nodes.
+Similarly, when Ray Serve scales down and terminates some replica actors, it may result in some nodes being empty, at which point the Ray autoscaler will remove those nodes.
+:::
+
+(serve-cpus-gpus)=
+
+### Resource Management (CPUs, GPUs)
+
+To assign hardware resources per replica, you can pass resource requirements to
+`ray_actor_options`.
+By default, each replica requires one CPU.
+To learn about options to pass in, take a look at [Resources with Actor](actor-resource-guide) guide.
+
+For example, to create a deployment where each replica uses a single GPU, you can do the
+following:
+
+```python
+@serve.deployment(ray_actor_options={"num_gpus": 1})
+def func(*args):
+    return do_something_with_my_gpu()
+```
+
+### Fractional Resources
+
+The resources specified in `ray_actor_options` can also be *fractional*.
+This allows you to flexibly share resources between replicas.
+For example, if you have two models and each doesn't fully saturate a GPU, you might want to have them share a GPU by allocating 0.5 GPUs each.
+The same could be done to multiplex over CPUs.
+
+```python
+@serve.deployment(name="deployment1", ray_actor_options={"num_gpus": 0.5})
+def func(*args):
+    return do_something_with_my_gpu()
+
+@serve.deployment(name="deployment2", ray_actor_options={"num_gpus": 0.5})
+def func(*args):
+    return do_something_with_my_gpu()
+```
+
+### Configuring Parallelism with OMP_NUM_THREADS
+
+Deep learning models like PyTorch and Tensorflow often use multithreading when performing inference.
+The number of CPUs they use is controlled by the OMP_NUM_THREADS environment variable.
+To [avoid contention](omp-num-thread-note), Ray sets `OMP_NUM_THREADS=1` by default because Ray workers and actors use a single CPU by default.
+If you *do* want to enable this parallelism in your Serve deployment, just set OMP_NUM_THREADS to the desired value either when starting Ray or in your function/class definition:
+
+```bash
+OMP_NUM_THREADS=12 ray start --head
+OMP_NUM_THREADS=12 ray start --address=$HEAD_NODE_ADDRESS
+```
+
+```python
+@serve.deployment
+class MyDeployment:
+    def __init__(self, parallelism):
+        os.environ["OMP_NUM_THREADS"] = parallelism
+        # Download model weights, initialize model, etc.
+
+MyDeployment.deploy()
+```
+
+:::{note}
+Some other libraries may not respect `OMP_NUM_THREADS` and have their own way to configure parallelism.
+For example, if you're using OpenCV, you'll need to manually set the number of threads using `cv2.setNumThreads(num_threads)` (set to 0 to disable multi-threading).
+You can check the configuration using `cv2.getNumThreads()` and `cv2.getNumberOfCPUs()`.
+:::
+
+### User Configuration (Experimental)
+
+Suppose you want to update a parameter in your model without needing to restart
+the replicas in your deployment.  You can do this by writing a `reconfigure` method
+for the class underlying your deployment.  At runtime, you can then pass in your
+new parameters by setting the `user_config` option.
+
+The following simple example will make the usage clear:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_reconfigure.py
+```
+
+The `reconfigure` method is called when the class is created if `user_config`
+is set.  In particular, it's also called when new replicas are created in the
+future if scale up your deployment later.  The `reconfigure` method is also  called
+each time `user_config` is updated.
+
+## Handling Dependencies
+
+Ray Serve supports serving deployments with different (possibly conflicting)
+Python dependencies.  For example, you can simultaneously serve one deployment
+that uses legacy Tensorflow 1 and another that uses Tensorflow 2.
+
+This is supported on Mac OS and Linux using Ray's {ref}`runtime-environments` feature.
+As with all other Ray actor options, pass the runtime environment in via `ray_actor_options` in
+your deployment.  Be sure to first run `pip install "ray[default]"` to ensure the
+Runtime Environments feature is installed.
+
+Example:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/conda_env.py
+```
+
+:::{tip}
+Avoid dynamically installing packages that install from source: these can be slow and
+use up all resources while installing, leading to problems with the Ray cluster.  Consider
+precompiling such packages in a private repository or Docker image.
+:::
+
+The dependencies required in the deployment may be different than
+the dependencies installed in the driver program (the one running Serve API
+calls). In this case, you should use a delayed import within the class to avoid
+importing unavailable packages in the driver.  This applies even when not
+using runtime environments.
+
+Example:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/delayed_import.py
+```
--- a/doc/source/serve/core-apis.rst
+++ b/doc/source/serve/core-apis.rst
@ -1,369 +0,0 @@
-=====================
-Core API: Deployments
-=====================
-
-This section should help you:
-
- create, query, update and configure deployments
- configure resources of your deployments
- specify different Python dependencies across different deployment using Runtime Environments
-
-.. tip::
-   Get in touch with us if you're using or considering using `Ray Serve <https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU>`_.
-
-.. contents::
-
-Creating a Deployment
-=====================
-
-Deployments are the central concept in Ray Serve.
-They allow you to define and update your business logic or models that will handle incoming requests as well as how this is exposed over HTTP or in Python.
-
-A deployment is defined using :mod:`@serve.deployment <ray.serve.api.deployment>` on a Python class (or function for simple use cases).
-You can specify arguments to be passed to the constructor when you call ``Deployment.deploy()``, shown below.
-
-A deployment consists of a number of *replicas*, which are individual copies of the function or class that are started in separate Ray Actors (processes).
-
-.. code-block:: python
-
-  @serve.deployment
-  class MyFirstDeployment:
-    # Take the message to return as an argument to the constructor.
-    def __init__(self, msg):
-        self.msg = msg
-
-    def __call__(self, request):
-        return self.msg
-
-    def other_method(self, arg):
-        return self.msg
-
-  MyFirstDeployment.deploy("Hello world!")
-
-Deployments can be exposed in two ways: over HTTP or in Python via the :ref:`servehandle-api`.
-By default, HTTP requests will be forwarded to the ``__call__`` method of the class (or the function) and a ``Starlette Request`` object will be the sole argument.
-You can also define a deployment that wraps a FastAPI app for more flexible handling of HTTP requests. See :ref:`serve-fastapi-http` for details.
-
-To serve multiple deployments defined by the same class, use the ``name`` option:
-
-.. code-block:: python
-
-  MyFirstDeployment.options(name="hello_service").deploy("Hello!")
-  MyFirstDeployment.options(name="hi_service").deploy("Hi!")
-
-You can also list all available deployments and dynamically get references to them:
-
-.. code-block:: python
-
-  >> serve.list_deployments()
-  {'A': Deployment(name=A,version=None,route_prefix=/A)}
-  {'MyFirstDeployment': Deployment(name=MyFirstDeployment,version=None,route_prefix=/MyFirstDeployment}
-
-  # Returns the same object as the original MyFirstDeployment object.
-  # This can be used to redeploy, get a handle, etc.
-  deployment = serve.get_deployment("MyFirstDeployment")
-
-Exposing a Deployment
-=====================
-
-By default, deployments are exposed over HTTP at ``http://localhost:8000/<deployment_name>``.
-The HTTP path that the deployment is available at can be changed using the ``route_prefix`` option.
-All requests to ``/{route_prefix}`` and any subpaths will be routed to the deployment (using a longest-prefix match for overlapping route prefixes).
-
-Here's an example:
-
-.. code-block:: python
-
-  @serve.deployment(name="http_deployment", route_prefix="/api")
-  class HTTPDeployment:
-    def __call__(self, request):
-        return "Hello world!"
-
-After creating the deployment, it is now exposed by the HTTP server and handles requests using the specified class.
-We can query the model to verify that it's working.
-
-.. code-block:: python
-
-  import requests
-  print(requests.get("http://127.0.0.1:8000/api").text)
-
-We can also query the deployment using the :mod:`ServeHandle <ray.serve.handle.RayServeHandle>` interface.
-
-.. code-block:: python
-
-  # To get a handle from the same script, use the Deployment object directly:
-  handle = HTTPDeployment.get_handle()
-
-  # To get a handle from a different script, reference it by name:
-  handle = serve.get_deployment("http_deployment").get_handle()
-
-  print(ray.get(handle.remote()))
-
-As noted above, there are two ways to expose deployments. The first is by using the :mod:`ServeHandle <ray.serve.handle.RayServeHandle>`
-interface. This method allows you to access deployments within a Python script or code, making it convenient for a
-Python developer. And the second is by using the HTTP request, allowing access to deployments via a web client application.
-
-Let's look at a simple end-to-end example using both ways to expose and access deployments. Your output may
-vary due to random nature of how the prediction is computed; however, the example illustrates two things:
-1) how to expose and use deployments and 2) how to use replicas, to which requests are sent. Note that each pid
-is a separate replica associated with each deployment name, ``rep-1`` and ``rep-2`` respectively.
-
-.. literalinclude:: _examples/doc_code/create_deployment.py
-    :language: python
-    :start-after: __serve_example_begin__
-    :end-before:  __serve_example_end__
-
-.. code-block:: python
-
-
-    # Output:
-    # {'rep-1': Deployment(name=rep-1,version=None,route_prefix=/rep-1),
-    # 'rep-2': Deployment(name=rep-2,version=None,route_prefix=/rep-2)}
-    #
-    # ServerHandle API responses: ----------
-    # handle name : rep-1
-    # prediction  : (pid: 62636); path: /model/rep-1.pkl; data: 0.600; prediction: 1.292
-    # --
-    # handle name : rep-2
-    # prediction  : (pid: 62635); path: /model/rep-2.pkl; data: 0.075; prediction: 0.075
-    # --
-    # handle name : rep-1
-    # prediction  : (pid: 62634); path: /model/rep-1.pkl; data: 0.186; prediction: 0.186
-    # --
-    # handle name : rep-2
-    # prediction  : (pid: 62637); path: /model/rep-2.pkl; data: 0.751; prediction: 1.444
-    # --
-    # HTTP responses: ----------
-    # handle name : rep-1
-    # prediction  : (pid: 62636); path: /model/rep-1.pkl; data: 0.582; prediction: 1.481
-    # handle name : rep-2
-    # prediction  : (pid: 62637); path: /model/rep-2.pkl; data: 0.778; prediction: 1.678
-    # handle name : rep-1
-    # prediction  : (pid: 62634); path: /model/rep-1.pkl; data: 0.139; prediction: 0.139
-    # handle name : rep-2
-    # prediction  : (pid: 62635); path: /model/rep-2.pkl; data: 0.569; prediction: 1.262
-
-Updating a Deployment
-=====================
-
-Often you want to be able to update your code or configuration options for a deployment over time.
-Deployments can be updated simply by updating the code or configuration options and calling ``deploy()`` again.
-
-.. code-block:: python
-
-  @serve.deployment(name="my_deployment", num_replicas=1)
-  class SimpleDeployment:
-      pass
-
-  # Creates one initial replica.
-  SimpleDeployment.deploy()
-
-  # Re-deploys, creating an additional replica.
-  # This could be the SAME Python script, modified and re-run.
-  @serve.deployment(name="my_deployment", num_replicas=2)
-  class SimpleDeployment:
-      pass
-
-  SimpleDeployment.deploy()
-
-  # You can also use Deployment.options() to change options without redefining
-  # the class. This is useful for programmatically updating deployments.
-  SimpleDeployment.options(num_replicas=2).deploy()
-
-
-By default, each call to ``.deploy()`` will cause a redeployment, even if the underlying code and options didn't change.
-This could be detrimental if you have many deployments in a script and and only want to update one: if you re-run the script, all of the deployments will be redeployed, not just the one you updated.
-To prevent this, you may provide a ``version`` string for the deployment as a keyword argument in the decorator or ``Deployment.options()``.
-If provided, the replicas will only be updated if the value of ``version`` is updated; if the value of ``version`` is unchanged, the call to ``.deploy()`` will be a no-op.
-When a redeployment happens, Serve will perform a rolling update, bringing down at most 20% of the replicas at any given time.
-
-.. _configuring-a-deployment:
-
-Configuring a Deployment
-========================
-
-There are a number of things you'll likely want to do with your serving application including
-scaling out or configuring the maximum number of in-flight requests for a deployment.
-All of these options can be specified either in :mod:`@serve.deployment <ray.serve.api.deployment>` or in ``Deployment.options()``.
-
-To update the config options for a running deployment, simply redeploy it with the new options set.
-
-Scaling Out
-----------
-
-To scale out a deployment to many processes, simply configure the number of replicas.
-
-.. code-block:: python
-
-  # Create with a single replica.
-  @serve.deployment(num_replicas=1)
-  def func(*args):
-      pass
-
-  func.deploy()
-
-  # Scale up to 10 replicas.
-  func.options(num_replicas=10).deploy()
-
-  # Scale back down to 1 replica.
-  func.options(num_replicas=1).deploy()
-
-Autoscaling
-^^^^^^^^^^^
-
-Serve also has experimental support for a demand-based replica autoscaler.
-It reacts to traffic spikes via observing queue sizes and making scaling decisions.
-To configure it, you can set the ``_autoscaling`` field in deployment options.
-
-.. warning::
-  The API is experimental and subject to change. We welcome you to test it out
-  and leave us feedback through `Github Issues <https://github.com/ray-project/ray/issues>`_ or our `discussion forum <https://discuss.ray.io/>`_!
-
-.. code-block:: python
-
-  @serve.deployment(
-      _autoscaling_config={
-          "min_replicas": 1,
-          "max_replicas": 5,
-          "target_num_ongoing_requests_per_replica": 10,
-      },
-      version="v1")
-  def func(_):
-      time.sleep(1)
-      return ""
-  
-  func.deploy() # The func deployment will now autoscale based on requests demand.
-
-The ``min_replicas`` and ``max_replicas`` fields configure the range of replicas which the
-Serve autoscaler chooses from.  Deployments will start with ``min_replicas`` initially.
-
-The ``target_num_ongoing_requests_per_replica`` configuration specifies how aggressively the
-autoscaler should react to traffic. Serve will try to make sure that each replica has roughly that number
-of requests being processed and waiting in the queue. For example, if your processing time is ``10ms``
-and the latency constraint is ``100ms``, you can have at most ``10`` requests ongoing per replica so
-the last requests can finish within the latency constraint. We recommend you benchmark your application
-code and set this number based on end to end latency objective.
-
-.. note::
-  The ``version`` field is required for autoscaling. We are actively working on removing
-  this limitation.
-
-.. note::
-  The Ray Serve Autoscaler is an application-level autoscaler that sits on top of the :ref:`Ray Autoscaler<cluster-index>`.  
-  Concretely, this means that the Ray Serve autoscaler asks Ray to start a number of replica actors based on the request demand.
-  If the Ray Autoscaler determines there aren't enough available CPUs to place these actors, it responds by adding more nodes.
-  Similarly, when Ray Serve scales down and terminates some replica actors, it may result in some nodes being empty, at which point the Ray autoscaler will remove those nodes.
-
-.. _`serve-cpus-gpus`:
-
-Resource Management (CPUs, GPUs)
--------------------------------
-
-To assign hardware resources per replica, you can pass resource requirements to
-``ray_actor_options``.
-By default, each replica requires one CPU.
-To learn about options to pass in, take a look at :ref:`Resources with Actor<actor-resource-guide>` guide.
-
-For example, to create a deployment where each replica uses a single GPU, you can do the
-following:
-
-.. code-block:: python
-
-  @serve.deployment(ray_actor_options={"num_gpus": 1})
-  def func(*args):
-      return do_something_with_my_gpu()
-
-Fractional Resources
--------------------
-
-The resources specified in ``ray_actor_options`` can also be *fractional*.
-This allows you to flexibly share resources between replicas.
-For example, if you have two models and each doesn't fully saturate a GPU, you might want to have them share a GPU by allocating 0.5 GPUs each.
-The same could be done to multiplex over CPUs.
-
-.. code-block:: python
-
-  @serve.deployment(name="deployment1", ray_actor_options={"num_gpus": 0.5})
-  def func(*args):
-      return do_something_with_my_gpu()
-
-  @serve.deployment(name="deployment2", ray_actor_options={"num_gpus": 0.5})
-  def func(*args):
-      return do_something_with_my_gpu()
-
-Configuring Parallelism with OMP_NUM_THREADS
--------------------------------------------
-
-Deep learning models like PyTorch and Tensorflow often use multithreading when performing inference.
-The number of CPUs they use is controlled by the OMP_NUM_THREADS environment variable.
-To :ref:`avoid contention<omp-num-thread-note>`, Ray sets ``OMP_NUM_THREADS=1`` by default because Ray workers and actors use a single CPU by default.
-If you *do* want to enable this parallelism in your Serve deployment, just set OMP_NUM_THREADS to the desired value either when starting Ray or in your function/class definition:
-
-.. code-block:: bash
-
-  OMP_NUM_THREADS=12 ray start --head
-  OMP_NUM_THREADS=12 ray start --address=$HEAD_NODE_ADDRESS
-
-.. code-block:: python
-
-  @serve.deployment
-  class MyDeployment:
-      def __init__(self, parallelism):
-          os.environ["OMP_NUM_THREADS"] = parallelism
-          # Download model weights, initialize model, etc.
-
-  MyDeployment.deploy()
-
-
-.. note::
-  Some other libraries may not respect ``OMP_NUM_THREADS`` and have their own way to configure parallelism.
-  For example, if you're using OpenCV, you'll need to manually set the number of threads using ``cv2.setNumThreads(num_threads)`` (set to 0 to disable multi-threading).
-  You can check the configuration using ``cv2.getNumThreads()`` and ``cv2.getNumberOfCPUs()``.
-
-User Configuration (Experimental)
---------------------------------
-
-Suppose you want to update a parameter in your model without needing to restart
-the replicas in your deployment.  You can do this by writing a `reconfigure` method
-for the class underlying your deployment.  At runtime, you can then pass in your
-new parameters by setting the `user_config` option.
-
-The following simple example will make the usage clear:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_reconfigure.py
-
-The `reconfigure` method is called when the class is created if `user_config`
-is set.  In particular, it's also called when new replicas are created in the
-future if scale up your deployment later.  The `reconfigure` method is also  called
-each time `user_config` is updated.
-
-Handling Dependencies
-=====================
-
-Ray Serve supports serving deployments with different (possibly conflicting)
-Python dependencies.  For example, you can simultaneously serve one deployment
-that uses legacy Tensorflow 1 and another that uses Tensorflow 2.
-
-This is supported on Mac OS and Linux using Ray's :ref:`runtime-environments` feature.
-As with all other Ray actor options, pass the runtime environment in via ``ray_actor_options`` in
-your deployment.  Be sure to first run ``pip install "ray[default]"`` to ensure the
-Runtime Environments feature is installed.
-
-Example:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/conda_env.py
-
-.. tip::
-  Avoid dynamically installing packages that install from source: these can be slow and
-  use up all resources while installing, leading to problems with the Ray cluster.  Consider
-  precompiling such packages in a private repository or Docker image.
-
-The dependencies required in the deployment may be different than
-the dependencies installed in the driver program (the one running Serve API
-calls). In this case, you should use a delayed import within the class to avoid
-importing unavailable packages in the driver.  This applies even when not
-using runtime environments.
-
-Example:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/delayed_import.py
--- a/doc/source/serve/deployment-graph.md
+++ b/doc/source/serve/deployment-graph.md
@ -298,7 +298,7 @@ Serve provides a default DAGDriver implementation that accepts HTTP request and
 You can configure how does the DAGDriver convert HTTP request types. By default, we directly send in a [```starlette.requests.Request```](https://www.starlette.io/requests/) object to represent the whole request. You can also specifies built-in adapters. In this example, we will use a `json_request` adapter that parses HTTP body with JSON parser.

 ```{tip}
-There are several useful adapters like ndarray JSON, image object, etc. You can checkout {ref}`the list of adapters here <serve-http-adapters>`. You can also easily plug in your own adapter by passing in in the ```http_adapter``` field.
+There are several useful adapters like ndarray JSON, image object, etc. You can checkout [the list of adapters here](serve-http-adapters). You can also easily plug in your own adapter by passing in in the ```http_adapter``` field.
 ```

 +++
--- a/doc/source/serve/deployment.md
+++ b/doc/source/serve/deployment.md
@ -0,0 +1,299 @@
+(serve-deploy-tutorial)=
+
+# Deploying Ray Serve
+
+This section should help you:
+
+- understand how Ray Serve runs on a Ray cluster beyond the basics mentioned in {doc}`core-apis`
+- deploy and update your Serve application over time
+- monitor your Serve application using the Ray Dashboard and logging
+
+```{contents} Deploying Ray Serve
+```
+
+(ray-serve-instance-lifetime)=
+
+## Lifetime of a Ray Serve Instance
+
+Ray Serve instances run on top of Ray clusters and are started using {mod}`serve.start <ray.serve.start>`.
+Once {mod}`serve.start <ray.serve.start>` has been called, further API calls can be used to create and update the deployments that will be used to serve your Python code (including ML models).
+The Serve instance will be torn down when the script exits.
+
+When running on a long-lived Ray cluster (e.g., one started using `ray start` and connected
+to using `ray.init(address="auto", namespace="serve")`, you can also deploy a Ray Serve instance as a long-running
+service using `serve.start(detached=True)`. In this case, the Serve instance will continue to
+run on the Ray cluster even after the script that calls it exits. If you want to run another script
+to update the Serve instance, you can run another script that connects to the same Ray cluster and makes further API calls (e.g., to create, update, or delete a deployment). Note that there can only be one detached Serve instance on each Ray cluster.
+
+All non-detached Serve instances will be started in the current namespace that was specified when connecting to the cluster. If a namespace is specified for a detached Serve instance, it will be used. Otherwise if the current namespace is anonymous, the Serve instance will be started in the `serve` namespace.
+
+If `serve.start()` is called again in a process in which there is already a running Serve instance, Serve will re-connect to the existing instance (regardless of whether the original instance was detached or not). To reconnect to a Serve instance that exists in the Ray cluster but not in the current process, connect to the cluster with the same namespace that was specified when starting the instance and run `serve.start()`.
+
+## Deploying on a Single Node
+
+While Ray Serve makes it easy to scale out on a multi-node Ray cluster, in some scenarios a single node may suit your needs.
+There are two ways you can run Ray Serve on a single node, shown below.
+In general, **Option 2 is recommended for most users** because it allows you to fully make use of Serve's ability to dynamically update running deployments.
+
+1. Start Ray and deploy with Ray Serve all in a single Python file.
+
+```python
+import ray
+from ray import serve
+import time
+
+# This will start Ray locally and start Serve on top of it.
+serve.start()
+
+@serve.deployment
+def my_func(request):
+  return "hello"
+
+my_func.deploy()
+
+# Serve will be shut down once the script exits, so keep it alive manually.
+while True:
+    time.sleep(5)
+    print(serve.list_deployments())
+```
+
+2. First running `ray start --head` on the machine, then connecting to the running local Ray cluster using `ray.init(address="auto", namespace="serve")` in your Serve script(s) (this is the Ray namespace, not Kubernetes namespace, and you can specify any namespace that you like). You can run multiple scripts to update your deployments over time.
+
+```bash
+ray start --head # Start local Ray cluster.
+serve start # Start Serve on the local Ray cluster.
+```
+
+```python
+import ray
+from ray import serve
+
+# This will connect to the running Ray cluster.
+ray.init(address="auto", namespace="serve")
+
+@serve.deployment
+def my_func(request):
+  return "hello"
+
+my_func.deploy()
+```
+
+## Deploying on Kubernetes
+
+In order to deploy Ray Serve on Kubernetes, we need to do the following:
+
+1. Start a Ray cluster on Kubernetes.
+2. Expose the head node of the cluster as a [Service].
+3. Start Ray Serve on the cluster.
+
+There are multiple ways to start a Ray cluster on Kubernetes, see {ref}`ray-k8s-deploy` for more information.
+Here, we will be using the [Ray Cluster Launcher](cluster-cloud) tool, which has support for Kubernetes as a backend.
+
+The cluster launcher takes in a yaml config file that describes the cluster.
+Here, we'll be using the [Kubernetes default config] with a few small modifications.
+First, we need to make sure that the head node of the cluster, where Ray Serve will run its HTTP server, is exposed as a Kubernetes [Service].
+There is already a default head node service defined in the `services` field of the config, so we just need to make sure that it's exposing the right port: 8000, which Ray Serve binds on by default.
+
+```yaml
+# Service that maps to the head node of the Ray cluster.
+- apiVersion: v1
+  kind: Service
+  metadata:
+      name: ray-head
+  spec:
+      # Must match the label in the head pod spec below.
+      selector:
+          component: ray-head
+      ports:
+          - protocol: TCP
+            # Port that this service will listen on.
+            port: 8000
+            # Port that requests will be sent to in pods backing the service.
+            targetPort: 8000
+```
+
+Then, we also need to make sure that the head node pod spec matches the selector defined here and exposes the same port:
+
+```yaml
+head_node:
+  apiVersion: v1
+  kind: Pod
+  metadata:
+    # Automatically generates a name for the pod with this prefix.
+    generateName: ray-head-
+
+    # Matches the selector in the service definition above.
+    labels:
+        component: ray-head
+
+  spec:
+    # ...
+    containers:
+    - name: ray-node
+      # ...
+      ports:
+          - containerPort: 8000 # Ray Serve default port.
+    # ...
+```
+
+The rest of the config remains unchanged for this example, though you may want to change the container image or the number of worker pods started by default when running your own deployment.
+Now, we just need to start the cluster:
+
+```shell
+# Start the cluster.
+$ ray up ray/python/ray/autoscaler/kubernetes/example-full.yaml
+
+# Check the status of the service pointing to the head node. If configured
+# properly, you should see the 'Endpoints' field populated with an IP
+# address like below. If not, make sure the head node pod started
+# successfully and the selector/labels match.
+$ kubectl -n ray describe service ray-head
+  Name:              ray-head
+  Namespace:         ray
+  Labels:            <none>
+  Annotations:       <none>
+  Selector:          component=ray-head
+  Type:              ClusterIP
+  IP:                10.100.188.203
+  Port:              <unset>  8000/TCP
+  TargetPort:        8000/TCP
+  Endpoints:         192.168.73.98:8000
+  Session Affinity:  None
+  Events:            <none>
+```
+
+With the cluster now running, we can run a simple script to start Ray Serve and deploy a "hello world" deployment:
+
+> ```python
+> import ray
+> from ray import serve
+>
+> # Connect to the running Ray cluster.
+> ray.init(address="auto", namespace="serve")
+> # Bind on 0.0.0.0 to expose the HTTP server on external IPs.
+> serve.start(detached=True, http_options={"host": "0.0.0.0"})
+>
+>
+> @serve.deployment(route_prefix="/hello")
+> def hello(request):
+>     return "hello world"
+>
+> hello.deploy()
+> ```
+
+Save this script locally as `deploy.py` and run it on the head node using `ray submit`:
+
+> ```shell
+> $ ray submit ray/python/ray/autoscaler/kubernetes/example-full.yaml deploy.py
+> ```
+
+Now we can try querying the service by sending an HTTP request to the service from within the Kubernetes cluster.
+
+> ```shell
+> # Get a shell inside of the head node.
+> $ ray attach ray/python/ray/autoscaler/kubernetes/example-full.yaml
+>
+> # Query the Ray Serve deployment. This can be run from anywhere in the
+> # Kubernetes cluster.
+> $ curl -X GET http://$RAY_HEAD_SERVICE_HOST:8000/hello
+> hello world
+> ```
+
+In order to expose the Ray Serve deployment externally, we would need to deploy the Service we created here behind an [Ingress] or a [NodePort].
+Please refer to the Kubernetes documentation for more information.
+
+## Health Checking
+
+By default, each actor making up a Serve deployment is health checked and restarted on failure.
+
+:::{note}
+User-defined health checks are experimental and may be subject to change before the interface is stabilized. If you have any feedback or run into any issues or unexpected behaviors, please file an issue on GitHub.
+:::
+
+You can customize this behavior to perform an application-level health check or to adjust the frequency/timeout.
+To define a custom healthcheck, define a `check_health` method on your deployment class.
+This method should take no arguments and return no result, raising an exception if the replica should be considered unhealthy.
+You can also customize how frequently the health check is run and the timeout when a replica will be deemed unhealthy if it hasn't responded in the deployment options.
+
+> ```python
+> @serve.deployment(_health_check_period_s=10, _health_check_timeout_s=30)
+> class MyDeployment:
+>     def __init__(self, db_addr: str):
+>         self._my_db_connection = connect_to_db(db_addr)
+>
+>     def __call__(self, request):
+>         return self._do_something_cool()
+>
+>     # Will be called by Serve to check the health of the replica.
+>     def check_health(self):
+>         if not self._my_db_connection.is_connected():
+>             # The specific type of exception is not important.
+>             raise RuntimeError("uh-oh, DB connection is broken.")
+> ```
+
+:::{tip}
+You can use the Serve CLI command `serve status` to get status info
+about your live deployments. The CLI was included with Serve when you did
+`pip install "ray[serve]"`. If you're checking your deployments on a
+remote Ray cluster, make sure to include the Ray cluster's dashboard address
+in the command: `serve status --address [dashboard_address]`.
+:::
+
+## Failure Recovery
+
+Ray Serve is resilient to any component failures within the Ray cluster out of the box.
+You can checkout the detail of how process and worker node failure handled at {ref}`serve-ft-detail`.
+However, when the Ray head node goes down, you would need to recover the state by creating a new
+Ray cluster and re-deploys all Serve deployments into that cluster.
+
+:::{note}
+Ray currently cannot survive head node failure and we recommend using application specific
+failure recovery solutions. Although Ray is not currently highly available (HA), it is on
+the long term roadmap and being actively worked on.
+:::
+
+Ray Serve added an experimental feature to help recovering the state.
+This features enables Serve to write all your deployment configuration and code into a storage location.
+Upon Ray cluster failure and restarts, you can simply call Serve to reconstruct the state.
+
+Here is how to use it:
+
+:::{warning}
+The API is experimental and subject to change. We welcome you to test it out
+and leave us feedback through github issues or discussion forum!
+:::
+
+You can use both the start argument and the CLI to specify it:
+
+```python
+serve.start(_checkpoint_path=...)
+```
+
+or
+
+```shell
+serve start --checkpoint-path ...
+```
+
+The checkpoint path argument accepts the following format:
+
+- `file://local_file_path`
+- `s3://bucket/path`
+- `gs://bucket/path`
+- `custom://importable.custom_python.Class/path`
+
+While we have native support for on disk, AWS S3, and Google Cloud Storage (GCS), there is no reason we cannot support more.
+
+In Kubernetes environment, we recommend using [Persistent Volumes] to create a disk and mount it into the Ray head node.
+For example, you can provision Azure Disk, AWS Elastic Block Store, or GCP Persistent Disk using the K8s [Persistent Volumes] API.
+Alternatively, you can also directly write to object store like S3.
+
+You can easily try to plug into your own implementation using the `custom://` path and inherit the [KVStoreBase] class.
+Feel free to open new github issues and contribute more storage backends!
+
+[ingress]: https://kubernetes.io/docs/concepts/services-networking/ingress/
+[kubernetes default config]: https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/kubernetes/example-full.yaml
+[kvstorebase]: https://github.com/ray-project/ray/blob/master/python/ray/serve/storage/kv_store_base.py
+[nodeport]: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
+[persistent volumes]: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
+[service]: https://kubernetes.io/docs/concepts/services-networking/service/
--- a/doc/source/serve/deployment.rst
+++ b/doc/source/serve/deployment.rst
@ -1,308 +0,0 @@
-.. _serve-deploy-tutorial:
-
-===================
-Deploying Ray Serve
-===================
-This section should help you:
-
- understand how Ray Serve runs on a Ray cluster beyond the basics mentioned in :doc:`core-apis`
- deploy and update your Serve application over time
- monitor your Serve application using the Ray Dashboard and logging
-
-.. contents:: Deploying Ray Serve
-
-.. _ray-serve-instance-lifetime:
-
-Lifetime of a Ray Serve Instance
-================================
-
-Ray Serve instances run on top of Ray clusters and are started using :mod:`serve.start <ray.serve.start>`.
-Once :mod:`serve.start <ray.serve.start>` has been called, further API calls can be used to create and update the deployments that will be used to serve your Python code (including ML models).
-The Serve instance will be torn down when the script exits.
-
-When running on a long-lived Ray cluster (e.g., one started using ``ray start`` and connected
-to using ``ray.init(address="auto", namespace="serve")``, you can also deploy a Ray Serve instance as a long-running
-service using ``serve.start(detached=True)``. In this case, the Serve instance will continue to
-run on the Ray cluster even after the script that calls it exits. If you want to run another script
-to update the Serve instance, you can run another script that connects to the same Ray cluster and makes further API calls (e.g., to create, update, or delete a deployment). Note that there can only be one detached Serve instance on each Ray cluster.
-
-All non-detached Serve instances will be started in the current namespace that was specified when connecting to the cluster. If a namespace is specified for a detached Serve instance, it will be used. Otherwise if the current namespace is anonymous, the Serve instance will be started in the ``serve`` namespace.
-
-If ``serve.start()`` is called again in a process in which there is already a running Serve instance, Serve will re-connect to the existing instance (regardless of whether the original instance was detached or not). To reconnect to a Serve instance that exists in the Ray cluster but not in the current process, connect to the cluster with the same namespace that was specified when starting the instance and run ``serve.start()``.
-
-Deploying on a Single Node
-==========================
-
-While Ray Serve makes it easy to scale out on a multi-node Ray cluster, in some scenarios a single node may suit your needs.
-There are two ways you can run Ray Serve on a single node, shown below.
-In general, **Option 2 is recommended for most users** because it allows you to fully make use of Serve's ability to dynamically update running deployments.
-
-1. Start Ray and deploy with Ray Serve all in a single Python file.
-
-.. code-block:: python
-
-  import ray
-  from ray import serve
-  import time
-
-  # This will start Ray locally and start Serve on top of it.
-  serve.start()
-
-  @serve.deployment
-  def my_func(request):
-    return "hello"
-
-  my_func.deploy()
-
-  # Serve will be shut down once the script exits, so keep it alive manually.
-  while True:
-      time.sleep(5)
-      print(serve.list_deployments())
-
-2. First running ``ray start --head`` on the machine, then connecting to the running local Ray cluster using ``ray.init(address="auto", namespace="serve")`` in your Serve script(s) (this is the Ray namespace, not Kubernetes namespace, and you can specify any namespace that you like). You can run multiple scripts to update your deployments over time.
-
-.. code-block:: bash
-
-  ray start --head # Start local Ray cluster.
-  serve start # Start Serve on the local Ray cluster.
-
-.. code-block:: python
-
-  import ray
-  from ray import serve
-
-  # This will connect to the running Ray cluster.
-  ray.init(address="auto", namespace="serve")
-
-  @serve.deployment
-  def my_func(request):
-    return "hello"
-
-  my_func.deploy()
-
-
-Deploying on Kubernetes
-=======================
-
-In order to deploy Ray Serve on Kubernetes, we need to do the following:
-
-1. Start a Ray cluster on Kubernetes.
-2. Expose the head node of the cluster as a `Service`_.
-3. Start Ray Serve on the cluster.
-
-There are multiple ways to start a Ray cluster on Kubernetes, see :ref:`ray-k8s-deploy` for more information.
-Here, we will be using the :ref:`Ray Cluster Launcher <cluster-cloud>` tool, which has support for Kubernetes as a backend.
-
-The cluster launcher takes in a yaml config file that describes the cluster.
-Here, we'll be using the `Kubernetes default config`_ with a few small modifications.
-First, we need to make sure that the head node of the cluster, where Ray Serve will run its HTTP server, is exposed as a Kubernetes `Service`_.
-There is already a default head node service defined in the ``services`` field of the config, so we just need to make sure that it's exposing the right port: 8000, which Ray Serve binds on by default.
-
-.. code-block:: yaml
-
-  # Service that maps to the head node of the Ray cluster.
-  - apiVersion: v1
-    kind: Service
-    metadata:
-        name: ray-head
-    spec:
-        # Must match the label in the head pod spec below.
-        selector:
-            component: ray-head
-        ports:
-            - protocol: TCP
-              # Port that this service will listen on.
-              port: 8000
-              # Port that requests will be sent to in pods backing the service.
-              targetPort: 8000
-
-Then, we also need to make sure that the head node pod spec matches the selector defined here and exposes the same port:
-
-.. code-block:: yaml
-
-  head_node:
-    apiVersion: v1
-    kind: Pod
-    metadata:
-      # Automatically generates a name for the pod with this prefix.
-      generateName: ray-head-
-
-      # Matches the selector in the service definition above.
-      labels:
-          component: ray-head
-
-    spec:
-      # ...
-      containers:
-      - name: ray-node
-        # ...
-        ports:
-            - containerPort: 8000 # Ray Serve default port.
-      # ...
-
-The rest of the config remains unchanged for this example, though you may want to change the container image or the number of worker pods started by default when running your own deployment.
-Now, we just need to start the cluster:
-
-.. code-block:: shell
-
-    # Start the cluster.
-    $ ray up ray/python/ray/autoscaler/kubernetes/example-full.yaml
-
-    # Check the status of the service pointing to the head node. If configured
-    # properly, you should see the 'Endpoints' field populated with an IP
-    # address like below. If not, make sure the head node pod started
-    # successfully and the selector/labels match.
-    $ kubectl -n ray describe service ray-head
-      Name:              ray-head
-      Namespace:         ray
-      Labels:            <none>
-      Annotations:       <none>
-      Selector:          component=ray-head
-      Type:              ClusterIP
-      IP:                10.100.188.203
-      Port:              <unset>  8000/TCP
-      TargetPort:        8000/TCP
-      Endpoints:         192.168.73.98:8000
-      Session Affinity:  None
-      Events:            <none>
-
-With the cluster now running, we can run a simple script to start Ray Serve and deploy a "hello world" deployment:
-
-  .. code-block:: python
-
-    import ray
-    from ray import serve
-
-    # Connect to the running Ray cluster.
-    ray.init(address="auto", namespace="serve")
-    # Bind on 0.0.0.0 to expose the HTTP server on external IPs.
-    serve.start(detached=True, http_options={"host": "0.0.0.0"})
-
-
-    @serve.deployment(route_prefix="/hello")
-    def hello(request):
-        return "hello world"
-
-    hello.deploy()
-
-Save this script locally as ``deploy.py`` and run it on the head node using ``ray submit``:
-
-  .. code-block:: shell
-
-    $ ray submit ray/python/ray/autoscaler/kubernetes/example-full.yaml deploy.py
-
-Now we can try querying the service by sending an HTTP request to the service from within the Kubernetes cluster.
-
-  .. code-block:: shell
-
-    # Get a shell inside of the head node.
-    $ ray attach ray/python/ray/autoscaler/kubernetes/example-full.yaml
-
-    # Query the Ray Serve deployment. This can be run from anywhere in the
-    # Kubernetes cluster.
-    $ curl -X GET http://$RAY_HEAD_SERVICE_HOST:8000/hello
-    hello world
-
-In order to expose the Ray Serve deployment externally, we would need to deploy the Service we created here behind an `Ingress`_ or a `NodePort`_.
-Please refer to the Kubernetes documentation for more information.
-
-.. _`Kubernetes default config`: https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/kubernetes/example-full.yaml
-.. _`Service`: https://kubernetes.io/docs/concepts/services-networking/service/
-.. _`Ingress`: https://kubernetes.io/docs/concepts/services-networking/ingress/
-.. _`NodePort`: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
-
-
-
-Health Checking
-===============
-By default, each actor making up a Serve deployment is health checked and restarted on failure.
-
-
-.. note::
-
-   User-defined health checks are experimental and may be subject to change before the interface is stabilized. If you have any feedback or run into any issues or unexpected behaviors, please file an issue on GitHub.
-
-You can customize this behavior to perform an application-level health check or to adjust the frequency/timeout.
-To define a custom healthcheck, define a ``check_health`` method on your deployment class.
-This method should take no arguments and return no result, raising an exception if the replica should be considered unhealthy.
-You can also customize how frequently the health check is run and the timeout when a replica will be deemed unhealthy if it hasn't responded in the deployment options.
-
-  .. code-block:: python
-
-    @serve.deployment(_health_check_period_s=10, _health_check_timeout_s=30)
-    class MyDeployment:
-        def __init__(self, db_addr: str):
-            self._my_db_connection = connect_to_db(db_addr)
-
-        def __call__(self, request):
-            return self._do_something_cool()
-
-        # Will be called by Serve to check the health of the replica.
-        def check_health(self):
-            if not self._my_db_connection.is_connected():
-                # The specific type of exception is not important.
-                raise RuntimeError("uh-oh, DB connection is broken.")
-
-.. tip::
-
-    You can use the Serve CLI command ``serve status`` to get status info
-    about your live deployments. The CLI was included with Serve when you did
-    ``pip install "ray[serve]"``. If you're checking your deployments on a
-    remote Ray cluster, make sure to include the Ray cluster's dashboard address
-    in the command: ``serve status --address [dashboard_address]``.
-
-Failure Recovery
-================
-Ray Serve is resilient to any component failures within the Ray cluster out of the box.
-You can checkout the detail of how process and worker node failure handled at :ref:`serve-ft-detail`.
-However, when the Ray head node goes down, you would need to recover the state by creating a new
-Ray cluster and re-deploys all Serve deployments into that cluster.
-
-.. note::
-  Ray currently cannot survive head node failure and we recommend using application specific
-  failure recovery solutions. Although Ray is not currently highly available (HA), it is on
-  the long term roadmap and being actively worked on.
-
-Ray Serve added an experimental feature to help recovering the state.
-This features enables Serve to write all your deployment configuration and code into a storage location.
-Upon Ray cluster failure and restarts, you can simply call Serve to reconstruct the state.
-
-Here is how to use it:
-
-.. warning::
-  The API is experimental and subject to change. We welcome you to test it out
-  and leave us feedback through github issues or discussion forum!
-
-
-You can use both the start argument and the CLI to specify it:
-
-.. code-block:: python
-
-    serve.start(_checkpoint_path=...)
-
-or
-
-.. code-block:: shell
-
-    serve start --checkpoint-path ...
-
-
-The checkpoint path argument accepts the following format:
-
- ``file://local_file_path``
- ``s3://bucket/path``
- ``gs://bucket/path``
- ``custom://importable.custom_python.Class/path``
-
-While we have native support for on disk, AWS S3, and Google Cloud Storage (GCS), there is no reason we cannot support more.
-
-In Kubernetes environment, we recommend using `Persistent Volumes`_ to create a disk and mount it into the Ray head node.
-For example, you can provision Azure Disk, AWS Elastic Block Store, or GCP Persistent Disk using the K8s `Persistent Volumes`_ API.
-Alternatively, you can also directly write to object store like S3.
-
-You can easily try to plug into your own implementation using the ``custom://`` path and inherit the `KVStoreBase`_ class.
-Feel free to open new github issues and contribute more storage backends!
-
-.. _`Persistent Volumes`: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
-
-.. _`KVStoreBase`: https://github.com/ray-project/ray/blob/master/python/ray/serve/storage/kv_store_base.py
--- a/doc/source/serve/end_to_end_tutorial.md
+++ b/doc/source/serve/end_to_end_tutorial.md
@ -0,0 +1,397 @@
+(end-to-end-tutorial)=
+
+# End-to-End Tutorial
+
+By the end of this tutorial you will have learned how to deploy a machine
+learning model locally via Ray Serve.
+
+First, install Ray Serve and all of its dependencies by running the following
+command in your terminal:
+
+```bash
+$ pip install "ray[serve]"
+```
+
+For this tutorial, we'll use [HuggingFace's SummarizationPipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.SummarizationPipeline)
+to access a model that summarizes text.
+
+## Example Model
+
+Let's first take a look at how the model works, without using Ray Serve.
+This is the code for the model:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_local.py
+:end-before: __local_model_end__
+:language: python
+:linenos: true
+:start-after: __local_model_start__
+```
+
+The Python file, called `local_model.py` uses the `summarize` function to
+generate summaries of text.
+
+- The `summarizer` variable on line 7 inside `summarize` points to a
+  function that uses the [t5-small](https://huggingface.co/t5-small)
+  model to summarize text.
+- When `summarizer` is called on a Python String, it returns summarized text
+  inside a dictionary formatted as `[{"summary_text": "...", ...}, ...]`.
+- `summarize` then extracts the summarized text on line 13 by indexing into
+  the dictionary.
+
+The file can be run locally by executing the Python script, which uses the
+model to summarize an article about the Apollo 11 moon landing [^f1].
+
+```bash
+$ python local_model.py
+
+"two astronauts steered their fragile lunar module safely and smoothly to the
+historic landing . the first men to reach the moon -- Armstrong and his
+co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
+rest on a level, rock-strewn plain ."
+```
+
+Keep in mind that the `SummarizationPipeline` is an example machine learning
+model for this tutorial. You can follow along using arbitrary models in any
+framework that has a Python API. Check out our tutorials on sckit-learn,
+PyTorch, and Tensorflow for more info and examples:
+
+- {ref}`serve-sklearn-tutorial`
+- {ref}`serve-pytorch-tutorial`
+- {ref}`serve-tensorflow-tutorial`
+
+## Converting to Ray Serve Deployment
+
+This tutorial's goal is to deploy this model using Ray Serve, so it can be
+scaled up and queried over HTTP. We'll start by converting the above Python
+function into a Ray Serve deployment that can be launched locally on a laptop.
+
+We start by opening a new Python file. First, we need to import `ray` and
+`ray serve`, to use features in Ray Serve such as `deployments`, which
+provide HTTP access to our model.
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
+:end-before: __import_end__
+:language: python
+:start-after: __import_start__
+```
+
+After these imports, we can include our model code from above.
+We won't call our `summarize` function just yet though!
+We will soon add logic to handle HTTP requests, so the `summarize` function
+can operate on article text sent via HTTP request.
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
+:end-before: __local_model_end__
+:language: python
+:start-after: __local_model_start__
+```
+
+Ray Serve needs to run on top of a Ray cluster, so we connect to a local one.
+See {ref}`serve-deploy-tutorial` to learn more about starting a Ray Serve
+instance and deploying to a Ray cluster.
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
+:end-before: __start_ray_cluster_end__
+:language: python
+:start-after: __start_ray_cluster_start__
+```
+
+The `address` parameter in `ray.init()` connects your Serve script to a
+running local Ray cluster. Later, we'll discuss how to start a local Ray
+cluster.
+
+:::{note}
+`ray.init()` connects to or starts a single-node Ray cluster on your
+local machine,  which allows you to use all your CPU cores to serve
+requests in parallel. To start a multi-node cluster, see
+{ref}`serve-deploy-tutorial`.
+:::
+
+Next, we start the Ray Serve runtime:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
+:end-before: __start_serve_end__
+:language: python
+:start-after: __start_serve_start__
+```
+
+:::{note}
+`detached=True` means Ray Serve will continue running even when the Python
+script exits. If you would rather stop Ray Serve after the script exits, use
+`serve.start()` instead (see {ref}`ray-serve-instance-lifetime` for
+details).
+:::
+
+Now that we have defined our `summarize` function, connected to a Ray
+Cluster, and started the Ray Serve runtime, we can define a function that
+accepts HTTP requests and routes them to the `summarize` function. We
+define a function called `router` that takes in a Starlette `request`
+object [^f2]:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
+:end-before: __router_end__
+:language: python
+:linenos: true
+:start-after: __router_start__
+```
+
+- In line 1, we add the decorator `@serve.deployment`
+  to the `router` function to turn the function into a Serve `Deployment`
+  object.
+- In line 3, `router` uses the `"txt"` query parameter in the `request`
+  to get the article text to summarize.
+- In line 4, it then passes this article text into the `summarize` function
+  and returns the value.
+
+:::{note}
+Lines 3 and 4 define our HTTP request schema. The HTTP requests sent to this
+endpoint must have a `"txt"` query parameter that contains a string.
+In general, you can accept HTTP data using query parameters or the
+request body. Additionally, you can add other Serve deployments with
+different names to create more endpoints that can accept different schemas.
+For more complex validation, you can also use FastAPI (see
+{ref}`serve-fastapi-http` for more info).
+:::
+
+:::{tip}
+This routing function's name doesn't have to be `router`.
+It can be any function name as long as the corresponding name is present in
+the HTTP request. If you want the function name to be different than the name
+in the HTTP request, you can add the `name` keyword parameter to the
+`@serve.deployment` decorator to specify the name sent in the HTTP request.
+
+For example, if the decorator is `@serve.deployment(name="responder")` and
+the function signature is `def request_manager(request)`, the HTTP request
+should use `responder`, not `request_manager`. If no `name` is passed
+into `@serve.deployment`, the `request` uses the function's name by
+default. For example, if the decorator is `@serve.deployment` and the
+function's signature is `def manager(request)`, the HTTP request should use
+`manager`.
+:::
+
+Since `@serve.deployment` makes `router` a `Deployment` object, it can be
+deployed using `router.deploy()`:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment.py
+:end-before: __router_deploy_end__
+:language: python
+:start-after: __router_deploy_start__
+```
+
+Once we deploy `router`, we can query the model over HTTP.
+With that, we can run our model on Ray Serve!
+Here's the full Ray Serve deployment script that we built for our model:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_deployment_full.py
+:end-before: __deployment_full_end__
+:language: python
+:linenos: true
+:start-after: __deployment_full_start__
+```
+
+To deploy `router`, we first start a local Ray cluster:
+
+```bash
+$ ray start --head
+```
+
+The Ray cluster that this command launches is the same Ray cluster that the
+Python code connects to using `ray.init(address="auto", namespace="serve")`.
+It is also the same Ray cluster that keeps Ray Serve (and any deployments on
+it, such as `router`) alive even after the Python script exits as long as
+`detached=True` inside `serve.start()`.
+
+:::{tip}
+To stop the Ray cluster, run the command `ray stop`.
+:::
+
+After starting the Ray cluster, we can run the Python file to deploy `router`
+and begin accepting HTTP requests:
+
+```bash
+$ python model_on_ray_serve.py
+```
+
+## Testing the Ray Serve Deployment
+
+We can now test our model over HTTP. The structure of our HTTP query is:
+
+`http://127.0.0.1:8000/[Deployment Name]?[Parameter Name-1]=[Parameter Value-1]&[Parameter Name-2]=[Parameter Value-2]&...&[Parameter Name-n]=[Parameter Value-n]`
+
+Since the cluster is deployed locally in this tutorial, the `127.0.0.1:8000`
+refers to a localhost with port 8000. The `[Deployment Name]` refers to
+either the name of the function that we called `.deploy()` on (in our case,
+this is `router`), or the `name` keyword parameter's value in
+`@serve.deployment` (see the Tip under the `router` function definition
+above for more info).
+
+Each `[Parameter Name]` refers to a field's name in the
+request's `query_params` dictionary for our deployed function. In our
+example, the only parameter we need to pass in is `txt`. This parameter is
+referenced in the `txt = request.query_params["txt"]` line in the `router`
+function. Each \[Parameter Name\] object has a corresponding \[Parameter Value\]
+object. The `txt`'s \[Parameter Value\] is a string containing the article
+text to summarize. We can chain together any number of the name-value pairs
+using the `&` symbol in the request URL.
+
+Now that the `summarize` function is deployed on Ray Serve, we can make HTTP
+requests to it. Here's a client script that requests a summary from the same
+article as the original Python script:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_router_client.py
+:end-before: __client_function_end__
+:language: python
+:start-after: __client_function_start__
+```
+
+We can run this script while the model is deployed to get a response over HTTP:
+
+```bash
+$ python router_client.py
+
+"two astronauts steered their fragile lunar module safely and smoothly to the
+historic landing . the first men to reach the moon -- Armstrong and his
+co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
+rest on a level, rock-strewn plain ."
+```
+
+## Using Classes in the Ray Serve Deployment
+
+Our application is still a bit inefficient though. In particular, the
+`summarize` function loads the model on each call when it sets the
+`summarizer` variable. However, the model never changes, so it would be more
+efficient to define `summarizer` only once and keep its value in memory
+instead of reloading it for each HTTP query.
+
+We can achieve this by converting our `summarize` function into a class:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_class_deployment.py
+:end-before: __deployment_class_end__
+:language: python
+:linenos: true
+:start-after: __deployment_class_start__
+```
+
+In this configuration, we can query the `Summarizer` class directly.
+The `Summarizer` is initialized once (after calling `Summarizer.deploy()`).
+In line 13, its `__init__` function loads and stores the model in
+`self.summarize`. HTTP queries for the `Summarizer` class are routed to its
+`__call__` method by default, which takes in the Starlette `request`
+object. The `Summarizer` class can then take the request's `txt` data and
+call the `self.summarize` function on it without loading the model on each
+query.
+
+:::{tip}
+Instance variables can also store state. For example, to
+count the number of requests served, a `@serve.deployment` class can define
+a `self.counter` instance variable in its `__init__` function and set it
+to 0. When the class is queried, it can increment the `self.counter`
+variable inside of the function responding to the query. The `self.counter`
+will keep track of the number of requests served across requests.
+:::
+
+HTTP queries for the Ray Serve class deployments follow a similar format to Ray
+Serve function deployments. Here's an example client script for the
+`Summarizer` class. Notice that the only difference from the `router`'s
+client script is that the URL uses the `Summarizer` path instead of
+`router`.
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_summarizer_client.py
+:end-before: __client_class_end__
+:language: python
+:start-after: __client_class_start__
+```
+
+We can deploy the class-based model on Serve without stopping the Ray cluster.
+However, for the purposes of this tutorial, let's restart the cluster, deploy
+the model, and query it over HTTP:
+
+```bash
+$ ray stop
+$ ray start --head
+$ python summarizer_on_ray_serve.py
+$ python summarizer_client.py
+
+"two astronauts steered their fragile lunar module safely and smoothly to the
+historic landing . the first men to reach the moon -- Armstrong and his
+co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
+rest on a level, rock-strewn plain ."
+```
+
+## Adding Functionality with FastAPI
+
+Now suppose we want to expose additional functionality in our model. In
+particular, the `summarize` function also has `min_length` and
+`max_length` parameters. Although we could expose these options as additional
+parameters in URL, Ray Serve also allows us to add more route options to the
+URL itself and handle each route separately.
+
+Because this logic can get complex, Serve integrates with
+[FastAPI](https://fastapi.tiangolo.com/). This allows us to define a Serve
+deployment by adding the `@serve.ingress` decorator to a FastAPI app. For
+more info about FastAPI with Serve, please see {ref}`serve-fastapi-http`.
+
+As an example of FastAPI, here's a modified version of our `Summarizer` class
+with route options to request a minimum or maximum length of ten words in the
+summaries:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_fastapi_deployment.py
+:end-before: __fastapi_end__
+:language: python
+:linenos: true
+:start-after: __fastapi_start__
+```
+
+The class now exposes three routes:
+
+- `/Summarizer`: As before, this route takes in article text and returns
+  a summary.
+- `/Summarizer/min10`: This route takes in article text and returns a
+  summary with at least 10 words.
+- `/Summarizer/max10`: This route takes in article text and returns a
+  summary with at most 10 words.
+
+Notice that `Summarizer`'s methods no longer take in a Starlette `request`
+object. Instead, they take in the URL's `txt` parameter directly with FastAPI's
+[query parameter](https://fastapi.tiangolo.com/tutorial/query-params/)
+feature.
+
+Since we still deploy our model locally, the full URL still uses the
+localhost IP. This means each of our three routes comes after the
+`http://127.0.0.1:8000` IP and port address. As an example, we can make
+requests to the `max10` route using this client script:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/e2e_fastapi_client.py
+:end-before: __client_fastapi_end__
+:language: python
+:start-after: __client_fastapi_start__
+```
+
+```bash
+$ ray stop
+$ ray start --head
+$ python serve_with_fastapi.py
+$ python fastapi_client.py
+
+"two astronauts steered their fragile lunar"
+```
+
+Congratulations! You just built and deployed a machine learning model on Ray
+Serve! You should now have enough context to dive into the {doc}`core-apis` to
+get a deeper understanding of Ray Serve.
+
+To learn more about how to start a multi-node cluster for your Ray Serve
+deployments, see {ref}`serve-deploy-tutorial`. For more interesting example
+applications, including integrations with popular machine learning frameworks
+and Python web servers, be sure to check out {doc}`tutorials/index`.
+
+```{rubric} Footnotes
+```
+
+[^f1]: The article text comes from the New York Times article "Astronauts
+    Land on Plain; Collect Rocks, Plant Flag" archived
+    [here](https://archive.nytimes.com/www.nytimes.com/library/national/science/nasa/072169sci-nasa.html).
+
+[^f2]: [Starlette](https://www.starlette.io/) is a web server framework
+    used by Ray Serve. Its [Request](https://www.starlette.io/requests/) class
+    provides a nice interface for incoming HTTP requests.
--- a/doc/source/serve/end_to_end_tutorial.rst
+++ b/doc/source/serve/end_to_end_tutorial.rst
@ -1,392 +0,0 @@
-.. _end_to_end_tutorial:
-
-===================
-End-to-End Tutorial
-===================
-
-By the end of this tutorial you will have learned how to deploy a machine
-learning model locally via Ray Serve.
-
-First, install Ray Serve and all of its dependencies by running the following
-command in your terminal:
-
-.. code-block:: bash
-
-  $ pip install "ray[serve]"
-
-For this tutorial, we'll use `HuggingFace's SummarizationPipeline <https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.SummarizationPipeline>`_
-to access a model that summarizes text. 
-
-
-Example Model
-=============
-
-Let's first take a look at how the model works, without using Ray Serve.
-This is the code for the model:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_local.py
-  :linenos:
-  :language: python
-  :start-after: __local_model_start__
-  :end-before: __local_model_end__
-
-The Python file, called ``local_model.py`` uses the ``summarize`` function to
-generate summaries of text. 
-
- The ``summarizer`` variable on line 7 inside ``summarize`` points to a
-  function that uses the `t5-small <https://huggingface.co/t5-small>`_
-  model to summarize text.
- When ``summarizer`` is called on a Python String, it returns summarized text
-  inside a dictionary formatted as ``[{"summary_text": "...", ...}, ...]``. 
- ``summarize`` then extracts the summarized text on line 13 by indexing into
-  the dictionary.
-
-The file can be run locally by executing the Python script, which uses the
-model to summarize an article about the Apollo 11 moon landing [#f1]_.
-
-.. code-block:: bash
-
-  $ python local_model.py
-
-  "two astronauts steered their fragile lunar module safely and smoothly to the
-  historic landing . the first men to reach the moon -- Armstrong and his
-  co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to
-  rest on a level, rock-strewn plain ."
-
-
-Keep in mind that the ``SummarizationPipeline`` is an example machine learning
-model for this tutorial. You can follow along using arbitrary models in any
-framework that has a Python API. Check out our tutorials on sckit-learn,
-PyTorch, and Tensorflow for more info and examples:
-
- :ref:`serve-sklearn-tutorial`
- :ref:`serve-pytorch-tutorial`
- :ref:`serve-tensorflow-tutorial`
-
-Converting to Ray Serve Deployment
-==================================
-
-This tutorial's goal is to deploy this model using Ray Serve, so it can be
-scaled up and queried over HTTP. We'll start by converting the above Python
-function into a Ray Serve deployment that can be launched locally on a laptop.
-
-We start by opening a new Python file. First, we need to import ``ray`` and
-``ray serve``, to use features in Ray Serve such as ``deployments``, which
-provide HTTP access to our model.
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
-    :language: python
-    :start-after: __import_start__
-    :end-before: __import_end__
-
-After these imports, we can include our model code from above. 
-We won't call our ``summarize`` function just yet though! 
-We will soon add logic to handle HTTP requests, so the ``summarize`` function 
-can operate on article text sent via HTTP request.
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
-  :language: python
-  :start-after: __local_model_start__
-  :end-before: __local_model_end__
-
-Ray Serve needs to run on top of a Ray cluster, so we connect to a local one.
-See :ref:`serve-deploy-tutorial` to learn more about starting a Ray Serve
-instance and deploying to a Ray cluster.
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
-  :language: python
-  :start-after: __start_ray_cluster_start__
-  :end-before: __start_ray_cluster_end__
-
-The ``address`` parameter in ``ray.init()`` connects your Serve script to a
-running local Ray cluster. Later, we'll discuss how to start a local Ray
-cluster.
-
-.. note::
-
-  ``ray.init()`` connects to or starts a single-node Ray cluster on your
-  local machine,  which allows you to use all your CPU cores to serve
-  requests in parallel. To start a multi-node cluster, see
-  :ref:`serve-deploy-tutorial`.
-
-Next, we start the Ray Serve runtime:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
-  :language: python
-  :start-after: __start_serve_start__
-  :end-before: __start_serve_end__
-
-.. note::
-
-  ``detached=True`` means Ray Serve will continue running even when the Python
-  script exits. If you would rather stop Ray Serve after the script exits, use
-  ``serve.start()`` instead (see :ref:`ray-serve-instance-lifetime` for
-  details).
-
-Now that we have defined our ``summarize`` function, connected to a Ray
-Cluster, and started the Ray Serve runtime, we can define a function that
-accepts HTTP requests and routes them to the ``summarize`` function. We
-define a function called ``router`` that takes in a Starlette ``request``
-object [#f2]_:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
-  :linenos:
-  :language: python
-  :start-after: __router_start__
-  :end-before: __router_end__
-
- In line 1, we add the decorator ``@serve.deployment``
-  to the ``router`` function to turn the function into a Serve ``Deployment``
-  object.
- In line 3, ``router`` uses the ``"txt"`` query parameter in the ``request``
-  to get the article text to summarize.
- In line 4, it then passes this article text into the ``summarize`` function
-  and returns the value.
-
-.. note::
-
-  Lines 3 and 4 define our HTTP request schema. The HTTP requests sent to this
-  endpoint must have a ``"txt"`` query parameter that contains a string.
-  In general, you can accept HTTP data using query parameters or the
-  request body. Additionally, you can add other Serve deployments with
-  different names to create more endpoints that can accept different schemas.
-  For more complex validation, you can also use FastAPI (see
-  :ref:`serve-fastapi-http` for more info).
-
-.. tip::
-  This routing function's name doesn't have to be ``router``. 
-  It can be any function name as long as the corresponding name is present in 
-  the HTTP request. If you want the function name to be different than the name 
-  in the HTTP request, you can add the ``name`` keyword parameter to the
-  ``@serve.deployment`` decorator to specify the name sent in the HTTP request.
-  
-  For example, if the decorator is ``@serve.deployment(name="responder")`` and
-  the function signature is ``def request_manager(request)``, the HTTP request
-  should use ``responder``, not ``request_manager``. If no ``name`` is passed
-  into ``@serve.deployment``, the ``request`` uses the function's name by
-  default. For example, if the decorator is ``@serve.deployment`` and the
-  function's signature is ``def manager(request)``, the HTTP request should use
-  ``manager``.
-
-Since ``@serve.deployment`` makes ``router`` a ``Deployment`` object, it can be
-deployed using ``router.deploy()``:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment.py
-  :language: python
-  :start-after: __router_deploy_start__
-  :end-before: __router_deploy_end__
-
-Once we deploy ``router``, we can query the model over HTTP. 
-With that, we can run our model on Ray Serve! 
-Here's the full Ray Serve deployment script that we built for our model:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_deployment_full.py
-  :linenos:
-  :language: python
-  :start-after: __deployment_full_start__
-  :end-before: __deployment_full_end__
-
-To deploy ``router``, we first start a local Ray cluster:
-
-.. code-block:: bash
-
-  $ ray start --head
-
-The Ray cluster that this command launches is the same Ray cluster that the
-Python code connects to using ``ray.init(address="auto", namespace="serve")``.
-It is also the same Ray cluster that keeps Ray Serve (and any deployments on
-it, such as ``router``) alive even after the Python script exits as long as
-``detached=True`` inside ``serve.start()``.
-
-.. tip::
-  To stop the Ray cluster, run the command ``ray stop``.
-
-After starting the Ray cluster, we can run the Python file to deploy ``router``
-and begin accepting HTTP requests:
-
-.. code-block:: bash
-
-  $ python model_on_ray_serve.py
-
-
-Testing the Ray Serve Deployment
-================================
-
-We can now test our model over HTTP. The structure of our HTTP query is:
-
-``http://127.0.0.1:8000/[Deployment Name]?[Parameter Name-1]=[Parameter Value-1]&[Parameter Name-2]=[Parameter Value-2]&...&[Parameter Name-n]=[Parameter Value-n]``
-
-Since the cluster is deployed locally in this tutorial, the ``127.0.0.1:8000`` 
-refers to a localhost with port 8000. The ``[Deployment Name]`` refers to
-either the name of the function that we called ``.deploy()`` on (in our case,
-this is ``router``), or the ``name`` keyword parameter's value in
-``@serve.deployment`` (see the Tip under the ``router`` function definition
-above for more info).
-
-Each ``[Parameter Name]`` refers to a field's name in the
-request's ``query_params`` dictionary for our deployed function. In our
-example, the only parameter we need to pass in is ``txt``. This parameter is
-referenced in the ``txt = request.query_params["txt"]`` line in the ``router``
-function. Each [Parameter Name] object has a corresponding [Parameter Value]
-object. The ``txt``'s [Parameter Value] is a string containing the article
-text to summarize. We can chain together any number of the name-value pairs
-using the ``&`` symbol in the request URL.
-
-Now that the ``summarize`` function is deployed on Ray Serve, we can make HTTP 
-requests to it. Here's a client script that requests a summary from the same 
-article as the original Python script:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_router_client.py
-  :language: python
-  :start-after: __client_function_start__
-  :end-before: __client_function_end__
-
-We can run this script while the model is deployed to get a response over HTTP:
-
-.. code-block:: bash
-
-  $ python router_client.py
-
-  "two astronauts steered their fragile lunar module safely and smoothly to the
-  historic landing . the first men to reach the moon -- Armstrong and his 
-  co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to 
-  rest on a level, rock-strewn plain ."
-
-
-Using Classes in the Ray Serve Deployment
-=========================================
-
-Our application is still a bit inefficient though. In particular, the 
-``summarize`` function loads the model on each call when it sets the
-``summarizer`` variable. However, the model never changes, so it would be more
-efficient to define ``summarizer`` only once and keep its value in memory
-instead of reloading it for each HTTP query.
-
-We can achieve this by converting our ``summarize`` function into a class:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_class_deployment.py
-  :linenos:
-  :language: python
-  :start-after: __deployment_class_start__
-  :end-before: __deployment_class_end__
-
-In this configuration, we can query the ``Summarizer`` class directly. 
-The ``Summarizer`` is initialized once (after calling ``Summarizer.deploy()``).
-In line 13, its ``__init__`` function loads and stores the model in
-``self.summarize``. HTTP queries for the ``Summarizer`` class are routed to its
-``__call__`` method by default, which takes in the Starlette ``request``
-object. The ``Summarizer`` class can then take the request's ``txt`` data and
-call the ``self.summarize`` function on it without loading the model on each
-query.
-
-.. tip::
-  Instance variables can also store state. For example, to
-  count the number of requests served, a ``@serve.deployment`` class can define
-  a ``self.counter`` instance variable in its ``__init__`` function and set it
-  to 0. When the class is queried, it can increment the ``self.counter``
-  variable inside of the function responding to the query. The ``self.counter``
-  will keep track of the number of requests served across requests.
-
-HTTP queries for the Ray Serve class deployments follow a similar format to Ray 
-Serve function deployments. Here's an example client script for the
-``Summarizer`` class. Notice that the only difference from the ``router``'s
-client script is that the URL uses the ``Summarizer`` path instead of
-``router``.
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_summarizer_client.py
-  :language: python
-  :start-after: __client_class_start__
-  :end-before: __client_class_end__
-
-We can deploy the class-based model on Serve without stopping the Ray cluster.
-However, for the purposes of this tutorial, let's restart the cluster, deploy
-the model, and query it over HTTP:
-
-.. code-block:: bash
-
-  $ ray stop
-  $ ray start --head
-  $ python summarizer_on_ray_serve.py
-  $ python summarizer_client.py
-
-  "two astronauts steered their fragile lunar module safely and smoothly to the
-  historic landing . the first men to reach the moon -- Armstrong and his 
-  co-pilot, col. Edwin E. Aldrin Jr. of the air force -- brought their ship to 
-  rest on a level, rock-strewn plain ."
-
-
-Adding Functionality with FastAPI
-=================================
-
-Now suppose we want to expose additional functionality in our model. In
-particular, the ``summarize`` function also has ``min_length`` and
-``max_length`` parameters. Although we could expose these options as additional
-parameters in URL, Ray Serve also allows us to add more route options to the
-URL itself and handle each route separately.
-
-Because this logic can get complex, Serve integrates with
-`FastAPI <https://fastapi.tiangolo.com/>`_. This allows us to define a Serve
-deployment by adding the ``@serve.ingress`` decorator to a FastAPI app. For 
-more info about FastAPI with Serve, please see :ref:`serve-fastapi-http`.
-
-As an example of FastAPI, here's a modified version of our ``Summarizer`` class
-with route options to request a minimum or maximum length of ten words in the
-summaries:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_fastapi_deployment.py
-  :linenos:
-  :language: python
-  :start-after: __fastapi_start__
-  :end-before: __fastapi_end__
-
-The class now exposes three routes:
-
- ``/Summarizer``: As before, this route takes in article text and returns
-  a summary.
- ``/Summarizer/min10``: This route takes in article text and returns a
-  summary with at least 10 words.
- ``/Summarizer/max10``: This route takes in article text and returns a
-  summary with at most 10 words.
-
-Notice that ``Summarizer``'s methods no longer take in a Starlette ``request``
-object. Instead, they take in the URL's `txt` parameter directly with FastAPI's
-`query parameter <https://fastapi.tiangolo.com/tutorial/query-params/>`_
-feature.
-
-Since we still deploy our model locally, the full URL still uses the
-localhost IP. This means each of our three routes comes after the
-``http://127.0.0.1:8000`` IP and port address. As an example, we can make
-requests to the ``max10`` route using this client script:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/e2e_fastapi_client.py
-  :language: python
-  :start-after: __client_fastapi_start__
-  :end-before: __client_fastapi_end__
-
-.. code-block:: bash
-
-  $ ray stop
-  $ ray start --head
-  $ python serve_with_fastapi.py
-  $ python fastapi_client.py
-
-  "two astronauts steered their fragile lunar"
-
-Congratulations! You just built and deployed a machine learning model on Ray
-Serve! You should now have enough context to dive into the :doc:`core-apis` to
-get a deeper understanding of Ray Serve.
-
-To learn more about how to start a multi-node cluster for your Ray Serve
-deployments, see :ref:`serve-deploy-tutorial`. For more interesting example
-applications, including integrations with popular machine learning frameworks
-and Python web servers, be sure to check out :doc:`tutorials/index`.
-
-.. rubric:: Footnotes
-
-.. [#f1] The article text comes from the New York Times article "Astronauts
-   Land on Plain; Collect Rocks, Plant Flag" archived
-   `here <https://archive.nytimes.com/www.nytimes.com/library/national/science/nasa/072169sci-nasa.html>`_.
-
-.. [#f2] `Starlette <https://www.starlette.io/>`_ is a web server framework
-   used by Ray Serve. Its `Request <https://www.starlette.io/requests/>`_ class
-   provides a nice interface for incoming HTTP requests.
--- a/doc/source/serve/faq.rst
+++ b/doc/source/serve/faq.rst
@ -1,46 +1,43 @@
-.. _serve-faq:
+(serve-faq)=

-Ray Serve FAQ
-=============
+# Ray Serve FAQ

 This page answers some common questions about Ray Serve. If you have more
-questions, feel free to ask them in the `Discussion Board <https://discuss.ray.io/>`_.
+questions, feel free to ask them in the [Discussion Board](https://discuss.ray.io/).

-.. contents::
+```{contents}
+```

-How do I deploy Ray Serve?
--------------------------
+## How do I deploy Ray Serve?

-See :doc:`deployment` for information about how to deploy Serve.
+See {doc}`deployment` for information about how to deploy Serve.

+## How fast is Ray Serve?

-How fast is Ray Serve?
----------------------
 We are continuously benchmarking Ray Serve. We can confidently say:

 - Ray Serve's **latency** overhead is single digit milliseconds, often times just 1-2 milliseconds.
 - For **throughput**, Serve achieves about 3-4k qps on a single machine.
 - It is **horizontally scalable** so you can add more machines to increase the overall throughput.

-You can checkout our `microbenchmark instruction <https://github.com/ray-project/ray/tree/master/python/ray/serve/benchmarks>`_
+You can checkout our [microbenchmark instruction](https://github.com/ray-project/ray/tree/master/python/ray/serve/benchmarks)
 to benchmark on your hardware.

+## Can I use `asyncio` along with Ray Serve?

-Can I use ``asyncio`` along with Ray Serve?
-------------------------------------------
-Yes! You can make your servable methods ``async def`` and Serve will run them
+Yes! You can make your servable methods `async def` and Serve will run them
 concurrently inside a Python asyncio event loop.

-Are there any other similar frameworks?
---------------------------------------
+## Are there any other similar frameworks?
+
 Yes and no. We truly believe Serve is unique as it gives you end to end control
 over the API while delivering scalability and high performance. To achieve
 something like what Serve offers, you often need to glue together multiple
 frameworks like Tensorflow Serving, SageMaker, or even roll your own
 batching server.

-How does Serve compare to TFServing, TorchServe, ONNXRuntime, and others?
-------------------------------------------------------------------------
+## How does Serve compare to TFServing, TorchServe, ONNXRuntime, and others?
+
 Ray Serve is *framework agnostic*, you can use any Python framework and libraries.
 We believe data scientists are not bounded a particular machine learning framework.
 They use the best tool available for the job.
@ -48,12 +45,12 @@ They use the best tool available for the job.
 Compared to these framework specific solution, Ray Serve doesn't perform any optimizations
 to make your ML model run faster. However, you can still optimize the models yourself
 and run them in Ray Serve: for example, you can run a model compiled by
-`PyTorch JIT <https://pytorch.org/docs/stable/jit.html>`_.
+[PyTorch JIT](https://pytorch.org/docs/stable/jit.html).
+
+## How does Serve compare to AWS SageMaker, Azure ML, Google AI Platform?

-How does Serve compare to AWS SageMaker, Azure ML, Google AI Platform?
----------------------------------------------------------------------
 Ray Serve brings the scalability and parallelism of these hosted offering to
-your own infrastructure. You can use our :ref:`cluster launcher <cluster-cloud>`
+your own infrastructure. You can use our [cluster launcher](cluster-cloud)
 to deploy Ray Serve to all major public clouds, K8s, as well as on bare-metal, on-premise machines.

 Compared to these offerings, Ray Serve lacks a unified user interface and functionality
@ -61,20 +58,20 @@ let you manage the lifecycle of the models, visualize it's performance, etc. Ray
 Serve focuses on just model serving and provides the primitives for you to
 build your own ML platform on top.

-How does Serve compare to Seldon, KFServing, Cortex?
----------------------------------------------------
+## How does Serve compare to Seldon, KFServing, Cortex?
+
 You can develop Ray Serve on your laptop, deploy it on a dev box, and scale it out
 to multiple machines or K8s cluster without changing one lines of code. It's a lot
 easier to get started with when you don't need to provision and manage K8s cluster.
-When it's time to deploy, you can use Ray :ref:`cluster launcher <cluster-cloud>`
+When it's time to deploy, you can use Ray [cluster launcher](cluster-cloud)
 to transparently put your Ray Serve application in K8s.

 Compare to these frameworks letting you deploy ML models on K8s, Ray Serve lacks
 the ability to declaratively configure your ML application via YAML files. In
 Ray Serve, you configure everything by Python code.

-Is Ray Serve only for ML models?
--------------------------------
+## Is Ray Serve only for ML models?
+
 Nope! Ray Serve can be used to build any type of Python microservices
 application. You can also use the full power of Ray within your Ray Serve
 programs, so it's easy to run parallel computations within your deployments.
--- a/doc/source/serve/http-servehandle.md
+++ b/doc/source/serve/http-servehandle.md
@ -0,0 +1,415 @@
+# Calling Deployments via HTTP and Python
+
+This section should help you:
+
+- understand how deployments can be called in two ways: from HTTP and from Python
+- integrate Ray Serve with an existing web server
+
+```{contents} Calling Deployments via HTTP and Python
+```
+
+(serve-http)=
+
+## Calling Deployments via HTTP
+
+### Basic Example
+
+As shown in the {ref}`serve-quickstart`, when you create a deployment, it is exposed over HTTP by default at `/{deployment_name}`. You can change the route by specifying the `route_prefix` argument to the {mod}`@serve.deployment <ray.serve.api.deployment>` decorator.
+
+```python
+@serve.deployment(route_prefix="/counter")
+class Counter:
+    def __call__(self, request):
+        pass
+```
+
+When you make a request to the Serve HTTP server at `/counter`, it will forward the request to the deployment's `__call__` method and provide a [Starlette Request object](https://www.starlette.io/requests/) as the sole argument. The `__call__` method can return any JSON-serializable object or a [Starlette Response object](https://www.starlette.io/responses/) (e.g., to return a custom status code).
+
+Below, we discuss some advanced features for customizing Ray Serve's HTTP functionality.
+
+(serve-fastapi-http)=
+
+### FastAPI HTTP Deployments
+
+If you want to define more complex HTTP handling logic, Serve integrates with [FastAPI](https://fastapi.tiangolo.com/). This allows you to define a Serve deployment using the {mod}`@serve.ingress <ray.serve.api.ingress>` decorator that wraps a FastAPI app with its full range of features. The most basic example of this is shown below, but for more details on all that FastAPI has to offer such as variable routes, automatic type validation, dependency injection (e.g., for database connections), and more, please check out [their documentation](https://fastapi.tiangolo.com/).
+
+```python
+import ray
+
+from fastapi import FastAPI
+from ray import serve
+
+app = FastAPI()
+ray.init(address="auto", namespace="summarizer")
+serve.start(detached=True)
+
+@serve.deployment(route_prefix="/hello")
+@serve.ingress(app)
+class MyFastAPIDeployment:
+    @app.get("/")
+    def root(self):
+        return "Hello, world!"
+
+MyFastAPIDeployment.deploy()
+```
+
+Now if you send a request to `/hello`, this will be routed to the `root` method of our deployment. We can also easily leverage FastAPI to define multiple routes with different HTTP methods:
+
+```python
+import ray
+
+from fastapi import FastAPI
+from ray import serve
+
+app = FastAPI()
+ray.init(address="auto", namespace="summarizer")
+serve.start(detached=True)
+
+@serve.deployment(route_prefix="/hello")
+@serve.ingress(app)
+class MyFastAPIDeployment:
+    @app.get("/")
+    def root(self):
+        return "Hello, world!"
+
+    @app.post("/{subpath}")
+    def root(self, subpath: str):
+        return f"Hello from {subpath}!"
+
+MyFastAPIDeployment.deploy()
+```
+
+You can also pass in an existing FastAPI app to a deployment to serve it as-is:
+
+```python
+import ray
+
+from fastapi import FastAPI
+from ray import serve
+
+app = FastAPI()
+ray.init(address="auto", namespace="summarizer")
+serve.start(detached=True)
+
+@app.get("/")
+def f():
+    return "Hello from the root!"
+
+# ... add more routes, routers, etc. to `app` ...
+
+@serve.deployment(route_prefix="/")
+@serve.ingress(app)
+class FastAPIWrapper:
+    pass
+
+FastAPIWrapper.deploy()
+```
+
+This is useful for scaling out an existing FastAPI app with no modifications necessary.
+Existing middlewares, automatic OpenAPI documentation generation, and other advanced FastAPI features should work as-is.
+You can also combine routes defined this way with routes defined on the deployment:
+
+```python
+import ray
+
+from fastapi import FastAPI
+from ray import serve
+
+app = FastAPI()
+ray.init(address="auto", namespace="summarizer")
+serve.start(detached=True)
+
+@app.get("/")
+def f():
+    return "Hello from the root!"
+
+@serve.deployment(route_prefix="/api1")
+@serve.ingress(app)
+class FastAPIWrapper1:
+    @app.get("/subpath")
+    def method(self):
+        return "Hello 1!"
+
+@serve.deployment(route_prefix="/api2")
+@serve.ingress(app)
+class FastAPIWrapper2:
+    @app.get("/subpath")
+    def method(self):
+        return "Hello 2!"
+
+FastAPIWrapper1.deploy()
+FastAPIWrapper2.deploy()
+```
+
+In this example, requests to both `/api1` and `/api2` would return `Hello from the root!` while a request to `/api1/subpath` would return `Hello 1!` and a request to `/api2/subpath` would return `Hello 2!`.
+
+To try it out, save a code snippet in a local python file (i.e. main.py) and in the same directory, run the following commands to start a local Ray cluster on your machine.
+
+```bash
+ray start --head
+python main.py
+```
+
+(serve-http-adapters)=
+
+### HTTP Adapters
+
+HTTP adapters are functions that convert raw HTTP request to Python types that you know and recognize.
+Its input arguments should be type annotated. At minimal, it should accept a `starlette.requests.Request` type.
+But it can also accept any type that's recognized by the FastAPI's dependency injection framework.
+
+For example, here is an adapter that extra the json content from request.
+
+```python
+async def json_resolver(request: starlette.requests.Request):
+    return await request.json()
+```
+
+Here is an adapter that accept two HTTP query parameters.
+
+```python
+def parse_query_args(field_a: int, field_b: str):
+    return YourDataClass(field_a, field_b)
+```
+
+You can specify different type signatures to facilitate HTTP fields extraction
+include
+[query parameters](https://fastapi.tiangolo.com/tutorial/query-params/),
+[body parameters](https://fastapi.tiangolo.com/tutorial/body/),
+and [many other data types](https://fastapi.tiangolo.com/tutorial/extra-data-types/).
+For more detail, you can take a look at [FastAPI documentation](https://fastapi.tiangolo.com/).
+
+You can use adapters in different scenarios within Serve:
+
+- Ray AIR `ModelWrapper`
+- Serve Deployment Graph `DAGDriver`
+- Embedded in Bring Your Own `FastAPI` Application
+
+Let's go over them one by one.
+
+#### Ray AIR `ModelWrapper`
+
+Ray Serve provides a suite of adapters to convert HTTP requests to ML inputs like `numpy` arrays.
+You can just use it with [Ray AI Runtime (AIR) model wrapper](air-serve-integration) feature
+to one click deploy pre-trained models.
+
+For example, we provide a simple adapter for n-dimensional array.
+
+With [model wrappers](air-serve-integration), you can specify it via the `http_adapter` field.
+
+```python
+from ray import serve
+from ray.serve.http_adapters import json_to_ndarray
+from ray.serve.model_wrappers import ModelWrapperDeployment
+
+ModelWrapperDeployment.options(name="my_model").deploy(
+    my_ray_air_predictor,
+    my_ray_air_checkpoint,
+    http_adapter=json_to_ndarray
+)
+```
+
+:::{note}
+Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
+you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
+Once validated, the model instance will passed to the predictor.
+
+```python
+from pydantic import BaseModel
+
+class User(BaseModel):
+    user_id: int
+    user_name: str
+
+...
+ModelWrapperDeployment.deploy(..., http_adapter=User)
+```
+:::
+
+#### Serve Deployment Graph `DAGDriver`
+
+In [Serve Deployment Graph](serve-deployment-graph), you can configure
+`ray.serve.drivers.DAGDriver` to accept an http adapter via it's `http_adapter` field.
+
+For example, the json request adapters parse JSON in HTTP body:
+
+```python
+from ray.serve.drivers import DAGDriver
+from ray.serve.http_adapters import json_request
+from ray.experimental.dag.input_node import InputNode
+
+with InputNode() as input_node:
+    ...
+    dag = DAGDriver.bind(other_node, http_adapter=json_request)
+```
+
+:::{note}
+Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
+you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
+Once validated, the model instance will passed as `input_node` variable.
+
+```python
+from pydantic import BaseModel
+
+class User(BaseModel):
+    user_id: int
+    user_name: str
+
+...
+DAGDriver.bind(other_node, http_adapter=User)
+```
+:::
+
+#### Embedded in Bring Your Own `FastAPI` Application
+
+You can also bring the adapter to your own FastAPI app using
+[Depends](https://fastapi.tiangolo.com/tutorial/dependencies/#import-depends).
+The input schema will automatically be part of the generated OpenAPI schema with FastAPI.
+
+```python
+from fastapi import FastAPI, Depends
+from ray.serve.http_adapters import json_to_ndarray
+
+app = FastAPI()
+
+@app.post("/endpoint")
+async def endpoint(np_array = Depends(json_to_ndarray)):
+    ...
+```
+
+It has the following schema for input:
+
+(serve-ndarray-schema)=
+
+```{eval-rst}
+.. autopydantic_model:: ray.serve.http_adapters.NdArray
+
+```
+
+#### List of Built-in Adapters
+
+Here is a list of adapters and please feel free to [contribute more](https://github.com/ray-project/ray/issues/new/choose)!
+
+```{eval-rst}
+.. automodule:: ray.serve.http_adapters
+    :members: json_to_ndarray, image_to_ndarray, starlette_request, json_request
+
+```
+
+### Configuring HTTP Server Locations
+
+By default, Ray Serve starts a single HTTP server on the head node of the Ray cluster.
+You can configure this behavior using the `http_options={"location": ...}` flag
+in {mod}`serve.start <ray.serve.start>`:
+
+- "HeadOnly": start one HTTP server on the head node. Serve
+  assumes the head node is the node you executed serve.start
+  on. This is the default.
+- "EveryNode": start one HTTP server per node.
+- "NoServer" or `None`: disable HTTP server.
+
+:::{note}
+Using the "EveryNode" option, you can point a cloud load balancer to the
+instance group of Ray cluster to achieve high availability of Serve's HTTP
+proxies.
+:::
+
+### Enabling CORS and other HTTP middlewares
+
+Serve supports arbitrary [Starlette middlewares](https://www.starlette.io/middleware/)
+and custom middlewares in Starlette format. The example below shows how to enable
+[Cross-Origin Resource Sharing (CORS)](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS).
+You can follow the same pattern for other Starlette middlewares.
+
+```python
+from starlette.middleware import Middleware
+from starlette.middleware.cors import CORSMiddleware
+
+client = serve.start(
+    http_options={"middlewares": [
+        Middleware(
+            CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
+    ]})
+```
+
+(serve-handle-explainer)=
+
+## ServeHandle: Calling Deployments from Python
+
+Ray Serve enables you to query models both from HTTP and Python. This feature
+enables seamless [model composition](serve-model-composition). You can
+get a `ServeHandle` corresponding to deployment, similar how you can
+reach a deployment through HTTP via a specific route. When you issue a request
+to a deployment through `ServeHandle`, the request is load balanced across
+available replicas in the same way an HTTP request is.
+
+To call a Ray Serve deployment from python, use {mod}`Deployment.get_handle <ray.serve.api.Deployment>`
+to get a handle to the deployment, then use
+{mod}`handle.remote <ray.serve.handle.RayServeHandle.remote>` to send requests
+to that deployment. These requests can pass ordinary args and kwargs that are
+passed directly to the method. This returns a Ray `ObjectRef` whose result
+can be waited for or retrieved using `ray.wait` or `ray.get`.
+
+```python
+@serve.deployment
+class Deployment:
+    def method1(self, arg):
+        return f"Method1: {arg}"
+
+    def __call__(self, arg):
+        return f"__call__: {arg}"
+
+Deployment.deploy()
+
+handle = Deployment.get_handle()
+ray.get(handle.remote("hi")) # Defaults to calling the __call__ method.
+ray.get(handle.method1.remote("hi")) # Call a different method.
+```
+
+If you want to use the same deployment to serve both HTTP and ServeHandle traffic, the recommended best practice is to define an internal method that the HTTP handling logic will call:
+
+```python
+@serve.deployment(route_prefix="/api")
+class Deployment:
+    def say_hello(self, name: str):
+        return f"Hello {name}!"
+
+    def __call__(self, request):
+        return self.say_hello(request.query_params["name"])
+
+Deployment.deploy()
+```
+
+Now we can invoke the same logic from both HTTP or Python:
+
+```python
+print(requests.get("http://localhost:8000/api?name=Alice"))
+# Hello Alice!
+
+handle = Deployment.get_handle()
+print(ray.get(handle.say_hello.remote("Alice")))
+# Hello Alice!
+```
+
+(serve-sync-async-handles)=
+
+### Sync and Async Handles
+
+Ray Serve offers two types of `ServeHandle`. You can use the `Deployment.get_handle(..., sync=True|False)`
+flag to toggle between them.
+
+- When you set `sync=True` (the default), a synchronous handle is returned.
+  Calling `handle.remote()` should return a Ray `ObjectRef`.
+- When you set `sync=False`, an asyncio based handle is returned. You need to
+  Call it with `await handle.remote()` to return a Ray ObjectRef. To use `await`,
+  you have to run `Deployment.get_handle` and `handle.remote` in Python asyncio event loop.
+
+The async handle has performance advantage because it uses asyncio directly; as compared
+to the sync handle, which talks to an asyncio event loop in a thread. To learn more about
+the reasoning behind these, checkout our [architecture documentation](serve-architecture).
+
+## Integrating with existing web servers
+
+Ray Serve comes with its own HTTP server out of the box, but if you have an existing
+web application, you can still plug in Ray Serve to scale up your compute using the `ServeHandle`.
+For a tutorial with sample code, see {ref}`serve-web-server-integration-tutorial`.
--- a/doc/source/serve/http-servehandle.rst
+++ b/doc/source/serve/http-servehandle.rst
@ -1,428 +0,0 @@
-=======================================
-Calling Deployments via HTTP and Python
-=======================================
-
-This section should help you:
-
- understand how deployments can be called in two ways: from HTTP and from Python
- integrate Ray Serve with an existing web server
-
-.. contents:: Calling Deployments via HTTP and Python
-
-.. _serve-http:
-
-Calling Deployments via HTTP
-============================
-
-Basic Example
-^^^^^^^^^^^^^
-
-As shown in the :ref:`serve_quickstart`, when you create a deployment, it is exposed over HTTP by default at ``/{deployment_name}``. You can change the route by specifying the ``route_prefix`` argument to the :mod:`@serve.deployment <ray.serve.api.deployment>` decorator.
-
-.. code-block:: python
-
-    @serve.deployment(route_prefix="/counter")
-    class Counter:
-        def __call__(self, request):
-            pass
-
-When you make a request to the Serve HTTP server at ``/counter``, it will forward the request to the deployment's ``__call__`` method and provide a `Starlette Request object <https://www.starlette.io/requests/>`_ as the sole argument. The ``__call__`` method can return any JSON-serializable object or a `Starlette Response object <https://www.starlette.io/responses/>`_ (e.g., to return a custom status code).
-
-Below, we discuss some advanced features for customizing Ray Serve's HTTP functionality.
-
-.. _serve-fastapi-http:
-
-FastAPI HTTP Deployments
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you want to define more complex HTTP handling logic, Serve integrates with `FastAPI <https://fastapi.tiangolo.com/>`_. This allows you to define a Serve deployment using the :mod:`@serve.ingress <ray.serve.api.ingress>` decorator that wraps a FastAPI app with its full range of features. The most basic example of this is shown below, but for more details on all that FastAPI has to offer such as variable routes, automatic type validation, dependency injection (e.g., for database connections), and more, please check out `their documentation <https://fastapi.tiangolo.com/>`_.
-
-.. code-block:: python
-
-    import ray
-
-    from fastapi import FastAPI
-    from ray import serve
-
-    app = FastAPI()
-    ray.init(address="auto", namespace="summarizer")
-    serve.start(detached=True)
-
-    @serve.deployment(route_prefix="/hello")
-    @serve.ingress(app)
-    class MyFastAPIDeployment:
-        @app.get("/")
-        def root(self):
-            return "Hello, world!"
-
-    MyFastAPIDeployment.deploy()
-
-Now if you send a request to ``/hello``, this will be routed to the ``root`` method of our deployment. We can also easily leverage FastAPI to define multiple routes with different HTTP methods:
-
-.. code-block:: python
-
-    import ray
-
-    from fastapi import FastAPI
-    from ray import serve
-
-    app = FastAPI()
-    ray.init(address="auto", namespace="summarizer")
-    serve.start(detached=True)
-
-    @serve.deployment(route_prefix="/hello")
-    @serve.ingress(app)
-    class MyFastAPIDeployment:
-        @app.get("/")
-        def root(self):
-            return "Hello, world!"
-
-        @app.post("/{subpath}")
-        def root(self, subpath: str):
-            return f"Hello from {subpath}!"
-
-    MyFastAPIDeployment.deploy()
-
-You can also pass in an existing FastAPI app to a deployment to serve it as-is:
-
-.. code-block:: python
-
-    import ray
-
-    from fastapi import FastAPI
-    from ray import serve
-
-    app = FastAPI()
-    ray.init(address="auto", namespace="summarizer")
-    serve.start(detached=True)
-
-    @app.get("/")
-    def f():
-        return "Hello from the root!"
-
-    # ... add more routes, routers, etc. to `app` ...
-
-    @serve.deployment(route_prefix="/")
-    @serve.ingress(app)
-    class FastAPIWrapper:
-        pass
-
-    FastAPIWrapper.deploy()
-
-This is useful for scaling out an existing FastAPI app with no modifications necessary.
-Existing middlewares, automatic OpenAPI documentation generation, and other advanced FastAPI features should work as-is.
-You can also combine routes defined this way with routes defined on the deployment:
-
-.. code-block:: python
-
-    import ray
-
-    from fastapi import FastAPI
-    from ray import serve
-
-    app = FastAPI()
-    ray.init(address="auto", namespace="summarizer")
-    serve.start(detached=True)
-
-    @app.get("/")
-    def f():
-        return "Hello from the root!"
-
-    @serve.deployment(route_prefix="/api1")
-    @serve.ingress(app)
-    class FastAPIWrapper1:
-        @app.get("/subpath")
-        def method(self):
-            return "Hello 1!"
-
-    @serve.deployment(route_prefix="/api2")
-    @serve.ingress(app)
-    class FastAPIWrapper2:
-        @app.get("/subpath")
-        def method(self):
-            return "Hello 2!"
-
-    FastAPIWrapper1.deploy()
-    FastAPIWrapper2.deploy()
-
-In this example, requests to both ``/api1`` and ``/api2`` would return ``Hello from the root!`` while a request to ``/api1/subpath`` would return ``Hello 1!`` and a request to ``/api2/subpath`` would return ``Hello 2!``.
-
-To try it out, save a code snippet in a local python file (i.e. main.py) and in the same directory, run the following commands to start a local Ray cluster on your machine.
-
-.. code-block:: bash
-
-    ray start --head
-    python main.py
-
-.. _serve-http-adapters:
-
-HTTP Adapters
-^^^^^^^^^^^^^
-
-HTTP adapters are functions that convert raw HTTP request to Python types that you know and recognize. 
-Its input arguments should be type annotated. At minimal, it should accept a ``starlette.requests.Request`` type.
-But it can also accept any type that's recognized by the FastAPI's dependency injection framework. 
-
-For example, here is an adapter that extra the json content from request. 
-
-.. code-block:: python
-
-    async def json_resolver(request: starlette.requests.Request):
-        return await request.json()
-
-Here is an adapter that accept two HTTP query parameters.
-
-.. code-block:: python
-
-    def parse_query_args(field_a: int, field_b: str):
-        return YourDataClass(field_a, field_b)
-
-You can specify different type signatures to facilitate HTTP fields extraction
-include 
-`query parameters <https://fastapi.tiangolo.com/tutorial/query-params/>`_,
-`body parameters <https://fastapi.tiangolo.com/tutorial/body/>`_,
-and `many other data types <https://fastapi.tiangolo.com/tutorial/extra-data-types/>`_.
-For more detail, you can take a look at `FastAPI documentation <https://fastapi.tiangolo.com/>`_.
-
-You can use adapters in different scenarios within Serve:
-
- Ray AIR ``ModelWrapper``
- Serve Deployment Graph ``DAGDriver``
- Embedded in Bring Your Own ``FastAPI`` Application
-
-Let's go over them one by one.
-
-Ray AIR ``ModelWrapper``
-""""""""""""""""""""""""
-
-Ray Serve provides a suite of adapters to convert HTTP requests to ML inputs like `numpy` arrays.
-You can just use it with :ref:`Ray AI Runtime (AIR) model wrapper<air-serve-integration>` feature
-to one click deploy pre-trained models.
-
-For example, we provide a simple adapter for n-dimensional array.
-
-With :ref:`model wrappers<air-serve-integration>`, you can specify it via the ``http_adapter`` field.
-
-.. code-block:: python
-
-    from ray import serve
-    from ray.serve.http_adapters import json_to_ndarray
-    from ray.serve.model_wrappers import ModelWrapperDeployment
-
-    ModelWrapperDeployment.options(name="my_model").deploy(
-        my_ray_air_predictor,
-        my_ray_air_checkpoint,
-        http_adapter=json_to_ndarray
-    )
-
-.. note::
-
-    Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
-    you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
-    Once validated, the model instance will passed to the predictor.
-
-    .. code-block:: python
-
-        from pydantic import BaseModel
-
-        class User(BaseModel):
-            user_id: int
-            user_name: str
-        
-        ...
-        ModelWrapperDeployment.deploy(..., http_adapter=User)
-
-
-Serve Deployment Graph ``DAGDriver``
-""""""""""""""""""""""""""""""""""""
-In :ref:`Serve Deployment Graph <serve-deployment-graph>`, you can configure
-``ray.serve.drivers.DAGDriver`` to accept an http adapter via it's ``http_adapter`` field. 
-
-
-For example, the json request adapters parse JSON in HTTP body:
-
-.. code-block:: python
-
-    from ray.serve.drivers import DAGDriver
-    from ray.serve.http_adapters import json_request
-    from ray.experimental.dag.input_node import InputNode
-
-    with InputNode() as input_node:
-        ...
-        dag = DAGDriver.bind(other_node, http_adapter=json_request)
-
-
-.. note::
-
-    Serve also supports pydantic models as a short-hand for HTTP adapters in model wrappers. Instead of functions,
-    you can directly pass in a pydantic model class to mean "validate the HTTP body with this schema".
-    Once validated, the model instance will passed as ``input_node`` variable.
-
-    .. code-block:: python
-
-        from pydantic import BaseModel
-
-        class User(BaseModel):
-            user_id: int
-            user_name: str
-        
-        ...
-        DAGDriver.bind(other_node, http_adapter=User)
-
-
-Embedded in Bring Your Own ``FastAPI`` Application
-""""""""""""""""""""""""""""""""""""""""""""""""""
-
-You can also bring the adapter to your own FastAPI app using
-`Depends <https://fastapi.tiangolo.com/tutorial/dependencies/#import-depends>`_.
-The input schema will automatically be part of the generated OpenAPI schema with FastAPI.
-
-.. code-block:: python
-
-    from fastapi import FastAPI, Depends
-    from ray.serve.http_adapters import json_to_ndarray
-
-    app = FastAPI()
-
-    @app.post("/endpoint")
-    async def endpoint(np_array = Depends(json_to_ndarray)):
-        ...
-
-It has the following schema for input:
-
-.. _serve-ndarray-schema:
-
-.. autopydantic_model:: ray.serve.http_adapters.NdArray
-
-
-List of Built-in Adapters
-"""""""""""""""""""""""""
-
-Here is a list of adapters and please feel free to `contribute more <https://github.com/ray-project/ray/issues/new/choose>`_!
-
-.. automodule:: ray.serve.http_adapters
-    :members: json_to_ndarray, image_to_ndarray, starlette_request, json_request
-
-
-Configuring HTTP Server Locations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-By default, Ray Serve starts a single HTTP server on the head node of the Ray cluster.
-You can configure this behavior using the ``http_options={"location": ...}`` flag
-in :mod:`serve.start <ray.serve.start>`:
-
- "HeadOnly": start one HTTP server on the head node. Serve
-  assumes the head node is the node you executed serve.start
-  on. This is the default.
- "EveryNode": start one HTTP server per node.
- "NoServer" or ``None``: disable HTTP server.
-
-.. note::
-   Using the "EveryNode" option, you can point a cloud load balancer to the
-   instance group of Ray cluster to achieve high availability of Serve's HTTP
-   proxies.
-
-Enabling CORS and other HTTP middlewares
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Serve supports arbitrary `Starlette middlewares <https://www.starlette.io/middleware/>`_
-and custom middlewares in Starlette format. The example below shows how to enable
-`Cross-Origin Resource Sharing (CORS) <https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS>`_.
-You can follow the same pattern for other Starlette middlewares.
-
-
-.. code-block:: python
-
-    from starlette.middleware import Middleware
-    from starlette.middleware.cors import CORSMiddleware
-
-    client = serve.start(
-        http_options={"middlewares": [
-            Middleware(
-                CORSMiddleware, allow_origins=["*"], allow_methods=["*"])
-        ]})
-
-.. _serve-handle-explainer:
-
-ServeHandle: Calling Deployments from Python
-============================================
-
-Ray Serve enables you to query models both from HTTP and Python. This feature
-enables seamless :ref:`model composition<serve-model-composition>`. You can
-get a ``ServeHandle`` corresponding to deployment, similar how you can
-reach a deployment through HTTP via a specific route. When you issue a request
-to a deployment through ``ServeHandle``, the request is load balanced across
-available replicas in the same way an HTTP request is.
-
-To call a Ray Serve deployment from python, use :mod:`Deployment.get_handle <ray.serve.api.Deployment>`
-to get a handle to the deployment, then use
-:mod:`handle.remote <ray.serve.handle.RayServeHandle.remote>` to send requests
-to that deployment. These requests can pass ordinary args and kwargs that are
-passed directly to the method. This returns a Ray ``ObjectRef`` whose result
-can be waited for or retrieved using ``ray.wait`` or ``ray.get``.
-
-.. code-block:: python
-
-    @serve.deployment
-    class Deployment:
-        def method1(self, arg):
-            return f"Method1: {arg}"
-
-        def __call__(self, arg):
-            return f"__call__: {arg}"
-
-    Deployment.deploy()
-
-    handle = Deployment.get_handle()
-    ray.get(handle.remote("hi")) # Defaults to calling the __call__ method.
-    ray.get(handle.method1.remote("hi")) # Call a different method.
-
-If you want to use the same deployment to serve both HTTP and ServeHandle traffic, the recommended best practice is to define an internal method that the HTTP handling logic will call:
-
-.. code-block:: python
-
-    @serve.deployment(route_prefix="/api")
-    class Deployment:
-        def say_hello(self, name: str):
-            return f"Hello {name}!"
-
-        def __call__(self, request):
-            return self.say_hello(request.query_params["name"])
-
-    Deployment.deploy()
-
-Now we can invoke the same logic from both HTTP or Python:
-
-.. code-block:: python
-
-    print(requests.get("http://localhost:8000/api?name=Alice"))
-    # Hello Alice!
-
-    handle = Deployment.get_handle()
-    print(ray.get(handle.say_hello.remote("Alice")))
-    # Hello Alice!
-
-.. _serve-sync-async-handles:
-
-Sync and Async Handles
-^^^^^^^^^^^^^^^^^^^^^^
-
-Ray Serve offers two types of ``ServeHandle``. You can use the ``Deployment.get_handle(..., sync=True|False)``
-flag to toggle between them.
-
- When you set ``sync=True`` (the default), a synchronous handle is returned.
-  Calling ``handle.remote()`` should return a Ray ``ObjectRef``.
- When you set ``sync=False``, an asyncio based handle is returned. You need to
-  Call it with ``await handle.remote()`` to return a Ray ObjectRef. To use ``await``,
-  you have to run ``Deployment.get_handle`` and ``handle.remote`` in Python asyncio event loop.
-
-The async handle has performance advantage because it uses asyncio directly; as compared
-to the sync handle, which talks to an asyncio event loop in a thread. To learn more about
-the reasoning behind these, checkout our `architecture documentation <./architecture.html>`_.
-
-Integrating with existing web servers
-=====================================
-
-Ray Serve comes with its own HTTP server out of the box, but if you have an existing
-web application, you can still plug in Ray Serve to scale up your compute using the ``ServeHandle``.
-For a tutorial with sample code, see :ref:`serve-web-server-integration-tutorial`.
--- a/doc/source/serve/index.md
+++ b/doc/source/serve/index.md
@ -0,0 +1,226 @@
+```{eval-rst}
+.. include:: /_includes/serve/announcement.rst
+```
+
+(rayserve)=
+
+# Serve: Scalable and Programmable Serving
+
+:::{tip}
+Get in touch with us if you're using or considering using [Ray Serve](https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU).
+:::
+
+```{image} logo.svg
+:align: center
+:height: 250px
+:width: 400px
+```
+
+(rayserve-overview)=
+
+Ray Serve is an easy-to-use scalable model serving library built on Ray.  Ray Serve is:
+
+- **Framework-agnostic**: Use a single toolkit to serve everything from deep learning models
+  built with frameworks like [PyTorch](serve-pytorch-tutorial),
+  [Tensorflow, and Keras](serve-tensorflow-tutorial), to [Scikit-Learn](serve-sklearn-tutorial) models, to arbitrary Python business logic.
+- **Python-first**: Configure your model serving declaratively in pure Python, without needing YAML or JSON configs.
+
+Ray Serve enables composing multiple ML models into a [deployment graph](serve-deployment-graph). This allows you to write a complex inference service consisting of multiple ML models and business logic all in Python code.
+
+Since Ray Serve is built on Ray, it allows you to easily scale to many machines, both in your datacenter and in the cloud.
+
+Ray Serve can be used in two primary ways to deploy your models at scale:
+
+1. Have Python functions and classes automatically placed behind HTTP endpoints.
+2. Alternatively, call them from [within your existing Python web server](serve-web-server-integration-tutorial) using the Python-native {ref}`servehandle-api`.
+
+:::{note}
+Serve recently added an experimental API for building deployment graphs of multiple models.
+Please take a look at the [Deployment Graph API](serve-deployment-graph) and try it out!
+:::
+
+:::{tip}
+Chat with Ray Serve users and developers on our [forum](https://discuss.ray.io/)!
+:::
+
+(serve-quickstart)=
+
+## Ray Serve Quickstart
+
+First install Ray Serve and all of its dependencies by running the following
+command in your terminal:
+
+```bash
+pip install "ray[serve]"
+```
+
+:::{note}
+Ray Serve supports the same Python versions as Ray. See {ref}`installation`
+for a list of supported Python versions.
+:::
+
+Now we will write a Python script to serve a simple "Counter" class over HTTP.  You may open an interactive Python terminal and copy in the lines below as we go.
+
+First, import Ray and Ray Serve:
+
+```python
+import ray
+from ray import serve
+```
+
+Ray Serve runs on top of a Ray cluster, so the next step is to start a local Ray cluster:
+
+```python
+ray.init()
+```
+
+:::{note}
+`ray.init()` will start a single-node Ray cluster on your local machine, which will allow you to use all your CPU cores to serve requests in parallel.  To start a multi-node cluster, see {doc}`../cluster/index`.
+:::
+
+Next, start the Ray Serve runtime:
+
+```python
+serve.start()
+```
+
+:::{warning}
+When the Python script exits, Ray Serve will shut down.
+If you would rather keep Ray Serve running in the background you can use `serve.start(detached=True)` (see {doc}`deployment` for details).
+:::
+
+Now we will define a simple Counter class. The goal is to serve this class behind an HTTP endpoint using Ray Serve.
+
+By default, Ray Serve offers a simple HTTP proxy that will send requests to the class' `__call__` method. The argument to this method will be a Starlette `Request` object.
+
+```python
+@serve.deployment
+class Counter:
+  def __init__(self):
+      self.count = 0
+
+  def __call__(self, request):
+      self.count += 1
+      return {"count": self.count}
+```
+
+:::{note}
+Besides classes, you can also serve standalone functions with Ray Serve in the same way.
+:::
+
+Notice that we made this class into a `Deployment` with the {mod}`@serve.deployment <ray.serve.api.deployment>` decorator.
+This decorator is where we could set various configuration options such as the number of replicas, unique name of the deployment (it defaults to the class name), or the HTTP route prefix to expose the deployment at.
+See the {mod}`Deployment package reference <ray.serve.api.Deployment>` for more details.
+In order to deploy this, we simply need to call `Counter.deploy()`.
+
+```python
+Counter.deploy()
+```
+
+:::{note}
+Deployments can be configured to improve performance, for example by increasing the number of replicas of the class being served in parallel.  For details, see {ref}`configuring-a-deployment`.
+:::
+
+Now that our deployment is up and running, let's test it out by making a query over HTTP.
+In your browser, simply visit `http://127.0.0.1:8000/Counter`, and you should see the output `{"count": 1"}`.
+If you keep refreshing the page, the count should increase, as expected.
+
+Now let's say we want to update this deployment to add another method to decrement the counter.
+Here, because we want more flexible HTTP configuration we'll use Serve's FastAPI integration.
+For more information on this, please see {ref}`serve-fastapi-http`.
+
+```python
+from fastapi import FastAPI
+
+app = FastAPI()
+
+@serve.deployment
+@serve.ingress(app)
+class Counter:
+  def __init__(self):
+      self.count = 0
+
+  @app.get("/")
+  def get(self):
+      return {"count": self.count}
+
+  @app.get("/incr")
+  def incr(self):
+      self.count += 1
+      return {"count": self.count}
+
+  @app.get("/decr")
+  def decr(self):
+      self.count -= 1
+      return {"count": self.count}
+```
+
+We've now redefined the `Counter` class to wrap a `FastAPI` application.
+This class is exposing three HTTP routes: `/Counter` will get the current count, `/Counter/incr` will increment the count, and `/Counter/decr` will decrement the count.
+
+To redeploy this updated version of the `Counter`, all we need to do is run `Counter.deploy()` again.
+Serve will perform a rolling update here to replace the existing replicas with the new version we defined.
+
+```python
+Counter.deploy()
+```
+
+If we test out the HTTP endpoint again, we can see this in action.
+Note that the count has been reset to zero because the new version of `Counter` was deployed.
+
+```bash
+> curl -X GET localhost:8000/Counter/
+{"count": 0}
+> curl -X GET localhost:8000/Counter/incr
+{"count": 1}
+> curl -X GET localhost:8000/Counter/decr
+{"count": 0}
+```
+
+Congratulations, you just built and ran your first Ray Serve application! You should now have enough context to dive into the {doc}`core-apis` to get a deeper understanding of Ray Serve.
+For more interesting example applications, including integrations with popular machine learning frameworks and Python web servers, be sure to check out {doc}`tutorials/index`.
+For a high-level view of the architecture underlying Ray Serve, see {doc}`architecture`.
+
+## Why Ray Serve?
+
+There are generally two ways of serving machine learning applications, both with serious limitations:
+you can use a **traditional web server**---your own Flask app---or you can use a cloud-hosted solution.
+
+The first approach is easy to get started with, but it's hard to scale each component. The second approach
+requires vendor lock-in (SageMaker), framework-specific tooling (TFServing), and a general
+lack of flexibility.
+
+Ray Serve solves these problems by giving you a simple web server (and the ability to [use your own](serve-web-server-integration-tutorial)) while still handling the complex routing, scaling, and testing logic
+necessary for production deployments.
+
+Beyond scaling up your deployments with multiple replicas, Ray Serve also enables:
+
+- {ref}`serve-model-composition`---ability to flexibly compose multiple models and independently scale and update each.
+- {ref}`serve-batching`---built in request batching to help you meet your performance objectives.
+- {ref}`serve-cpus-gpus`---specify fractional resource requirements to fully saturate each of your GPUs with several models.
+
+For more on the motivation behind Ray Serve, check out these [meetup slides](https://tinyurl.com/serve-meetup) and this [blog post](https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f).
+
+### When should I use Ray Serve?
+
+Ray Serve is a flexible tool that's easy to use for deploying, operating, and monitoring Python-based machine learning applications.
+Ray Serve excels when you want to mix business logic with ML models and scaling out in production is a necessity. This might be because of large-scale batch processing
+requirements or because you want to scale up a deployment graph consisting of many individual models with different performance properties.
+
+If you plan on running on multiple machines, Ray Serve will serve you well!
+
+## What's next?
+
+Check out the {ref}`end-to-end-tutorial` and {doc}`core-apis`, look at the {ref}`serve-faq`,
+or head over to the {doc}`tutorials/index` to get started building your Ray Serve applications.
+
+For more, see the following blog posts about Ray Serve:
+
+- [Serving ML Models in Production: Common Patterns](https://www.anyscale.com/blog/serving-ml-models-in-production-common-patterns) by Simon Mo, Edward Oakes, and Michael Galarnyk
+- [How to Scale Up Your FastAPI Application Using Ray Serve](https://medium.com/distributed-computing-with-ray/how-to-scale-up-your-fastapi-application-using-ray-serve-c9a7b69e786) by Archit Kulkarni
+- [Machine Learning is Broken](https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f) by Simon Mo
+- [The Simplest Way to Serve your NLP Model in Production with Pure Python](https://medium.com/distributed-computing-with-ray/the-simplest-way-to-serve-your-nlp-model-in-production-with-pure-python-d42b6a97ad55) by Edward Oakes and Bill Chambers
+
+```{eval-rst}
+.. include:: /_includes/serve/announcement_bottom.rst
+```
--- a/doc/source/serve/index.rst
+++ b/doc/source/serve/index.rst
@ -1,225 +0,0 @@
-.. include:: /_includes/serve/announcement.rst
-
-.. _rayserve:
-
-========================================
-Serve: Scalable and Programmable Serving
-========================================
-
-.. tip::
-   Get in touch with us if you're using or considering using `Ray Serve <https://docs.google.com/forms/d/1l8HT35jXMPtxVUtQPeGoe09VGp5jcvSv0TqPgyz6lGU>`_.
-
-
-.. image:: logo.svg
-    :align: center
-    :height: 250px
-    :width: 400px
-
-.. _rayserve-overview:
-
-Ray Serve is an easy-to-use scalable model serving library built on Ray.  Ray Serve is:
-
- **Framework-agnostic**: Use a single toolkit to serve everything from deep learning models
-  built with frameworks like :ref:`PyTorch <serve-pytorch-tutorial>`,
-  :ref:`Tensorflow, and Keras <serve-tensorflow-tutorial>`, to :ref:`Scikit-Learn <serve-sklearn-tutorial>` models, to arbitrary Python business logic.
- **Python-first**: Configure your model serving declaratively in pure Python, without needing YAML or JSON configs.
-
-Ray Serve enables composing multiple ML models into a :ref:`deployment graph <serve-deployment-graph>`. This allows you to write a complex inference service consisting of multiple ML models and business logic all in Python code.
-
-Since Ray Serve is built on Ray, it allows you to easily scale to many machines, both in your datacenter and in the cloud.
-
-Ray Serve can be used in two primary ways to deploy your models at scale:
-
-1. Have Python functions and classes automatically placed behind HTTP endpoints.
-
-2. Alternatively, call them from :ref:`within your existing Python web server <serve-web-server-integration-tutorial>` using the Python-native :ref:`servehandle-api`.
-
-.. note::
-  Serve recently added an experimental API for building deployment graphs of multiple models.
-  Please take a look at the :ref:`Deployment Graph API <serve-deployment-graph>` and try it out!
-
-.. tip::
-  Chat with Ray Serve users and developers on our `forum <https://discuss.ray.io/>`_!
-
-.. _serve_quickstart:
-
-Ray Serve Quickstart
-====================
-
-First install Ray Serve and all of its dependencies by running the following
-command in your terminal:
-
-.. code-block:: bash
-
-  pip install "ray[serve]"
-
-.. note::
-  Ray Serve supports the same Python versions as Ray. See :ref:`installation`
-  for a list of supported Python versions.
-
-Now we will write a Python script to serve a simple "Counter" class over HTTP.  You may open an interactive Python terminal and copy in the lines below as we go.
-
-First, import Ray and Ray Serve:
-
-.. code-block:: python
-
-  import ray
-  from ray import serve
-
-Ray Serve runs on top of a Ray cluster, so the next step is to start a local Ray cluster:
-
-.. code-block:: python
-
-  ray.init()
-
-.. note::
-
-  ``ray.init()`` will start a single-node Ray cluster on your local machine, which will allow you to use all your CPU cores to serve requests in parallel.  To start a multi-node cluster, see :doc:`../cluster/index`.
-
-Next, start the Ray Serve runtime:
-
-.. code-block:: python
-
-  serve.start()
-
-.. warning::
-
-  When the Python script exits, Ray Serve will shut down.
-  If you would rather keep Ray Serve running in the background you can use ``serve.start(detached=True)`` (see :doc:`deployment` for details).
-
-Now we will define a simple Counter class. The goal is to serve this class behind an HTTP endpoint using Ray Serve.
-
-By default, Ray Serve offers a simple HTTP proxy that will send requests to the class' ``__call__`` method. The argument to this method will be a Starlette ``Request`` object.
-
-.. code-block:: python
-
-  @serve.deployment
-  class Counter:
-    def __init__(self):
-        self.count = 0
-
-    def __call__(self, request):
-        self.count += 1
-        return {"count": self.count}
-
-.. note::
-
-  Besides classes, you can also serve standalone functions with Ray Serve in the same way.
-
-Notice that we made this class into a ``Deployment`` with the :mod:`@serve.deployment <ray.serve.api.deployment>` decorator.
-This decorator is where we could set various configuration options such as the number of replicas, unique name of the deployment (it defaults to the class name), or the HTTP route prefix to expose the deployment at.
-See the :mod:`Deployment package reference <ray.serve.api.Deployment>` for more details.
-In order to deploy this, we simply need to call ``Counter.deploy()``.
-
-.. code-block:: python
-
-  Counter.deploy()
-
-.. note::
-
-  Deployments can be configured to improve performance, for example by increasing the number of replicas of the class being served in parallel.  For details, see :ref:`configuring-a-deployment`.
-
-Now that our deployment is up and running, let's test it out by making a query over HTTP.
-In your browser, simply visit ``http://127.0.0.1:8000/Counter``, and you should see the output ``{"count": 1"}``.
-If you keep refreshing the page, the count should increase, as expected.
-
-Now let's say we want to update this deployment to add another method to decrement the counter.
-Here, because we want more flexible HTTP configuration we'll use Serve's FastAPI integration.
-For more information on this, please see :ref:`serve-fastapi-http`.
-
-.. code-block:: python
-
-  from fastapi import FastAPI
-
-  app = FastAPI()
-
-  @serve.deployment
-  @serve.ingress(app)
-  class Counter:
-    def __init__(self):
-        self.count = 0
-
-    @app.get("/")
-    def get(self):
-        return {"count": self.count}
-
-    @app.get("/incr")
-    def incr(self):
-        self.count += 1
-        return {"count": self.count}
-
-    @app.get("/decr")
-    def decr(self):
-        self.count -= 1
-        return {"count": self.count}
-
-We've now redefined the ``Counter`` class to wrap a ``FastAPI`` application.
-This class is exposing three HTTP routes: ``/Counter`` will get the current count, ``/Counter/incr`` will increment the count, and ``/Counter/decr`` will decrement the count.
-
-To redeploy this updated version of the ``Counter``, all we need to do is run ``Counter.deploy()`` again.
-Serve will perform a rolling update here to replace the existing replicas with the new version we defined.
-
-.. code-block:: python
-
-  Counter.deploy()
-
-If we test out the HTTP endpoint again, we can see this in action.
-Note that the count has been reset to zero because the new version of ``Counter`` was deployed.
-
-.. code-block:: bash
-
-  > curl -X GET localhost:8000/Counter/
-  {"count": 0}
-  > curl -X GET localhost:8000/Counter/incr
-  {"count": 1}
-  > curl -X GET localhost:8000/Counter/decr
-  {"count": 0}
-
-Congratulations, you just built and ran your first Ray Serve application! You should now have enough context to dive into the :doc:`core-apis` to get a deeper understanding of Ray Serve.
-For more interesting example applications, including integrations with popular machine learning frameworks and Python web servers, be sure to check out :doc:`tutorials/index`.
-For a high-level view of the architecture underlying Ray Serve, see :doc:`architecture`.
-
-Why Ray Serve?
-==============
-
-There are generally two ways of serving machine learning applications, both with serious limitations:
-you can use a **traditional web server**---your own Flask app---or you can use a cloud-hosted solution.
-
-The first approach is easy to get started with, but it's hard to scale each component. The second approach
-requires vendor lock-in (SageMaker), framework-specific tooling (TFServing), and a general
-lack of flexibility.
-
-Ray Serve solves these problems by giving you a simple web server (and the ability to :ref:`use your own <serve-web-server-integration-tutorial>`) while still handling the complex routing, scaling, and testing logic
-necessary for production deployments.
-
-Beyond scaling up your deployments with multiple replicas, Ray Serve also enables:
-
- :ref:`serve-model-composition`---ability to flexibly compose multiple models and independently scale and update each.
- :ref:`serve-batching`---built in request batching to help you meet your performance objectives.
- :ref:`serve-cpus-gpus`---specify fractional resource requirements to fully saturate each of your GPUs with several models.
-
-For more on the motivation behind Ray Serve, check out these `meetup slides <https://tinyurl.com/serve-meetup>`_ and this `blog post <https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f>`_.
-
-When should I use Ray Serve?
----------------------------
-
-Ray Serve is a flexible tool that's easy to use for deploying, operating, and monitoring Python-based machine learning applications.
-Ray Serve excels when you want to mix business logic with ML models and scaling out in production is a necessity. This might be because of large-scale batch processing
-requirements or because you want to scale up a deployment graph consisting of many individual models with different performance properties.
-
-If you plan on running on multiple machines, Ray Serve will serve you well!
-
-What's next?
-============
-
-Check out the :ref:`end_to_end_tutorial` and :doc:`core-apis`, look at the :ref:`serve-faq`,
-or head over to the :doc:`tutorials/index` to get started building your Ray Serve applications.
-
-For more, see the following blog posts about Ray Serve:
-
- `Serving ML Models in Production: Common Patterns <https://www.anyscale.com/blog/serving-ml-models-in-production-common-patterns>`_ by Simon Mo, Edward Oakes, and Michael Galarnyk
- `How to Scale Up Your FastAPI Application Using Ray Serve <https://medium.com/distributed-computing-with-ray/how-to-scale-up-your-fastapi-application-using-ray-serve-c9a7b69e786>`_ by Archit Kulkarni
- `Machine Learning is Broken <https://medium.com/distributed-computing-with-ray/machine-learning-serving-is-broken-f59aff2d607f>`_ by Simon Mo
- `The Simplest Way to Serve your NLP Model in Production with Pure Python <https://medium.com/distributed-computing-with-ray/the-simplest-way-to-serve-your-nlp-model-in-production-with-pure-python-d42b6a97ad55>`_ by Edward Oakes and Bill Chambers
-
-.. include:: /_includes/serve/announcement_bottom.rst
--- a/doc/source/serve/ml-models.md
+++ b/doc/source/serve/ml-models.md
@ -0,0 +1,300 @@
+# Serving ML Models
+
+This section should help you:
+
+- batch requests to optimize performance
+- serve multiple models by composing deployments
+- serve multiple models by making ensemble deployments
+
+```{contents}
+```
+
+(serve-batching)=
+
+## Request Batching
+
+You can also have Ray Serve batch requests for performance, which is especially important for some ML models that run on GPUs. In order to use this feature, you need to do the following two things:
+
+1. Use `async def` for your request handling logic to process queries concurrently.
+2. Use the `@serve.batch` decorator to batch individual queries that come into the replica. The method/function that's decorated should handle a list of requests and return a list of the same length.
+
+```python
+@serve.deployment(route_prefix="/increment")
+class BatchingExample:
+    def __init__(self):
+        self.count = 0
+
+    @serve.batch
+    async def handle_batch(self, requests):
+        responses = []
+        for request in requests:
+            responses.append(request.json())
+
+        return responses
+
+    async def __call__(self, request):
+        return await self.handle_batch(request)
+
+BatchingExample.deploy()
+```
+
+Please take a look at [Batching Tutorial](serve-batch-tutorial) for a deep
+dive.
+
+(serve-model-composition)=
+
+## Model Composition
+
+:::{note}
+Serve recently added an experimental API for building deployment graphs of multiple models.
+Please take a look at the [Deployment Graph API](serve-deployment-graph) and try it out!
+:::
+
+Ray Serve supports composing individually scalable models into a single model
+out of the box. For instance, you can combine multiple models to perform
+stacking or ensembles.
+
+To define a higher-level composed model you need to do three things:
+
+1. Define your underlying models (the ones that you will compose together) as
+   Ray Serve deployments.
+2. Define your composed model, using the handles of the underlying models
+   (see the example below).
+3. Define a deployment representing this composed model and query it!
+
+In order to avoid synchronous execution in the composed model (e.g., it's very
+slow to make calls to the composed model), you'll need to make the function
+asynchronous by using an `async def`. You'll see this in the example below.
+
+That's it. Let's take a look at an example:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_model_composition.py
+```
+
+(serve-model-ensemble)=
+
+## Model Ensemble
+
+Ray Serve supports creating different ensemble models
+
+To define an ensemble of different models you need to do three things:
+
+1. Define your underlying sub models (the ones that make up the ensemble) as
+   Ray Serve deployments.
+2. Define your ensemble model, using the handles of the underlying models
+   (see the example below).
+3. Define a deployment representing this ensemble model and query it!
+
+In order to avoid synchronous execution in the ensemble model, you'll need to make
+the function asynchronous by using an `async def`. In contrast to a composition model,
+within an ensemble model, you want to call **all** sub models in parallel. This will be
+achieved by sending all prediction calls to the sub models via async by using
+`asyncio.wait()`. Each serve deployment used in an ensemble use case is independently
+scalable via changing `num_replicas`.
+
+That's it. Let's take a look at an example:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_model_ensemble.py
+```
+
+## Integration with Model Registries
+
+Ray Serve is flexible.  If you can load your model as a Python
+function or class, then you can scale it up and serve it with Ray Serve.
+
+For example, if you are using the
+[MLflow Model Registry](https://www.mlflow.org/docs/latest/model-registry.html)
+to manage your models, the following wrapper
+class will allow you to load a model using its MLflow `Model URI`:
+
+```python
+import pandas as pd
+import mlflow.pyfunc
+
+@serve.deployment
+class MLflowDeployment:
+    def __init__(self, model_uri):
+        self.model = mlflow.pyfunc.load_model(model_uri=model_uri)
+
+    async def __call__(self, request):
+        csv_text = await request.body() # The body contains just raw csv text.
+        df = pd.read_csv(csv_text)
+        return self.model.predict(df)
+
+model_uri = "model:/my_registered_model/Production"
+MLflowDeployment.deploy(model_uri)
+```
+
+To serve multiple different MLflow models in the same program, use the `name` option:
+
+```python
+MLflowDeployment.options(name="my_mlflow_model_1").deploy(model_uri)
+```
+
+:::{tip}
+The above approach will work for any model registry, not just MLflow.
+Namely, load the model from the registry in `__init__`, and forward the request to the model in `__call__`.
+:::
+
+For a complete hands-on and seamless integration with MLflow, try this self-contained example on your laptop.
+But first install `mlflow`.
+
+```bash
+pip install mlflow
+```
+
+```python
+# This brief example shows how to deploy models saved in a model registry such as
+# MLflow to Ray Serve, using the simple Ray Serve deployment APIs. You can peruse
+# the saved models' metrics and parameters in MLflow ui.
+#
+import json
+import numpy as np
+import pandas as pd
+import requests
+import os
+import tempfile
+
+from sklearn.datasets import load_iris
+from sklearn.ensemble import GradientBoostingClassifier
+from mlflow.tracking import MlflowClient
+
+from ray import serve
+import mlflow
+
+
+def create_and_save_model():
+    # load Iris data
+    iris_data = load_iris()
+    data, target, target_names = (iris_data['data'],
+                                  iris_data['target'],
+                                  iris_data['target_names'])
+
+    # Instantiate a model
+    model = GradientBoostingClassifier()
+
+    # Training and validation split
+    np.random.shuffle(data), np.random.shuffle(target)
+    train_x, train_y = data[:100], target[:100]
+    val_x, val_y = data[100:], target[100:]
+
+    # Create labels list as file
+    LABEL_PATH = os.path.join(tempfile.gettempdir(), "iris_labels.json")
+    with open(LABEL_PATH, "w") as f:
+        json.dump(target_names.tolist(), f)
+
+    # Train the model and save our label list as an MLflow artifact
+    # mlflow.sklearn.autolog automatically logs all parameters and metrics during
+    # the training.
+    mlflow.sklearn.autolog()
+    with mlflow.start_run() as run:
+        model.fit(train_x, train_y)
+        # Log label list as a artifact
+        mlflow.log_artifact(LABEL_PATH, artifact_path="labels")
+    return run.info.run_id
+
+#
+# Create our Ray Serve deployment class
+#
+
+
+@serve.deployment(route_prefix="/regressor")
+class BoostingModel:
+    def __init__(self, uri):
+        # Load the model and label artifact from the local
+        # Mlflow model registry as a PyFunc Model
+        self.model = mlflow.pyfunc.load_model(model_uri=uri)
+
+        # Download the artifact list of labels
+        local_dir = "/tmp/artifact_downloads"
+        if not os.path.exists(local_dir):
+            os.mkdir(local_dir)
+        client = MlflowClient()
+        local_path = f"{client.download_artifacts(run_id, 'labels', local_dir)}/iris_labels.json"
+        with open(local_path, "r") as f:
+            self.label_list = json.load(f)
+
+    async def __call__(self, starlette_request):
+        payload = await starlette_request.json()
+        print(f"Worker: received Starlette request with data: {payload}")
+
+        # Get the input vector from the payload
+        input_vector = [
+            payload["sepal length"],
+            payload["sepal width"],
+            payload["petal length"],
+            payload["petal width"],
+        ]
+
+        # Convert the input vector in a Pandas DataFrame for prediction since
+        # an MLflow PythonFunc model, model.predict(...), takes pandas DataFrame
+        prediction = self.model.predict(pd.DataFrame([input_vector]))[0]
+        human_name = self.label_list[prediction]
+        return {"result": human_name}
+
+
+if __name__ == '__main__':
+
+    # Train and save the model artifacts in MLflow.
+    # Here our MLflow model registry is local file
+    # directory ./mlruns
+    run_id = create_and_save_model()
+
+    # Start the Ray Serve instance
+    serve.start()
+    # Construct model uri to load the model from our model registry
+    uri = f"runs:/{run_id}/model"
+    # Deploy our model.
+    BoostingModel.deploy(uri)
+
+    # Send in a request for labels types virginica, setosa, versicolor
+    sample_request_inputs = [{
+        "sepal length": 6.3,
+        "sepal width": 3.3,
+        "petal length": 6.0,
+        "petal width": 2.5},
+        {
+        "sepal length": 5.1,
+        "sepal width": 3.5,
+        "petal length": 1.4,
+        "petal width": 0.2},
+        {
+        "sepal length": 6.4,
+        "sepal width": 3.2,
+        "petal length": 4.5,
+        "petal width": 1.5},
+    ]
+    for input_request in sample_request_inputs:
+        response = requests.get("http://localhost:8000/regressor",
+                            json=input_request)
+        print(response.text)
+
+    print("Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.")
+
+    #output
+    #{
+    #   "result": "versicolor"
+    #}
+    #{
+    #    "result": "virginica"
+    #}
+    #{
+    #    "result": "setosa"
+    #}
+    #
+    # Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.
+```
+
+For an even more hands-off and seamless integration with MLflow, check out the
+[Ray Serve MLflow deployment plugin](https://github.com/ray-project/mlflow-ray-serve).  A full
+tutorial is available [here](https://github.com/mlflow/mlflow/tree/master/examples/ray_serve).
+
+## Framework-Specific Tutorials
+
+Ray Serve seamlessly integrates with popular Python ML libraries.
+Below are tutorials with some of these frameworks to help get you started.
+
+- [PyTorch Tutorial](serve-pytorch-tutorial)
+- [Scikit-Learn Tutorial](serve-sklearn-tutorial)
+- [Keras and Tensorflow Tutorial](serve-tensorflow-tutorial)
+- [RLlib Tutorial](serve-rllib-tutorial)
--- a/doc/source/serve/ml-models.rst
+++ b/doc/source/serve/ml-models.rst
@ -1,307 +0,0 @@
-=================
-Serving ML Models
-=================
-
-This section should help you:
-
- batch requests to optimize performance
- serve multiple models by composing deployments
- serve multiple models by making ensemble deployments
-
-.. contents::
-
-.. _serve-batching:
-
-Request Batching
-================
-
-You can also have Ray Serve batch requests for performance, which is especially important for some ML models that run on GPUs. In order to use this feature, you need to do the following two things:
-
-1. Use ``async def`` for your request handling logic to process queries concurrently.
-2. Use the ``@serve.batch`` decorator to batch individual queries that come into the replica. The method/function that's decorated should handle a list of requests and return a list of the same length.
-
-
-.. code-block:: python
-
-  @serve.deployment(route_prefix="/increment")
-  class BatchingExample:
-      def __init__(self):
-          self.count = 0
-
-      @serve.batch
-      async def handle_batch(self, requests):
-          responses = []
-          for request in requests:
-              responses.append(request.json())
-
-          return responses
-
-      async def __call__(self, request):
-          return await self.handle_batch(request)
-
-  BatchingExample.deploy()
-
-Please take a look at :ref:`Batching Tutorial<serve-batch-tutorial>` for a deep
-dive.
-
-
-.. _serve-model-composition:
-
-Model Composition
-=================
-
-.. note::
-  Serve recently added an experimental API for building deployment graphs of multiple models.
-  Please take a look at the :ref:`Deployment Graph API <serve-deployment-graph>` and try it out!
-
-Ray Serve supports composing individually scalable models into a single model
-out of the box. For instance, you can combine multiple models to perform
-stacking or ensembles.
-
-To define a higher-level composed model you need to do three things:
-
-1. Define your underlying models (the ones that you will compose together) as
-   Ray Serve deployments.
-2. Define your composed model, using the handles of the underlying models
-   (see the example below).
-3. Define a deployment representing this composed model and query it!
-
-In order to avoid synchronous execution in the composed model (e.g., it's very
-slow to make calls to the composed model), you'll need to make the function
-asynchronous by using an ``async def``. You'll see this in the example below.
-
-That's it. Let's take a look at an example:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_composition.py
-
-.. _serve-model-ensemble:
-
-Model Ensemble
-=================
-
-Ray Serve supports creating different ensemble models
-
-To define an ensemble of different models you need to do three things:
-
-1. Define your underlying sub models (the ones that make up the ensemble) as
-   Ray Serve deployments.
-2. Define your ensemble model, using the handles of the underlying models
-   (see the example below).
-3. Define a deployment representing this ensemble model and query it!
-
-In order to avoid synchronous execution in the ensemble model, you'll need to make
-the function asynchronous by using an ``async def``. In contrast to a composition model,
-within an ensemble model, you want to call **all** sub models in parallel. This will be
-achieved by sending all prediction calls to the sub models via async by using
-``asyncio.wait()``. Each serve deployment used in an ensemble use case is independently
-scalable via changing ``num_replicas``.
-
-That's it. Let's take a look at an example:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_ensemble.py
-
-
-Integration with Model Registries
-=================================
-
-Ray Serve is flexible.  If you can load your model as a Python
-function or class, then you can scale it up and serve it with Ray Serve.
-
-For example, if you are using the
-`MLflow Model Registry <https://www.mlflow.org/docs/latest/model-registry.html>`_
-to manage your models, the following wrapper
-class will allow you to load a model using its MLflow `Model URI`:
-
-.. code-block:: python
-
-  import pandas as pd
-  import mlflow.pyfunc
-
-  @serve.deployment
-  class MLflowDeployment:
-      def __init__(self, model_uri):
-          self.model = mlflow.pyfunc.load_model(model_uri=model_uri)
-
-      async def __call__(self, request):
-          csv_text = await request.body() # The body contains just raw csv text.
-          df = pd.read_csv(csv_text)
-          return self.model.predict(df)
-
-  model_uri = "model:/my_registered_model/Production"
-  MLflowDeployment.deploy(model_uri)
-
-To serve multiple different MLflow models in the same program, use the ``name`` option:
-
-.. code-block:: python
-
-  MLflowDeployment.options(name="my_mlflow_model_1").deploy(model_uri)
-
-
-.. tip::
-
-  The above approach will work for any model registry, not just MLflow.
-  Namely, load the model from the registry in ``__init__``, and forward the request to the model in ``__call__``.
-
-For a complete hands-on and seamless integration with MLflow, try this self-contained example on your laptop.
-But first install ``mlflow``.
-
-.. code-block:: bash
-
-    pip install mlflow
-
-.. code-block:: python
-
-    # This brief example shows how to deploy models saved in a model registry such as
-    # MLflow to Ray Serve, using the simple Ray Serve deployment APIs. You can peruse
-    # the saved models' metrics and parameters in MLflow ui.
-    #
-    import json
-    import numpy as np
-    import pandas as pd
-    import requests
-    import os
-    import tempfile
-
-    from sklearn.datasets import load_iris
-    from sklearn.ensemble import GradientBoostingClassifier
-    from mlflow.tracking import MlflowClient
-
-    from ray import serve
-    import mlflow
-
-
-    def create_and_save_model():
-        # load Iris data
-        iris_data = load_iris()
-        data, target, target_names = (iris_data['data'],
-                                      iris_data['target'],
-                                      iris_data['target_names'])
-
-        # Instantiate a model
-        model = GradientBoostingClassifier()
-
-        # Training and validation split
-        np.random.shuffle(data), np.random.shuffle(target)
-        train_x, train_y = data[:100], target[:100]
-        val_x, val_y = data[100:], target[100:]
-
-        # Create labels list as file
-        LABEL_PATH = os.path.join(tempfile.gettempdir(), "iris_labels.json")
-        with open(LABEL_PATH, "w") as f:
-            json.dump(target_names.tolist(), f)
-
-        # Train the model and save our label list as an MLflow artifact
-        # mlflow.sklearn.autolog automatically logs all parameters and metrics during
-        # the training.
-        mlflow.sklearn.autolog()
-        with mlflow.start_run() as run:
-            model.fit(train_x, train_y)
-            # Log label list as a artifact
-            mlflow.log_artifact(LABEL_PATH, artifact_path="labels")
-        return run.info.run_id
-
-    #
-    # Create our Ray Serve deployment class
-    #
-
-
-    @serve.deployment(route_prefix="/regressor")
-    class BoostingModel:
-        def __init__(self, uri):
-            # Load the model and label artifact from the local
-            # Mlflow model registry as a PyFunc Model
-            self.model = mlflow.pyfunc.load_model(model_uri=uri)
-
-            # Download the artifact list of labels
-            local_dir = "/tmp/artifact_downloads"
-            if not os.path.exists(local_dir):
-                os.mkdir(local_dir)
-            client = MlflowClient()
-            local_path = f"{client.download_artifacts(run_id, 'labels', local_dir)}/iris_labels.json"
-            with open(local_path, "r") as f:
-                self.label_list = json.load(f)
-
-        async def __call__(self, starlette_request):
-            payload = await starlette_request.json()
-            print(f"Worker: received Starlette request with data: {payload}")
-
-            # Get the input vector from the payload
-            input_vector = [
-                payload["sepal length"],
-                payload["sepal width"],
-                payload["petal length"],
-                payload["petal width"],
-            ]
-
-            # Convert the input vector in a Pandas DataFrame for prediction since
-            # an MLflow PythonFunc model, model.predict(...), takes pandas DataFrame
-            prediction = self.model.predict(pd.DataFrame([input_vector]))[0]
-            human_name = self.label_list[prediction]
-            return {"result": human_name}
-
-
-    if __name__ == '__main__':
-
-        # Train and save the model artifacts in MLflow.
-        # Here our MLflow model registry is local file
-        # directory ./mlruns
-        run_id = create_and_save_model()
-
-        # Start the Ray Serve instance
-        serve.start()
-        # Construct model uri to load the model from our model registry
-        uri = f"runs:/{run_id}/model"
-        # Deploy our model.
-        BoostingModel.deploy(uri)
-
-        # Send in a request for labels types virginica, setosa, versicolor
-        sample_request_inputs = [{
-            "sepal length": 6.3,
-            "sepal width": 3.3,
-            "petal length": 6.0,
-            "petal width": 2.5},
-            {
-            "sepal length": 5.1,
-            "sepal width": 3.5,
-            "petal length": 1.4,
-            "petal width": 0.2},
-            {
-            "sepal length": 6.4,
-            "sepal width": 3.2,
-            "petal length": 4.5,
-            "petal width": 1.5},
-        ]
-        for input_request in sample_request_inputs:
-            response = requests.get("http://localhost:8000/regressor",
-                                json=input_request)
-            print(response.text)
-
-        print("Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.")
-
-        #output
-        #{
-        #   "result": "versicolor"
-        #}
-        #{
-        #    "result": "virginica"
-        #}
-        #{
-        #    "result": "setosa"
-        #}
-        #
-        # Launch MLflow ui to see the model parameters, metrics, and artifacts: `mlflow ui` from current directory.
-
-For an even more hands-off and seamless integration with MLflow, check out the
-`Ray Serve MLflow deployment plugin <https://github.com/ray-project/mlflow-ray-serve>`__.  A full
-tutorial is available `here <https://github.com/mlflow/mlflow/tree/master/examples/ray_serve>`__.
-
-Framework-Specific Tutorials
-============================
-
-Ray Serve seamlessly integrates with popular Python ML libraries.
-Below are tutorials with some of these frameworks to help get you started.
-
- :ref:`PyTorch Tutorial<serve-pytorch-tutorial>`
- :ref:`Scikit-Learn Tutorial<serve-sklearn-tutorial>`
- :ref:`Keras and Tensorflow Tutorial<serve-tensorflow-tutorial>`
- :ref:`RLlib Tutorial<serve-rllib-tutorial>`
--- a/doc/source/serve/monitoring.md
+++ b/doc/source/serve/monitoring.md
@ -0,0 +1,266 @@
+(serve-monitoring)=
+
+# Debugging & Monitoring
+
+This section should help you understand how to debug and monitor your Serve application.
+
+## Ray Dashboard
+
+A high-level way to monitor your Ray Serve application is via the Ray Dashboard.
+See the [Ray Dashboard documentation](ray-dashboard) for a detailed overview, including instructions on how to view the dashboard.
+
+Below is an example of what the Ray Dashboard might look like for a Serve deployment:
+
+```{image} https://raw.githubusercontent.com/ray-project/Images/master/docs/dashboard/serve-dashboard.png
+:align: center
+```
+
+Here you can see the Serve controller actor, an HTTP proxy actor, and all of the replicas for each Serve deployment.
+To learn about the function of the controller and proxy actors, see the [Serve Architecture page](serve-architecture).
+In this example pictured above, we have a single-node cluster with a deployment named Counter with `num_replicas=2`.
+
+## Logging
+
+:::{note}
+For an overview of logging in Ray, see [Ray Logging](ray-logging).
+:::
+
+Ray Serve uses Python's standard `logging` facility with the `"ray.serve"` named logger.
+By default, logs are emitted from actors both to `stderr` and on disk on each node at `/tmp/ray/session_latest/logs/serve/`.
+This includes both system-level logs from the Serve controller and HTTP proxy as well as access logs and custom user logs produced from within deployment replicas.
+
+In development, logs are streamed to the driver Ray program (the program that calls `.deploy()` or `serve.run`, or the `serve run` CLI command) that deployed the deployments, so it's most convenient to keep the driver running for debugging.
+For example, let's run a basic Serve application and view the logs that are emitted.
+You can run this in an interactive shell like IPython to follow along.
+
+First we call `serve.start()`:
+
+```python
+from ray import serve
+
+serve.start()
+```
+
+This produces a few INFO-level log messages about startup from the Serve controller.
+
+```bash
+2022-04-02 09:10:49,906 INFO services.py:1460 -- View the Ray dashboard at http://127.0.0.1:8265
+(ServeController pid=67312) INFO 2022-04-02 09:10:51,386 controller 67312 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.
+(ServeController pid=67312) INFO 2022-04-02 09:10:51,492 controller 67312 http_state.py:108 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:xlehoa:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'
+```
+
+Next, let's create a simple deployment that logs a custom log message when it's queried:
+
+```python
+import logging
+
+logger = logging.getLogger("ray.serve")
+
+@serve.deployment(route_prefix="/")
+class SayHello:
+    def __call__(self, *args):
+        logger.info("Hello world!")
+        return "hi"
+
+SayHello.deploy()
+```
+
+Running this code block, we first get some log messages from the controller saying that a new replica of the deployment is being created:
+
+```bash
+(ServeController pid=67312) INFO 2022-04-02 09:16:13,323 controller 67312 deployment_state.py:1198 - Adding 1 replicas to deployment 'SayHello'.
+```
+
+Then when we query the deployment, we get both a default access log as well as our custom `"Hello world!"` message.
+Note that these log lines are tagged with the deployment name followed by a unique identifier for the specific replica.
+These can be parsed by a logging stack such as ELK or Loki to enable searching logs by deployment and replica.
+
+```bash
+handle = SayHello.get_handle()
+ray.get(handle.remote())
+(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
+(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
+```
+
+Querying the deployment over HTTP produces a similar access log message from the HTTP proxy:
+
+```bash
+curl -X GET http://localhost:8000/
+(HTTPProxyActor pid=67315) INFO 2022-04-02 09:20:08,976 http_proxy 127.0.0.1 http_proxy.py:310 - GET / 200 2.6ms
+(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
+(SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
+```
+
+You can also be able to view all of these log messages in the files in `/tmp/ray/session_latest/logs/serve/`.
+
+To silence the replica-level logs or otherwise configure logging, configure the `"ray.serve"` logger *from inside the deployment constructor:*
+
+```python
+import logging
+
+logger = logging.getLogger("ray.serve")
+
+@serve.deployment
+class Silenced:
+    def __init__(self):
+        logger.setLevel(logging.ERROR)
+```
+
+This will prevent the replica INFO-level logs from being written to STDOUT or to files on disk.
+You can also use your own custom logger, in which case you'll need to configure the behavior to write to STDOUT/STDERR, files on disk, or both.
+
+### Tutorial: Ray Serve with Loki
+
+Here is a quick walkthrough of how to explore and filter your logs using [Loki](https://grafana.com/oss/loki/).
+Setup and configuration is very easy on Kubernetes, but in this tutorial we'll just set things up manually.
+
+First, install Loki and Promtail using the instructions on <https://grafana.com>.
+It will be convenient to save the Loki and Promtail executables in the same directory, and to navigate to this directory in your terminal before beginning this walkthrough.
+
+Now let's get our logs into Loki using Promtail.
+
+Save the following file as `promtail-local-config.yaml`:
+
+```yaml
+server:
+  http_listen_port: 9080
+  grpc_listen_port: 0
+
+positions:
+  filename: /tmp/positions.yaml
+
+clients:
+  - url: http://localhost:3100/loki/api/v1/push
+
+scrape_configs:
+- job_name: ray
+static_configs:
+  - labels:
+    job: ray
+    __path__: /tmp/ray/session_latest/logs/serve/*.*
+```
+
+The relevant part for Ray is the `static_configs` field, where we have indicated the location of our log files with `__path__`.
+The expression `*.*` will match all files, but not directories, which cause an error with Promtail.
+
+We will run Loki locally.  Grab the default config file for Loki with the following command in your terminal:
+
+```shell
+wget https://raw.githubusercontent.com/grafana/loki/v2.1.0/cmd/loki/loki-local-config.yaml
+```
+
+Now start Loki:
+
+```shell
+./loki-darwin-amd64 -config.file=loki-local-config.yaml
+```
+
+Here you may need to replace `./loki-darwin-amd64` with the path to your Loki executable file, which may have a different name depending on your operating system.
+
+Start Promtail and pass in the path to the config file we saved earlier:
+
+```shell
+./promtail-darwin-amd64 -config.file=promtail-local-config.yaml
+```
+
+As above, you may need to replace `./promtail-darwin-amd64` with the appropriate filename and path.
+
+Now we are ready to start our Ray Serve deployment.  Start a long-running Ray cluster and Ray Serve instance in your terminal:
+
+```shell
+ray start --head
+serve start
+```
+
+Now run the following Python script to deploy a basic Serve deployment with a Serve deployment logger:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/deployment_logger.py
+```
+
+Now [install and run Grafana](https://grafana.com/docs/grafana/latest/installation/) and navigate to `http://localhost:3000`, where you can log in with the default username "admin" and default password "admin".
+On the welcome page, click "Add your first data source" and click "Loki" to add Loki as a data source.
+
+Now click "Explore" in the left-side panel.  You are ready to run some queries!
+
+To filter all these Ray logs for the ones relevant to our deployment, use the following [LogQL](https://grafana.com/docs/loki/latest/logql/) query:
+
+```shell
+{job="ray"} |= "Counter"
+```
+
+You should see something similar to the following:
+
+```{image} https://raw.githubusercontent.com/ray-project/Images/master/docs/serve/loki-serve.png
+:align: center
+```
+
+## Metrics
+
+Ray Serve exposes important system metrics like the number of successful and
+errored requests through the [Ray metrics monitoring infrastructure](ray-metrics). By default,
+the metrics are exposed in Prometheus format on each node.
+
+The following metrics are exposed by Ray Serve:
+
+```{eval-rst}
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Description
+   * - ``serve_deployment_request_counter``
+     - The number of queries that have been processed in this replica.
+   * - ``serve_deployment_error_counter``
+     - The number of exceptions that have occurred in the deployment.
+   * - ``serve_deployment_replica_starts``
+     - The number of times this replica has been restarted due to failure.
+   * - ``serve_deployment_queuing_latency_ms``
+     - The latency for queries in the replica's queue waiting to be processed.
+   * - ``serve_deployment_processing_latency_ms``
+     - The latency for queries to be processed.
+   * - ``serve_replica_queued_queries``
+     - The current number of queries queued in the deployment replicas.
+   * - ``serve_replica_processing_queries``
+     - The current number of queries being processed.
+   * - ``serve_num_http_requests``
+     - The number of HTTP requests processed.
+   * - ``serve_num_http_error_requests``
+     - The number of non-200 HTTP responses.
+   * - ``serve_num_router_requests``
+     - The number of requests processed by the router.
+   * - ``serve_handle_request_counter``
+     - The number of requests processed by this ServeHandle.
+   * - ``serve_deployment_queued_queries``
+     - The number of queries for this deployment waiting to be assigned to a replica.
+   * - ``serve_num_deployment_http_error_requests``
+     - The number of non-200 HTTP responses returned by each deployment.
+```
+
+To see this in action, run `ray start --head --metrics-export-port=8080` in your terminal, and then run the following script:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_metrics.py
+```
+
+In your web browser, navigate to `localhost:8080`.
+In the output there, you can search for `serve_` to locate the metrics above.
+The metrics are updated once every ten seconds, and you will need to refresh the page to see the new values.
+
+For example, after running the script for some time and refreshing `localhost:8080` you might see something that looks like:
+
+```
+ray_serve_deployment_processing_latency_ms_count{...,deployment="f",...} 99.0
+ray_serve_deployment_processing_latency_ms_sum{...,deployment="f",...} 99279.30498123169
+```
+
+which indicates that the average processing latency is just over one second, as expected.
+
+You can even define a [custom metric](application-level-metrics) to use in your deployment, and tag it with the current deployment or replica.
+Here's an example:
+
+```{literalinclude} ../../../python/ray/serve/examples/doc/snippet_custom_metric.py
+:end-before: __custom_metrics_deployment_end__
+:start-after: __custom_metrics_deployment_start__
+```
+
+See the
+[Ray Metrics documentation](ray-metrics) for more details, including instructions for scraping these metrics using Prometheus.
--- a/doc/source/serve/monitoring.rst
+++ b/doc/source/serve/monitoring.rst
@ -1,269 +0,0 @@
-.. _serve-monitoring:
-
-======================
-Debugging & Monitoring
-======================
-
-This section should help you understand how to debug and monitor your Serve application.
-
-
-Ray Dashboard
-=============
-
-A high-level way to monitor your Ray Serve application is via the Ray Dashboard.
-See the `Ray Dashboard documentation <../ray-dashboard.html>`__ for a detailed overview, including instructions on how to view the dashboard.
-
-Below is an example of what the Ray Dashboard might look like for a Serve deployment:
-
-.. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/dashboard/serve-dashboard.png
-    :align: center
-
-Here you can see the Serve controller actor, an HTTP proxy actor, and all of the replicas for each Serve deployment.
-To learn about the function of the controller and proxy actors, see the `Serve Architecture page <architecture.html>`__.
-In this example pictured above, we have a single-node cluster with a deployment named Counter with ``num_replicas=2``.
-
-Logging
-=======
-
-.. note::
-
-  For an overview of logging in Ray, see `Ray Logging <../ray-logging.html>`__.
-
-
-
-Ray Serve uses Python's standard ``logging`` facility with the ``"ray.serve"`` named logger.
-By default, logs are emitted from actors both to ``stderr`` and on disk on each node at ``/tmp/ray/session_latest/logs/serve/``.
-This includes both system-level logs from the Serve controller and HTTP proxy as well as access logs and custom user logs produced from within deployment replicas.
-
-In development, logs are streamed to the driver Ray program (the program that calls ``.deploy()`` or ``serve.run``, or the ``serve run`` CLI command) that deployed the deployments, so it's most convenient to keep the driver running for debugging.
-For example, let's run a basic Serve application and view the logs that are emitted.
-You can run this in an interactive shell like IPython to follow along.
-
-First we call ``serve.start()``:
-
-.. code-block:: python
-
-   from ray import serve
-
-   serve.start()
-
-This produces a few INFO-level log messages about startup from the Serve controller.
-
-.. code-block:: bash
-
-   2022-04-02 09:10:49,906 INFO services.py:1460 -- View the Ray dashboard at http://127.0.0.1:8265
-   (ServeController pid=67312) INFO 2022-04-02 09:10:51,386 controller 67312 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.
-   (ServeController pid=67312) INFO 2022-04-02 09:10:51,492 controller 67312 http_state.py:108 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:xlehoa:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'
-
-Next, let's create a simple deployment that logs a custom log message when it's queried:
-
-.. code-block:: python
-
-   import logging
-
-   logger = logging.getLogger("ray.serve")
-
-   @serve.deployment(route_prefix="/")
-   class SayHello:
-       def __call__(self, *args):
-           logger.info("Hello world!")
-           return "hi"
-   
-   SayHello.deploy()
-
-Running this code block, we first get some log messages from the controller saying that a new replica of the deployment is being created:
-
-.. code-block:: bash
-
-   (ServeController pid=67312) INFO 2022-04-02 09:16:13,323 controller 67312 deployment_state.py:1198 - Adding 1 replicas to deployment 'SayHello'.
-
-Then when we query the deployment, we get both a default access log as well as our custom ``"Hello world!"`` message.
-Note that these log lines are tagged with the deployment name followed by a unique identifier for the specific replica.
-These can be parsed by a logging stack such as ELK or Loki to enable searching logs by deployment and replica.
-
-.. code-block:: bash
-
-   handle = SayHello.get_handle()
-   ray.get(handle.remote())
-   (SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
-   (SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
-
-Querying the deployment over HTTP produces a similar access log message from the HTTP proxy:
-
-.. code-block:: bash
-
-   curl -X GET http://localhost:8000/
-   (HTTPProxyActor pid=67315) INFO 2022-04-02 09:20:08,976 http_proxy 127.0.0.1 http_proxy.py:310 - GET / 200 2.6ms
-   (SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh <ipython-input-4-1e8854e5c9ba>:8 - Hello world!
-   (SayHello pid=67352) INFO 2022-04-02 09:20:08,975 SayHello SayHello#LBINMh replica.py:466 - HANDLE __call__ OK 0.3ms
-
-
-You can also be able to view all of these log messages in the files in ``/tmp/ray/session_latest/logs/serve/``.
-
-To silence the replica-level logs or otherwise configure logging, configure the ``"ray.serve"`` logger *from inside the deployment constructor:*
-
-.. code-block:: python
-
-   import logging
-
-   logger = logging.getLogger("ray.serve")
-
-   @serve.deployment
-   class Silenced:
-       def __init__(self):
-           logger.setLevel(logging.ERROR)
-
-
-This will prevent the replica INFO-level logs from being written to STDOUT or to files on disk.
-You can also use your own custom logger, in which case you'll need to configure the behavior to write to STDOUT/STDERR, files on disk, or both.
-
-Tutorial: Ray Serve with Loki
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Here is a quick walkthrough of how to explore and filter your logs using `Loki <https://grafana.com/oss/loki/>`__.
-Setup and configuration is very easy on Kubernetes, but in this tutorial we'll just set things up manually.
-
-First, install Loki and Promtail using the instructions on https://grafana.com.
-It will be convenient to save the Loki and Promtail executables in the same directory, and to navigate to this directory in your terminal before beginning this walkthrough.
-
-Now let's get our logs into Loki using Promtail.
-
-Save the following file as ``promtail-local-config.yaml``:
-
-.. code-block:: yaml
-
-  server:
-    http_listen_port: 9080
-    grpc_listen_port: 0
-
-  positions:
-    filename: /tmp/positions.yaml
-
-  clients:
-    - url: http://localhost:3100/loki/api/v1/push
-
-  scrape_configs:
-  - job_name: ray
-  static_configs:
-    - labels:
-      job: ray
-      __path__: /tmp/ray/session_latest/logs/serve/*.*
-
-The relevant part for Ray is the ``static_configs`` field, where we have indicated the location of our log files with ``__path__``.
-The expression ``*.*`` will match all files, but not directories, which cause an error with Promtail.
-
-We will run Loki locally.  Grab the default config file for Loki with the following command in your terminal:
-
-.. code-block:: shell
-
-  wget https://raw.githubusercontent.com/grafana/loki/v2.1.0/cmd/loki/loki-local-config.yaml
-
-Now start Loki:
-
-.. code-block:: shell
-
-  ./loki-darwin-amd64 -config.file=loki-local-config.yaml
-
-Here you may need to replace ``./loki-darwin-amd64`` with the path to your Loki executable file, which may have a different name depending on your operating system.
-
-Start Promtail and pass in the path to the config file we saved earlier:
-
-.. code-block:: shell
-
-  ./promtail-darwin-amd64 -config.file=promtail-local-config.yaml
-
-As above, you may need to replace ``./promtail-darwin-amd64`` with the appropriate filename and path.
-
-
-Now we are ready to start our Ray Serve deployment.  Start a long-running Ray cluster and Ray Serve instance in your terminal:
-
-.. code-block:: shell
-
-  ray start --head
-  serve start
-
-Now run the following Python script to deploy a basic Serve deployment with a Serve deployment logger:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/deployment_logger.py
-
-Now `install and run Grafana <https://grafana.com/docs/grafana/latest/installation/>`__ and navigate to ``http://localhost:3000``, where you can log in with the default username "admin" and default password "admin".
-On the welcome page, click "Add your first data source" and click "Loki" to add Loki as a data source.
-
-Now click "Explore" in the left-side panel.  You are ready to run some queries!
-
-To filter all these Ray logs for the ones relevant to our deployment, use the following `LogQL <https://grafana.com/docs/loki/latest/logql/>`__ query:
-
-.. code-block:: shell
-
-  {job="ray"} |= "Counter"
-
-You should see something similar to the following:
-
-.. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/serve/loki-serve.png
-    :align: center
-
-Metrics
-=======
-
-Ray Serve exposes important system metrics like the number of successful and
-errored requests through the `Ray metrics monitoring infrastructure <../ray-metrics.html>`__. By default,
-the metrics are exposed in Prometheus format on each node.
-
-The following metrics are exposed by Ray Serve:
-
-.. list-table::
-   :header-rows: 1
-
-   * - Name
-     - Description
-   * - ``serve_deployment_request_counter``
-     - The number of queries that have been processed in this replica.
-   * - ``serve_deployment_error_counter``
-     - The number of exceptions that have occurred in the deployment.
-   * - ``serve_deployment_replica_starts``
-     - The number of times this replica has been restarted due to failure.
-   * - ``serve_deployment_queuing_latency_ms``
-     - The latency for queries in the replica's queue waiting to be processed.
-   * - ``serve_deployment_processing_latency_ms``
-     - The latency for queries to be processed.
-   * - ``serve_replica_queued_queries``
-     - The current number of queries queued in the deployment replicas.
-   * - ``serve_replica_processing_queries``
-     - The current number of queries being processed.
-   * - ``serve_num_http_requests``
-     - The number of HTTP requests processed.
-   * - ``serve_num_http_error_requests``
-     - The number of non-200 HTTP responses.
-   * - ``serve_num_router_requests``
-     - The number of requests processed by the router.
-   * - ``serve_handle_request_counter``
-     - The number of requests processed by this ServeHandle.
-   * - ``serve_deployment_queued_queries``
-     - The number of queries for this deployment waiting to be assigned to a replica.
-   * - ``serve_num_deployment_http_error_requests``
-     - The number of non-200 HTTP responses returned by each deployment.
-
-To see this in action, run ``ray start --head --metrics-export-port=8080`` in your terminal, and then run the following script:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_metrics.py
-
-In your web browser, navigate to ``localhost:8080``.
-In the output there, you can search for ``serve_`` to locate the metrics above.
-The metrics are updated once every ten seconds, and you will need to refresh the page to see the new values.
-
-For example, after running the script for some time and refreshing ``localhost:8080`` you might see something that looks like::
-
-  ray_serve_deployment_processing_latency_ms_count{...,deployment="f",...} 99.0
-  ray_serve_deployment_processing_latency_ms_sum{...,deployment="f",...} 99279.30498123169
-
-which indicates that the average processing latency is just over one second, as expected.
-
-You can even define a :ref:`custom metric <application-level-metrics>` to use in your deployment, and tag it with the current deployment or replica.
-Here's an example:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_custom_metric.py
-  :start-after: __custom_metrics_deployment_start__
-  :end-before: __custom_metrics_deployment_end__
-
-See the
-:ref:`Ray Metrics documentation <ray-metrics>` for more details, including instructions for scraping these metrics using Prometheus.
--- a/doc/source/serve/package-ref.rst
+++ b/doc/source/serve/package-ref.rst
@ -1,29 +1,47 @@
-Ray Serve API
-=============
+# Ray Serve API

-Core APIs
---------
+## Core APIs
+
+```{eval-rst}
 .. autofunction:: ray.serve.start
+```
+
+```{eval-rst}
 .. autofunction:: ray.serve.deployment
+```
+
+```{eval-rst}
 .. autofunction:: ray.serve.list_deployments
+```
+
+```{eval-rst}
 .. autofunction:: ray.serve.get_deployment
+```
+
+```{eval-rst}
 .. autofunction:: ray.serve.shutdown
+```

-.. _`deployment-api`:
+(deployment-api)=

-Deployment API
--------------
+## Deployment API

+```{eval-rst}
 .. autoclass:: ray.serve.deployment.Deployment
    :members: deploy, delete, options, get_handle
+```

-.. _`servehandle-api`:
+(servehandle-api)=

-ServeHandle API
---------------
+## ServeHandle API
+
+```{eval-rst}
 .. autoclass:: ray.serve.handle.RayServeHandle
    :members: remote, options
+```

-Batching Requests
-----------------
+## Batching Requests
+
+```{eval-rst}
 .. autofunction:: ray.serve.batch(max_batch_size=10, batch_wait_timeout_s=0.0)
+```
--- a/doc/source/serve/performance.rst
+++ b/doc/source/serve/performance.rst
@ -1,79 +1,80 @@
-Performance Tuning
-==================
+# Performance Tuning

 This section should help you:

 - understand the performance characteristics of Ray Serve
 - find ways to debug and tune the performance of your Serve deployment

-.. note::
-    While this section offers some tips and tricks to improve the performance of your Serve deployment,
-    the :ref:`architecture doc <serve-architecture>` is helpful to gain a deeper understanding of these contexts and parameters.
+:::{note}
+While this section offers some tips and tricks to improve the performance of your Serve deployment,
+the [architecture doc](serve-architecture) is helpful to gain a deeper understanding of these contexts and parameters.
+:::

-.. contents::
+```{contents}
+```
+
+## Performance and known benchmarks

-Performance and known benchmarks
--------------------------------
 We are continuously benchmarking Ray Serve. The metrics we care about are latency, throughput, and scalability. We can confidently say:

 - Ray Serve’s latency overhead is single digit milliseconds, around 1-2 milliseconds on average.
 - For throughput, Serve achieves about 3-4k queries per second on a single machine (8 cores) using 1 http proxy and 8 replicas performing noop requests.
- It is horizontally scalable so you can add more machines to increase the overall throughput. Ray Serve is built on top of Ray, 
-  so its scalability is bounded by Ray’s scalability. Please check out Ray’s `scalability envelope <https://github.com/ray-project/ray/blob/master/release/benchmarks/README.md>`_
+- It is horizontally scalable so you can add more machines to increase the overall throughput. Ray Serve is built on top of Ray,
+  so its scalability is bounded by Ray’s scalability. Please check out Ray’s [scalability envelope](https://github.com/ray-project/ray/blob/master/release/benchmarks/README.md)
  to learn more about the maximum number of nodes and other limitations.

-You can check out our `microbenchmark instruction <https://github.com/ray-project/ray/blob/master/python/ray/serve/benchmarks/README.md>`_
+You can check out our [microbenchmark instruction](https://github.com/ray-project/ray/blob/master/python/ray/serve/benchmarks/README.md)
 to benchmark on your hardware.

-Debugging performance issues
----------------------------
+## Debugging performance issues
+
 The performance issue you're most likely to encounter is high latency and/or low throughput for requests.

-If you have set up :ref:`monitoring <serve-monitoring>` with Ray and Ray Serve, you will likely observe that
-``serve_num_router_requests`` is constant while your load increases
-``serve_deployment_queuing_latency_ms`` is spiking up as queries queue up in the background
+If you have set up [monitoring](serve-monitoring) with Ray and Ray Serve, you will likely observe that
+`serve_num_router_requests` is constant while your load increases
+`serve_deployment_queuing_latency_ms` is spiking up as queries queue up in the background

 Given the symptom, there are several ways to fix it.

-Choosing the right hardware
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Make sure you are using the right hardware and resources. 
-Are you using GPUs (``actor_init_options={“num_gpus”: 1}``) or 1+ cores (``actor_init_options={“num_cpus”: 2}``, and setting ``OMP_NUM_THREADS``)
+### Choosing the right hardware
+
+Make sure you are using the right hardware and resources.
+Are you using GPUs (`actor_init_options={“num_gpus”: 1}`) or 1+ cores (`actor_init_options={“num_cpus”: 2}`, and setting `OMP_NUM_THREADS`)
 to increase the performance of your deep learning framework?

-Async functions
-^^^^^^^^^^^^^^^
-Are you using ``async def`` in your callable? If you are using asyncio and
-hitting the same queuing issue mentioned above, you might want to increase 
-``max_concurrent_queries``. Serve sets a low number by default so the client gets 
+### Async functions
+
+Are you using `async def` in your callable? If you are using asyncio and
+hitting the same queuing issue mentioned above, you might want to increase
+`max_concurrent_queries`. Serve sets a low number by default so the client gets
 proper backpressure. You can increase the value in the Deployment decorator.

-Batching
-^^^^^^^^
-If your deployment can process a batch at a time at a sublinear latency 
-(for example, if it takes 1ms to process 1 query and 5ms to process 10 of them) 
-then batching is your best approach. Check out the :ref:`batching guide <serve-batching>` to 
-make your deployment accept batches (especially for GPU-based ML inference). You might want to tune your ``max_batch_size`` and ``batch_wait_timeout`` in the ``@serve.batch`` decorator to maximize the benefits:
+### Batching

- ``max_batch_size`` specifies how big the batch should be. Generally, 
-  we recommend choosing the largest batch size your function can handle 
-  AND the performance improvement is no longer sublinear. Take a dummy 
+If your deployment can process a batch at a time at a sublinear latency
+(for example, if it takes 1ms to process 1 query and 5ms to process 10 of them)
+then batching is your best approach. Check out the [batching guide](serve-batching) to
+make your deployment accept batches (especially for GPU-based ML inference). You might want to tune your `max_batch_size` and `batch_wait_timeout` in the `@serve.batch` decorator to maximize the benefits:
+
+- `max_batch_size` specifies how big the batch should be. Generally,
+  we recommend choosing the largest batch size your function can handle
+  AND the performance improvement is no longer sublinear. Take a dummy
  example: suppose it takes 1ms to process 1 query, 5ms to process 10 queries,
-  and 6ms to process 11 queries. Here you should set the batch size to to 10 
+  and 6ms to process 11 queries. Here you should set the batch size to to 10
  because adding more queries won’t improve the performance.
- ``batch_wait_timeout`` specifies how the maximum amount of time to wait before
-  a batch should be processed, even if it’s not full.  It should be set according 
-  to `batch-wait-timeout + full batch processing time ~= expected latency`. The idea 
-  here is to have the first query wait for the longest possible time to achieve high throughput.  
-  This means you should set ``batch_wait_timeout`` as large as possible without exceeding your desired expected latency in the equation above.
+- `batch_wait_timeout` specifies how the maximum amount of time to wait before
+  a batch should be processed, even if it’s not full.  It should be set according
+  to `batch-wait-timeout + full batch processing time ~= expected latency`. The idea
+  here is to have the first query wait for the longest possible time to achieve high throughput.
+  This means you should set `batch_wait_timeout` as large as possible without exceeding your desired expected latency in the equation above.
+
+### Scaling HTTP servers

-Scaling HTTP servers
-^^^^^^^^^^^^^^^^^^^^
 Sometimes it’s not about your code: Serve’s HTTP server can become the bottleneck.
 If you observe that the CPU utilization for HTTPProxy actor spike up to 100%, the HTTP server is the bottleneck.
-Serve only starts a single HTTP server on the Ray head node by default. 
-This single HTTP server can handle about 3k queries per second. 
+Serve only starts a single HTTP server on the Ray head node by default.
+This single HTTP server can handle about 3k queries per second.
 If your workload exceeds this number, you might want to consider starting one
-HTTP server per Ray node to spread the load by ``serve.start(http_options={“location”: “EveryNode”})``.
-This configuration tells Serve to spawn one HTTP server per node. 
+HTTP server per Ray node to spread the load by `serve.start(http_options={“location”: “EveryNode”})`.
+This configuration tells Serve to spawn one HTTP server per node.
 You should put an external load balancer in front of it.
--- a/doc/source/serve/tutorials/batch.md
+++ b/doc/source/serve/tutorials/batch.md
@ -0,0 +1,105 @@
+(serve-batch-tutorial)=
+
+# Batching Tutorial
+
+In this guide, we will deploy a simple vectorized adder that takes
+a batch of queries and adds them at once. In particular, we show:
+
+- How to implement and deploy a Ray Serve deployment that accepts batches.
+- How to configure the batch size.
+- How to query the model in Python.
+
+This tutorial should help the following use cases:
+
+- You want to perform offline batch inference on a cluster of machines.
+- You want to serve online queries and your model can take advantage of batching.
+  For example, linear regressions and neural networks use CPU and GPU's
+  vectorized instructions to perform computation in parallel. Performing
+  inference with batching can increase the *throughput* of the model as well as
+  *utilization* of the hardware.
+
+Let's import Ray Serve and some other helpers.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+:end-before: __doc_import_end__
+:start-after: __doc_import_begin__
+```
+
+You can use the `@serve.batch` decorator to annotate a function or a method.
+This annotation will automatically cause calls to the function to be batched together.
+The function must handle a list of objects and will be called with a single object.
+This function must also be `async def` so that you can handle multiple queries concurrently:
+
+```python
+@serve.batch
+async def my_batch_handler(self, requests: List):
+    pass
+```
+
+This batch handler can then be called from another `async def` method in your deployment.
+These calls will be batched and executed together, but return an individual result as if
+they were a normal function call:
+
+```python
+class MyBackend:
+    @serve.batch
+    async def my_batch_handler(self, requests: List):
+        results = []
+        for request in requests:
+            results.append(request.json())
+        return results
+
+    async def __call__(self, request):
+        await self.my_batch_handler(request)
+```
+
+:::{note}
+By default, Ray Serve performs *opportunistic batching*. This means that as
+soon as the batch handler is called, the method will be executed without
+waiting for a full batch. If there are more queries available after this call
+finishes, a larger batch may be executed. This behavior can be tuned using the
+`batch_wait_timeout_s` option to `@serve.batch` (defaults to 0). Increasing this
+timeout may improve throughput at the cost of latency under low load.
+:::
+
+Let's define a deployment that takes in a list of requests, extracts the input value,
+converts them into an array, and uses NumPy to add 1 to each element.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+:end-before: __doc_define_servable_end__
+:start-after: __doc_define_servable_begin__
+```
+
+Let's deploy it. Note that in the `@serve.batch` decorator, we are specifying
+specifying the maximum batch size via `max_batch_size=4`. This option limits
+the maximum possible batch size that will be executed at once.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+:end-before: __doc_deploy_end__
+:start-after: __doc_deploy_begin__
+```
+
+Let's define a [Ray remote task](ray-remote-functions) to send queries in
+parallel. As you can see, the first batch has a batch size of 1, and the subsequent
+queries have a batch size of 4. Even though each query is issued independently,
+Ray Serve was able to evaluate them in batches.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+:end-before: __doc_query_end__
+:start-after: __doc_query_begin__
+```
+
+What if you want to evaluate a whole batch in Python? Ray Serve allows you to send
+queries via the Python API. A batch of queries can either come from the web server
+or the Python API. Learn more [here](serve-handle-explainer).
+
+To query the deployment via the Python API, we can use `Deployment.get_handle` to receive
+a handle to the corresponding deployment. To enqueue a query, you can call
+`handle.method.remote(data)`. This call returns immediately
+with a [Ray ObjectRef](ray-object-refs). You can call `ray.get` to retrieve
+the result.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_batch.py
+:end-before: __doc_query_handle_end__
+:start-after: __doc_query_handle_begin__
+```
--- a/doc/source/serve/tutorials/batch.rst
+++ b/doc/source/serve/tutorials/batch.rst
@ -1,101 +0,0 @@
-.. _serve-batch-tutorial:
-
-Batching Tutorial
-=================
-
-In this guide, we will deploy a simple vectorized adder that takes
-a batch of queries and adds them at once. In particular, we show:
-
- How to implement and deploy a Ray Serve deployment that accepts batches.
- How to configure the batch size.
- How to query the model in Python.
-
-This tutorial should help the following use cases:
-
- You want to perform offline batch inference on a cluster of machines.
- You want to serve online queries and your model can take advantage of batching.
-  For example, linear regressions and neural networks use CPU and GPU's
-  vectorized instructions to perform computation in parallel. Performing
-  inference with batching can increase the *throughput* of the model as well as
-  *utilization* of the hardware.
-
-
-Let's import Ray Serve and some other helpers.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
-    :start-after: __doc_import_begin__
-    :end-before: __doc_import_end__
-
-You can use the ``@serve.batch`` decorator to annotate a function or a method.
-This annotation will automatically cause calls to the function to be batched together.
-The function must handle a list of objects and will be called with a single object.
-This function must also be ``async def`` so that you can handle multiple queries concurrently:
-
-.. code-block:: python
-
-    @serve.batch
-    async def my_batch_handler(self, requests: List):
-        pass
-
-This batch handler can then be called from another ``async def`` method in your deployment.
-These calls will be batched and executed together, but return an individual result as if
-they were a normal function call:
-
-.. code-block:: python
-
-    class MyBackend:
-        @serve.batch
-        async def my_batch_handler(self, requests: List):
-            results = []
-            for request in requests:
-                results.append(request.json())
-            return results
-
-        async def __call__(self, request):
-            await self.my_batch_handler(request)
-
-.. note::
-    By default, Ray Serve performs *opportunistic batching*. This means that as
-    soon as the batch handler is called, the method will be executed without
-    waiting for a full batch. If there are more queries available after this call
-    finishes, a larger batch may be executed. This behavior can be tuned using the
-    ``batch_wait_timeout_s`` option to ``@serve.batch`` (defaults to 0). Increasing this
-    timeout may improve throughput at the cost of latency under low load.
-
-Let's define a deployment that takes in a list of requests, extracts the input value,
-converts them into an array, and uses NumPy to add 1 to each element.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
-    :start-after: __doc_define_servable_begin__
-    :end-before: __doc_define_servable_end__
-
-Let's deploy it. Note that in the ``@serve.batch`` decorator, we are specifying
-specifying the maximum batch size via ``max_batch_size=4``. This option limits
-the maximum possible batch size that will be executed at once.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
-    :start-after: __doc_deploy_begin__
-    :end-before: __doc_deploy_end__
-
-Let's define a :ref:`Ray remote task<ray-remote-functions>` to send queries in
-parallel. As you can see, the first batch has a batch size of 1, and the subsequent
-queries have a batch size of 4. Even though each query is issued independently,
-Ray Serve was able to evaluate them in batches.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
-    :start-after: __doc_query_begin__
-    :end-before: __doc_query_end__
-
-What if you want to evaluate a whole batch in Python? Ray Serve allows you to send
-queries via the Python API. A batch of queries can either come from the web server
-or the Python API. Learn more :ref:`here<serve-handle-explainer>`.
-
-To query the deployment via the Python API, we can use ``Deployment.get_handle`` to receive
-a handle to the corresponding deployment. To enqueue a query, you can call
-``handle.method.remote(data)``. This call returns immediately
-with a :ref:`Ray ObjectRef<ray-object-refs>`. You can call `ray.get` to retrieve
-the result.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_batch.py
-    :start-after: __doc_query_handle_begin__
-    :end-before: __doc_query_handle_end__
--- a/doc/source/serve/tutorials/index.md
+++ b/doc/source/serve/tutorials/index.md
@ -0,0 +1,22 @@
+# Advanced Tutorials
+
+Below is a list of tutorials that you can use to learn more about the different pieces of
+Ray Serve functionality and how to integrate different modeling frameworks.
+
+```{toctree}
+:caption: Serve Tutorials
+:maxdepth: '-1'
+:name: serve-tutorials
+
+tensorflow
+pytorch
+sklearn
+batch
+web-server-integration
+rllib
+gradio
+```
+
+Other Topics:
+
+- {doc}`../deployment`
--- a/doc/source/serve/tutorials/index.rst
+++ b/doc/source/serve/tutorials/index.rst
@ -1,23 +0,0 @@
-==================
-Advanced Tutorials
-==================
-
-Below is a list of tutorials that you can use to learn more about the different pieces of
-Ray Serve functionality and how to integrate different modeling frameworks.
-
-.. toctree::
-   :caption: Serve Tutorials
-   :name: serve-tutorials
-   :maxdepth: -1
-
-   tensorflow
-   pytorch
-   sklearn
-   batch
-   web-server-integration
-   rllib
-   gradio
-
-Other Topics:
-
- :doc:`../deployment`
--- a/doc/source/serve/tutorials/pytorch.md
+++ b/doc/source/serve/tutorials/pytorch.md
@ -0,0 +1,48 @@
+(serve-pytorch-tutorial)=
+
+# PyTorch Tutorial
+
+In this guide, we will load and serve a PyTorch Resnet Model.
+In particular, we show:
+
+- How to load the model from PyTorch's pre-trained modelzoo.
+- How to parse the JSON request, transform the payload and evaluated in the model.
+
+Please see the {doc}`../core-apis` to learn more general information about Ray Serve.
+
+This tutorial requires Pytorch and Torchvision installed in your system. Ray Serve
+is framework agnostic and works with any version of PyTorch.
+
+```bash
+pip install torch torchvision
+```
+
+Let's import Ray Serve and some other helpers.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
+:end-before: __doc_import_end__
+:start-after: __doc_import_begin__
+```
+
+Services are just defined as normal classes with `__init__` and `__call__` methods.
+The `__call__` method will be invoked per request.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
+:end-before: __doc_define_servable_end__
+:start-after: __doc_define_servable_begin__
+```
+
+Now that we've defined our services, let's deploy the model to Ray Serve. We will
+define a Serve deployment that will be exposed over an HTTP route.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
+:end-before: __doc_deploy_end__
+:start-after: __doc_deploy_begin__
+```
+
+Let's query it!
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
+:end-before: __doc_query_end__
+:start-after: __doc_query_begin__
+```
--- a/doc/source/serve/tutorials/pytorch.rst
+++ b/doc/source/serve/tutorials/pytorch.rst
@ -1,46 +0,0 @@
-.. _serve-pytorch-tutorial:
-
-PyTorch Tutorial
-================
-
-In this guide, we will load and serve a PyTorch Resnet Model.
-In particular, we show:
-
- How to load the model from PyTorch's pre-trained modelzoo.
- How to parse the JSON request, transform the payload and evaluated in the model.
-
-Please see the :doc:`../core-apis` to learn more general information about Ray Serve.
-
-This tutorial requires Pytorch and Torchvision installed in your system. Ray Serve
-is framework agnostic and works with any version of PyTorch.
-
-.. code-block:: bash
-
-    pip install torch torchvision
-
-Let's import Ray Serve and some other helpers.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
-    :start-after: __doc_import_begin__
-    :end-before: __doc_import_end__
-
-
-Services are just defined as normal classes with ``__init__`` and ``__call__`` methods.
-The ``__call__`` method will be invoked per request.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
-    :start-after: __doc_define_servable_begin__
-    :end-before: __doc_define_servable_end__
-
-Now that we've defined our services, let's deploy the model to Ray Serve. We will
-define a Serve deployment that will be exposed over an HTTP route.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
-    :start-after: __doc_deploy_begin__
-    :end-before: __doc_deploy_end__
-
-Let's query it!
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_pytorch.py
-    :start-after: __doc_query_begin__
-    :end-before: __doc_query_end__
--- a/doc/source/serve/tutorials/rllib.md
+++ b/doc/source/serve/tutorials/rllib.md
@ -91,7 +91,7 @@ class ServePPOModel:

 :::{tip}
 Although we used a single input and `trainer.compute_single_action(...)` here, you
-can process a batch of input using Ray Serve's {ref}`batching<serve-batching>` feature
+can process a batch of input using Ray Serve's [batching](serve-batching) feature
 and use `trainer.compute_actions(...)` to process a batch of inputs.
 :::

--- a/doc/source/serve/tutorials/sklearn.md
+++ b/doc/source/serve/tutorials/sklearn.md
@ -0,0 +1,54 @@
+(serve-sklearn-tutorial)=
+
+# Scikit-Learn Tutorial
+
+In this guide, we will train and deploy a simple Scikit-Learn classifier.
+In particular, we show:
+
+- How to load the model from file system in your Ray Serve definition
+- How to parse the JSON request and evaluated in sklearn model
+
+Please see the {doc}`../core-apis` to learn more general information about Ray Serve.
+
+Ray Serve is framework agnostic. You can use any version of sklearn.
+
+```bash
+pip install scikit-learn
+```
+
+Let's import Ray Serve and some other helpers.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
+:end-before: __doc_import_end__
+:start-after: __doc_import_begin__
+```
+
+We will train a logistic regression with the iris dataset.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
+:end-before: __doc_train_model_end__
+:start-after: __doc_train_model_begin__
+```
+
+Services are just defined as normal classes with `__init__` and `__call__` methods.
+The `__call__` method will be invoked per request.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
+:end-before: __doc_define_servable_end__
+:start-after: __doc_define_servable_begin__
+```
+
+Now that we've defined our services, let's deploy the model to Ray Serve. We will
+define a Serve deployment that will be exposed over an HTTP route.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
+:end-before: __doc_deploy_end__
+:start-after: __doc_deploy_begin__
+```
+
+Let's query it!
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
+:end-before: __doc_query_end__
+:start-after: __doc_query_begin__
+```
--- a/doc/source/serve/tutorials/sklearn.rst
+++ b/doc/source/serve/tutorials/sklearn.rst
@ -1,50 +0,0 @@
-.. _serve-sklearn-tutorial:
-
-Scikit-Learn Tutorial
-=====================
-
-In this guide, we will train and deploy a simple Scikit-Learn classifier.
-In particular, we show:
-
- How to load the model from file system in your Ray Serve definition
- How to parse the JSON request and evaluated in sklearn model
-
-Please see the :doc:`../core-apis` to learn more general information about Ray Serve.
-
-Ray Serve is framework agnostic. You can use any version of sklearn.
-
-.. code-block:: bash
-
-    pip install scikit-learn
-
-Let's import Ray Serve and some other helpers.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
-    :start-after: __doc_import_begin__
-    :end-before: __doc_import_end__
-
-We will train a logistic regression with the iris dataset.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
-    :start-after: __doc_train_model_begin__
-    :end-before: __doc_train_model_end__
-
-Services are just defined as normal classes with ``__init__`` and ``__call__`` methods.
-The ``__call__`` method will be invoked per request.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
-    :start-after: __doc_define_servable_begin__
-    :end-before: __doc_define_servable_end__
-
-Now that we've defined our services, let's deploy the model to Ray Serve. We will
-define a Serve deployment that will be exposed over an HTTP route.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
-    :start-after: __doc_deploy_begin__
-    :end-before: __doc_deploy_end__
-
-Let's query it!
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_sklearn.py
-    :start-after: __doc_query_begin__
-    :end-before: __doc_query_end__
--- a/doc/source/serve/tutorials/tensorflow.md
+++ b/doc/source/serve/tutorials/tensorflow.md
@ -0,0 +1,56 @@
+(serve-tensorflow-tutorial)=
+
+# Keras and Tensorflow Tutorial
+
+In this guide, we will train and deploy a simple Tensorflow neural net.
+In particular, we show:
+
+- How to load the model from file system in your Ray Serve definition
+- How to parse the JSON request and evaluated in Tensorflow
+
+Please see the {doc}`../core-apis` to learn more general information about Ray Serve.
+
+Ray Serve is framework agnostic -- you can use any version of Tensorflow.
+However, for this tutorial, we use Tensorflow 2 and Keras. Please make sure you have
+Tensorflow 2 installed.
+
+```bash
+pip install "tensorflow>=2.0"
+```
+
+Let's import Ray Serve and some other helpers.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
+:end-before: __doc_import_end__
+:start-after: __doc_import_begin__
+```
+
+We will train a simple MNIST model using Keras.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
+:end-before: __doc_train_model_end__
+:start-after: __doc_train_model_begin__
+```
+
+Services are just defined as normal classes with `__init__` and `__call__` methods.
+The `__call__` method will be invoked per request.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
+:end-before: __doc_define_servable_end__
+:start-after: __doc_define_servable_begin__
+```
+
+Now that we've defined our services, let's deploy the model to Ray Serve. We will
+define a Serve deployment that will be exposed over an HTTP route.
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
+:end-before: __doc_deploy_end__
+:start-after: __doc_deploy_begin__
+```
+
+Let's query it!
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
+:end-before: __doc_query_end__
+:start-after: __doc_query_begin__
+```
--- a/doc/source/serve/tutorials/tensorflow.rst
+++ b/doc/source/serve/tutorials/tensorflow.rst
@ -1,53 +0,0 @@
-.. _serve-tensorflow-tutorial:
-
-Keras and Tensorflow Tutorial
-=============================
-
-In this guide, we will train and deploy a simple Tensorflow neural net.
-In particular, we show:
-
- How to load the model from file system in your Ray Serve definition
- How to parse the JSON request and evaluated in Tensorflow
-
-Please see the :doc:`../core-apis` to learn more general information about Ray Serve.
-
-Ray Serve is framework agnostic -- you can use any version of Tensorflow.
-However, for this tutorial, we use Tensorflow 2 and Keras. Please make sure you have
-Tensorflow 2 installed.
-
-
-.. code-block:: bash
-
-    pip install "tensorflow>=2.0"
-
-Let's import Ray Serve and some other helpers.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
-    :start-after: __doc_import_begin__
-    :end-before: __doc_import_end__
-
-We will train a simple MNIST model using Keras.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
-    :start-after: __doc_train_model_begin__
-    :end-before: __doc_train_model_end__
-
-Services are just defined as normal classes with ``__init__`` and ``__call__`` methods.
-The ``__call__`` method will be invoked per request.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
-    :start-after: __doc_define_servable_begin__
-    :end-before: __doc_define_servable_end__
-
-Now that we've defined our services, let's deploy the model to Ray Serve. We will
-define a Serve deployment that will be exposed over an HTTP route.
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
-    :start-after: __doc_deploy_begin__
-    :end-before: __doc_deploy_end__
-
-Let's query it!
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/tutorial_tensorflow.py
-    :start-after: __doc_query_begin__
-    :end-before: __doc_query_end__
--- a/doc/source/serve/tutorials/web-server-integration.md
+++ b/doc/source/serve/tutorials/web-server-integration.md
@ -0,0 +1,33 @@
+(serve-web-server-integration-tutorial)=
+
+# Integration with Existing Web Servers
+
+In this guide, you will learn how to use Ray Serve to scale up your existing web application.  The key feature of Ray Serve that makes this possible is the Python-native {ref}`servehandle-api`, which allows you keep using your same Python web server while offloading your heavy computation to Ray Serve.
+
+We give two examples, one using a [FastAPI](https://fastapi.tiangolo.com/) web server and another using an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server, but the same approach will work with any Python web server.
+
+## Scaling Up a FastAPI Application
+
+Ray Serve has a native integration with FastAPI - please see {ref}`serve-fastapi-http`.
+
+## Scaling Up an AIOHTTP Application
+
+In this section, we'll integrate Ray Serve with an [AIOHTTP](https://docs.aiohttp.org/en/stable/) web server run using [Gunicorn](https://gunicorn.org/).  You'll need to install AIOHTTP and gunicorn with the command `pip install aiohttp gunicorn`.
+
+First, here is the script that deploys Ray Serve:
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
+```
+
+Next is the script that defines the AIOHTTP server:
+
+```{literalinclude} ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
+```
+
+Here's how to run this example:
+
+1. Run `ray start --head` to start a local Ray cluster in the background.
+2. In the directory where the example files are saved, run `python aiohttp_deploy_serve.py` to deploy our Ray Serve deployment.
+3. Run `gunicorn aiohttp_app:app --worker-class aiohttp.GunicornWebWorker` to start the AIOHTTP app using gunicorn.
+4. To test out the server, run `curl localhost:8000/dummy-model`.  This should output `Model received data: dummy input`.
+5. For cleanup, you can press Ctrl-C to stop the Gunicorn server, and run `ray stop` to stop the background Ray cluster.
--- a/doc/source/serve/tutorials/web-server-integration.rst
+++ b/doc/source/serve/tutorials/web-server-integration.rst
@ -1,39 +0,0 @@
-.. _serve-web-server-integration-tutorial:
-
-Integration with Existing Web Servers
-=====================================
-
-In this guide, you will learn how to use Ray Serve to scale up your existing web application.  The key feature of Ray Serve that makes this possible is the Python-native :ref:`servehandle-api`, which allows you keep using your same Python web server while offloading your heavy computation to Ray Serve.
-
-We give two examples, one using a `FastAPI <https://fastapi.tiangolo.com/>`__ web server and another using an `AIOHTTP <https://docs.aiohttp.org/en/stable/>`__ web server, but the same approach will work with any Python web server.
-
-
-Scaling Up a FastAPI Application
--------------------------------
-
-Ray Serve has a native integration with FastAPI - please see :ref:`serve-fastapi-http`.
-
-Scaling Up an AIOHTTP Application
---------------------------------
-
-In this section, we'll integrate Ray Serve with an `AIOHTTP <https://docs.aiohttp.org/en/stable/>`_ web server run using `Gunicorn <https://gunicorn.org/>`_.  You'll need to install AIOHTTP and gunicorn with the command ``pip install aiohttp gunicorn``.
-
-First, here is the script that deploys Ray Serve:
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_deploy_serve.py
-
-Next is the script that defines the AIOHTTP server:
-
-.. literalinclude:: ../../../../python/ray/serve/examples/doc/aiohttp/aiohttp_app.py
-
-Here's how to run this example:
-
-1. Run ``ray start --head`` to start a local Ray cluster in the background.
-
-2. In the directory where the example files are saved, run ``python aiohttp_deploy_serve.py`` to deploy our Ray Serve deployment.
-
-3. Run ``gunicorn aiohttp_app:app --worker-class aiohttp.GunicornWebWorker`` to start the AIOHTTP app using gunicorn.
-
-4. To test out the server, run ``curl localhost:8000/dummy-model``.  This should output ``Model received data: dummy input``.
-
-5. For cleanup, you can press Ctrl-C to stop the Gunicorn server, and run ``ray stop`` to stop the background Ray cluster.
--- a/python/ray/_private/usage/usage_constants.py
+++ b/python/ray/_private/usage/usage_constants.py
@ -12,7 +12,7 @@ USAGE_STATS_ENABLED_MESSAGE = (
    "Usage stats collection is enabled. To disable this, add `--disable-usage-stats` "
    "to the command that starts the cluster, or run the following command:"
    " `ray disable-usage-stats` before starting the cluster. "
-    "See https://github.com/ray-project/ray/issues/20857 for more details."
+    "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
 )

 USAGE_STATS_DISABLED_MESSAGE = "Usage stats collection is disabled."
@ -23,7 +23,7 @@ USAGE_STATS_ENABLED_BY_DEFAULT_MESSAGE = (
    "To disable this, add `--disable-usage-stats` to the command that starts "
    "the cluster, or run the following command:"
    " `ray disable-usage-stats` before starting the cluster. "
-    "See https://github.com/ray-project/ray/issues/20857 for more details."
+    "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
 )

 USAGE_STATS_CONFIRMATION_MESSAGE = (
--- a/python/ray/_private/usage/usage_lib.py
+++ b/python/ray/_private/usage/usage_lib.py
@ -92,30 +92,49 @@ class ClusterStatusToReport:
 class UsageStatsToReport:
    """Usage stats to report"""

+    #: The Ray version in use.
    ray_version: str
+    #: The Python version in use.
    python_version: str
+    #: The schema version of the report.
    schema_version: str
+    #: The source of the data (i.e. OSS).
    source: str
+    #: A random id of the cluster session.
    session_id: str
+    #: The git commit hash of Ray (i.e. ray.__commit__).
    git_commit: str
+    #: The operating system in use.
    os: str
+    #: When the data is collected and reported.
    collect_timestamp_ms: int
+    #: When the cluster is started.
    session_start_timestamp_ms: int
+    #: The cloud provider found in the cluster.yaml file (e.g., aws).
    cloud_provider: Optional[str]
+    #: The min_workers found in the cluster.yaml file.
    min_workers: Optional[int]
+    #: The max_workers found in the cluster.yaml file.
    max_workers: Optional[int]
+    #: The head node instance type found in the cluster.yaml file (e.g., i3.8xlarge).
    head_node_instance_type: Optional[str]
+    #: The worker node instance types found in the cluster.yaml file (e.g., i3.8xlarge).
    worker_node_instance_types: Optional[List[str]]
+    #: The total num of cpus in the cluster.
    total_num_cpus: Optional[int]
+    #: The total num of gpus in the cluster.
    total_num_gpus: Optional[int]
+    #: The total size of memory in the cluster.
    total_memory_gb: Optional[float]
+    #: The total size of object store memory in the cluster.
    total_object_store_memory_gb: Optional[float]
+    #: The Ray libraries that are used (e.g., rllib).
    library_usages: Optional[List[str]]
-    # The total number of successful reports for the lifetime of the cluster.
+    #: The total number of successful reports for the lifetime of the cluster.
    total_success: int
-    # The total number of failed reports for the lifetime of the cluster.
+    #: The total number of failed reports for the lifetime of the cluster.
    total_failed: int
-    # The sequence number of the report.
+    #: The sequence number of the report.
    seq_number: int


--- a/python/ray/serve/client.py
+++ b/python/ray/serve/client.py
@ -44,6 +44,9 @@ from ray.serve.handle import RayServeHandle, RayServeSyncHandle


 logger = logging.getLogger(__file__)
+# Whether to issue warnings about using sync handles in async context
+# or using async handle in sync context.
+_WARN_SYNC_ASYNC_HANDLE_CONTEXT: bool = True


 def _ensure_connected(f: Callable) -> Callable:
@ -393,7 +396,7 @@ class ServeControllerClient:
            else:
                raise ex

-        if asyncio_loop_running and sync:
+        if asyncio_loop_running and sync and _WARN_SYNC_ASYNC_HANDLE_CONTEXT:
            logger.warning(
                "You are retrieving a sync handle inside an asyncio loop. "
                "Try getting client.get_handle(.., sync=False) to get better "
@ -401,7 +404,7 @@ class ServeControllerClient:
                "serve/http-servehandle.html#sync-and-async-handles"
            )

-        if not asyncio_loop_running and not sync:
+        if not asyncio_loop_running and not sync and _WARN_SYNC_ASYNC_HANDLE_CONTEXT:
            logger.warning(
                "You are retrieving an async handle outside an asyncio loop. "
                "You should make sure client.get_handle is called inside a "
--- a/python/ray/serve/deployment_graph.py
+++ b/python/ray/serve/deployment_graph.py
@ -1,9 +1,19 @@
+from contextlib import contextmanager
 import json
+
 from ray.experimental.dag.class_node import ClassNode  # noqa: F401
 from ray.experimental.dag.function_node import FunctionNode  # noqa: F401
 from ray.experimental.dag.input_node import InputNode  # noqa: F401
 from ray.experimental.dag import DAGNode  # noqa: F401
 from ray.util.annotations import PublicAPI
+import ray.serve.client
+
+
+@contextmanager
+def _mute_sync_handle_warnings():
+    ray.serve.client._WARN_SYNC_ASYNC_HANDLE_CONTEXT = False
+    yield
+    ray.serve.client._WARN_SYNC_ASYNC_HANDLE_CONTEXT = True


@PublicAPI(stability="alpha")
@ -31,10 +41,12 @@ class RayServeDAGHandle:
        return RayServeDAGHandle._deserialize, (self.dag_node_json,)

    def remote(self, *args, **kwargs):
-        if self.dag_node is None:
-            from ray.serve.pipeline.json_serde import dagnode_from_json
+        # NOTE: There's nothing user can do about these warnings, we should hide it.
+        with _mute_sync_handle_warnings():
+            if self.dag_node is None:
+                from ray.serve.pipeline.json_serde import dagnode_from_json

-            self.dag_node = json.loads(
-                self.dag_node_json, object_hook=dagnode_from_json
-            )
-        return self.dag_node.execute(*args, **kwargs)
+                self.dag_node = json.loads(
+                    self.dag_node_json, object_hook=dagnode_from_json
+                )
+            return self.dag_node.execute(*args, **kwargs)
--- a/python/ray/serve/tests/test_pipeline_driver.py
+++ b/python/ray/serve/tests/test_pipeline_driver.py
@ -1,3 +1,5 @@
+import contextlib
+import io
 import sys
 import numpy as np
 from pydantic import BaseModel
@ -12,6 +14,7 @@ from ray.serve.http_adapters import json_request
 from ray.experimental.dag.input_node import InputNode
 from ray import serve
 import ray
+from ray._private.test_utils import wait_for_condition


 def my_resolver(a: int):
@ -170,5 +173,30 @@ def test_driver_np_serializer(serve_instance):
    assert requests.get("http://127.0.0.1:8000/").json() == [42]


+def test_dag_driver_sync_warning(serve_instance):
+    with InputNode() as inp:
+        dag = echo.bind(inp)
+
+    log_file = io.StringIO()
+    with contextlib.redirect_stderr(log_file):
+
+        handle = serve.run(DAGDriver.bind(dag))
+        assert ray.get(handle.predict.remote(42)) == 42
+
+        def wait_for_request_success_log():
+            lines = log_file.getvalue().splitlines()
+            for line in lines:
+                if "DAGDriver" in line and "HANDLE predict OK" in line:
+                    return True
+            return False
+
+        wait_for_condition(wait_for_request_success_log)
+
+        assert (
+            "You are retrieving a sync handle inside an asyncio loop."
+            not in log_file.getvalue()
+        )
+
+
 if __name__ == "__main__":
    sys.exit(pytest.main(["-v", "-s", __file__]))
--- a/python/ray/state.py
+++ b/python/ray/state.py
@ -456,6 +456,13 @@ class GlobalState:

        self._check_connected()

+        # Add a small delay to account for propagation delay of events to the GCS.
+        # This should be harmless enough but prevents calls to timeline() from
+        # missing recent timeline data.
+        import time
+
+        time.sleep(1)
+
        profile_table = self.profile_table()
        all_events = []