From c3d18bfc8f2b72048dcfdfadb0fb920bdbdb67a7 Mon Sep 17 00:00:00 2001
From: zcin <cindyzyx9@gmail.com>
Date: Fri, 5 Aug 2022 16:49:41 -0700
Subject: [PATCH] =?UTF-8?q?Revert=20"[serve]=20Integrate=20and=20Document?=
 =?UTF-8?q?=20Bring-Your-Own=20Gradio=20Applications=20(#2=E2=80=A6=20(#27?=
 =?UTF-8?q?560)"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 419ba8efd38829287e58da7b4c263c88d1a9e69b.
---
 .buildkite/pipeline.yml                       |  32 +-
 ci/ci.sh                                      |   2 -
 ci/env/install-serve.sh                       |  12 -
 .../serve/tutorials/gradio-integration.md     | 128 ------
 doc/source/serve/tutorials/gradio.ipynb       | 364 ++++++++++++++++++
 doc/source/serve/tutorials/index.md           |   2 +-
 python/ray/serve/BUILD                        |   8 -
 .../doc/gradio-integration-parallel.py        |  52 ---
 .../serve/examples/doc/gradio-integration.py  |  49 ---
 .../ray/serve/examples/doc/gradio-original.py |  25 --
 python/ray/serve/gradio_integrations.py       |  31 --
 python/ray/serve/tests/test_gradio.py         |  70 ----
 python/requirements.txt                       |   1 -
 python/setup.py                               |   8 +-
 14 files changed, 368 insertions(+), 416 deletions(-)
 delete mode 100755 ci/env/install-serve.sh
 delete mode 100644 doc/source/serve/tutorials/gradio-integration.md
 create mode 100644 doc/source/serve/tutorials/gradio.ipynb
 delete mode 100644 python/ray/serve/examples/doc/gradio-integration-parallel.py
 delete mode 100644 python/ray/serve/examples/doc/gradio-integration.py
 delete mode 100644 python/ray/serve/examples/doc/gradio-original.py
 delete mode 100644 python/ray/serve/gradio_integrations.py
 delete mode 100644 python/ray/serve/tests/test_gradio.py

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8008f25c5..063f68379 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -251,26 +251,9 @@
       > test_shard.txt
     - cat test_shard.txt
     - bazel test --config=ci $(./ci/run/bazel_export_options)
-      --test_tag_filters=-post_wheel_build,-py37
+      --test_tag_filters=-post_wheel_build
       $(cat test_shard.txt)
-- label: ":serverless: Serve Tests (Python 3.7)"
-  conditions:
-    [
-        "RAY_CI_SERVE_AFFECTED",
-        "RAY_CI_PYTHON_AFFECTED",
-    ]
-  commands:
-    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
-    - echo "--- Setting up Python 3.7 environment."
-    - PYTHON=3.7 TORCH_VERSION=1.6 ./ci/env/install-dependencies.sh
-    # Specifying PYTHON=3.7 above somehow messes up the Ray install.
-    # Uninstall and re-install Ray so that we can use Ray Client.
-    # (Remove thirdparty_files to sidestep an issue with psutil.)
-    - pip uninstall -y ray && rm -rf /ray/python/ray/thirdparty_files
-    - ./ci/ci.sh build
-    - bazel test --config=ci $(./ci/run/bazel_export_options)
-      --test_tag_filters=team:serve
-      python/ray/serve/test_gradio
+
 
 - label: ":python: Minimal install 3.6"
   conditions: ["RAY_CI_PYTHON_AFFECTED"]
@@ -305,17 +288,6 @@
     - bazel test --test_output=streamed --config=ci --test_env=RAY_DEFAULT=1 $(./ci/run/bazel_export_options)
       python/ray/dashboard/test_dashboard
 
-- label: ":python: Ray Serve default install"
-  conditions: ["RAY_CI_PYTHON_AFFECTED"]
-  commands:
-    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
-    - ./ci/env/install-serve.sh
-    - ./ci/env/env_info.sh
-    - bazel test --test_output=streamed --config=ci --test_env=RAY_DEFAULT=1 $(./ci/run/bazel_export_options)
-      python/ray/serve/test_deployment_graph
-    - bazel test --test_output=streamed --config=ci --test_env=RAY_DEFAULT=1 $(./ci/run/bazel_export_options)
-      python/ray/serve/test_api
-
 - label: ":python: Release test package unit tests"
   conditions: ["ALWAYS"]
   commands:
diff --git a/ci/ci.sh b/ci/ci.sh
index 20a8d639a..a57622dea 100755
--- a/ci/ci.sh
+++ b/ci/ci.sh
@@ -172,7 +172,6 @@ test_python() {
       -python/ray/serve:test_cross_language # Ray java not built on Windows yet.
       -python/ray/serve:test_gcs_failure # Fork not supported in windows
       -python/ray/serve:test_standalone2 # Multinode not supported on Windows
-      -python/ray/serve:test_gradio
       -python/ray/tests:test_actor_advanced  # crashes in shutdown
       -python/ray/tests:test_autoscaler # We don't support Autoscaler on Windows
       -python/ray/tests:test_autoscaler_aws
@@ -217,7 +216,6 @@ test_python() {
       --test_env=CI="1" \
       --test_env=RAY_CI_POST_WHEEL_TESTS="1" \
       --test_env=USERPROFILE="${USERPROFILE}" \
-      --test_env=WINDIR \
       --test_output=streamed \
       -- \
       ${test_shard_selection};
diff --git a/ci/env/install-serve.sh b/ci/env/install-serve.sh
deleted file mode 100755
index 32d65f425..000000000
--- a/ci/env/install-serve.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-
-# Installs serve dependencies ("ray[serve]") on top of minimal install
-
-# Get script's directory: https://stackoverflow.com/a/246128
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-# Installs minimal dependencies
-"$SCRIPT_DIR"/install-minimal.sh
-
-# Installs serve dependencies
-python -m pip install -U "ray[serve]"
diff --git a/doc/source/serve/tutorials/gradio-integration.md b/doc/source/serve/tutorials/gradio-integration.md
deleted file mode 100644
index 3a75e9b05..000000000
--- a/doc/source/serve/tutorials/gradio-integration.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Scaling your Gradio app with Ray Serve
-
-In this guide, we will show you how to scale up your [Gradio](https://gradio.app/) application using Ray Serve. There is no need to change the internal architecture of your Gradio app; instead, we will neatly wrap it with Ray Serve and then scale it up to access more resources.
-
-## Dependencies
-
-To follow this tutorial, you will need Ray Serve and Gradio. If you haven't already, install them by running:
-```console
-$ pip install "ray[serve]"
-$ pip install gradio
-```
-For the purposes of this tutorial, we will be working with Gradio apps that run text summarization and text generation models. **Note that you can substitute this Gradio app for any Gradio app of your own!**
-
-We will be using [HuggingFace's Pipelines](https://huggingface.co/docs/transformers/main_classes/pipelines) to access the model. First, let's install the transformers module.
-```console
-$ pip install transformers
-```
-
-## Quickstart: Deploy your Gradio app with Ray Serve
-
-This example will show you an easy, straightforward way to deploy your app onto Ray Serve. Start by creating a new Python file named `demo.py` and import `GradioServer` from Ray Serve for deploying your Gradio app, `gradio`, and `transformers.pipeline` for loading text summarization models.
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-integration.py
-:start-after: __doc_import_begin__
-:end-before: __doc_import_end__
-```
-
-Then, we construct the (optional) Gradio app `io`:
-:::{note} 
-Remember you can substitute this with your own Gradio app if you want to try scaling up your own Gradio app!
-:::
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-integration.py
-:start-after: __doc_gradio_app_begin__
-:end-before: __doc_gradio_app_end__
-```
-
-
-### Understanding `GradioServer`
-In order to deploy your Gradio app onto Ray Serve, you need to wrap your Gradio app in a Serve [deployment](serve-key-concepts-deployment). `GradioServer` acts as that wrapper. It serves your Gradio app remotely on Ray Serve so that it can process and respond to HTTP requests.
-:::{note} 
-`GradioServer` is simply `GradioIngress` but wrapped in a Serve deployment.
-:::
-```{literalinclude} ../../../../python/ray/serve/gradio_integrations.py
-:start-after: __doc_gradio_ingress_begin__
-:end-before: __doc_gradio_ingress_end__
-```
-
-### Deploy your Gradio Server
-Replicas in a deployment are copies of your program living on Ray Serve, and more replicas means your deployment can serve more client requests. You can increase the number of replicas of your application or increase the number of CPUs and/or GPUs available to each replica.
-
-Then, using either the example we created above, or an existing Gradio app (of type `Interface`, `Block`, `Parallel`, etc.), wrap it in your Gradio Server.
-
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-integration.py
-:start-after: __doc_app_begin__
-:end-before: __doc_app_end__
-```
-
-Finally, deploy your Gradio Server! Run the following in your terminal:
-```console
-$ serve run demo:app
-```
-
-Now you can access your Gradio app at `http://localhost:8000`! This is what it should look like:
-![Gradio Result](https://raw.githubusercontent.com/ray-project/images/master/docs/serve/gradio_result.png)
-
-See [Putting Ray Serve Deployment Graphs in Production](https://docs.ray.io/en/master/serve/production.html#id1) for more information on how to deploy your app in production.
-
-
-## Parallelizing models with Ray Serve
-You can run multiple models in parallel with Ray Serve by utilizing the [deployment graph](deployment-graph-e2e-tutorial) in Ray Serve.
-
-### Original Approach
-Suppose you want to run the following program.
-
-1. Take two text generation models, [`gpt2`](https://huggingface.co/gpt2) and [`EleutherAI/gpt-neo-125M`](https://huggingface.co/EleutherAI/gpt-neo-125M).
-2. Run the two models on the same input text, such that the generated text has a minimum length of 20 and maximum length of 100.
-3. Display the outputs of both models using Gradio.
-
-This is how you would do it normally:
-
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-original.py
-:start-after: __doc_code_begin__
-:end-before: __doc_code_end__
-```
-
-### Parallelize using Ray Serve
-
-With Ray Serve, we can parallelize the two text generation models by wrapping each model in a separate Ray Serve [deployment](serve-key-concepts-deployment). Deployments are defined by decorating a Python class or function with `@serve.deployment`, and usually wrap the models that you want to deploy on Ray Serve and handle incoming requests.
-
-First, let's import our dependencies. Note that we need to import `GradioIngress` instead of `GradioServer` like before since we're now building a customized `MyGradioServer` that can run models in parallel.
-
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-integration-parallel.py
-:start-after: __doc_import_begin__
-:end-before: __doc_import_end__
-```
-
-Then, let's wrap our `gpt2` and `EleutherAI/gpt-neo-125M` models in Serve deployments, named `TextGenerationModel`.
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-integration-parallel.py
-:start-after: __doc_models_begin__
-:end-before: __doc_models_end__
-```
-
-Next, instead of simply wrapping our Gradio app in a `GradioServer` deployment, we can build our own `MyGradioServer` that reroutes the Gradio app so that it runs the `TextGenerationModel` deployments:
-
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-integration-parallel.py
-:start-after: __doc_gradio_server_begin__
-:end-before: __doc_gradio_server_end__
-```
-
-Lastly, we link everything together:
-```{literalinclude} ../../../../python/ray/serve/examples/doc/gradio-integration-parallel.py
-:start-after: __doc_app_begin__
-:end-before: __doc_app_end__
-```
-
-:::{note} 
-This will bind your two text generation models (wrapped in Serve deployments) to `MyGradioServer._d1` and `MyGradioServer._d2`, forming a [deployment graph](deployment-graph-e2e-tutorial). Thus, we have built our Gradio Interface `io` such that it calls `MyGradioServer.fanout()`, which simply sends requests to your two text generation models that are deployed on Ray Serve.
-:::
-
-Now, you can run your scalable app, and the two text generation models will run in parallel on Ray Serve! Run your Gradio app:
-
-```console
-$ serve run demo:app
-```
-
-Access your Gradio app at http://localhost:8000. This is what it should look like:
-![Gradio Result](https://raw.githubusercontent.com/ray-project/images/master/docs/serve/gradio_result_parallel.png)
-
-See [Putting Ray Serve Deployment Graphs in Production](https://docs.ray.io/en/master/serve/production.html#id1) for more information on how to deploy your app in production.
\ No newline at end of file
diff --git a/doc/source/serve/tutorials/gradio.ipynb b/doc/source/serve/tutorials/gradio.ipynb
new file mode 100644
index 000000000..5216ae79e
--- /dev/null
+++ b/doc/source/serve/tutorials/gradio.ipynb
@@ -0,0 +1,364 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0c5705f2",
+   "metadata": {},
+   "source": [
+    "(gradio-serve-tutorial)=\n",
+    "\n",
+    "# Building a Gradio demo with Ray Serve\n",
+    "\n",
+    "In this example, we will show you how to wrap a machine learning model served\n",
+    "by Ray Serve in a [Gradio demo](https://gradio.app/).\n",
+    "\n",
+    "Specifically, we're going to download a GPT-2 model from the `transformer` library,\n",
+    "define a Ray Serve deployment with it, and then define and launch a Gradio `Interface`.\n",
+    "Let's take a look."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c017f8c4",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Install all dependencies for this example.\n",
+    "! pip install ray gradio transformers requests"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6245b4c3",
+   "metadata": {},
+   "source": [
+    "## Deploying a model with Ray Serve\n",
+    "\n",
+    "To start off, we import Ray Serve, Gradio, the `transformers` and `requests` libraries,\n",
+    "and then simply start Ray Serve:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79d354ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "from ray import serve\n",
+    "from transformers import pipeline\n",
+    "import requests\n",
+    "\n",
+    "\n",
+    "serve.start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85b1eba9",
+   "metadata": {},
+   "source": [
+    "Next, we define a Ray Serve deployment with a GPT-2 model, by using the `@serve.deployment` decorator on a `model`\n",
+    "function that takes a `request` argument.\n",
+    "In this function we define a GPT-2 model with a call to `pipeline` and return the result of querying the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ef8e2c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@serve.deployment\n",
+    "def model(request):\n",
+    "    language_model = pipeline(\"text-generation\", model=\"gpt2\")\n",
+    "    query = request.query_params[\"query\"]\n",
+    "    return language_model(query, max_length=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba7be609",
+   "metadata": {},
+   "source": [
+    "This `model` can now easily be deployed using a `model.deploy()` call.\n",
+    "To test this deployment we use a simple `example` query to get a `response` from the model running\n",
+    "on `localhost:8000/model`.\n",
+    "The first time you use this endpoint, the model will be downloaded first, which can take a while to complete.\n",
+    "Subsequent calls will be faster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c278dfb7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.deploy()\n",
+    "example = \"What's the meaning of life?\"\n",
+    "response = requests.get(f\"http://localhost:8000/model?query={example}\")\n",
+    "print(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b11e675",
+   "metadata": {},
+   "source": [
+    "## Defining and launching a Gradio interface\n",
+    "\n",
+    "Defining a Gradio interface is now straightforward.\n",
+    "All we need is a function that Gradio can call to get the response from the model.\n",
+    "That's just a thin wrapper around our previous `requests` call:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61c3ab00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gpt2(query):\n",
+    "    response = requests.get(f\"http://localhost:8000/model?query={query}\")\n",
+    "    return response.json()[0][\"generated_text\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53b4a5ef",
+   "metadata": {},
+   "source": [
+    "Apart from our `gpt2` function, the only other thing that we need to define a Gradio interface is\n",
+    "a description of the model inputs and outputs that Gradio understands.\n",
+    "Since our model takes text as input and output, this turns out to be pretty simple:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "115fb25f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iface = gr.Interface(\n",
+    "    fn=gpt2,\n",
+    "    inputs=[gr.inputs.Textbox(\n",
+    "        default=example, label=\"Input prompt\"\n",
+    "    )],\n",
+    "    outputs=[gr.outputs.Textbox(label=\"Model output\")]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e998109",
+   "metadata": {},
+   "source": [
+    "For more complex models served with Ray, you might need multiple `gr.inputs`\n",
+    "and `gr.outputs` of different types.\n",
+    "\n",
+    "```{margin}\n",
+    "The [Gradio documentation](https://gradio.app/docs/) covers all viable input and output components in detail.\n",
+    "```\n",
+    "\n",
+    "Finally, we can launch the interface using `iface.launch()`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "203ce70e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iface.launch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e5638a9",
+   "metadata": {},
+   "source": [
+    "This should launch an interface that you can interact with that looks like this:\n",
+    "\n",
+    "```{image} https://raw.githubusercontent.com/ray-project/images/master/docs/serve/gradio_serve_gpt.png\n",
+    "```\n",
+    "\n",
+    "You can run this examples directly in the browser, for instance by launching this notebook directly\n",
+    "into Google Colab or Binder, by clicking on the _rocket icon_ at the top right of this page.\n",
+    "If you run this code locally in Python, this Gradio app will be served on `http://127.0.0.1:7861/`.\n",
+    "\n",
+    "## Building a Gradio app from a Scikit-Learn model\n",
+    "\n",
+    "Let's take a look at another example, so that you can see the slight differences to the first example\n",
+    "in direct comparison."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fdc6b92",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Install all dependencies for this example.\n",
+    "! pip install ray gradio requests scikit-learn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "257744c8",
+   "metadata": {},
+   "source": [
+    "This time we're going to use a [Scikit-Learn](https://scikit-learn.org/) model that we quickly train\n",
+    "ourselves on the famous Iris dataset.\n",
+    "To do this, we'll download the Iris dataset using the built-in `load_iris` function from the `sklearn` library,\n",
+    "and we used the `GradientBoostingClassifier` from the `sklearn.ensemble` module for training.\n",
+    "\n",
+    "This time we'll use the `@serve.deployment` decorator on a _class_ called `BoostingModel`, which has an\n",
+    "asynchronous `__call__` method that Ray Serve needs to define your deployment.\n",
+    "All else remains the same as in the first example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb92f167",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "import requests\n",
+    "from sklearn.datasets import load_iris\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "\n",
+    "from ray import serve\n",
+    "\n",
+    "# Train your model.\n",
+    "iris_dataset = load_iris()\n",
+    "model = GradientBoostingClassifier()\n",
+    "model.fit(iris_dataset[\"data\"], iris_dataset[\"target\"])\n",
+    "\n",
+    "# Start Ray Serve.\n",
+    "serve.start()\n",
+    "\n",
+    "# Define your deployment.\n",
+    "@serve.deployment(route_prefix=\"/iris\")\n",
+    "class BoostingModel:\n",
+    "    def __init__(self, model):\n",
+    "        self.model = model\n",
+    "        self.label_list = iris_dataset[\"target_names\"].tolist()\n",
+    "\n",
+    "    async def __call__(self, request):\n",
+    "        payload = (await request.json())[\"vector\"]\n",
+    "        print(f\"Received http request with data {payload}\")\n",
+    "\n",
+    "        prediction = self.model.predict([payload])[0]\n",
+    "        human_name = self.label_list[prediction]\n",
+    "        return {\"result\": human_name}\n",
+    "\n",
+    "\n",
+    "# Deploy your model.\n",
+    "BoostingModel.deploy(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30c3ef21",
+   "metadata": {},
+   "source": [
+    "Equipped with our `BoostingModel` class, we can now define and launch a Gradio interface as follows.\n",
+    "The Iris dataset has a total of four features, namely the four numeric values _sepal length_, _sepal width_,\n",
+    "_petal length_, and _petal width_.\n",
+    "We use this fact to define an `iris` function that takes these four features and returns the predicted class,\n",
+    "using our deployed model.\n",
+    "This time, the Gradio interface takes four input `Number`s, and returns the predicted class as `text`.\n",
+    "Go ahead and try it out in the browser yourself."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "733fb4f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define gradio function\n",
+    "def iris(sl, sw, pl, pw):\n",
+    "    request_input = {\"vector\": [sl, sw, pl, pw]}\n",
+    "    response = requests.get(\n",
+    "        \"http://localhost:8000/iris\", json=request_input)\n",
+    "    return response.json()[0][\"result\"]\n",
+    "\n",
+    "\n",
+    "# Define gradio interface\n",
+    "iface = gr.Interface(\n",
+    "    fn=iris,\n",
+    "    inputs=[\n",
+    "        gr.inputs.Number(default=1.0, label=\"sepal length (cm)\"),\n",
+    "        gr.inputs.Number(default=1.0, label=\"sepal width (cm)\"),\n",
+    "        gr.inputs.Number(default=1.0, label=\"petal length (cm)\"),\n",
+    "        gr.inputs.Number(default=1.0, label=\"petal width (cm)\"),\n",
+    "        ],\n",
+    "    outputs=\"text\")\n",
+    "\n",
+    "# Launch the gradio interface\n",
+    "iface.launch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3e47ff7",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Launching this interface, you should see an interactive interface that looks like this:\n",
+    "\n",
+    "```{image} https://raw.githubusercontent.com/ray-project/images/master/docs/serve/gradio_serve_iris.png\n",
+    "```\n",
+    "\n",
+    "## Conclusion\n",
+    "\n",
+    "To summarize, it's easy to build Gradio apps from Ray Serve deployments.\n",
+    "You only need to properly encode your model's inputs and outputs in a Gradio interface, and you're good to go!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/doc/source/serve/tutorials/index.md b/doc/source/serve/tutorials/index.md
index ccd2a3422..4c12782ed 100644
--- a/doc/source/serve/tutorials/index.md
+++ b/doc/source/serve/tutorials/index.md
@@ -15,7 +15,7 @@ sklearn
 batch
 web-server-integration
 rllib
-gradio-integration
+gradio
 ```
 
 Other Topics:
diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD
index 0cd3d4dde..472925c6b 100644
--- a/python/ray/serve/BUILD
+++ b/python/ray/serve/BUILD
@@ -534,11 +534,3 @@ py_test(
     tags = ["exclusive", "team:serve"],
     deps = [":serve_lib"],
 )
-
-py_test(
-    name = "test_gradio",
-    size = "small",
-    srcs = serve_tests_srcs,
-    tags = ["exclusive", "team:serve", "py37"],
-    deps = [":serve_lib"],
-)
\ No newline at end of file
diff --git a/python/ray/serve/examples/doc/gradio-integration-parallel.py b/python/ray/serve/examples/doc/gradio-integration-parallel.py
deleted file mode 100644
index bc20c51ad..000000000
--- a/python/ray/serve/examples/doc/gradio-integration-parallel.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# __doc_import_begin__
-import ray
-from ray import serve
-from ray.serve.gradio_integrations import GradioIngress
-
-import gradio as gr
-
-from transformers import pipeline
-
-# __doc_import_end__
-
-
-# __doc_models_begin__
-@serve.deployment
-class TextGenerationModel:
-    def __init__(self, model_name):
-        self.generator = pipeline("text-generation", model=model_name)
-
-    def __call__(self, text):
-        generated_list = self.generator(
-            text, do_sample=True, min_length=20, max_length=100
-        )
-        generated = generated_list[0]["generated_text"]
-        return generated
-
-
-app1 = TextGenerationModel.bind("gpt2")
-app2 = TextGenerationModel.bind("EleutherAI/gpt-neo-125M")
-# __doc_models_end__
-
-
-# __doc_gradio_server_begin__
-@serve.deployment
-class MyGradioServer(GradioIngress):
-    def __init__(self, downstream_model_1, downstream_model_2):
-        self._d1 = downstream_model_1
-        self._d2 = downstream_model_2
-
-        io = gr.Interface(self.fanout, "textbox", "textbox")
-        super().__init__(io)
-
-    def fanout(self, text):
-        [result1, result2] = ray.get([self._d1.remote(text), self._d2.remote(text)])
-        return f"{result1}\n------------\n{result2}"
-
-
-# __doc_gradio_server_end__
-
-
-# __doc_app_begin__
-app = MyGradioServer.bind(app1, app2)
-# __doc_app_end__
diff --git a/python/ray/serve/examples/doc/gradio-integration.py b/python/ray/serve/examples/doc/gradio-integration.py
deleted file mode 100644
index 5d72455d9..000000000
--- a/python/ray/serve/examples/doc/gradio-integration.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# __doc_import_begin__
-from ray.serve.gradio_integrations import GradioServer
-
-import gradio as gr
-
-from transformers import pipeline
-
-# __doc_import_end__
-
-# __doc_gradio_app_begin__
-summarizer = pipeline("summarization", model="t5-small")
-
-
-def model(text):
-    summary_list = summarizer(text)
-    summary = summary_list[0]["summary_text"]
-    return summary
-
-
-example = (
-    "HOUSTON -- Men have landed and walked on the moon. "
-    "Two Americans, astronauts of Apollo 11, steered their fragile "
-    "four-legged lunar module safely and smoothly to the historic landing "
-    "yesterday at 4:17:40 P.M., Eastern daylight time. Neil A. Armstrong, the "
-    "38-year-old commander, radioed to earth and the mission control room "
-    'here: "Houston, Tranquility Base here. The Eagle has landed." The '
-    "first men to reach the moon -- Armstrong and his co-pilot, Col. Edwin E. "
-    "Aldrin Jr. of the Air Force -- brought their ship to rest on a level, "
-    "rock-strewn plain near the southwestern shore of the arid Sea of "
-    "Tranquility. About six and a half hours later, Armstrong opened the "
-    "landing craft's hatch, stepped slowly down the ladder and declared as "
-    "he planted the first human footprint on the lunar crust: \"That's one "
-    'small step for man, one giant leap for mankind." His first step on the '
-    "moon came at 10:56:20 P.M., as a television camera outside the craft "
-    "transmitted his every move to an awed and excited audience of hundreds "
-    "of millions of people on earth."
-)
-
-io = gr.Interface(
-    fn=model,
-    inputs=[gr.inputs.Textbox(default=example, label="Input prompt")],
-    outputs=[gr.outputs.Textbox(label="Model output")],
-)
-# __doc_gradio_app_end__
-
-
-# __doc_app_begin__
-app = GradioServer.options(num_replicas=2, ray_actor_options={"num_cpus": 4}).bind(io)
-# __doc_app_end__
diff --git a/python/ray/serve/examples/doc/gradio-original.py b/python/ray/serve/examples/doc/gradio-original.py
deleted file mode 100644
index 9b1b27f39..000000000
--- a/python/ray/serve/examples/doc/gradio-original.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import gradio as gr
-from transformers import pipeline
-
-# __doc_code_begin__
-generator1 = pipeline("text-generation", model="gpt2")
-generator2 = pipeline("text-generation", model="EleutherAI/gpt-neo-125M")
-
-
-def model1(text):
-    generated_list = generator1(text, do_sample=True, min_length=20, max_length=100)
-    generated = generated_list[0]["generated_text"]
-    return generated
-
-
-def model2(text):
-    generated_list = generator2(text, do_sample=True, min_length=20, max_length=100)
-    generated = generated_list[0]["generated_text"]
-    return generated
-
-
-demo = gr.Interface(
-    lambda text: f"{model1(text)}\n------------\n{model2(text)}", "textbox", "textbox"
-)
-demo.launch()
-# __doc_code_end__
diff --git a/python/ray/serve/gradio_integrations.py b/python/ray/serve/gradio_integrations.py
deleted file mode 100644
index 367776da8..000000000
--- a/python/ray/serve/gradio_integrations.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from ray import serve
-from ray.serve._private.http_util import ASGIHTTPSender
-from ray.util.annotations import PublicAPI
-
-import starlette
-
-try:
-    import gradio as gr
-except ModuleNotFoundError:
-    print("Gradio isn't installed. Run `pip install gradio` to install Gradio.")
-    raise
-
-
-@PublicAPI(stability="alpha")
-# __doc_gradio_ingress_begin__
-class GradioIngress:
-    """User-facing class that wraps a Gradio App in a Serve Deployment"""
-
-    def __init__(self, io: gr.Blocks):
-        self.app = gr.routes.App.create_app(io)
-
-    async def __call__(self, request: starlette.requests.Request):
-        sender = ASGIHTTPSender()
-        await self.app(request.scope, receive=request.receive, send=sender)
-        return sender.build_asgi_response()
-
-
-# __doc_gradio_ingress_end__
-
-
-GradioServer = serve.deployment(GradioIngress)
diff --git a/python/ray/serve/tests/test_gradio.py b/python/ray/serve/tests/test_gradio.py
deleted file mode 100644
index 181e66e22..000000000
--- a/python/ray/serve/tests/test_gradio.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import ray
-from ray import serve
-from ray.serve.gradio_integrations import GradioServer
-
-import gradio as gr
-
-import os
-import sys
-import pytest
-import requests
-
-
-@pytest.fixture
-def serve_start_shutdown():
-    ray.init()
-    serve.start()
-    yield
-    serve.shutdown()
-    ray.shutdown()
-
-
-def test_gradio_ingress_correctness(serve_start_shutdown):
-    """
-    Ensure a Gradio app deployed to a cluster through GradioIngress still
-    produces the correct output.
-    """
-
-    def greet(name):
-        return f"Good morning {name}!"
-
-    io = gr.Interface(fn=greet, inputs="text", outputs="text")
-    app = GradioServer.bind(io)
-    serve.run(app)
-
-    test_input = "Alice"
-    response = requests.post(
-        "http://127.0.0.1:8000/api/predict/", json={"data": [test_input]}
-    )
-    assert response.status_code == 200 and response.json()["data"][0] == greet(
-        test_input
-    )
-
-
-def test_gradio_ingress_scaling(serve_start_shutdown):
-    """
-    Check that a Gradio app that has been deployed to a cluster through
-    GradioIngress scales as needed, i.e. separate client requests are served by
-    different replicas.
-    """
-
-    def f(*args):
-        return os.getpid()
-
-    io = gr.Interface(fn=f, inputs="text", outputs="text")
-    app = GradioServer.options(num_replicas=2).bind(io)
-    serve.run(app)
-
-    pids = []
-    for _ in range(3):
-        response = requests.post(
-            "http://127.0.0.1:8000/api/predict/", json={"data": ["input"]}
-        )
-        assert response.status_code == 200
-        pids.append(response.json()["data"][0])
-
-    assert len(set(pids)) == 2
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main(["-v", "-s", __file__]))
diff --git a/python/requirements.txt b/python/requirements.txt
index ce87e51d2..fc65ca061 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -43,7 +43,6 @@ uvicorn==0.16.0
 dataclasses; python_version < '3.7'
 starlette==0.18.0
 aiorwlock
-gradio
 
 # Requirements for running tests
 pyarrow >= 6.0.1, < 7.0.0
diff --git a/python/setup.py b/python/setup.py
index ae01b1c8c..5df11e4d6 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -224,13 +224,7 @@ if setup_spec.type == SetupType.RAY:
             "prometheus_client >= 0.7.1, < 0.14.0",
             "smart_open",
         ],
-        "serve": [
-            "uvicorn==0.16.0",
-            "requests",
-            "starlette",
-            "fastapi",
-            "aiorwlock",
-        ],
+        "serve": ["uvicorn==0.16.0", "requests", "starlette", "fastapi", "aiorwlock"],
         "tune": ["pandas", "tabulate", "tensorboardX>=1.9", "requests"],
         "k8s": ["kubernetes", "urllib3"],
         "observability": [