ray/release/serve_tests/workloads/serve_handle_long_chain.py

"""
This test is parity of
release/serve_tests/workloads/deployment_graph_long_chain.py
Instead of using graph api, the test is using pure handle to
test long chain graph.

INPUT -> Node_1 -> Node_2 -> ... -> Node_10 -> OUTPUT

 1) Intermediate blob size can be large / small
 2) Compute time each node can be long / short
 3) Init time can be long / short
"""

import time
import asyncio
import click

from typing import Optional

import ray
from ray import serve
from ray.serve.context import get_global_client
from serve_test_cluster_utils import (
    setup_local_single_node_cluster,
    setup_anyscale_cluster,
)
from serve_test_utils import save_test_results
from benchmark_utils import benchmark_throughput_tps, benchmark_latency_ms

DEFAULT_CHAIN_LENGTH = 4

DEFAULT_NUM_REQUESTS_PER_CLIENT = 20  # request sent for latency test
DEFAULT_NUM_CLIENTS = 1  # Clients concurrently sending request to deployment

DEFAULT_THROUGHPUT_TRIAL_DURATION_SECS = 10


@serve.deployment
class Node:
    def __init__(
        self,
        id: int,
        prev_node=None,
        init_delay_secs=0,
        compute_delay_secs=0,
        sync_handle=True,
    ):
        time.sleep(init_delay_secs)
        self.id = id
        self.prev_node = prev_node
        self.compute_delay_secs = compute_delay_secs
        self.sync_handle = sync_handle

    async def predict(self, input_data: int):
        await asyncio.sleep(self.compute_delay_secs)
        if self.prev_node:
            if self.sync_handle:
                return await self.prev_node.predict.remote(input_data) + 1
            else:
                return await (await self.prev_node.predict.remote(input_data)) + 1
        else:
            return input_data + 1


def construct_long_chain_graph_with_pure_handle(
    chain_length, sync_handle: bool, init_delay_secs=0, compute_delay_secs=0
):
    prev_handle = None
    for id in range(chain_length):
        Node.options(name=str(id)).deploy(
            id, prev_handle, init_delay_secs, compute_delay_secs, sync_handle
        )
        prev_handle = get_global_client().get_handle(str(id), sync=sync_handle)
    return prev_handle


async def sanity_check_graph_deployment_with_async_handle(handle, expected_result):
    assert await (await handle.predict.remote(0)) == expected_result


@click.command()
@click.option("--chain-length", type=int, default=DEFAULT_CHAIN_LENGTH)
@click.option("--init-delay-secs", type=int, default=0)
@click.option("--compute-delay-secs", type=int, default=0)
@click.option(
    "--num-requests-per-client",
    type=int,
    default=DEFAULT_NUM_REQUESTS_PER_CLIENT,
)
@click.option("--num-clients", type=int, default=DEFAULT_NUM_CLIENTS)
@click.option(
    "--throughput-trial-duration-secs",
    type=int,
    default=DEFAULT_THROUGHPUT_TRIAL_DURATION_SECS,
)
@click.option("--local-test", type=bool, default=True)
@click.option("--sync-handle", type=bool, default=True)
def main(
    chain_length: Optional[int],
    init_delay_secs: Optional[int],
    compute_delay_secs: Optional[int],
    num_requests_per_client: Optional[int],
    num_clients: Optional[int],
    throughput_trial_duration_secs: Optional[int],
    local_test: Optional[bool],
    sync_handle: Optional[bool],
):
    if local_test:
        setup_local_single_node_cluster(1, num_cpu_per_node=8)
    else:
        setup_anyscale_cluster()

    handle = construct_long_chain_graph_with_pure_handle(
        chain_length,
        sync_handle,
        init_delay_secs=init_delay_secs,
        compute_delay_secs=compute_delay_secs,
    )
    if sync_handle:
        assert ray.get(handle.predict.remote(0)) == chain_length
    else:
        sanity_check_graph_deployment_with_async_handle(handle, chain_length)
    loop = asyncio.get_event_loop()

    throughput_mean_tps, throughput_std_tps = loop.run_until_complete(
        benchmark_throughput_tps(
            handle,
            chain_length,
            duration_secs=throughput_trial_duration_secs,
            num_clients=num_clients,
        )
    )
    latency_mean_ms, latency_std_ms = loop.run_until_complete(
        benchmark_latency_ms(
            handle,
            chain_length,
            num_requests=num_requests_per_client,
            num_clients=num_clients,
        )
    )

    print(f"chain_length: {chain_length}, num_clients: {num_clients}")
    print(f"latency_mean_ms: {latency_mean_ms}, " f"latency_std_ms: {latency_std_ms}")
    print(
        f"throughput_mean_tps: {throughput_mean_tps}, "
        f"throughput_std_tps: {throughput_std_tps}"
    )

    results = {
        "chain_length": chain_length,
        "init_delay_secs": init_delay_secs,
        "compute_delay_secs": compute_delay_secs,
        "local_test": local_test,
        "sync_handle": sync_handle,
    }
    results["perf_metrics"] = [
        {
            "perf_metric_name": "throughput_mean_tps",
            "perf_metric_value": throughput_mean_tps,
            "perf_metric_type": "THROUGHPUT",
        },
        {
            "perf_metric_name": "throughput_std_tps",
            "perf_metric_value": throughput_std_tps,
            "perf_metric_type": "THROUGHPUT",
        },
        {
            "perf_metric_name": "latency_mean_ms",
            "perf_metric_value": latency_mean_ms,
            "perf_metric_type": "LATENCY",
        },
        {
            "perf_metric_name": "latency_std_ms",
            "perf_metric_value": latency_std_ms,
            "perf_metric_type": "LATENCY",
        },
    ]
    save_test_results(results)


if __name__ == "__main__":
    main()
    import sys
    import pytest

    sys.exit(pytest.main(["-v", "-s", __file__]))
[Serve] Add serve handle graph workload nightly tests (#24435) 2022-05-04 09:07:50 -07:00			`"""`
			`This test is parity of`
			`release/serve_tests/workloads/deployment_graph_long_chain.py`
			`Instead of using graph api, the test is using pure handle to`
			`test long chain graph.`

			`INPUT -> Node_1 -> Node_2 -> ... -> Node_10 -> OUTPUT`

			`1) Intermediate blob size can be large / small`
			`2) Compute time each node can be long / short`
			`3) Init time can be long / short`
			`"""`

			`import time`
			`import asyncio`
			`import click`

			`from typing import Optional`

			`import ray`
			`from ray import serve`
			`from ray.serve.context import get_global_client`
			`from serve_test_cluster_utils import (`
			`setup_local_single_node_cluster,`
			`setup_anyscale_cluster,`
			`)`
			`from serve_test_utils import save_test_results`
			`from benchmark_utils import benchmark_throughput_tps, benchmark_latency_ms`

			`DEFAULT_CHAIN_LENGTH = 4`

			`DEFAULT_NUM_REQUESTS_PER_CLIENT = 20 # request sent for latency test`
			`DEFAULT_NUM_CLIENTS = 1 # Clients concurrently sending request to deployment`

			`DEFAULT_THROUGHPUT_TRIAL_DURATION_SECS = 10`


			`@serve.deployment`
			`class Node:`
			`def __init__(`
			`self,`
			`id: int,`
			`prev_node=None,`
			`init_delay_secs=0,`
			`compute_delay_secs=0,`
			`sync_handle=True,`
			`):`
			`time.sleep(init_delay_secs)`
			`self.id = id`
			`self.prev_node = prev_node`
			`self.compute_delay_secs = compute_delay_secs`
			`self.sync_handle = sync_handle`

			`async def predict(self, input_data: int):`
			`await asyncio.sleep(self.compute_delay_secs)`
			`if self.prev_node:`
			`if self.sync_handle:`
			`return await self.prev_node.predict.remote(input_data) + 1`
			`else:`
			`return await (await self.prev_node.predict.remote(input_data)) + 1`
			`else:`
			`return input_data + 1`


			`def construct_long_chain_graph_with_pure_handle(`
			`chain_length, sync_handle: bool, init_delay_secs=0, compute_delay_secs=0`
			`):`
			`prev_handle = None`
			`for id in range(chain_length):`
			`Node.options(name=str(id)).deploy(`
			`id, prev_handle, init_delay_secs, compute_delay_secs, sync_handle`
			`)`
			`prev_handle = get_global_client().get_handle(str(id), sync=sync_handle)`
			`return prev_handle`


			`async def sanity_check_graph_deployment_with_async_handle(handle, expected_result):`
			`assert await (await handle.predict.remote(0)) == expected_result`


			`@click.command()`
			`@click.option("--chain-length", type=int, default=DEFAULT_CHAIN_LENGTH)`
			`@click.option("--init-delay-secs", type=int, default=0)`
			`@click.option("--compute-delay-secs", type=int, default=0)`
			`@click.option(`
			`"--num-requests-per-client",`
			`type=int,`
			`default=DEFAULT_NUM_REQUESTS_PER_CLIENT,`
			`)`
			`@click.option("--num-clients", type=int, default=DEFAULT_NUM_CLIENTS)`
			`@click.option(`
			`"--throughput-trial-duration-secs",`
			`type=int,`
			`default=DEFAULT_THROUGHPUT_TRIAL_DURATION_SECS,`
			`)`
			`@click.option("--local-test", type=bool, default=True)`
			`@click.option("--sync-handle", type=bool, default=True)`
			`def main(`
			`chain_length: Optional[int],`
			`init_delay_secs: Optional[int],`
			`compute_delay_secs: Optional[int],`
			`num_requests_per_client: Optional[int],`
			`num_clients: Optional[int],`
			`throughput_trial_duration_secs: Optional[int],`
			`local_test: Optional[bool],`
			`sync_handle: Optional[bool],`
			`):`
			`if local_test:`
			`setup_local_single_node_cluster(1, num_cpu_per_node=8)`
			`else:`
			`setup_anyscale_cluster()`

			`handle = construct_long_chain_graph_with_pure_handle(`
			`chain_length,`
			`sync_handle,`
			`init_delay_secs=init_delay_secs,`
			`compute_delay_secs=compute_delay_secs,`
			`)`
			`if sync_handle:`
			`assert ray.get(handle.predict.remote(0)) == chain_length`
			`else:`
			`sanity_check_graph_deployment_with_async_handle(handle, chain_length)`
			`loop = asyncio.get_event_loop()`

			`throughput_mean_tps, throughput_std_tps = loop.run_until_complete(`
			`benchmark_throughput_tps(`
			`handle,`
			`chain_length,`
			`duration_secs=throughput_trial_duration_secs,`
			`num_clients=num_clients,`
			`)`
			`)`
			`latency_mean_ms, latency_std_ms = loop.run_until_complete(`
			`benchmark_latency_ms(`
			`handle,`
			`chain_length,`
			`num_requests=num_requests_per_client,`
			`num_clients=num_clients,`
			`)`
			`)`

			`print(f"chain_length: {chain_length}, num_clients: {num_clients}")`
			`print(f"latency_mean_ms: {latency_mean_ms}, " f"latency_std_ms: {latency_std_ms}")`
			`print(`
			`f"throughput_mean_tps: {throughput_mean_tps}, "`
			`f"throughput_std_tps: {throughput_std_tps}"`
			`)`

			`results = {`
			`"chain_length": chain_length,`
			`"init_delay_secs": init_delay_secs,`
			`"compute_delay_secs": compute_delay_secs,`
			`"local_test": local_test,`
			`"sync_handle": sync_handle,`
			`}`
			`results["perf_metrics"] = [`
			`{`
			`"perf_metric_name": "throughput_mean_tps",`
			`"perf_metric_value": throughput_mean_tps,`
			`"perf_metric_type": "THROUGHPUT",`
			`},`
			`{`
			`"perf_metric_name": "throughput_std_tps",`
			`"perf_metric_value": throughput_std_tps,`
			`"perf_metric_type": "THROUGHPUT",`
			`},`
			`{`
			`"perf_metric_name": "latency_mean_ms",`
			`"perf_metric_value": latency_mean_ms,`
			`"perf_metric_type": "LATENCY",`
			`},`
			`{`
			`"perf_metric_name": "latency_std_ms",`
			`"perf_metric_value": latency_std_ms,`
			`"perf_metric_type": "LATENCY",`
			`},`
			`]`
			`save_test_results(results)`


			`if __name__ == "__main__":`
			`main()`
			`import sys`
			`import pytest`

			`sys.exit(pytest.main(["-v", "-s", __file__]))`