ray/release/serve_tests/workloads/serve_cluster_fault_tolerance.py

"""
Test that a serve deployment can recover from cluster failures by resuming
from checkpoints of external source, such as s3.

For product testing, we skip the part of actually starting new cluster as
it's Job Manager's responsibility, and only re-deploy to the same cluster
with remote checkpoint.
"""

import click
import time
import requests
import uuid
import os
from pathlib import Path

from serve_test_cluster_utils import setup_local_single_node_cluster

from serve_test_utils import save_test_results

import ray
from ray import serve
from ray.serve.utils import logger

# Deployment configs
DEFAULT_NUM_REPLICAS = 2
DEFAULT_MAX_BATCH_SIZE = 16


def request_with_retries(endpoint, timeout=3):
    start = time.time()
    while True:
        try:
            return requests.get("http://127.0.0.1:8000" + endpoint, timeout=timeout)
        except requests.RequestException:
            if time.time() - start > timeout:
                raise TimeoutError
            time.sleep(0.1)


@click.command()
def main():
    # Setup local cluster, note this cluster setup is the same for both
    # local and product ray cluster env.
    # Each test uses different ray namespace, thus kv storage key for each
    # checkpoint is different to avoid collision.
    namespace = uuid.uuid4().hex

    # IS_SMOKE_TEST is set by args of releaser's e2e.py
    smoke_test = os.environ.get("IS_SMOKE_TEST", "1")
    if smoke_test == "1":
        path = Path("checkpoint.db")
        checkpoint_path = f"file://{path}"
        if path.exists():
            path.unlink()
    else:
        checkpoint_path = (
            "s3://serve-nightly-tests/fault-tolerant-test-checkpoint"  # noqa: E501
        )

    _, cluster = setup_local_single_node_cluster(
        1, checkpoint_path=checkpoint_path, namespace=namespace
    )

    # Deploy for the first time
    @serve.deployment(num_replicas=DEFAULT_NUM_REPLICAS)
    def hello():
        return serve.get_replica_context().deployment

    for name in ["hello", "world"]:
        hello.options(name=name).deploy()

        for _ in range(5):
            response = request_with_retries(f"/{name}/", timeout=3)
            assert response.text == name

    logger.info("Initial deployment successful with working endpoint.")

    # Kill current cluster, recover from remote checkpoint and ensure endpoint
    # is still available with expected results

    ray.kill(serve.api._global_client._controller, no_restart=True)
    ray.shutdown()
    cluster.shutdown()
    serve.api._set_global_client(None)

    # Start another ray cluster with same namespace to resume from previous
    # checkpoints with no new deploy() call.
    setup_local_single_node_cluster(
        1, checkpoint_path=checkpoint_path, namespace=namespace
    )

    for name in ["hello", "world"]:
        for _ in range(5):
            response = request_with_retries(f"/{name}/", timeout=3)
            assert response.text == name

    logger.info(
        "Deployment recovery from s3 checkpoint is successful " "with working endpoint."
    )

    # Delete dangling checkpoints. If script failed before this step, it's up
    # to the TTL policy on s3 to clean up, but won't lead to collision with
    # subsequent tests since each test run in different uuid namespace.
    serve.shutdown()
    ray.shutdown()
    cluster.shutdown()

    # Checkpoints in S3 bucket are moved after 7 days with explicit lifecycle
    # rules. Each checkpoint is ~260 Bytes in size from this test.

    # Save results
    save_test_results(
        {"result": "success"},
        default_output_file="/tmp/serve_cluster_fault_tolerance.json",
    )


if __name__ == "__main__":
    main()
    import pytest
    import sys

    sys.exit(pytest.main(["-v", "-s", __file__]))
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00			`"""`
			`Test that a serve deployment can recover from cluster failures by resuming`
			`from checkpoints of external source, such as s3.`

			`For product testing, we skip the part of actually starting new cluster as`
			`it's Job Manager's responsibility, and only re-deploy to the same cluster`
			`with remote checkpoint.`
			`"""`

			`import click`
			`import time`
			`import requests`
			`import uuid`
			`import os`
[Serve] Don't recover from current state checkpoint (#19998) 2021-11-12 09:02:27 -08:00			`from pathlib import Path`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
			`from serve_test_cluster_utils import setup_local_single_node_cluster`

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`from serve_test_utils import save_test_results`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
			`import ray`
			`from ray import serve`
			`from ray.serve.utils import logger`

			`# Deployment configs`
[Serve] Don't recover from current state checkpoint (#19998) 2021-11-12 09:02:27 -08:00			`DEFAULT_NUM_REPLICAS = 2`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00			`DEFAULT_MAX_BATCH_SIZE = 16`


			`def request_with_retries(endpoint, timeout=3):`
			`start = time.time()`
			`while True:`
			`try:`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`return requests.get("http://127.0.0.1:8000" + endpoint, timeout=timeout)`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00			`except requests.RequestException:`
			`if time.time() - start > timeout:`
			`raise TimeoutError`
			`time.sleep(0.1)`


			`@click.command()`
			`def main():`
			`# Setup local cluster, note this cluster setup is the same for both`
			`# local and product ray cluster env.`
			`# Each test uses different ray namespace, thus kv storage key for each`
			`# checkpoint is different to avoid collision.`
			`namespace = uuid.uuid4().hex`

			`# IS_SMOKE_TEST is set by args of releaser's e2e.py`
			`smoke_test = os.environ.get("IS_SMOKE_TEST", "1")`
			`if smoke_test == "1":`
[Serve] Don't recover from current state checkpoint (#19998) 2021-11-12 09:02:27 -08:00			`path = Path("checkpoint.db")`
			`checkpoint_path = f"file://{path}"`
			`if path.exists():`
			`path.unlink()`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00			`else:`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`checkpoint_path = (`
			`"s3://serve-nightly-tests/fault-tolerant-test-checkpoint" # noqa: E501`
			`)`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
			`_, cluster = setup_local_single_node_cluster(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`1, checkpoint_path=checkpoint_path, namespace=namespace`
			`)`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
			`# Deploy for the first time`
[Serve] Don't recover from current state checkpoint (#19998) 2021-11-12 09:02:27 -08:00			`@serve.deployment(num_replicas=DEFAULT_NUM_REPLICAS)`
			`def hello():`
			`return serve.get_replica_context().deployment`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
[Serve] Don't recover from current state checkpoint (#19998) 2021-11-12 09:02:27 -08:00			`for name in ["hello", "world"]:`
			`hello.options(name=name).deploy()`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
[Serve] Don't recover from current state checkpoint (#19998) 2021-11-12 09:02:27 -08:00			`for _ in range(5):`
			`response = request_with_retries(f"/{name}/", timeout=3)`
			`assert response.text == name`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
			`logger.info("Initial deployment successful with working endpoint.")`

			`# Kill current cluster, recover from remote checkpoint and ensure endpoint`
			`# is still available with expected results`

			`ray.kill(serve.api._global_client._controller, no_restart=True)`
			`ray.shutdown()`
			`cluster.shutdown()`
			`serve.api._set_global_client(None)`

			`# Start another ray cluster with same namespace to resume from previous`
			`# checkpoints with no new deploy() call.`
			`setup_local_single_node_cluster(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`1, checkpoint_path=checkpoint_path, namespace=namespace`
			`)`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
[Serve] Don't recover from current state checkpoint (#19998) 2021-11-12 09:02:27 -08:00			`for name in ["hello", "world"]:`
			`for _ in range(5):`
			`response = request_with_retries(f"/{name}/", timeout=3)`
			`assert response.text == name`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`logger.info(`
			`"Deployment recovery from s3 checkpoint is successful " "with working endpoint."`
			`)`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00
			`# Delete dangling checkpoints. If script failed before this step, it's up`
			`# to the TTL policy on s3 to clean up, but won't lead to collision with`
			`# subsequent tests since each test run in different uuid namespace.`
			`serve.shutdown()`
			`ray.shutdown()`
			`cluster.shutdown()`

			`# Checkpoints in S3 bucket are moved after 7 days with explicit lifecycle`
			`# rules. Each checkpoint is ~260 Bytes in size from this test.`

			`# Save results`
			`save_test_results(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`{"result": "success"},`
			`default_output_file="/tmp/serve_cluster_fault_tolerance.json",`
			`)`
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00

			`if __name__ == "__main__":`
			`main()`
			`import pytest`
			`import sys`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00
[Serve] Add nightly test for Serve failure recovery (#19125) 2021-10-11 18:33:20 -07:00			`sys.exit(pytest.main(["-v", "-s", __file__]))`