ray/release/.buildkite/build_pipeline.py

import copy
import logging
import os
import sys

import yaml

# Env variables:

# RAY_REPO          Repo to use for finding the wheel
# RAY_BRANCH        Branch to find the wheel
# RAY_TEST_REPO     Repo to use for test scripts
# RAY_TEST_BRANCH   Branch for test scripts
# FILTER_FILE       File filter
# FILTER_TEST       Test name filter
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)


class ReleaseTest:
    def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
        self.name = name
        self.smoke_test = smoke_test
        self.retry = retry

    def __str__(self):
        return self.name

    def __repr__(self):
        return self.name

    def __contains__(self, item):
        return self.name.__contains__(item)

    def __iter__(self):
        return iter(self.name)

    def __len__(self):
        return len(self.name)


class SmokeTest(ReleaseTest):
    def __init__(self, name: str, retry: int = 0):
        super(SmokeTest, self).__init__(
            name=name, smoke_test=True, retry=retry)


CORE_NIGHTLY_TESTS = {
    "~/ray/release/nightly_tests/nightly_tests.yaml": [
        "shuffle_10gb",
        "shuffle_50gb",
        "shuffle_50gb_large_partition",
        "shuffle_100gb",
        "non_streaming_shuffle_100gb",
        "non_streaming_shuffle_50gb_large_partition",
        "non_streaming_shuffle_50gb",
        "dask_on_ray_10gb_sort",
        "dask_on_ray_100gb_sort",
        "dask_on_ray_large_scale_test_no_spilling",
        "dask_on_ray_large_scale_test_spilling",
        "stress_test_placement_group",
        "shuffle_1tb_1000_partition",
        "non_streaming_shuffle_1tb_1000_partition",
        "shuffle_1tb_5000_partitions",
        "non_streaming_shuffle_1tb_5000_partitions",
        "decision_tree_autoscaling",
        "autoscaling_shuffle_1tb_1000_partitions",
        SmokeTest("stress_test_many_tasks"),
        SmokeTest("stress_test_dead_actors"),
    ],
    "~/ray/benchmarks/benchmark_tests.yaml": [
        "single_node",
        "object_store",
    ],
}

NIGHTLY_TESTS = {
    # "~/ray/release/horovod_tests/horovod_tests.yaml": [
    #     SmokeTest("horovod_test"),
    # ],  # Should we enable this?
    "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
        "dask_xgboost_test",
        "modin_xgboost_test",
        "torch_tune_serve_test",
    ],
    "~/ray/release/long_running_tests/long_running_tests.yaml": [
        SmokeTest("actor_deaths"),
        SmokeTest("apex"),
        SmokeTest("impala"),
        SmokeTest("many_actor_tasks"),
        SmokeTest("many_drivers"),
        SmokeTest("many_ppo"),
        SmokeTest("many_tasks"),
        SmokeTest("many_tasks_serialized_ids"),
        SmokeTest("node_failures"),
        SmokeTest("pbt"),
        # SmokeTest("serve"),
        # SmokeTest("serve_failure"),
    ],
    "~/ray/release/microbenchmark/microbenchmark.yaml": [
        "microbenchmark",
    ],
    "~/ray/release/sgd_tests/sgd_tests.yaml": [
        "sgd_gpu",
    ],
    "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
        "bookkeeping_overhead",
        "durable_trainable",
        SmokeTest("long_running_large_checkpoints"),
        SmokeTest("network_overhead"),
        "result_throughput_cluster",
        "result_throughput_single_node",
        "xgboost_sweep",
    ],
    "~/ray/release/xgboost_tests/xgboost_tests.yaml": [
        "train_small",
        "train_moderate",
        "train_gpu",
        "tune_small",
        "tune_4x32",
        "tune_32x4",
        "ft_small_elastic",
        "ft_small_non_elastic",
        "distributed_api_test",
    ],
}

WEEKLY_TESTS = {
    "~/ray/benchmarks/benchmark_tests.yaml": [
        "distributed",
    ],
    "~/ray/release/nightly_tests/nightly_tests.yaml": [
        "stress_test_many_tasks",
        "stress_test_dead_actors",
    ],
    "~/ray/release/horovod_tests/horovod_tests.yaml": [
        "horovod_test",
    ],
    "~/ray/release/long_running_distributed_tests"
    "/long_running_distributed.yaml": [
        "pytorch_pbt_failure",
    ],
    # Full long running tests (1 day runtime)
    "~/ray/release/long_running_tests/long_running_tests.yaml": [
        "actor_deaths",
        "apex",
        "impala",
        "many_actor_tasks",
        "many_drivers",
        "many_ppo",
        "many_tasks",
        "many_tasks_serialized_ids",
        "node_failures",
        "pbt",
        # "serve",
        # "serve_failure",
    ],
    "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
        "network_overhead",
        "long_running_large_checkpoints",
    ],
}

MANUAL_TESTS = {
    "~/ray/release/rllib_tests/rllib_tests.yaml": [
        "learning_tests",
        "example_scripts_on_gpu_tests",
        "stress_tests",
    ],
    "~/ray/release/long_running_tests/long_running_tests.yaml": [
        SmokeTest("serve"),
        SmokeTest("serve_failure"),
    ]
}

SUITES = {
    "core-nightly": CORE_NIGHTLY_TESTS,
    "nightly": NIGHTLY_TESTS,
    "weekly": WEEKLY_TESTS,
    "manual": MANUAL_TESTS,
}

DEFAULT_STEP_TEMPLATE = {
    "env": {
        "ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
        "ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
        "RELEASE_AWS_BUCKET": "ray-release-automation-results",
        "RELEASE_AWS_LOCATION": "dev",
        "RELEASE_AWS_DB_NAME": "ray_ci",
        "RELEASE_AWS_DB_TABLE": "release_test_result",
        "AWS_REGION": "us-west-2"
    },
    "agents": {
        "queue": "runner_queue_branch"
    },
    "plugins": [{
        "docker#v3.8.0": {
            "image": "rayproject/ray",
            "propagate-environment": True
        }
    }],
    "commands": []
}


def build_pipeline(steps):
    all_steps = []

    RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
    RAY_REPO = os.environ.get("RAY_REPO",
                              "https://github.com/ray-project/ray.git")

    RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
    RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)

    FILTER_FILE = os.environ.get("FILTER_FILE", "")
    FILTER_TEST = os.environ.get("FILTER_TEST", "")

    logging.info(
        f"Building pipeline \n"
        f"Ray repo/branch to test:\n"
        f" RAY_REPO   = {RAY_REPO}\n"
        f" RAY_BRANCH = {RAY_BRANCH}\n\n"
        f"Ray repo/branch containing the test configurations and scripts:"
        f" RAY_TEST_REPO   = {RAY_TEST_REPO}\n"
        f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
        f"Filtering for these tests:\n"
        f" FILTER_FILE = {FILTER_FILE}\n"
        f" FILTER_TEST = {FILTER_TEST}\n\n")

    for test_file, test_names in steps.items():
        if FILTER_FILE and FILTER_FILE not in test_file:
            continue

        test_base = os.path.basename(test_file)
        for test_name in test_names:
            if FILTER_TEST and FILTER_TEST not in test_name:
                continue

            if not isinstance(test_name, ReleaseTest):
                test_name = ReleaseTest(name=test_name)

            logging.info(f"Adding test: {test_base}/{test_name}")

            cmd = str(f"python release/e2e.py "
                      f"--ray-branch {RAY_BRANCH} "
                      f"--category {RAY_BRANCH} "
                      f"--test-config {test_file} "
                      f"--test-name {test_name}")

            if test_name.smoke_test:
                logging.info("This test will run as a smoke test.")
                cmd += " --smoke-test"

            step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)

            if test_name.retry:
                logging.info(f"This test will be retried up to "
                             f"{test_name.retry} times.")
                step_conf["retry"] = {
                    "automatic": [{
                        "exit_status": "*",
                        "limit": test_name.retry
                    }]
                }

            step_conf["commands"] = [
                "pip install -q -r release/requirements.txt",
                "pip install -U boto3 botocore",
                f"git clone -b {RAY_TEST_BRANCH} {RAY_TEST_REPO} ~/ray",
                cmd,
            ]

            step_conf["label"] = f"{test_name} ({RAY_BRANCH}) - " \
                                 f"{RAY_TEST_BRANCH}/{test_base}"
            all_steps.append(step_conf)

    return all_steps


def alert_pipeline(stats: bool = False):
    step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)

    cmd = "python release/alert.py"
    if stats:
        cmd += " --stats"

    step_conf["commands"] = [
        "pip install -q -r release/requirements.txt",
        "pip install -U boto3 botocore",
        cmd,
    ]
    step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
    return [step_conf]


if __name__ == "__main__":
    alert = os.environ.get("RELEASE_ALERT", "0")

    if alert in ["1", "stats"]:
        steps = alert_pipeline(alert == "stats")
    else:
        TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
        PIPELINE_SPEC = SUITES[TEST_SUITE]

        steps = build_pipeline(PIPELINE_SPEC)

    yaml.dump({"steps": steps}, sys.stdout)
[release] move release testing end to end script to main ray repo (#17070) 2021-07-14 21:39:07 +02:00			`import copy`
			`import logging`
			`import os`
			`import sys`

			`import yaml`

			`# Env variables:`

			`# RAY_REPO Repo to use for finding the wheel`
			`# RAY_BRANCH Branch to find the wheel`
			`# RAY_TEST_REPO Repo to use for test scripts`
			`# RAY_TEST_BRANCH Branch for test scripts`
			`# FILTER_FILE File filter`
			`# FILTER_TEST Test name filter`
			`# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)`


			`class ReleaseTest:`
			`def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):`
			`self.name = name`
			`self.smoke_test = smoke_test`
			`self.retry = retry`

			`def __str__(self):`
			`return self.name`

			`def __repr__(self):`
			`return self.name`

			`def __contains__(self, item):`
			`return self.name.__contains__(item)`

			`def __iter__(self):`
			`return iter(self.name)`

			`def __len__(self):`
			`return len(self.name)`


			`class SmokeTest(ReleaseTest):`
			`def __init__(self, name: str, retry: int = 0):`
			`super(SmokeTest, self).__init__(`
			`name=name, smoke_test=True, retry=retry)`


			`CORE_NIGHTLY_TESTS = {`
			`"~/ray/release/nightly_tests/nightly_tests.yaml": [`
			`"shuffle_10gb",`
			`"shuffle_50gb",`
			`"shuffle_50gb_large_partition",`
			`"shuffle_100gb",`
			`"non_streaming_shuffle_100gb",`
			`"non_streaming_shuffle_50gb_large_partition",`
			`"non_streaming_shuffle_50gb",`
			`"dask_on_ray_10gb_sort",`
			`"dask_on_ray_100gb_sort",`
			`"dask_on_ray_large_scale_test_no_spilling",`
			`"dask_on_ray_large_scale_test_spilling",`
			`"stress_test_placement_group",`
			`"shuffle_1tb_1000_partition",`
			`"non_streaming_shuffle_1tb_1000_partition",`
			`"shuffle_1tb_5000_partitions",`
			`"non_streaming_shuffle_1tb_5000_partitions",`
			`"decision_tree_autoscaling",`
			`"autoscaling_shuffle_1tb_1000_partitions",`
			`SmokeTest("stress_test_many_tasks"),`
			`SmokeTest("stress_test_dead_actors"),`
			`],`
			`"~/ray/benchmarks/benchmark_tests.yaml": [`
			`"single_node",`
			`"object_store",`
			`],`
			`}`

			`NIGHTLY_TESTS = {`
			`# "~/ray/release/horovod_tests/horovod_tests.yaml": [`
			`# SmokeTest("horovod_test"),`
			`# ], # Should we enable this?`
			`"~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [`
			`"dask_xgboost_test",`
			`"modin_xgboost_test",`
			`"torch_tune_serve_test",`
			`],`
			`"~/ray/release/long_running_tests/long_running_tests.yaml": [`
			`SmokeTest("actor_deaths"),`
			`SmokeTest("apex"),`
			`SmokeTest("impala"),`
			`SmokeTest("many_actor_tasks"),`
			`SmokeTest("many_drivers"),`
			`SmokeTest("many_ppo"),`
			`SmokeTest("many_tasks"),`
			`SmokeTest("many_tasks_serialized_ids"),`
			`SmokeTest("node_failures"),`
			`SmokeTest("pbt"),`
			`# SmokeTest("serve"),`
			`# SmokeTest("serve_failure"),`
			`],`
			`"~/ray/release/microbenchmark/microbenchmark.yaml": [`
			`"microbenchmark",`
			`],`
			`"~/ray/release/sgd_tests/sgd_tests.yaml": [`
			`"sgd_gpu",`
			`],`
			`"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [`
			`"bookkeeping_overhead",`
			`"durable_trainable",`
			`SmokeTest("long_running_large_checkpoints"),`
			`SmokeTest("network_overhead"),`
			`"result_throughput_cluster",`
			`"result_throughput_single_node",`
			`"xgboost_sweep",`
			`],`
			`"~/ray/release/xgboost_tests/xgboost_tests.yaml": [`
			`"train_small",`
			`"train_moderate",`
			`"train_gpu",`
			`"tune_small",`
			`"tune_4x32",`
			`"tune_32x4",`
			`"ft_small_elastic",`
			`"ft_small_non_elastic",`
			`"distributed_api_test",`
			`],`
			`}`

			`WEEKLY_TESTS = {`
			`"~/ray/benchmarks/benchmark_tests.yaml": [`
			`"distributed",`
			`],`
			`"~/ray/release/nightly_tests/nightly_tests.yaml": [`
			`"stress_test_many_tasks",`
			`"stress_test_dead_actors",`
			`],`
			`"~/ray/release/horovod_tests/horovod_tests.yaml": [`
			`"horovod_test",`
			`],`
			`"~/ray/release/long_running_distributed_tests"`
			`"/long_running_distributed.yaml": [`
			`"pytorch_pbt_failure",`
			`],`
			`# Full long running tests (1 day runtime)`
			`"~/ray/release/long_running_tests/long_running_tests.yaml": [`
			`"actor_deaths",`
			`"apex",`
			`"impala",`
			`"many_actor_tasks",`
			`"many_drivers",`
			`"many_ppo",`
			`"many_tasks",`
			`"many_tasks_serialized_ids",`
			`"node_failures",`
			`"pbt",`
			`# "serve",`
			`# "serve_failure",`
			`],`
			`"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [`
			`"network_overhead",`
			`"long_running_large_checkpoints",`
			`],`
			`}`

			`MANUAL_TESTS = {`
			`"~/ray/release/rllib_tests/rllib_tests.yaml": [`
			`"learning_tests",`
			`"example_scripts_on_gpu_tests",`
			`"stress_tests",`
			`],`
			`"~/ray/release/long_running_tests/long_running_tests.yaml": [`
			`SmokeTest("serve"),`
			`SmokeTest("serve_failure"),`
			`]`
			`}`

			`SUITES = {`
			`"core-nightly": CORE_NIGHTLY_TESTS,`
			`"nightly": NIGHTLY_TESTS,`
			`"weekly": WEEKLY_TESTS,`
			`"manual": MANUAL_TESTS,`
			`}`

			`DEFAULT_STEP_TEMPLATE = {`
			`"env": {`
			`"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",`
			`"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",`
			`"RELEASE_AWS_BUCKET": "ray-release-automation-results",`
			`"RELEASE_AWS_LOCATION": "dev",`
			`"RELEASE_AWS_DB_NAME": "ray_ci",`
			`"RELEASE_AWS_DB_TABLE": "release_test_result",`
			`"AWS_REGION": "us-west-2"`
			`},`
			`"agents": {`
			`"queue": "runner_queue_branch"`
			`},`
			`"plugins": [{`
			`"docker#v3.8.0": {`
			`"image": "rayproject/ray",`
			`"propagate-environment": True`
			`}`
			`}],`
			`"commands": []`
			`}`


			`def build_pipeline(steps):`
			`all_steps = []`

			`RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")`
			`RAY_REPO = os.environ.get("RAY_REPO",`
			`"https://github.com/ray-project/ray.git")`

			`RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)`
			`RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)`

			`FILTER_FILE = os.environ.get("FILTER_FILE", "")`
			`FILTER_TEST = os.environ.get("FILTER_TEST", "")`

			`logging.info(`
			`f"Building pipeline \n"`
			`f"Ray repo/branch to test:\n"`
			`f" RAY_REPO = {RAY_REPO}\n"`
			`f" RAY_BRANCH = {RAY_BRANCH}\n\n"`
			`f"Ray repo/branch containing the test configurations and scripts:"`
			`f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"`
			`f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"`
			`f"Filtering for these tests:\n"`
			`f" FILTER_FILE = {FILTER_FILE}\n"`
			`f" FILTER_TEST = {FILTER_TEST}\n\n")`

			`for test_file, test_names in steps.items():`
			`if FILTER_FILE and FILTER_FILE not in test_file:`
			`continue`

			`test_base = os.path.basename(test_file)`
			`for test_name in test_names:`
			`if FILTER_TEST and FILTER_TEST not in test_name:`
			`continue`

			`if not isinstance(test_name, ReleaseTest):`
			`test_name = ReleaseTest(name=test_name)`

			`logging.info(f"Adding test: {test_base}/{test_name}")`

			`cmd = str(f"python release/e2e.py "`
			`f"--ray-branch {RAY_BRANCH} "`
			`f"--category {RAY_BRANCH} "`
			`f"--test-config {test_file} "`
			`f"--test-name {test_name}")`

			`if test_name.smoke_test:`
			`logging.info("This test will run as a smoke test.")`
			`cmd += " --smoke-test"`

			`step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)`

			`if test_name.retry:`
			`logging.info(f"This test will be retried up to "`
			`f"{test_name.retry} times.")`
			`step_conf["retry"] = {`
			`"automatic": [{`
			`"exit_status": "*",`
			`"limit": test_name.retry`
			`}]`
			`}`

			`step_conf["commands"] = [`
			`"pip install -q -r release/requirements.txt",`
			`"pip install -U boto3 botocore",`
			`f"git clone -b {RAY_TEST_BRANCH} {RAY_TEST_REPO} ~/ray",`
			`cmd,`
			`]`

			`step_conf["label"] = f"{test_name} ({RAY_BRANCH}) - " \`
			`f"{RAY_TEST_BRANCH}/{test_base}"`
			`all_steps.append(step_conf)`

			`return all_steps`


			`def alert_pipeline(stats: bool = False):`
			`step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)`

			`cmd = "python release/alert.py"`
			`if stats:`
			`cmd += " --stats"`

			`step_conf["commands"] = [`
			`"pip install -q -r release/requirements.txt",`
			`"pip install -U boto3 botocore",`
			`cmd,`
			`]`
			`step_conf["label"] = f"Send periodic alert (stats_only = {stats})"`
			`return [step_conf]`


			`if __name__ == "__main__":`
			`alert = os.environ.get("RELEASE_ALERT", "0")`

			`if alert in ["1", "stats"]:`
			`steps = alert_pipeline(alert == "stats")`
			`else:`
			`TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")`
			`PIPELINE_SPEC = SUITES[TEST_SUITE]`

			`steps = build_pipeline(PIPELINE_SPEC)`

			`yaml.dump({"steps": steps}, sys.stdout)`