[release] LightGBM release tests (#17043)

2025-03-05 10:01:43 -05:00 · 2021-07-14 09:38:55 +02:00 · 2021-07-14 09:38:55 +02:00 · cfc5806c2d
commit cfc5806c2d
parent 0f79ebbd75
26 changed files with 1199 additions and 0 deletions
--- a/python/ray/util/lightgbm/BUILD
+++ b/python/ray/util/lightgbm/BUILD
@ -0,0 +1,36 @@
+# --------------------------------------------------------------------
+# Tests from the python/ray/util/lightgbm directory.
+# Please keep these sorted alphabetically.
+# --------------------------------------------------------------------
+py_test(
+ name = "simple_example",
+ size = "small",
+ srcs = ["simple_example.py"],
+ deps = [":lgbm_lib"],
+ tags = ["exclusive"],
+)
+
+py_test(
+ name = "simple_tune",
+ size="small",
+ srcs = ["simple_tune.py"],
+ deps = [":lgbm_lib"],
+ tags = ["exclusive"]
+)
+
+py_test(
+    name = "test_client",
+    size = "small",
+    srcs = ["tests/test_client.py"],
+    deps = [":lgbm_lib"],
+    tags = ["exclusive", "client"]
+)
+
+# This is a dummy test dependency that causes the above tests to be
+# re-run if any of these files changes.
+py_library(
+ name = "lgbm_lib",
+ srcs = glob(["**/*.py"]),
+)
+
+
--- a/python/ray/util/lightgbm/init.py
+++ b/python/ray/util/lightgbm/init.py
@ -0,0 +1,18 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+train = None
+predict = None
+RayParams = None
+RayDMatrix = None
+RayFileType = None
+
+try:
+    from lightgbm_ray import train, predict, RayParams, RayDMatrix, RayFileType
+except ImportError:
+    logger.info(
+        "lightgbm_ray is not installed. Please run "
+        "`pip install git+https://github.com/ray-project/lightgbm_ray`.")
+
+__all__ = ["train", "predict", "RayParams", "RayDMatrix", "RayFileType"]
--- a/python/ray/util/lightgbm/release_test_util.py
+++ b/python/ray/util/lightgbm/release_test_util.py
@ -0,0 +1,149 @@
+import glob
+import os
+import time
+
+import ray
+
+from lightgbm_ray import train, RayDMatrix, RayFileType, \
+    RayParams, RayDeviceQuantileDMatrix
+from lightgbm_ray.tune import _TuneLGBMRank0Mixin
+from lightgbm.callback import CallbackEnv
+
+if "OMP_NUM_THREADS" in os.environ:
+    del os.environ["OMP_NUM_THREADS"]
+
+
+@ray.remote
+class FailureState:
+    def __init__(self):
+        self._failed_ids = set()
+
+    def set_failed(self, id):
+        if id in self._failed_ids:
+            return False
+        self._failed_ids.add(id)
+        return True
+
+    def has_failed(self, id):
+        return id in self._failed_ids
+
+
+class FailureInjection(_TuneLGBMRank0Mixin):
+    def __init__(self, id, state, ranks, iteration):
+        self._id = id
+        self._state = state
+        self._ranks = ranks or []
+        self._iteration = iteration
+
+    def __call__(self, env: CallbackEnv):
+        if env.iteration == self._iteration:
+            rank = 0 if self.is_rank_0 else 1
+            if rank in self._ranks:
+                if not ray.get(self._state.has_failed.remote(self._id)):
+                    success = ray.get(self._state.set_failed.remote(self._id))
+                    if not success:
+                        # Another rank is already about to fail
+                        return
+
+                    pid = os.getpid()
+                    print(f"Killing process: {pid} for actor rank {rank}")
+                    time.sleep(1)
+                    os.kill(pid, 9)
+
+    order = 2
+
+
+class TrackingCallback(_TuneLGBMRank0Mixin):
+    def __call__(self, env: CallbackEnv):
+        if self.is_rank_0:
+            print(f"[Rank 0] I am at iteration {env.iteration}")
+
+    order = 1
+
+
+def train_ray(path,
+              num_workers,
+              num_boost_rounds,
+              num_files=0,
+              regression=False,
+              use_gpu=False,
+              ray_params=None,
+              lightgbm_params=None,
+              **kwargs):
+    path = os.path.expanduser(path)
+    if not os.path.exists(path):
+        raise ValueError(f"Path does not exist: {path}")
+
+    if num_files:
+        files = sorted(glob.glob(f"{path}/**/*.parquet"))
+        while num_files > len(files):
+            files = files + files
+        path = files[0:num_files]
+
+    use_device_matrix = False
+    if use_gpu:
+        try:
+            import cupy  # noqa: F401
+            use_device_matrix = True
+        except ImportError:
+            use_device_matrix = False
+
+    if use_device_matrix:
+        dtrain = RayDeviceQuantileDMatrix(
+            path,
+            num_actors=num_workers,
+            label="labels",
+            ignore=["partition"],
+            filetype=RayFileType.PARQUET)
+    else:
+        dtrain = RayDMatrix(
+            path,
+            num_actors=num_workers,
+            label="labels",
+            ignore=["partition"],
+            filetype=RayFileType.PARQUET)
+
+    config = {"device": "cpu" if not use_gpu else "gpu"}
+
+    if not regression:
+        # Classification
+        config.update({
+            "objective": "binary",
+            "metric": ["binary_logloss", "binary_error"],
+        })
+    else:
+        # Regression
+        config.update({
+            "objective": "regression",
+            "metric": ["l2", "rmse"],
+        })
+
+    if lightgbm_params:
+        config.update(lightgbm_params)
+
+    start = time.time()
+    evals_result = {}
+    additional_results = {}
+    bst = train(
+        config,
+        dtrain,
+        evals_result=evals_result,
+        additional_results=additional_results,
+        num_boost_round=num_boost_rounds,
+        ray_params=ray_params or RayParams(
+            max_actor_restarts=2,
+            num_actors=num_workers,
+            cpus_per_actor=2,
+            gpus_per_actor=0 if not use_gpu else 1),
+        evals=[(dtrain, "train")],
+        **kwargs)
+    taken = time.time() - start
+    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
+
+    out_file = os.path.expanduser(
+        "~/benchmark_{}.lgbm".format("cpu" if not use_gpu else "gpu"))
+    bst.booster_.save_model(out_file)
+
+    print("Final training error: {:.4f}".format(evals_result["train"][
+        "binary_error" if not regression else "rmse"][-1]))
+    return bst, additional_results, taken
--- a/python/ray/util/lightgbm/simple_example.py
+++ b/python/ray/util/lightgbm/simple_example.py
@ -0,0 +1,44 @@
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from ray.util.lightgbm import RayDMatrix, RayParams, train
+
+
+# __lightgbm_begin__
+def main():
+    # Load dataset
+    data, labels = datasets.load_breast_cancer(return_X_y=True)
+    # Split into train and test set
+    train_x, test_x, train_y, test_y = train_test_split(
+        data, labels, test_size=0.25)
+
+    train_set = RayDMatrix(train_x, train_y)
+    test_set = RayDMatrix(test_x, test_y)
+
+    # Set config
+    config = {
+        "objective": "binary",
+        "metric": ["binary_logloss", "binary_error"],
+        "max_depth": 3,
+    }
+
+    evals_result = {}
+
+    # Train the classifier
+    bst = train(
+        config,
+        train_set,
+        evals=[(test_set, "eval")],
+        evals_result=evals_result,
+        ray_params=RayParams(max_actor_restarts=1, num_actors=1),
+        verbose_eval=False)
+
+    bst.booster_.save_model("simple.lgbm")
+    print("Final validation error: {:.4f}".format(
+        evals_result["eval"]["binary_error"][-1]))
+
+
+# __lightgbm_end__
+
+if __name__ == "__main__":
+    main()
--- a/python/ray/util/lightgbm/simple_tune.py
+++ b/python/ray/util/lightgbm/simple_tune.py
@ -0,0 +1,95 @@
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from ray.util.lightgbm import RayDMatrix, RayParams, train
+
+# __train_begin__
+num_cpus_per_actor = 2
+num_actors = 1
+
+
+def train_model(config):
+    # Load dataset
+    data, labels = datasets.load_breast_cancer(return_X_y=True)
+    # Split into train and test set
+    train_x, test_x, train_y, test_y = train_test_split(
+        data, labels, test_size=0.25)
+
+    train_set = RayDMatrix(train_x, train_y)
+    test_set = RayDMatrix(test_x, test_y)
+
+    evals_result = {}
+    bst = train(
+        params=config,
+        dtrain=train_set,
+        evals=[(test_set, "eval")],
+        evals_result=evals_result,
+        verbose_eval=False,
+        ray_params=RayParams(
+            num_actors=num_actors, cpus_per_actor=num_cpus_per_actor))
+    bst.booster_.save_model("model.lgbm")
+
+
+# __train_end__
+
+
+# __load_begin__
+def load_best_model(best_logdir):
+    import lightgbm as lgbm
+    import os
+
+    best_bst = lgbm.Booster(model_file=os.path.join(best_logdir, "model.lgbm"))
+    return best_bst
+
+
+# __load_end__
+
+
+def main():
+    # __tune_begin__
+    from ray import tune
+
+    # Set config
+    config = {
+        "objective": "binary",
+        "metric": ["binary_logloss", "binary_error"],
+        "eta": tune.loguniform(1e-4, 1e-1),
+        "subsample": tune.uniform(0.5, 1.0),
+        "max_depth": tune.randint(1, 9)
+    }
+    # __tune_end__
+
+    # __tune_run_begin__
+    analysis = tune.run(
+        train_model,
+        config=config,
+        metric="eval-binary_error",
+        mode="min",
+        num_samples=4,
+        resources_per_trial={
+            "cpu": 1,
+            "extra_cpu": num_actors * num_cpus_per_actor
+        })
+
+    # Load in the best performing model.
+    best_bst = load_best_model(analysis.best_logdir)
+
+    # Use the following code block instead if using Ray Client.
+    # import ray
+    # if ray.util.client.ray.is_connected():
+    #     # If using Ray Client best_logdir is a directory on the server.
+    #     # So we want to make sure we wrap model loading in a task.
+    #     remote_load_fn = ray.remote(load_best_model)
+    #     best_bst = ray.get(remote_load_fn.remote(analysis.best_logdir))
+
+    # Do something with the best model.
+    _ = best_bst
+
+    accuracy = 1. - analysis.best_result["eval-binary_error"]
+    print(f"Best model parameters: {analysis.best_config}")
+    print(f"Best model total accuracy: {accuracy:.4f}")
+    # __tune_run_end__
+
+
+if __name__ == "__main__":
+    main()
--- a/python/ray/util/lightgbm/tests/init.py
+++ b/python/ray/util/lightgbm/tests/init.py
--- a/python/ray/util/lightgbm/tests/test_client.py
+++ b/python/ray/util/lightgbm/tests/test_client.py
@ -0,0 +1,28 @@
+import pytest
+import sys
+
+import ray
+from ray.util.client.ray_client_helpers import ray_start_client_server
+
+
+@pytest.fixture
+def start_client_server():
+    with ray_start_client_server() as client:
+        yield client
+
+
+def test_simple_example(start_client_server):
+    assert ray.util.client.ray.is_connected()
+    from ray.util.lightgbm.simple_example import main
+    main()
+
+
+def test_simple_tune(start_client_server):
+    assert ray.util.client.ray.is_connected()
+    from ray.util.lightgbm.simple_tune import main
+    main()
+
+
+if __name__ == "__main__":
+    import pytest
+    sys.exit(pytest.main(["-v", __file__]))
--- a/release/lightgbm_tests/README.rst
+++ b/release/lightgbm_tests/README.rst
@ -0,0 +1,24 @@
+LightGBM on Ray tests
+====================
+
+This directory contains various LightGBM on Ray release tests.
+
+You should run these tests with the `releaser <https://github.com/ray-project/releaser>`_ tool.
+
+Overview
+--------
+There are four kinds of tests:
+
+1. ``distributed_api_test`` - checks general API functionality and should finish very quickly (< 1 minute)
+2. ``train_*`` - checks single trial training on different setups.
+3. ``tune_*`` - checks multi trial training via Ray Tune.
+4. ``ft_*`` - checks fault tolerance.
+
+Generally the releaser tool will run all tests in parallel, but if you do
+it sequentially, be sure to do it in the order above. If ``train_*`` fails,
+``tune_*`` will fail, too.
+
+Acceptance criteria
+-------------------
+These tests are considered passing when they throw no error at the end of
+the output log.
--- a/release/lightgbm_tests/app_config.yaml
+++ b/release/lightgbm_tests/app_config.yaml
@ -0,0 +1,24 @@
+base_image: "anyscale/ray-ml:pinned-nightly-py37"
+env_vars: {}
+debian_packages:
+  - curl
+
+python:
+  pip_packages:
+    - pytest
+    - lightgbm_ray
+    - petastorm
+    - tblib
+  conda_packages: []
+
+post_build_cmds:
+  - pip uninstall -y numpy ray || true
+  - sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
+  - pip3 install numpy || true
+  - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  - pip3 install -U lightgbm_ray petastorm  # Install latest releases
+  - sudo mkdir -p /data || true
+  - sudo chown ray:1000 /data || true
+  - rm -rf /data/classification.parquet || true
+  - curl -o create_test_data.py https://raw.githubusercontent.com/ray-project/ray/releases/1.3.0/release/xgboost_tests/create_test_data.py  # XGBoost is intended
+  - python ./create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
--- a/release/lightgbm_tests/cluster_cpu_moderate.yaml
+++ b/release/lightgbm_tests/cluster_cpu_moderate.yaml
@ -0,0 +1,38 @@
+cluster_name: ray-lightgbm-release-cpu-moderate
+
+max_workers: 32
+
+upscaling_speed: 32
+
+idle_timeout_minutes: 15
+
+docker:
+    image: anyscale/ray:latest
+    container_name: ray_container
+    pull_before_run: true
+
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+    cache_stopped_nodes: false
+
+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 31
+        max_workers: 31
+
+auth:
+    ssh_user: ubuntu
+
+head_node_type: cpu_4_ondemand
+worker_default_node_type: cpu_4_ondemand
+
+file_mounts: {
+    "~/lightgbm_tests": "."
+}
+
+file_mounts_sync_continuously: false
--- a/release/lightgbm_tests/cluster_cpu_small.yaml
+++ b/release/lightgbm_tests/cluster_cpu_small.yaml
@ -0,0 +1,38 @@
+cluster_name: ray-lightgbm-release-cpu-small
+
+max_workers: 4
+
+upscaling_speed: 32
+
+idle_timeout_minutes: 15
+
+docker:
+    image: anyscale/ray:latest
+    container_name: ray_container
+    pull_before_run: true
+
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+    cache_stopped_nodes: false
+
+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 3
+        max_workers: 3
+
+auth:
+    ssh_user: ubuntu
+
+head_node_type: cpu_4_ondemand
+worker_default_node_type: cpu_4_ondemand
+
+file_mounts: {
+    "~/lightgbm_tests": "."
+}
+
+file_mounts_sync_continuously: false
--- a/release/lightgbm_tests/create_test_data.py
+++ b/release/lightgbm_tests/create_test_data.py
@ -0,0 +1,58 @@
+import argparse
+import numpy as np
+import os
+
+from xgboost_ray.tests.utils import create_parquet
+
+if __name__ == "__main__":
+    if "OMP_NUM_THREADS" in os.environ:
+        del os.environ["OMP_NUM_THREADS"]
+
+    parser = argparse.ArgumentParser(description="Create fake data.")
+    parser.add_argument(
+        "filename", type=str, default="/data/parted.parquet/", help="ray/dask")
+    parser.add_argument(
+        "-r",
+        "--num-rows",
+        required=False,
+        type=int,
+        default=1e8,
+        help="num rows")
+    parser.add_argument(
+        "-p",
+        "--num-partitions",
+        required=False,
+        type=int,
+        default=100,
+        help="num partitions")
+    parser.add_argument(
+        "-c",
+        "--num-cols",
+        required=False,
+        type=int,
+        default=4,
+        help="num columns (features)")
+    parser.add_argument(
+        "-C",
+        "--num-classes",
+        required=False,
+        type=int,
+        default=2,
+        help="num classes")
+    parser.add_argument(
+        "-s",
+        "--seed",
+        required=False,
+        type=int,
+        default=1234,
+        help="random seed")
+
+    args = parser.parse_args()
+
+    np.random.seed(args.seed)
+    create_parquet(
+        args.filename,
+        num_rows=int(args.num_rows),
+        num_partitions=int(args.num_partitions),
+        num_features=int(args.num_cols),
+        num_classes=int(args.num_classes))
--- a/release/lightgbm_tests/lightgbm_tests.yaml
+++ b/release/lightgbm_tests/lightgbm_tests.yaml
@ -0,0 +1,83 @@
+- name: train_small
+  cluster:
+    app_config: app_config.yaml
+    compute_template: tpl_cpu_small.yaml
+
+  run:
+    use_connect: True
+    timeout: 600
+    prepare: python wait_cluster.py 4 600
+    script: python workloads/train_small.py
+
+- name: train_moderate
+  cluster:
+    app_config: app_config.yaml
+    compute_template: tpl_cpu_moderate.yaml
+
+  run:
+    timeout: 600
+    prepare: python wait_cluster.py 32 600
+    script: python workloads/train_moderate.py
+
+- name: train_gpu
+  cluster:
+    app_config: app_config_gpu.yaml
+    compute_template: tpl_gpu_small.yaml
+
+  run:
+    timeout: 600
+    prepare: python wait_cluster.py 5 600
+    script: python workloads/train_gpu.py
+
+- name: distributed_api_test
+  cluster:
+    app_config: app_config.yaml
+    compute_template: tpl_cpu_small.yaml
+    results: 
+
+  run:
+    timeout: 600
+    prepare: python wait_cluster.py 4 600
+    script: python workloads/distributed_api_test.py
+    results: ""
+
+- name: ft_small_non_elastic
+  cluster:
+    app_config: app_config.yaml
+    compute_template: tpl_cpu_small.yaml
+
+  run:
+    timeout: 900
+    prepare: python wait_cluster.py 4 600
+    script: python workloads/ft_small_non_elastic.py
+    results: ""
+
+- name: tune_small
+  cluster:
+    app_config: app_config.yaml
+    compute_template: tpl_cpu_small.yaml
+
+  run:
+    timeout: 600
+    prepare: python wait_cluster.py 4 600
+    script: python workloads/tune_small.py
+
+- name: tune_32x4
+  cluster:
+    app_config: app_config.yaml
+    compute_template: tpl_cpu_moderate.yaml
+
+  run:
+    timeout: 900
+    prepare: python wait_cluster.py 32 600
+    script: python workloads/tune_32x4.py
+
+- name: tune_4x32
+  cluster:
+    app_config: app_config.yaml
+    compute_template: tpl_cpu_moderate.yaml
+
+  run:
+    timeout: 900
+    prepare: python wait_cluster.py 32 600
+    script: python workloads/tune_4x32.py
--- a/release/lightgbm_tests/requirements.txt
+++ b/release/lightgbm_tests/requirements.txt
@ -0,0 +1,3 @@
+ray[tune]
+lightgbm_ray
+xgboost_ray
--- a/release/lightgbm_tests/setup_lightgbm.sh
+++ b/release/lightgbm_tests/setup_lightgbm.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+pip install pytest
+# Uninstall any existing lightgbm_ray repositories
+pip uninstall -y lightgbm_ray || true
+
+# Install lightgbm package
+pip install -U "${LIGHTGBM_RAY_PACKAGE:-lightgbm_ray}"
+
+# Create test dataset
+sudo mkdir -p /data || true
+sudo chown ray:1000 /data || true
+rm -rf /data/classification.parquet || true
+cp -R /tmp/ray_tmp_mount/lightgbm_tests ~/lightgbm_tests || echo "Copy failed"
+python ~/lightgbm_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
--- a/release/lightgbm_tests/tpl_cpu_moderate.yaml
+++ b/release/lightgbm_tests/tpl_cpu_moderate.yaml
@ -0,0 +1,15 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 31
+
+head_node_type:
+    name: head_node
+    instance_type: m5.xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.xlarge
+      min_workers: 31
+      max_workers: 31
+      use_spot: false
--- a/release/lightgbm_tests/tpl_cpu_small.yaml
+++ b/release/lightgbm_tests/tpl_cpu_small.yaml
@ -0,0 +1,15 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 3
+
+head_node_type:
+    name: head_node
+    instance_type: m5.xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.xlarge
+      min_workers: 3
+      max_workers: 3
+      use_spot: false
--- a/release/lightgbm_tests/wait_cluster.py
+++ b/release/lightgbm_tests/wait_cluster.py
@ -0,0 +1,49 @@
+import argparse
+import time
+
+import ray
+
+ray.init(address="auto")
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "num_nodes",
+    type=int,
+    help="Wait for this number of nodes (includes head)")
+
+parser.add_argument(
+    "max_time_s", type=int, help="Wait for this number of seconds")
+
+parser.add_argument(
+    "--feedback_interval_s",
+    type=int,
+    default=10,
+    help="Wait for this number of seconds")
+
+args = parser.parse_args()
+
+curr_nodes = 0
+start = time.time()
+next_feedback = start
+max_time = start + args.max_time_s
+while not curr_nodes >= args.num_nodes:
+    now = time.time()
+
+    if now >= max_time:
+        raise RuntimeError(
+            f"Maximum wait time reached, but only "
+            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting.")
+
+    if now >= next_feedback:
+        passed = now - start
+        print(f"Waiting for more nodes to come up: "
+              f"{curr_nodes}/{args.num_nodes} "
+              f"({passed:.0f} seconds passed)")
+        next_feedback = now + args.feedback_interval_s
+
+    time.sleep(5)
+    curr_nodes = len(ray.nodes())
+
+passed = time.time() - start
+print(f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
+      f"{passed:.0f} seconds")
--- a/release/lightgbm_tests/workloads/distributed_api_test.py
+++ b/release/lightgbm_tests/workloads/distributed_api_test.py
@ -0,0 +1,26 @@
+"""Distributed LightGBM API test
+
+This test runs unit tests on a distributed cluster. This will confirm that
+LightGBM API features like custom metrics/objectives work with remote
+trainables.
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Unit tests should pass (requires pytest).
+"""
+
+import ray
+
+from lightgbm_ray.tests.test_lightgbm_api import LightGBMAPITest
+
+
+class LightGBMDistributedAPITest(LightGBMAPITest):
+    def _init_ray(self):
+        if not ray.is_initialized():
+            ray.init(address="auto")
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", f"{__file__}::LightGBMDistributedAPITest"]))
--- a/release/lightgbm_tests/workloads/ft_small_non_elastic.py
+++ b/release/lightgbm_tests/workloads/ft_small_non_elastic.py
@ -0,0 +1,51 @@
+"""Fault tolerance test (small cluster, non-elastic training)
+
+In this run, two training actors will die after some time. It is expected that
+in both cases lightgbm_ray stops training, restarts the dead actors, and
+continues training with all four actors.
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Should run through and report final results. Intermediate
+output should show that training halts wenn an actor dies and continues only
+when all four actors are available again. The test will fail if fault
+tolerance did not work correctly.
+
+Notes: This test seems to be somewhat flaky. This might be due to
+race conditions in handling dead actors. This is likely a problem of
+the lightgbm_ray implementation and not of this test.
+"""
+import ray
+
+from lightgbm_ray import RayParams
+
+
+from ray.util.lightgbm.release_test_util import train_ray, \
+    FailureState, FailureInjection, TrackingCallback
+
+if __name__ == "__main__":
+    ray.init(address="auto")
+
+    failure_state = FailureState.remote()
+
+    ray_params = RayParams(
+        max_actor_restarts=2, num_actors=4, cpus_per_actor=4, gpus_per_actor=0)
+
+    _, additional_results, _ = train_ray(
+        path="/data/classification.parquet",
+        num_workers=4,
+        num_boost_rounds=100,
+        num_files=200,
+        regression=False,
+        use_gpu=False,
+        ray_params=ray_params,
+        lightgbm_params=None,
+        callbacks=[
+            TrackingCallback(),
+            FailureInjection(
+                id="first_fail", state=failure_state, ranks=[1], iteration=14),
+            FailureInjection(
+                id="second_fail", state=failure_state, ranks=[0], iteration=34)
+        ])
+
+    print("PASSED.")
--- a/release/lightgbm_tests/workloads/train_moderate.py
+++ b/release/lightgbm_tests/workloads/train_moderate.py
@ -0,0 +1,49 @@
+"""Moderate cluster training
+
+This training run will start 32 workers on 32 nodes (including head node).
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Should run through and report final results.
+"""
+import json
+import os
+import time
+
+import ray
+from lightgbm_ray import RayParams
+
+from ray.util.lightgbm.release_test_util import train_ray
+
+if __name__ == "__main__":
+    ray.init(address="auto")
+
+    ray_params = RayParams(
+        elastic_training=False,
+        max_actor_restarts=2,
+        num_actors=32,
+        cpus_per_actor=4,
+        gpus_per_actor=0)
+
+    start = time.time()
+    train_ray(
+        path="/data/classification.parquet",
+        num_workers=32,
+        num_boost_rounds=100,
+        num_files=128,
+        regression=False,
+        use_gpu=False,
+        ray_params=ray_params,
+        lightgbm_params=None,
+    )
+    taken = time.time() - start
+
+    result = {
+        "time_taken": taken,
+    }
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/train_moderate.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    print("PASSED.")
--- a/release/lightgbm_tests/workloads/train_small.py
+++ b/release/lightgbm_tests/workloads/train_small.py
@ -0,0 +1,67 @@
+"""Small cluster training
+
+This training run will start 4 workers on 4 nodes (including head node).
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Should run through and report final results.
+"""
+import json
+import os
+import time
+
+import ray
+from ray.test_utils import wait_for_num_nodes
+from lightgbm_ray import RayParams
+
+from ray.util.lightgbm.release_test_util import train_ray
+
+if __name__ == "__main__":
+    addr = os.environ.get("RAY_ADDRESS")
+    job_name = os.environ.get("RAY_JOB_NAME", "train_small")
+    if addr.startswith("anyscale://"):
+        ray.client(address=addr).job_name(job_name).connect()
+    else:
+        ray.init(address="auto")
+
+    wait_for_num_nodes(
+        int(os.environ.get("RAY_RELEASE_MIN_WORKERS", 0)) + 1, 600)
+
+    output = os.environ["TEST_OUTPUT_JSON"]
+    state = os.environ["TEST_STATE_JSON"]
+    ray_params = RayParams(
+        elastic_training=False,
+        max_actor_restarts=2,
+        num_actors=4,
+        cpus_per_actor=4,
+        gpus_per_actor=0)
+
+    start = time.time()
+
+    @ray.remote
+    def train():
+        os.environ["TEST_OUTPUT_JSON"] = output
+        os.environ["TEST_STATE_JSON"] = state
+        train_ray(
+            path="/data/classification.parquet",
+            num_workers=4,
+            num_boost_rounds=100,
+            num_files=25,
+            regression=False,
+            use_gpu=False,
+            ray_params=ray_params,
+            lightgbm_params=None,
+        )
+
+    ray.get(train.remote())
+    taken = time.time() - start
+
+    result = {
+        "time_taken": taken,
+    }
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/train_small.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    print("PASSED.")
--- a/release/lightgbm_tests/workloads/train_small_connect.py
+++ b/release/lightgbm_tests/workloads/train_small_connect.py
@ -0,0 +1,58 @@
+"""Small cluster training
+
+This training run will start 4 workers on 4 nodes (including head node).
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Should run through and report final results.
+"""
+import json
+import os
+import time
+
+import ray
+from lightgbm_ray import RayParams
+
+from ray.util.lightgbm.release_test_util import train_ray
+
+if __name__ == "__main__":
+    addr = os.environ.get("RAY_ADDRESS")
+    job_name = os.environ.get("RAY_JOB_NAME", "train_small")
+    if addr.startswith("anyscale://"):
+        ray.client(address=addr).job_name(job_name).connect()
+    else:
+        ray.init(address="auto")
+
+    ray_params = RayParams(
+        elastic_training=False,
+        max_actor_restarts=2,
+        num_actors=4,
+        cpus_per_actor=4,
+        gpus_per_actor=0)
+
+    @ray.remote
+    def train():
+        train_ray(
+            path="/data/classification.parquet",
+            num_workers=4,
+            num_boost_rounds=100,
+            num_files=25,
+            regression=False,
+            use_gpu=False,
+            ray_params=ray_params,
+            lightgbm_params=None,
+        )
+
+    start = time.time()
+    ray.get(train.remote())
+    taken = time.time() - start
+
+    result = {
+        "time_taken": taken,
+    }
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/train_small.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    print("PASSED.")
--- a/release/lightgbm_tests/workloads/tune_32x4.py
+++ b/release/lightgbm_tests/workloads/tune_32x4.py
@ -0,0 +1,72 @@
+"""Moderate Ray Tune run (32 trials, 4 actors).
+
+This training run will start 32 Ray Tune trials, each starting 4 actors.
+The cluster comprises 32 nodes.
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Should run through and report final results, as well
+as the Ray Tune results table. No trials should error. All trials should
+run in parallel.
+"""
+from collections import Counter
+import json
+import os
+import time
+
+import ray
+from ray import tune
+
+from lightgbm_ray import RayParams
+
+from ray.util.lightgbm.release_test_util import train_ray
+
+
+def train_wrapper(config, ray_params):
+    train_ray(
+        path="/data/classification.parquet",
+        num_workers=4,
+        num_boost_rounds=100,
+        num_files=64,
+        regression=False,
+        use_gpu=False,
+        ray_params=ray_params,
+        lightgbm_params=config,
+    )
+
+
+if __name__ == "__main__":
+    search_space = {
+        "eta": tune.loguniform(1e-4, 1e-1),
+        "subsample": tune.uniform(0.5, 1.0),
+        "max_depth": tune.randint(1, 9)
+    }
+
+    ray.init(address="auto")
+
+    ray_params = RayParams(
+        elastic_training=False,
+        max_actor_restarts=2,
+        num_actors=4,
+        cpus_per_actor=1,
+        gpus_per_actor=0)
+
+    start = time.time()
+    analysis = tune.run(
+        tune.with_parameters(train_wrapper, ray_params=ray_params),
+        config=search_space,
+        num_samples=32,
+        resources_per_trial=ray_params.get_tune_resources())
+    taken = time.time() - start
+
+    result = {
+        "time_taken": taken,
+        "trial_states": dict(
+            Counter([trial.status for trial in analysis.trials]))
+    }
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/tune_32x4.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    print("PASSED.")
--- a/release/lightgbm_tests/workloads/tune_4x32.py
+++ b/release/lightgbm_tests/workloads/tune_4x32.py
@ -0,0 +1,72 @@
+"""Moderate Ray Tune run (4 trials, 32 actors).
+
+This training run will start 4 Ray Tune trials, each starting 32 actors.
+The cluster comprises 32 nodes.
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Should run through and report final results, as well
+as the Ray Tune results table. No trials should error. All trials should
+run in parallel.
+"""
+from collections import Counter
+import json
+import os
+import time
+
+import ray
+from ray import tune
+
+from lightgbm_ray import RayParams
+
+from ray.util.lightgbm.release_test_util import train_ray
+
+
+def train_wrapper(config, ray_params):
+    train_ray(
+        path="/data/classification.parquet",
+        num_workers=32,
+        num_boost_rounds=100,
+        num_files=128,
+        regression=False,
+        use_gpu=False,
+        ray_params=ray_params,
+        lightgbm_params=config,
+    )
+
+
+if __name__ == "__main__":
+    search_space = {
+        "eta": tune.loguniform(1e-4, 1e-1),
+        "subsample": tune.uniform(0.5, 1.0),
+        "max_depth": tune.randint(1, 9)
+    }
+
+    ray.init(address="auto")
+
+    ray_params = RayParams(
+        elastic_training=False,
+        max_actor_restarts=2,
+        num_actors=32,
+        cpus_per_actor=1,
+        gpus_per_actor=0)
+
+    start = time.time()
+    analysis = tune.run(
+        tune.with_parameters(train_wrapper, ray_params=ray_params),
+        config=search_space,
+        num_samples=4,
+        resources_per_trial=ray_params.get_tune_resources())
+    taken = time.time() - start
+
+    result = {
+        "time_taken": taken,
+        "trial_states": dict(
+            Counter([trial.status for trial in analysis.trials]))
+    }
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/tune_4x32.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    print("PASSED.")
--- a/release/lightgbm_tests/workloads/tune_small.py
+++ b/release/lightgbm_tests/workloads/tune_small.py
@ -0,0 +1,72 @@
+"""Small Ray Tune run (4 trials, 4 actors).
+
+This training run will start 4 Ray Tune Trials, each starting 4 actors.
+The cluster comprises 4 nodes.
+
+Test owner: Yard1 (primary), krfricke
+
+Acceptance criteria: Should run through and report final results, as well
+as the Ray Tune results table. No trials should error. All trials should
+run in parallel.
+"""
+from collections import Counter
+import json
+import os
+import time
+
+import ray
+from ray import tune
+
+from lightgbm_ray import RayParams
+
+from ray.util.lightgbm.release_test_util import train_ray
+
+
+def train_wrapper(config, ray_params):
+    train_ray(
+        path="/data/classification.parquet",
+        num_workers=4,
+        num_boost_rounds=100,
+        num_files=25,
+        regression=False,
+        use_gpu=False,
+        ray_params=ray_params,
+        lightgbm_params=config,
+    )
+
+
+if __name__ == "__main__":
+    search_space = {
+        "eta": tune.loguniform(1e-4, 1e-1),
+        "subsample": tune.uniform(0.5, 1.0),
+        "max_depth": tune.randint(1, 9)
+    }
+
+    ray.init(address="auto")
+
+    ray_params = RayParams(
+        elastic_training=False,
+        max_actor_restarts=2,
+        num_actors=4,
+        cpus_per_actor=1,
+        gpus_per_actor=0)
+
+    start = time.time()
+    analysis = tune.run(
+        tune.with_parameters(train_wrapper, ray_params=ray_params),
+        config=search_space,
+        num_samples=4,
+        resources_per_trial=ray_params.get_tune_resources())
+    taken = time.time() - start
+
+    result = {
+        "time_taken": taken,
+        "trial_states": dict(
+            Counter([trial.status for trial in analysis.trials]))
+    }
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/tune_small.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    print("PASSED.")