[xgboost] Update XGBoost release test configs (#13941)

* Update XGBoost release test configs * Use GPU containers * Fix elastic check * Use spot instances for GPU * Add debugging output * Fix success check, failure checking, outputs, sync behavior * Update release checklist, rename mounts
2025-03-05 18:11:42 -05:00 · 2021-02-17 23:00:49 +01:00 · 2021-02-17 23:00:49 +01:00 · a0f73cf3f7
commit a0f73cf3f7
parent 58d7398246
18 changed files with 280 additions and 74 deletions
--- a/release/RELEASE_CHECKLIST.md
+++ b/release/RELEASE_CHECKLIST.md
@ -79,8 +79,8 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d
    - [ ] tune_small
    - [ ] tune_4x32
    - [ ] tune_32x4
-    - [ ] ft_small_non_elastic (flaky!)
-    - [ ] ft_small_elastic (flaky!)
+    - [ ] ft_small_non_elastic
+    - [ ] ft_small_elastic

 ## Final Steps
 - [ ] Wheels uploaded to Test PyPI
--- a/release/xgboost_tests/README.rst
+++ b/release/xgboost_tests/README.rst
@ -12,20 +12,12 @@ There are four kinds of tests:
 1. ``distributed_api_test`` - checks general API functionality and should finish very quickly (< 1 minute)
 2. ``train_*`` - checks single trial training on different setups.
 3. ``tune_*`` - checks multi trial training via Ray Tune.
-4. ``ft_*`` - checks fault tolerance. **These tests are currently flaky**
+4. ``ft_*`` - checks fault tolerance.

 Generally the releaser tool will run all tests in parallel, but if you do
 it sequentially, be sure to do it in the order above. If ``train_*`` fails,
 ``tune_*`` will fail, too.

-Flaky fault tolerance tests
---------------------------
-The fault tolerance tests are currently flaky. In some runs, more nodes die
-than expected, causing the test to fail. In other cases, the re-scheduled
-actors become available too soon after crashing, causing the assertions to
-fail. Please consider re-running the test a couple of times or contact the
-test owner with outputs from the tests for further questions.
-
 Acceptance criteria
 -------------------
 These tests are considered passing when they throw no error at the end of
--- a/release/xgboost_tests/cluster_cpu_moderate.yaml
+++ b/release/xgboost_tests/cluster_cpu_moderate.yaml
@ -1,7 +1,8 @@
 cluster_name: ray-xgboost-release-cpu-moderate

-min_workers: 31
-max_workers: 31
+max_workers: 32
+
+upscaling_speed: 32

 idle_timeout_minutes: 15

@ -16,20 +17,25 @@ provider:
    availability_zone: us-west-2a
    cache_stopped_nodes: false

+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 31
+        max_workers: 31
+
 auth:
    ssh_user: ubuntu

-head_node:
-    # 64 CPUs
-    InstanceType: m5.xlarge
+head_node_type: cpu_4_ondemand
+worker_default_node_type: cpu_4_ondemand

-worker_nodes:
-    # 64 CPUs
-    InstanceType: m5.xlarge
+file_mounts: {
+    "~/xgboost_tests": "."
+}
+
+file_mounts_sync_continuously: false

 setup_commands:
-    - pip install pytest xgboost_ray
-    - sudo mkdir -p /data || true
-    - sudo chown ray:1000 /data || true
-    - rm -rf /data/classification.parquet || true
-    - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
+    - /bin/bash ~/xgboost_tests/setup_xgboost.sh
--- a/release/xgboost_tests/cluster_cpu_small.yaml
+++ b/release/xgboost_tests/cluster_cpu_small.yaml
@ -1,7 +1,8 @@
 cluster_name: ray-xgboost-release-cpu-small

-min_workers: 3
-max_workers: 3
+max_workers: 4
+
+upscaling_speed: 32

 idle_timeout_minutes: 15

@ -16,20 +17,25 @@ provider:
    availability_zone: us-west-2a
    cache_stopped_nodes: false

+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 3
+        max_workers: 3
+
 auth:
    ssh_user: ubuntu

-head_node:
-    # 64 CPUs
-    InstanceType: m5.xlarge
+head_node_type: cpu_4_ondemand
+worker_default_node_type: cpu_4_ondemand

-worker_nodes:
-    # 64 CPUs
-    InstanceType: m5.xlarge
+file_mounts: {
+    "~/xgboost_tests": "."
+}
+
+file_mounts_sync_continuously: false

 setup_commands:
-    - pip install pytest xgboost_ray
-    - sudo mkdir -p /data || true
-    - sudo chown ray:1000 /data || true
-    - rm -rf /data/classification.parquet || true
-    - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
+    - /bin/bash ~/xgboost_tests/setup_xgboost.sh
--- a/release/xgboost_tests/cluster_gpu_small.yaml
+++ b/release/xgboost_tests/cluster_gpu_small.yaml
@ -1,7 +1,8 @@
 cluster_name: ray-xgboost-release-gpu-small

-min_workers: 4
-max_workers: 4
+max_workers: 5
+
+upscaling_speed: 32

 idle_timeout_minutes: 15

@ -16,20 +17,33 @@ provider:
    availability_zone: us-west-2a
    cache_stopped_nodes: false

+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 0
+        max_workers: 0
+    gpu_1_spot:
+        node_config:
+            InstanceType: p2.xlarge
+            InstanceMarketOptions:
+                MarketType: spot
+        resources: {"CPU": 4, "GPU": 1}
+        min_workers: 4
+        max_workers: 4
+
 auth:
    ssh_user: ubuntu

-head_node:
-    # 64 CPUs
-    InstanceType: m5.xlarge
+head_node_type: cpu_4_ondemand
+worker_default_node_type: gpu_1_spot

-worker_nodes:
-    # 64 CPUs
-    InstanceType: p2.xlarge
+file_mounts: {
+    "~/xgboost_tests": "."
+}
+
+file_mounts_sync_continuously: false

 setup_commands:
-    - pip install pytest xgboost_ray
-    - sudo mkdir -p /data || true
-    - sudo chown ray:1000 /data || true
-    - rm -rf /data/classification.parquet || true
-    - python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
+    - /bin/bash ~/xgboost_tests/setup_xgboost.sh
--- a/release/xgboost_tests/oss_cluster_cpu_moderate.yaml
+++ b/release/xgboost_tests/oss_cluster_cpu_moderate.yaml
@ -0,0 +1,41 @@
+cluster_name: ray-xgboost-release-cpu-moderate
+
+max_workers: 32
+
+upscaling_speed: 32
+
+idle_timeout_minutes: 15
+
+docker:
+    image: rayproject/ray:latest
+    container_name: ray_container
+    pull_before_run: true
+
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+    cache_stopped_nodes: false
+
+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 31
+        max_workers: 31
+
+auth:
+    ssh_user: ubuntu
+
+head_node_type: cpu_4_ondemand
+worker_default_node_type: cpu_4_ondemand
+
+file_mounts: {
+    "~/xgboost_tests": "."
+}
+
+file_mounts_sync_continuously: false
+
+setup_commands:
+    - /bin/bash ~/xgboost_tests/setup_xgboost.sh
--- a/release/xgboost_tests/oss_cluster_cpu_small.yaml
+++ b/release/xgboost_tests/oss_cluster_cpu_small.yaml
@ -0,0 +1,41 @@
+cluster_name: ray-xgboost-release-cpu-small
+
+max_workers: 4
+
+upscaling_speed: 32
+
+idle_timeout_minutes: 15
+
+docker:
+    image: rayproject/ray:latest
+    container_name: ray_container
+    pull_before_run: true
+
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+    cache_stopped_nodes: false
+
+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 3
+        max_workers: 3
+
+auth:
+    ssh_user: ubuntu
+
+head_node_type: cpu_4_ondemand
+worker_default_node_type: cpu_4_ondemand
+
+file_mounts: {
+    "~/xgboost_tests": "."
+}
+
+file_mounts_sync_continuously: false
+
+setup_commands:
+    - /bin/bash ~/xgboost_tests/setup_xgboost.sh
--- a/release/xgboost_tests/oss_cluster_gpu_small.yaml
+++ b/release/xgboost_tests/oss_cluster_gpu_small.yaml
@ -0,0 +1,49 @@
+cluster_name: ray-xgboost-release-gpu-small
+
+max_workers: 5
+
+upscaling_speed: 32
+
+idle_timeout_minutes: 15
+
+docker:
+    image: rayproject/ray:latest-gpu
+    container_name: ray_container
+    pull_before_run: true
+
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+    cache_stopped_nodes: false
+
+available_node_types:
+    cpu_4_ondemand:
+        node_config:
+            InstanceType: m5.xlarge
+        resources: {"CPU": 4}
+        min_workers: 0
+        max_workers: 0
+    gpu_1_spot:
+        node_config:
+            InstanceType: p2.xlarge
+            InstanceMarketOptions:
+                MarketType: spot
+        resources: {"CPU": 4, "GPU": 1}
+        min_workers: 4
+        max_workers: 4
+
+auth:
+    ssh_user: ubuntu
+
+head_node_type: cpu_4_ondemand
+worker_default_node_type: gpu_1_spot
+
+file_mounts: {
+    "~/xgboost_tests": "."
+}
+
+file_mounts_sync_continuously: false
+
+setup_commands:
+    - /bin/bash ~/xgboost_tests/setup_xgboost.sh
--- a/release/xgboost_tests/setup_xgboost.sh
+++ b/release/xgboost_tests/setup_xgboost.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+pip install pytest xgboost_ray
+sudo mkdir -p /data || true
+sudo chown ray:1000 /data || true
+rm -rf /data/classification.parquet || true
+cp -R /tmp/ray_tmp_mount/xgboost_tests ~/xgboost_tests || echo "Copy failed"
+python ~/xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
--- a/release/xgboost_tests/workloads/_train.py
+++ b/release/xgboost_tests/workloads/_train.py
@ -19,6 +19,7 @@ def train_ray(path,
              ray_params=None,
              xgboost_params=None,
              **kwargs):
+    path = os.path.expanduser(path)
    if not os.path.exists(path):
        raise ValueError(f"Path does not exist: {path}")

@ -88,7 +89,10 @@ def train_ray(path,
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

-    bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
+    out_file = os.path.expanduser(
+        "~/benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
+    bst.save_model(out_file)
+
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
    return bst, additional_results, taken
--- a/release/xgboost_tests/workloads/ft_small_elastic.py
+++ b/release/xgboost_tests/workloads/ft_small_elastic.py
@ -16,9 +16,13 @@ Notes: This test seems to be somewhat flaky. This might be due to
 race conditions in handling dead actors. This is likely a problem of
 the xgboost_ray implementation and not of this test.
 """
+import warnings
+from unittest.mock import patch
+
 import ray

 from xgboost_ray import RayParams
+from xgboost_ray.main import _train as unmocked_train

 from _train import train_ray
 from ft_small_non_elastic import FailureState, FailureInjection, \
@ -26,6 +30,8 @@ from ft_small_non_elastic import FailureState, FailureInjection, \

 if __name__ == "__main__":
    ray.init(address="auto")
+    from xgboost_ray.main import logger
+    logger.setLevel(10)

    failure_state = FailureState.remote()

@ -37,25 +43,48 @@ if __name__ == "__main__":
        cpus_per_actor=4,
        gpus_per_actor=0)

-    _, additional_results, _ = train_ray(
-        path="/data/classification.parquet",
-        num_workers=4,
-        num_boost_rounds=100,
-        num_files=200,
-        regression=False,
-        use_gpu=False,
-        ray_params=ray_params,
-        xgboost_params=None,
-        callbacks=[
-            TrackingCallback(),
-            FailureInjection(
-                id="first_fail", state=failure_state, ranks=[2], iteration=14),
-            FailureInjection(
-                id="second_fail", state=failure_state, ranks=[0], iteration=34)
-        ])
+    world_sizes = []
+    start_actors = []
+
+    def _mock_train(*args, _training_state, **kwargs):
+        world_sizes.append(len([a for a in _training_state.actors if a]))
+        start_actors.append(len(_training_state.failed_actor_ranks))
+
+        return unmocked_train(*args, _training_state=_training_state, **kwargs)
+
+    with patch("xgboost_ray.main._train") as mocked:
+        mocked.side_effect = _mock_train
+        _, additional_results, _ = train_ray(
+            path="/data/classification.parquet",
+            num_workers=4,
+            num_boost_rounds=100,
+            num_files=200,
+            regression=False,
+            use_gpu=False,
+            ray_params=ray_params,
+            xgboost_params=None,
+            callbacks=[
+                TrackingCallback(),
+                FailureInjection(
+                    id="first_fail",
+                    state=failure_state,
+                    ranks=[2],
+                    iteration=14),
+                FailureInjection(
+                    id="second_fail",
+                    state=failure_state,
+                    ranks=[0],
+                    iteration=34)
+            ])

    actor_1_world_size = set(additional_results["callback_returns"][1])
-    assert 3 in actor_1_world_size, \
-        "No training with only 3 actors observed, but this was elastic " \
-        "training. Please check if additional actors died (e.g. via " \
-        "node failure), run test again, and report to test owner otherwise."
+
+    if 3 not in actor_1_world_size and 3 not in world_sizes and \
+       1 not in world_sizes:
+        warnings.warn(
+            "No training with only 3 actors observed, but this was elastic "
+            "training. Please check the output to see if data loading was "
+            "too fast so that the training actors were re-integrated directly "
+            "after restarting.")
+
+    print("PASSED.")
--- a/release/xgboost_tests/workloads/ft_small_non_elastic.py
+++ b/release/xgboost_tests/workloads/ft_small_non_elastic.py
@ -61,8 +61,8 @@ class FailureInjection(TrainingCallback):
        if epoch == self._iteration:
            rank = get_actor_rank()
            if rank in self._ranks:
-                if not ray.get(self._state.has_failed.remote(id)):
-                    success = ray.get(self._state.set_failed.remote(id))
+                if not ray.get(self._state.has_failed.remote(self._id)):
+                    success = ray.get(self._state.set_failed.remote(self._id))
                    if not success:
                        # Another rank is already about to fail
                        return
@ -74,7 +74,9 @@ class FailureInjection(TrainingCallback):


 class TrackingCallback(TrainingCallback):
-    def after_iteration(self, model, epoch, evals_log):
+    def before_iteration(self, model, epoch, evals_log):
+        if get_actor_rank() == 3:
+            print(f"[Rank {get_actor_rank()}] I am at iteration {epoch}")
        put_queue(get_world_size())


@ -111,3 +113,5 @@ if __name__ == "__main__":
    assert len(actor_1_world_size) == 1 and 4 in actor_1_world_size, \
        "Training with fewer than 4 actors observed, but this was " \
        "non-elastic training. Please report to test owner."
+
+    print("PASSED.")
--- a/release/xgboost_tests/workloads/train_gpu.py
+++ b/release/xgboost_tests/workloads/train_gpu.py
@ -44,3 +44,5 @@ if __name__ == "__main__":
        ray_params=ray_params,
        xgboost_params=None,
    )
+
+    print("PASSED.")
--- a/release/xgboost_tests/workloads/train_moderate.py
+++ b/release/xgboost_tests/workloads/train_moderate.py
@ -31,3 +31,5 @@ if __name__ == "__main__":
        ray_params=ray_params,
        xgboost_params=None,
    )
+
+    print("PASSED.")
--- a/release/xgboost_tests/workloads/train_small.py
+++ b/release/xgboost_tests/workloads/train_small.py
@ -31,3 +31,5 @@ if __name__ == "__main__":
        ray_params=ray_params,
        xgboost_params=None,
    )
+
+    print("PASSED.")
--- a/release/xgboost_tests/workloads/tune_32x4.py
+++ b/release/xgboost_tests/workloads/tune_32x4.py
@ -54,3 +54,5 @@ if __name__ == "__main__":
            "cpu": 1,
            "extra_cpu": 3
        })
+
+    print("PASSED.")
--- a/release/xgboost_tests/workloads/tune_4x32.py
+++ b/release/xgboost_tests/workloads/tune_4x32.py
@ -54,3 +54,5 @@ if __name__ == "__main__":
            "cpu": 1,
            "extra_cpu": 31
        })
+
+    print("PASSED.")
--- a/release/xgboost_tests/workloads/tune_small.py
+++ b/release/xgboost_tests/workloads/tune_small.py
@ -54,3 +54,5 @@ if __name__ == "__main__":
            "cpu": 1,
            "extra_cpu": 3
        })
+
+    print("PASSED.")