mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[xgboost] Update XGBoost release test configs (#13941)
* Update XGBoost release test configs * Use GPU containers * Fix elastic check * Use spot instances for GPU * Add debugging output * Fix success check, failure checking, outputs, sync behavior * Update release checklist, rename mounts
This commit is contained in:
parent
58d7398246
commit
a0f73cf3f7
18 changed files with 280 additions and 74 deletions
|
@ -79,8 +79,8 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d
|
|||
- [ ] tune_small
|
||||
- [ ] tune_4x32
|
||||
- [ ] tune_32x4
|
||||
- [ ] ft_small_non_elastic (flaky!)
|
||||
- [ ] ft_small_elastic (flaky!)
|
||||
- [ ] ft_small_non_elastic
|
||||
- [ ] ft_small_elastic
|
||||
|
||||
## Final Steps
|
||||
- [ ] Wheels uploaded to Test PyPI
|
||||
|
|
|
@ -12,20 +12,12 @@ There are four kinds of tests:
|
|||
1. ``distributed_api_test`` - checks general API functionality and should finish very quickly (< 1 minute)
|
||||
2. ``train_*`` - checks single trial training on different setups.
|
||||
3. ``tune_*`` - checks multi trial training via Ray Tune.
|
||||
4. ``ft_*`` - checks fault tolerance. **These tests are currently flaky**
|
||||
4. ``ft_*`` - checks fault tolerance.
|
||||
|
||||
Generally the releaser tool will run all tests in parallel, but if you do
|
||||
it sequentially, be sure to do it in the order above. If ``train_*`` fails,
|
||||
``tune_*`` will fail, too.
|
||||
|
||||
Flaky fault tolerance tests
|
||||
---------------------------
|
||||
The fault tolerance tests are currently flaky. In some runs, more nodes die
|
||||
than expected, causing the test to fail. In other cases, the re-scheduled
|
||||
actors become available too soon after crashing, causing the assertions to
|
||||
fail. Please consider re-running the test a couple of times or contact the
|
||||
test owner with outputs from the tests for further questions.
|
||||
|
||||
Acceptance criteria
|
||||
-------------------
|
||||
These tests are considered passing when they throw no error at the end of
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
cluster_name: ray-xgboost-release-cpu-moderate
|
||||
|
||||
min_workers: 31
|
||||
max_workers: 31
|
||||
max_workers: 32
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
|
@ -16,20 +17,25 @@ provider:
|
|||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 31
|
||||
max_workers: 31
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node:
|
||||
# 64 CPUs
|
||||
InstanceType: m5.xlarge
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: cpu_4_ondemand
|
||||
|
||||
worker_nodes:
|
||||
# 64 CPUs
|
||||
InstanceType: m5.xlarge
|
||||
file_mounts: {
|
||||
"~/xgboost_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
||||
|
||||
setup_commands:
|
||||
- pip install pytest xgboost_ray
|
||||
- sudo mkdir -p /data || true
|
||||
- sudo chown ray:1000 /data || true
|
||||
- rm -rf /data/classification.parquet || true
|
||||
- python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
|
||||
- /bin/bash ~/xgboost_tests/setup_xgboost.sh
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
cluster_name: ray-xgboost-release-cpu-small
|
||||
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
max_workers: 4
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
|
@ -16,20 +17,25 @@ provider:
|
|||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node:
|
||||
# 64 CPUs
|
||||
InstanceType: m5.xlarge
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: cpu_4_ondemand
|
||||
|
||||
worker_nodes:
|
||||
# 64 CPUs
|
||||
InstanceType: m5.xlarge
|
||||
file_mounts: {
|
||||
"~/xgboost_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
||||
|
||||
setup_commands:
|
||||
- pip install pytest xgboost_ray
|
||||
- sudo mkdir -p /data || true
|
||||
- sudo chown ray:1000 /data || true
|
||||
- rm -rf /data/classification.parquet || true
|
||||
- python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
|
||||
- /bin/bash ~/xgboost_tests/setup_xgboost.sh
|
|
@ -1,7 +1,8 @@
|
|||
cluster_name: ray-xgboost-release-gpu-small
|
||||
|
||||
min_workers: 4
|
||||
max_workers: 4
|
||||
max_workers: 5
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
|
@ -16,20 +17,33 @@ provider:
|
|||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 0
|
||||
max_workers: 0
|
||||
gpu_1_spot:
|
||||
node_config:
|
||||
InstanceType: p2.xlarge
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
resources: {"CPU": 4, "GPU": 1}
|
||||
min_workers: 4
|
||||
max_workers: 4
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node:
|
||||
# 64 CPUs
|
||||
InstanceType: m5.xlarge
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: gpu_1_spot
|
||||
|
||||
worker_nodes:
|
||||
# 64 CPUs
|
||||
InstanceType: p2.xlarge
|
||||
file_mounts: {
|
||||
"~/xgboost_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
||||
|
||||
setup_commands:
|
||||
- pip install pytest xgboost_ray
|
||||
- sudo mkdir -p /data || true
|
||||
- sudo chown ray:1000 /data || true
|
||||
- rm -rf /data/classification.parquet || true
|
||||
- python ~/release-automation-xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
|
||||
- /bin/bash ~/xgboost_tests/setup_xgboost.sh
|
41
release/xgboost_tests/oss_cluster_cpu_moderate.yaml
Normal file
41
release/xgboost_tests/oss_cluster_cpu_moderate.yaml
Normal file
|
@ -0,0 +1,41 @@
|
|||
cluster_name: ray-xgboost-release-cpu-moderate
|
||||
|
||||
max_workers: 32
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
image: rayproject/ray:latest
|
||||
container_name: ray_container
|
||||
pull_before_run: true
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 31
|
||||
max_workers: 31
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: cpu_4_ondemand
|
||||
|
||||
file_mounts: {
|
||||
"~/xgboost_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
||||
|
||||
setup_commands:
|
||||
- /bin/bash ~/xgboost_tests/setup_xgboost.sh
|
41
release/xgboost_tests/oss_cluster_cpu_small.yaml
Normal file
41
release/xgboost_tests/oss_cluster_cpu_small.yaml
Normal file
|
@ -0,0 +1,41 @@
|
|||
cluster_name: ray-xgboost-release-cpu-small
|
||||
|
||||
max_workers: 4
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
image: rayproject/ray:latest
|
||||
container_name: ray_container
|
||||
pull_before_run: true
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: cpu_4_ondemand
|
||||
|
||||
file_mounts: {
|
||||
"~/xgboost_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
||||
|
||||
setup_commands:
|
||||
- /bin/bash ~/xgboost_tests/setup_xgboost.sh
|
49
release/xgboost_tests/oss_cluster_gpu_small.yaml
Normal file
49
release/xgboost_tests/oss_cluster_gpu_small.yaml
Normal file
|
@ -0,0 +1,49 @@
|
|||
cluster_name: ray-xgboost-release-gpu-small
|
||||
|
||||
max_workers: 5
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
image: rayproject/ray:latest-gpu
|
||||
container_name: ray_container
|
||||
pull_before_run: true
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 0
|
||||
max_workers: 0
|
||||
gpu_1_spot:
|
||||
node_config:
|
||||
InstanceType: p2.xlarge
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
resources: {"CPU": 4, "GPU": 1}
|
||||
min_workers: 4
|
||||
max_workers: 4
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: gpu_1_spot
|
||||
|
||||
file_mounts: {
|
||||
"~/xgboost_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
||||
|
||||
setup_commands:
|
||||
- /bin/bash ~/xgboost_tests/setup_xgboost.sh
|
8
release/xgboost_tests/setup_xgboost.sh
Executable file
8
release/xgboost_tests/setup_xgboost.sh
Executable file
|
@ -0,0 +1,8 @@
|
|||
#!/bin/bash
|
||||
|
||||
pip install pytest xgboost_ray
|
||||
sudo mkdir -p /data || true
|
||||
sudo chown ray:1000 /data || true
|
||||
rm -rf /data/classification.parquet || true
|
||||
cp -R /tmp/ray_tmp_mount/xgboost_tests ~/xgboost_tests || echo "Copy failed"
|
||||
python ~/xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
|
|
@ -19,6 +19,7 @@ def train_ray(path,
|
|||
ray_params=None,
|
||||
xgboost_params=None,
|
||||
**kwargs):
|
||||
path = os.path.expanduser(path)
|
||||
if not os.path.exists(path):
|
||||
raise ValueError(f"Path does not exist: {path}")
|
||||
|
||||
|
@ -88,7 +89,10 @@ def train_ray(path,
|
|||
taken = time.time() - start
|
||||
print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
|
||||
|
||||
bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
|
||||
out_file = os.path.expanduser(
|
||||
"~/benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
|
||||
bst.save_model(out_file)
|
||||
|
||||
print("Final training error: {:.4f}".format(
|
||||
evals_result["train"]["error"][-1]))
|
||||
return bst, additional_results, taken
|
||||
|
|
|
@ -16,9 +16,13 @@ Notes: This test seems to be somewhat flaky. This might be due to
|
|||
race conditions in handling dead actors. This is likely a problem of
|
||||
the xgboost_ray implementation and not of this test.
|
||||
"""
|
||||
import warnings
|
||||
from unittest.mock import patch
|
||||
|
||||
import ray
|
||||
|
||||
from xgboost_ray import RayParams
|
||||
from xgboost_ray.main import _train as unmocked_train
|
||||
|
||||
from _train import train_ray
|
||||
from ft_small_non_elastic import FailureState, FailureInjection, \
|
||||
|
@ -26,6 +30,8 @@ from ft_small_non_elastic import FailureState, FailureInjection, \
|
|||
|
||||
if __name__ == "__main__":
|
||||
ray.init(address="auto")
|
||||
from xgboost_ray.main import logger
|
||||
logger.setLevel(10)
|
||||
|
||||
failure_state = FailureState.remote()
|
||||
|
||||
|
@ -37,25 +43,48 @@ if __name__ == "__main__":
|
|||
cpus_per_actor=4,
|
||||
gpus_per_actor=0)
|
||||
|
||||
_, additional_results, _ = train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=4,
|
||||
num_boost_rounds=100,
|
||||
num_files=200,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
xgboost_params=None,
|
||||
callbacks=[
|
||||
TrackingCallback(),
|
||||
FailureInjection(
|
||||
id="first_fail", state=failure_state, ranks=[2], iteration=14),
|
||||
FailureInjection(
|
||||
id="second_fail", state=failure_state, ranks=[0], iteration=34)
|
||||
])
|
||||
world_sizes = []
|
||||
start_actors = []
|
||||
|
||||
def _mock_train(*args, _training_state, **kwargs):
|
||||
world_sizes.append(len([a for a in _training_state.actors if a]))
|
||||
start_actors.append(len(_training_state.failed_actor_ranks))
|
||||
|
||||
return unmocked_train(*args, _training_state=_training_state, **kwargs)
|
||||
|
||||
with patch("xgboost_ray.main._train") as mocked:
|
||||
mocked.side_effect = _mock_train
|
||||
_, additional_results, _ = train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=4,
|
||||
num_boost_rounds=100,
|
||||
num_files=200,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
xgboost_params=None,
|
||||
callbacks=[
|
||||
TrackingCallback(),
|
||||
FailureInjection(
|
||||
id="first_fail",
|
||||
state=failure_state,
|
||||
ranks=[2],
|
||||
iteration=14),
|
||||
FailureInjection(
|
||||
id="second_fail",
|
||||
state=failure_state,
|
||||
ranks=[0],
|
||||
iteration=34)
|
||||
])
|
||||
|
||||
actor_1_world_size = set(additional_results["callback_returns"][1])
|
||||
assert 3 in actor_1_world_size, \
|
||||
"No training with only 3 actors observed, but this was elastic " \
|
||||
"training. Please check if additional actors died (e.g. via " \
|
||||
"node failure), run test again, and report to test owner otherwise."
|
||||
|
||||
if 3 not in actor_1_world_size and 3 not in world_sizes and \
|
||||
1 not in world_sizes:
|
||||
warnings.warn(
|
||||
"No training with only 3 actors observed, but this was elastic "
|
||||
"training. Please check the output to see if data loading was "
|
||||
"too fast so that the training actors were re-integrated directly "
|
||||
"after restarting.")
|
||||
|
||||
print("PASSED.")
|
||||
|
|
|
@ -61,8 +61,8 @@ class FailureInjection(TrainingCallback):
|
|||
if epoch == self._iteration:
|
||||
rank = get_actor_rank()
|
||||
if rank in self._ranks:
|
||||
if not ray.get(self._state.has_failed.remote(id)):
|
||||
success = ray.get(self._state.set_failed.remote(id))
|
||||
if not ray.get(self._state.has_failed.remote(self._id)):
|
||||
success = ray.get(self._state.set_failed.remote(self._id))
|
||||
if not success:
|
||||
# Another rank is already about to fail
|
||||
return
|
||||
|
@ -74,7 +74,9 @@ class FailureInjection(TrainingCallback):
|
|||
|
||||
|
||||
class TrackingCallback(TrainingCallback):
|
||||
def after_iteration(self, model, epoch, evals_log):
|
||||
def before_iteration(self, model, epoch, evals_log):
|
||||
if get_actor_rank() == 3:
|
||||
print(f"[Rank {get_actor_rank()}] I am at iteration {epoch}")
|
||||
put_queue(get_world_size())
|
||||
|
||||
|
||||
|
@ -111,3 +113,5 @@ if __name__ == "__main__":
|
|||
assert len(actor_1_world_size) == 1 and 4 in actor_1_world_size, \
|
||||
"Training with fewer than 4 actors observed, but this was " \
|
||||
"non-elastic training. Please report to test owner."
|
||||
|
||||
print("PASSED.")
|
||||
|
|
|
@ -44,3 +44,5 @@ if __name__ == "__main__":
|
|||
ray_params=ray_params,
|
||||
xgboost_params=None,
|
||||
)
|
||||
|
||||
print("PASSED.")
|
||||
|
|
|
@ -31,3 +31,5 @@ if __name__ == "__main__":
|
|||
ray_params=ray_params,
|
||||
xgboost_params=None,
|
||||
)
|
||||
|
||||
print("PASSED.")
|
||||
|
|
|
@ -31,3 +31,5 @@ if __name__ == "__main__":
|
|||
ray_params=ray_params,
|
||||
xgboost_params=None,
|
||||
)
|
||||
|
||||
print("PASSED.")
|
||||
|
|
|
@ -54,3 +54,5 @@ if __name__ == "__main__":
|
|||
"cpu": 1,
|
||||
"extra_cpu": 3
|
||||
})
|
||||
|
||||
print("PASSED.")
|
||||
|
|
|
@ -54,3 +54,5 @@ if __name__ == "__main__":
|
|||
"cpu": 1,
|
||||
"extra_cpu": 31
|
||||
})
|
||||
|
||||
print("PASSED.")
|
||||
|
|
|
@ -54,3 +54,5 @@ if __name__ == "__main__":
|
|||
"cpu": 1,
|
||||
"extra_cpu": 3
|
||||
})
|
||||
|
||||
print("PASSED.")
|
||||
|
|
Loading…
Add table
Reference in a new issue