mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[release] LightGBM release tests (#17043)
This commit is contained in:
parent
0f79ebbd75
commit
cfc5806c2d
26 changed files with 1199 additions and 0 deletions
36
python/ray/util/lightgbm/BUILD
Normal file
36
python/ray/util/lightgbm/BUILD
Normal file
|
@ -0,0 +1,36 @@
|
|||
# --------------------------------------------------------------------
|
||||
# Tests from the python/ray/util/lightgbm directory.
|
||||
# Please keep these sorted alphabetically.
|
||||
# --------------------------------------------------------------------
|
||||
py_test(
|
||||
name = "simple_example",
|
||||
size = "small",
|
||||
srcs = ["simple_example.py"],
|
||||
deps = [":lgbm_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "simple_tune",
|
||||
size="small",
|
||||
srcs = ["simple_tune.py"],
|
||||
deps = [":lgbm_lib"],
|
||||
tags = ["exclusive"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_client",
|
||||
size = "small",
|
||||
srcs = ["tests/test_client.py"],
|
||||
deps = [":lgbm_lib"],
|
||||
tags = ["exclusive", "client"]
|
||||
)
|
||||
|
||||
# This is a dummy test dependency that causes the above tests to be
|
||||
# re-run if any of these files changes.
|
||||
py_library(
|
||||
name = "lgbm_lib",
|
||||
srcs = glob(["**/*.py"]),
|
||||
)
|
||||
|
||||
|
18
python/ray/util/lightgbm/__init__.py
Normal file
18
python/ray/util/lightgbm/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
train = None
|
||||
predict = None
|
||||
RayParams = None
|
||||
RayDMatrix = None
|
||||
RayFileType = None
|
||||
|
||||
try:
|
||||
from lightgbm_ray import train, predict, RayParams, RayDMatrix, RayFileType
|
||||
except ImportError:
|
||||
logger.info(
|
||||
"lightgbm_ray is not installed. Please run "
|
||||
"`pip install git+https://github.com/ray-project/lightgbm_ray`.")
|
||||
|
||||
__all__ = ["train", "predict", "RayParams", "RayDMatrix", "RayFileType"]
|
149
python/ray/util/lightgbm/release_test_util.py
Normal file
149
python/ray/util/lightgbm/release_test_util.py
Normal file
|
@ -0,0 +1,149 @@
|
|||
import glob
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
from lightgbm_ray import train, RayDMatrix, RayFileType, \
|
||||
RayParams, RayDeviceQuantileDMatrix
|
||||
from lightgbm_ray.tune import _TuneLGBMRank0Mixin
|
||||
from lightgbm.callback import CallbackEnv
|
||||
|
||||
if "OMP_NUM_THREADS" in os.environ:
|
||||
del os.environ["OMP_NUM_THREADS"]
|
||||
|
||||
|
||||
@ray.remote
|
||||
class FailureState:
|
||||
def __init__(self):
|
||||
self._failed_ids = set()
|
||||
|
||||
def set_failed(self, id):
|
||||
if id in self._failed_ids:
|
||||
return False
|
||||
self._failed_ids.add(id)
|
||||
return True
|
||||
|
||||
def has_failed(self, id):
|
||||
return id in self._failed_ids
|
||||
|
||||
|
||||
class FailureInjection(_TuneLGBMRank0Mixin):
|
||||
def __init__(self, id, state, ranks, iteration):
|
||||
self._id = id
|
||||
self._state = state
|
||||
self._ranks = ranks or []
|
||||
self._iteration = iteration
|
||||
|
||||
def __call__(self, env: CallbackEnv):
|
||||
if env.iteration == self._iteration:
|
||||
rank = 0 if self.is_rank_0 else 1
|
||||
if rank in self._ranks:
|
||||
if not ray.get(self._state.has_failed.remote(self._id)):
|
||||
success = ray.get(self._state.set_failed.remote(self._id))
|
||||
if not success:
|
||||
# Another rank is already about to fail
|
||||
return
|
||||
|
||||
pid = os.getpid()
|
||||
print(f"Killing process: {pid} for actor rank {rank}")
|
||||
time.sleep(1)
|
||||
os.kill(pid, 9)
|
||||
|
||||
order = 2
|
||||
|
||||
|
||||
class TrackingCallback(_TuneLGBMRank0Mixin):
|
||||
def __call__(self, env: CallbackEnv):
|
||||
if self.is_rank_0:
|
||||
print(f"[Rank 0] I am at iteration {env.iteration}")
|
||||
|
||||
order = 1
|
||||
|
||||
|
||||
def train_ray(path,
|
||||
num_workers,
|
||||
num_boost_rounds,
|
||||
num_files=0,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=None,
|
||||
lightgbm_params=None,
|
||||
**kwargs):
|
||||
path = os.path.expanduser(path)
|
||||
if not os.path.exists(path):
|
||||
raise ValueError(f"Path does not exist: {path}")
|
||||
|
||||
if num_files:
|
||||
files = sorted(glob.glob(f"{path}/**/*.parquet"))
|
||||
while num_files > len(files):
|
||||
files = files + files
|
||||
path = files[0:num_files]
|
||||
|
||||
use_device_matrix = False
|
||||
if use_gpu:
|
||||
try:
|
||||
import cupy # noqa: F401
|
||||
use_device_matrix = True
|
||||
except ImportError:
|
||||
use_device_matrix = False
|
||||
|
||||
if use_device_matrix:
|
||||
dtrain = RayDeviceQuantileDMatrix(
|
||||
path,
|
||||
num_actors=num_workers,
|
||||
label="labels",
|
||||
ignore=["partition"],
|
||||
filetype=RayFileType.PARQUET)
|
||||
else:
|
||||
dtrain = RayDMatrix(
|
||||
path,
|
||||
num_actors=num_workers,
|
||||
label="labels",
|
||||
ignore=["partition"],
|
||||
filetype=RayFileType.PARQUET)
|
||||
|
||||
config = {"device": "cpu" if not use_gpu else "gpu"}
|
||||
|
||||
if not regression:
|
||||
# Classification
|
||||
config.update({
|
||||
"objective": "binary",
|
||||
"metric": ["binary_logloss", "binary_error"],
|
||||
})
|
||||
else:
|
||||
# Regression
|
||||
config.update({
|
||||
"objective": "regression",
|
||||
"metric": ["l2", "rmse"],
|
||||
})
|
||||
|
||||
if lightgbm_params:
|
||||
config.update(lightgbm_params)
|
||||
|
||||
start = time.time()
|
||||
evals_result = {}
|
||||
additional_results = {}
|
||||
bst = train(
|
||||
config,
|
||||
dtrain,
|
||||
evals_result=evals_result,
|
||||
additional_results=additional_results,
|
||||
num_boost_round=num_boost_rounds,
|
||||
ray_params=ray_params or RayParams(
|
||||
max_actor_restarts=2,
|
||||
num_actors=num_workers,
|
||||
cpus_per_actor=2,
|
||||
gpus_per_actor=0 if not use_gpu else 1),
|
||||
evals=[(dtrain, "train")],
|
||||
**kwargs)
|
||||
taken = time.time() - start
|
||||
print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
|
||||
|
||||
out_file = os.path.expanduser(
|
||||
"~/benchmark_{}.lgbm".format("cpu" if not use_gpu else "gpu"))
|
||||
bst.booster_.save_model(out_file)
|
||||
|
||||
print("Final training error: {:.4f}".format(evals_result["train"][
|
||||
"binary_error" if not regression else "rmse"][-1]))
|
||||
return bst, additional_results, taken
|
44
python/ray/util/lightgbm/simple_example.py
Normal file
44
python/ray/util/lightgbm/simple_example.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from ray.util.lightgbm import RayDMatrix, RayParams, train
|
||||
|
||||
|
||||
# __lightgbm_begin__
|
||||
def main():
|
||||
# Load dataset
|
||||
data, labels = datasets.load_breast_cancer(return_X_y=True)
|
||||
# Split into train and test set
|
||||
train_x, test_x, train_y, test_y = train_test_split(
|
||||
data, labels, test_size=0.25)
|
||||
|
||||
train_set = RayDMatrix(train_x, train_y)
|
||||
test_set = RayDMatrix(test_x, test_y)
|
||||
|
||||
# Set config
|
||||
config = {
|
||||
"objective": "binary",
|
||||
"metric": ["binary_logloss", "binary_error"],
|
||||
"max_depth": 3,
|
||||
}
|
||||
|
||||
evals_result = {}
|
||||
|
||||
# Train the classifier
|
||||
bst = train(
|
||||
config,
|
||||
train_set,
|
||||
evals=[(test_set, "eval")],
|
||||
evals_result=evals_result,
|
||||
ray_params=RayParams(max_actor_restarts=1, num_actors=1),
|
||||
verbose_eval=False)
|
||||
|
||||
bst.booster_.save_model("simple.lgbm")
|
||||
print("Final validation error: {:.4f}".format(
|
||||
evals_result["eval"]["binary_error"][-1]))
|
||||
|
||||
|
||||
# __lightgbm_end__
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
95
python/ray/util/lightgbm/simple_tune.py
Normal file
95
python/ray/util/lightgbm/simple_tune.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from ray.util.lightgbm import RayDMatrix, RayParams, train
|
||||
|
||||
# __train_begin__
|
||||
num_cpus_per_actor = 2
|
||||
num_actors = 1
|
||||
|
||||
|
||||
def train_model(config):
|
||||
# Load dataset
|
||||
data, labels = datasets.load_breast_cancer(return_X_y=True)
|
||||
# Split into train and test set
|
||||
train_x, test_x, train_y, test_y = train_test_split(
|
||||
data, labels, test_size=0.25)
|
||||
|
||||
train_set = RayDMatrix(train_x, train_y)
|
||||
test_set = RayDMatrix(test_x, test_y)
|
||||
|
||||
evals_result = {}
|
||||
bst = train(
|
||||
params=config,
|
||||
dtrain=train_set,
|
||||
evals=[(test_set, "eval")],
|
||||
evals_result=evals_result,
|
||||
verbose_eval=False,
|
||||
ray_params=RayParams(
|
||||
num_actors=num_actors, cpus_per_actor=num_cpus_per_actor))
|
||||
bst.booster_.save_model("model.lgbm")
|
||||
|
||||
|
||||
# __train_end__
|
||||
|
||||
|
||||
# __load_begin__
|
||||
def load_best_model(best_logdir):
|
||||
import lightgbm as lgbm
|
||||
import os
|
||||
|
||||
best_bst = lgbm.Booster(model_file=os.path.join(best_logdir, "model.lgbm"))
|
||||
return best_bst
|
||||
|
||||
|
||||
# __load_end__
|
||||
|
||||
|
||||
def main():
|
||||
# __tune_begin__
|
||||
from ray import tune
|
||||
|
||||
# Set config
|
||||
config = {
|
||||
"objective": "binary",
|
||||
"metric": ["binary_logloss", "binary_error"],
|
||||
"eta": tune.loguniform(1e-4, 1e-1),
|
||||
"subsample": tune.uniform(0.5, 1.0),
|
||||
"max_depth": tune.randint(1, 9)
|
||||
}
|
||||
# __tune_end__
|
||||
|
||||
# __tune_run_begin__
|
||||
analysis = tune.run(
|
||||
train_model,
|
||||
config=config,
|
||||
metric="eval-binary_error",
|
||||
mode="min",
|
||||
num_samples=4,
|
||||
resources_per_trial={
|
||||
"cpu": 1,
|
||||
"extra_cpu": num_actors * num_cpus_per_actor
|
||||
})
|
||||
|
||||
# Load in the best performing model.
|
||||
best_bst = load_best_model(analysis.best_logdir)
|
||||
|
||||
# Use the following code block instead if using Ray Client.
|
||||
# import ray
|
||||
# if ray.util.client.ray.is_connected():
|
||||
# # If using Ray Client best_logdir is a directory on the server.
|
||||
# # So we want to make sure we wrap model loading in a task.
|
||||
# remote_load_fn = ray.remote(load_best_model)
|
||||
# best_bst = ray.get(remote_load_fn.remote(analysis.best_logdir))
|
||||
|
||||
# Do something with the best model.
|
||||
_ = best_bst
|
||||
|
||||
accuracy = 1. - analysis.best_result["eval-binary_error"]
|
||||
print(f"Best model parameters: {analysis.best_config}")
|
||||
print(f"Best model total accuracy: {accuracy:.4f}")
|
||||
# __tune_run_end__
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
0
python/ray/util/lightgbm/tests/__init__.py
Normal file
0
python/ray/util/lightgbm/tests/__init__.py
Normal file
28
python/ray/util/lightgbm/tests/test_client.py
Normal file
28
python/ray/util/lightgbm/tests/test_client.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
import pytest
|
||||
import sys
|
||||
|
||||
import ray
|
||||
from ray.util.client.ray_client_helpers import ray_start_client_server
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def start_client_server():
|
||||
with ray_start_client_server() as client:
|
||||
yield client
|
||||
|
||||
|
||||
def test_simple_example(start_client_server):
|
||||
assert ray.util.client.ray.is_connected()
|
||||
from ray.util.lightgbm.simple_example import main
|
||||
main()
|
||||
|
||||
|
||||
def test_simple_tune(start_client_server):
|
||||
assert ray.util.client.ray.is_connected()
|
||||
from ray.util.lightgbm.simple_tune import main
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
24
release/lightgbm_tests/README.rst
Normal file
24
release/lightgbm_tests/README.rst
Normal file
|
@ -0,0 +1,24 @@
|
|||
LightGBM on Ray tests
|
||||
====================
|
||||
|
||||
This directory contains various LightGBM on Ray release tests.
|
||||
|
||||
You should run these tests with the `releaser <https://github.com/ray-project/releaser>`_ tool.
|
||||
|
||||
Overview
|
||||
--------
|
||||
There are four kinds of tests:
|
||||
|
||||
1. ``distributed_api_test`` - checks general API functionality and should finish very quickly (< 1 minute)
|
||||
2. ``train_*`` - checks single trial training on different setups.
|
||||
3. ``tune_*`` - checks multi trial training via Ray Tune.
|
||||
4. ``ft_*`` - checks fault tolerance.
|
||||
|
||||
Generally the releaser tool will run all tests in parallel, but if you do
|
||||
it sequentially, be sure to do it in the order above. If ``train_*`` fails,
|
||||
``tune_*`` will fail, too.
|
||||
|
||||
Acceptance criteria
|
||||
-------------------
|
||||
These tests are considered passing when they throw no error at the end of
|
||||
the output log.
|
24
release/lightgbm_tests/app_config.yaml
Executable file
24
release/lightgbm_tests/app_config.yaml
Executable file
|
@ -0,0 +1,24 @@
|
|||
base_image: "anyscale/ray-ml:pinned-nightly-py37"
|
||||
env_vars: {}
|
||||
debian_packages:
|
||||
- curl
|
||||
|
||||
python:
|
||||
pip_packages:
|
||||
- pytest
|
||||
- lightgbm_ray
|
||||
- petastorm
|
||||
- tblib
|
||||
conda_packages: []
|
||||
|
||||
post_build_cmds:
|
||||
- pip uninstall -y numpy ray || true
|
||||
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
|
||||
- pip3 install numpy || true
|
||||
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||
- pip3 install -U lightgbm_ray petastorm # Install latest releases
|
||||
- sudo mkdir -p /data || true
|
||||
- sudo chown ray:1000 /data || true
|
||||
- rm -rf /data/classification.parquet || true
|
||||
- curl -o create_test_data.py https://raw.githubusercontent.com/ray-project/ray/releases/1.3.0/release/xgboost_tests/create_test_data.py # XGBoost is intended
|
||||
- python ./create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
|
38
release/lightgbm_tests/cluster_cpu_moderate.yaml
Normal file
38
release/lightgbm_tests/cluster_cpu_moderate.yaml
Normal file
|
@ -0,0 +1,38 @@
|
|||
cluster_name: ray-lightgbm-release-cpu-moderate
|
||||
|
||||
max_workers: 32
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
image: anyscale/ray:latest
|
||||
container_name: ray_container
|
||||
pull_before_run: true
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 31
|
||||
max_workers: 31
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: cpu_4_ondemand
|
||||
|
||||
file_mounts: {
|
||||
"~/lightgbm_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
38
release/lightgbm_tests/cluster_cpu_small.yaml
Normal file
38
release/lightgbm_tests/cluster_cpu_small.yaml
Normal file
|
@ -0,0 +1,38 @@
|
|||
cluster_name: ray-lightgbm-release-cpu-small
|
||||
|
||||
max_workers: 4
|
||||
|
||||
upscaling_speed: 32
|
||||
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
image: anyscale/ray:latest
|
||||
container_name: ray_container
|
||||
pull_before_run: true
|
||||
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
cache_stopped_nodes: false
|
||||
|
||||
available_node_types:
|
||||
cpu_4_ondemand:
|
||||
node_config:
|
||||
InstanceType: m5.xlarge
|
||||
resources: {"CPU": 4}
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
head_node_type: cpu_4_ondemand
|
||||
worker_default_node_type: cpu_4_ondemand
|
||||
|
||||
file_mounts: {
|
||||
"~/lightgbm_tests": "."
|
||||
}
|
||||
|
||||
file_mounts_sync_continuously: false
|
58
release/lightgbm_tests/create_test_data.py
Normal file
58
release/lightgbm_tests/create_test_data.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import argparse
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
from xgboost_ray.tests.utils import create_parquet
|
||||
|
||||
if __name__ == "__main__":
|
||||
if "OMP_NUM_THREADS" in os.environ:
|
||||
del os.environ["OMP_NUM_THREADS"]
|
||||
|
||||
parser = argparse.ArgumentParser(description="Create fake data.")
|
||||
parser.add_argument(
|
||||
"filename", type=str, default="/data/parted.parquet/", help="ray/dask")
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--num-rows",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1e8,
|
||||
help="num rows")
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--num-partitions",
|
||||
required=False,
|
||||
type=int,
|
||||
default=100,
|
||||
help="num partitions")
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--num-cols",
|
||||
required=False,
|
||||
type=int,
|
||||
default=4,
|
||||
help="num columns (features)")
|
||||
parser.add_argument(
|
||||
"-C",
|
||||
"--num-classes",
|
||||
required=False,
|
||||
type=int,
|
||||
default=2,
|
||||
help="num classes")
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--seed",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1234,
|
||||
help="random seed")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
np.random.seed(args.seed)
|
||||
create_parquet(
|
||||
args.filename,
|
||||
num_rows=int(args.num_rows),
|
||||
num_partitions=int(args.num_partitions),
|
||||
num_features=int(args.num_cols),
|
||||
num_classes=int(args.num_classes))
|
83
release/lightgbm_tests/lightgbm_tests.yaml
Normal file
83
release/lightgbm_tests/lightgbm_tests.yaml
Normal file
|
@ -0,0 +1,83 @@
|
|||
- name: train_small
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/train_small.py
|
||||
|
||||
- name: train_moderate
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/train_moderate.py
|
||||
|
||||
- name: train_gpu
|
||||
cluster:
|
||||
app_config: app_config_gpu.yaml
|
||||
compute_template: tpl_gpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 5 600
|
||||
script: python workloads/train_gpu.py
|
||||
|
||||
- name: distributed_api_test
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
results:
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/distributed_api_test.py
|
||||
results: ""
|
||||
|
||||
- name: ft_small_non_elastic
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/ft_small_non_elastic.py
|
||||
results: ""
|
||||
|
||||
- name: tune_small
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/tune_small.py
|
||||
|
||||
- name: tune_32x4
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/tune_32x4.py
|
||||
|
||||
- name: tune_4x32
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/tune_4x32.py
|
3
release/lightgbm_tests/requirements.txt
Normal file
3
release/lightgbm_tests/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
ray[tune]
|
||||
lightgbm_ray
|
||||
xgboost_ray
|
15
release/lightgbm_tests/setup_lightgbm.sh
Executable file
15
release/lightgbm_tests/setup_lightgbm.sh
Executable file
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
pip install pytest
|
||||
# Uninstall any existing lightgbm_ray repositories
|
||||
pip uninstall -y lightgbm_ray || true
|
||||
|
||||
# Install lightgbm package
|
||||
pip install -U "${LIGHTGBM_RAY_PACKAGE:-lightgbm_ray}"
|
||||
|
||||
# Create test dataset
|
||||
sudo mkdir -p /data || true
|
||||
sudo chown ray:1000 /data || true
|
||||
rm -rf /data/classification.parquet || true
|
||||
cp -R /tmp/ray_tmp_mount/lightgbm_tests ~/lightgbm_tests || echo "Copy failed"
|
||||
python ~/lightgbm_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
|
15
release/lightgbm_tests/tpl_cpu_moderate.yaml
Normal file
15
release/lightgbm_tests/tpl_cpu_moderate.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||
region: us-west-2
|
||||
|
||||
max_workers: 31
|
||||
|
||||
head_node_type:
|
||||
name: head_node
|
||||
instance_type: m5.xlarge
|
||||
|
||||
worker_node_types:
|
||||
- name: worker_node
|
||||
instance_type: m5.xlarge
|
||||
min_workers: 31
|
||||
max_workers: 31
|
||||
use_spot: false
|
15
release/lightgbm_tests/tpl_cpu_small.yaml
Normal file
15
release/lightgbm_tests/tpl_cpu_small.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||
region: us-west-2
|
||||
|
||||
max_workers: 3
|
||||
|
||||
head_node_type:
|
||||
name: head_node
|
||||
instance_type: m5.xlarge
|
||||
|
||||
worker_node_types:
|
||||
- name: worker_node
|
||||
instance_type: m5.xlarge
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
use_spot: false
|
49
release/lightgbm_tests/wait_cluster.py
Normal file
49
release/lightgbm_tests/wait_cluster.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes",
|
||||
type=int,
|
||||
help="Wait for this number of nodes (includes head)")
|
||||
|
||||
parser.add_argument(
|
||||
"max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting.")
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)")
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds")
|
26
release/lightgbm_tests/workloads/distributed_api_test.py
Normal file
26
release/lightgbm_tests/workloads/distributed_api_test.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
"""Distributed LightGBM API test
|
||||
|
||||
This test runs unit tests on a distributed cluster. This will confirm that
|
||||
LightGBM API features like custom metrics/objectives work with remote
|
||||
trainables.
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Unit tests should pass (requires pytest).
|
||||
"""
|
||||
|
||||
import ray
|
||||
|
||||
from lightgbm_ray.tests.test_lightgbm_api import LightGBMAPITest
|
||||
|
||||
|
||||
class LightGBMDistributedAPITest(LightGBMAPITest):
|
||||
def _init_ray(self):
|
||||
if not ray.is_initialized():
|
||||
ray.init(address="auto")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", f"{__file__}::LightGBMDistributedAPITest"]))
|
51
release/lightgbm_tests/workloads/ft_small_non_elastic.py
Normal file
51
release/lightgbm_tests/workloads/ft_small_non_elastic.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
"""Fault tolerance test (small cluster, non-elastic training)
|
||||
|
||||
In this run, two training actors will die after some time. It is expected that
|
||||
in both cases lightgbm_ray stops training, restarts the dead actors, and
|
||||
continues training with all four actors.
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Should run through and report final results. Intermediate
|
||||
output should show that training halts wenn an actor dies and continues only
|
||||
when all four actors are available again. The test will fail if fault
|
||||
tolerance did not work correctly.
|
||||
|
||||
Notes: This test seems to be somewhat flaky. This might be due to
|
||||
race conditions in handling dead actors. This is likely a problem of
|
||||
the lightgbm_ray implementation and not of this test.
|
||||
"""
|
||||
import ray
|
||||
|
||||
from lightgbm_ray import RayParams
|
||||
|
||||
|
||||
from ray.util.lightgbm.release_test_util import train_ray, \
|
||||
FailureState, FailureInjection, TrackingCallback
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init(address="auto")
|
||||
|
||||
failure_state = FailureState.remote()
|
||||
|
||||
ray_params = RayParams(
|
||||
max_actor_restarts=2, num_actors=4, cpus_per_actor=4, gpus_per_actor=0)
|
||||
|
||||
_, additional_results, _ = train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=4,
|
||||
num_boost_rounds=100,
|
||||
num_files=200,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
lightgbm_params=None,
|
||||
callbacks=[
|
||||
TrackingCallback(),
|
||||
FailureInjection(
|
||||
id="first_fail", state=failure_state, ranks=[1], iteration=14),
|
||||
FailureInjection(
|
||||
id="second_fail", state=failure_state, ranks=[0], iteration=34)
|
||||
])
|
||||
|
||||
print("PASSED.")
|
49
release/lightgbm_tests/workloads/train_moderate.py
Normal file
49
release/lightgbm_tests/workloads/train_moderate.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
"""Moderate cluster training
|
||||
|
||||
This training run will start 32 workers on 32 nodes (including head node).
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Should run through and report final results.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from lightgbm_ray import RayParams
|
||||
|
||||
from ray.util.lightgbm.release_test_util import train_ray
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init(address="auto")
|
||||
|
||||
ray_params = RayParams(
|
||||
elastic_training=False,
|
||||
max_actor_restarts=2,
|
||||
num_actors=32,
|
||||
cpus_per_actor=4,
|
||||
gpus_per_actor=0)
|
||||
|
||||
start = time.time()
|
||||
train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=32,
|
||||
num_boost_rounds=100,
|
||||
num_files=128,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
lightgbm_params=None,
|
||||
)
|
||||
taken = time.time() - start
|
||||
|
||||
result = {
|
||||
"time_taken": taken,
|
||||
}
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/train_moderate.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("PASSED.")
|
67
release/lightgbm_tests/workloads/train_small.py
Normal file
67
release/lightgbm_tests/workloads/train_small.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
"""Small cluster training
|
||||
|
||||
This training run will start 4 workers on 4 nodes (including head node).
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Should run through and report final results.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.test_utils import wait_for_num_nodes
|
||||
from lightgbm_ray import RayParams
|
||||
|
||||
from ray.util.lightgbm.release_test_util import train_ray
|
||||
|
||||
if __name__ == "__main__":
|
||||
addr = os.environ.get("RAY_ADDRESS")
|
||||
job_name = os.environ.get("RAY_JOB_NAME", "train_small")
|
||||
if addr.startswith("anyscale://"):
|
||||
ray.client(address=addr).job_name(job_name).connect()
|
||||
else:
|
||||
ray.init(address="auto")
|
||||
|
||||
wait_for_num_nodes(
|
||||
int(os.environ.get("RAY_RELEASE_MIN_WORKERS", 0)) + 1, 600)
|
||||
|
||||
output = os.environ["TEST_OUTPUT_JSON"]
|
||||
state = os.environ["TEST_STATE_JSON"]
|
||||
ray_params = RayParams(
|
||||
elastic_training=False,
|
||||
max_actor_restarts=2,
|
||||
num_actors=4,
|
||||
cpus_per_actor=4,
|
||||
gpus_per_actor=0)
|
||||
|
||||
start = time.time()
|
||||
|
||||
@ray.remote
|
||||
def train():
|
||||
os.environ["TEST_OUTPUT_JSON"] = output
|
||||
os.environ["TEST_STATE_JSON"] = state
|
||||
train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=4,
|
||||
num_boost_rounds=100,
|
||||
num_files=25,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
lightgbm_params=None,
|
||||
)
|
||||
|
||||
ray.get(train.remote())
|
||||
taken = time.time() - start
|
||||
|
||||
result = {
|
||||
"time_taken": taken,
|
||||
}
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/train_small.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("PASSED.")
|
58
release/lightgbm_tests/workloads/train_small_connect.py
Normal file
58
release/lightgbm_tests/workloads/train_small_connect.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
"""Small cluster training
|
||||
|
||||
This training run will start 4 workers on 4 nodes (including head node).
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Should run through and report final results.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from lightgbm_ray import RayParams
|
||||
|
||||
from ray.util.lightgbm.release_test_util import train_ray
|
||||
|
||||
if __name__ == "__main__":
|
||||
addr = os.environ.get("RAY_ADDRESS")
|
||||
job_name = os.environ.get("RAY_JOB_NAME", "train_small")
|
||||
if addr.startswith("anyscale://"):
|
||||
ray.client(address=addr).job_name(job_name).connect()
|
||||
else:
|
||||
ray.init(address="auto")
|
||||
|
||||
ray_params = RayParams(
|
||||
elastic_training=False,
|
||||
max_actor_restarts=2,
|
||||
num_actors=4,
|
||||
cpus_per_actor=4,
|
||||
gpus_per_actor=0)
|
||||
|
||||
@ray.remote
|
||||
def train():
|
||||
train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=4,
|
||||
num_boost_rounds=100,
|
||||
num_files=25,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
lightgbm_params=None,
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
ray.get(train.remote())
|
||||
taken = time.time() - start
|
||||
|
||||
result = {
|
||||
"time_taken": taken,
|
||||
}
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/train_small.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("PASSED.")
|
72
release/lightgbm_tests/workloads/tune_32x4.py
Normal file
72
release/lightgbm_tests/workloads/tune_32x4.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
"""Moderate Ray Tune run (32 trials, 4 actors).
|
||||
|
||||
This training run will start 32 Ray Tune trials, each starting 4 actors.
|
||||
The cluster comprises 32 nodes.
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Should run through and report final results, as well
|
||||
as the Ray Tune results table. No trials should error. All trials should
|
||||
run in parallel.
|
||||
"""
|
||||
from collections import Counter
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
|
||||
from lightgbm_ray import RayParams
|
||||
|
||||
from ray.util.lightgbm.release_test_util import train_ray
|
||||
|
||||
|
||||
def train_wrapper(config, ray_params):
|
||||
train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=4,
|
||||
num_boost_rounds=100,
|
||||
num_files=64,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
lightgbm_params=config,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
search_space = {
|
||||
"eta": tune.loguniform(1e-4, 1e-1),
|
||||
"subsample": tune.uniform(0.5, 1.0),
|
||||
"max_depth": tune.randint(1, 9)
|
||||
}
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
ray_params = RayParams(
|
||||
elastic_training=False,
|
||||
max_actor_restarts=2,
|
||||
num_actors=4,
|
||||
cpus_per_actor=1,
|
||||
gpus_per_actor=0)
|
||||
|
||||
start = time.time()
|
||||
analysis = tune.run(
|
||||
tune.with_parameters(train_wrapper, ray_params=ray_params),
|
||||
config=search_space,
|
||||
num_samples=32,
|
||||
resources_per_trial=ray_params.get_tune_resources())
|
||||
taken = time.time() - start
|
||||
|
||||
result = {
|
||||
"time_taken": taken,
|
||||
"trial_states": dict(
|
||||
Counter([trial.status for trial in analysis.trials]))
|
||||
}
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/tune_32x4.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("PASSED.")
|
72
release/lightgbm_tests/workloads/tune_4x32.py
Normal file
72
release/lightgbm_tests/workloads/tune_4x32.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
"""Moderate Ray Tune run (4 trials, 32 actors).
|
||||
|
||||
This training run will start 4 Ray Tune trials, each starting 32 actors.
|
||||
The cluster comprises 32 nodes.
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Should run through and report final results, as well
|
||||
as the Ray Tune results table. No trials should error. All trials should
|
||||
run in parallel.
|
||||
"""
|
||||
from collections import Counter
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
|
||||
from lightgbm_ray import RayParams
|
||||
|
||||
from ray.util.lightgbm.release_test_util import train_ray
|
||||
|
||||
|
||||
def train_wrapper(config, ray_params):
|
||||
train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=32,
|
||||
num_boost_rounds=100,
|
||||
num_files=128,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
lightgbm_params=config,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
search_space = {
|
||||
"eta": tune.loguniform(1e-4, 1e-1),
|
||||
"subsample": tune.uniform(0.5, 1.0),
|
||||
"max_depth": tune.randint(1, 9)
|
||||
}
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
ray_params = RayParams(
|
||||
elastic_training=False,
|
||||
max_actor_restarts=2,
|
||||
num_actors=32,
|
||||
cpus_per_actor=1,
|
||||
gpus_per_actor=0)
|
||||
|
||||
start = time.time()
|
||||
analysis = tune.run(
|
||||
tune.with_parameters(train_wrapper, ray_params=ray_params),
|
||||
config=search_space,
|
||||
num_samples=4,
|
||||
resources_per_trial=ray_params.get_tune_resources())
|
||||
taken = time.time() - start
|
||||
|
||||
result = {
|
||||
"time_taken": taken,
|
||||
"trial_states": dict(
|
||||
Counter([trial.status for trial in analysis.trials]))
|
||||
}
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/tune_4x32.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("PASSED.")
|
72
release/lightgbm_tests/workloads/tune_small.py
Normal file
72
release/lightgbm_tests/workloads/tune_small.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
"""Small Ray Tune run (4 trials, 4 actors).
|
||||
|
||||
This training run will start 4 Ray Tune Trials, each starting 4 actors.
|
||||
The cluster comprises 4 nodes.
|
||||
|
||||
Test owner: Yard1 (primary), krfricke
|
||||
|
||||
Acceptance criteria: Should run through and report final results, as well
|
||||
as the Ray Tune results table. No trials should error. All trials should
|
||||
run in parallel.
|
||||
"""
|
||||
from collections import Counter
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
|
||||
from lightgbm_ray import RayParams
|
||||
|
||||
from ray.util.lightgbm.release_test_util import train_ray
|
||||
|
||||
|
||||
def train_wrapper(config, ray_params):
|
||||
train_ray(
|
||||
path="/data/classification.parquet",
|
||||
num_workers=4,
|
||||
num_boost_rounds=100,
|
||||
num_files=25,
|
||||
regression=False,
|
||||
use_gpu=False,
|
||||
ray_params=ray_params,
|
||||
lightgbm_params=config,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
search_space = {
|
||||
"eta": tune.loguniform(1e-4, 1e-1),
|
||||
"subsample": tune.uniform(0.5, 1.0),
|
||||
"max_depth": tune.randint(1, 9)
|
||||
}
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
ray_params = RayParams(
|
||||
elastic_training=False,
|
||||
max_actor_restarts=2,
|
||||
num_actors=4,
|
||||
cpus_per_actor=1,
|
||||
gpus_per_actor=0)
|
||||
|
||||
start = time.time()
|
||||
analysis = tune.run(
|
||||
tune.with_parameters(train_wrapper, ray_params=ray_params),
|
||||
config=search_space,
|
||||
num_samples=4,
|
||||
resources_per_trial=ray_params.get_tune_resources())
|
||||
taken = time.time() - start
|
||||
|
||||
result = {
|
||||
"time_taken": taken,
|
||||
"trial_states": dict(
|
||||
Counter([trial.status for trial in analysis.trials]))
|
||||
}
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/tune_small.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("PASSED.")
|
Loading…
Add table
Reference in a new issue