ray/release/xgboost_tests/workloads/release_test_util.py
Amog Kamsetty 862d10c162
[AIR] Remove ML code from ray.util (#27005)
Removes all ML related code from `ray.util`

Removes:
- `ray.util.xgboost`
- `ray.util.lightgbm`
- `ray.util.horovod`
- `ray.util.ray_lightning`

Moves `ray.util.ml_utils` to other locations

Closes #23900

Signed-off-by: Amog Kamsetty <amogkamsetty@yahoo.com>
Signed-off-by: Kai Fricke <kai@anyscale.com>
Co-authored-by: Kai Fricke <kai@anyscale.com>
2022-07-27 14:24:19 +01:00

169 lines
4.4 KiB
Python

import glob
import os
import time
import ray
from xgboost_ray import (
train,
RayDMatrix,
RayFileType,
RayDeviceQuantileDMatrix,
RayParams,
)
from xgboost_ray.session import get_actor_rank, put_queue
from xgboost.callback import TrainingCallback
from xgboost.rabit import get_world_size
if "OMP_NUM_THREADS" in os.environ:
del os.environ["OMP_NUM_THREADS"]
@ray.remote
class FailureState:
def __init__(self):
self._failed_ids = set()
def set_failed(self, id):
if id in self._failed_ids:
return False
self._failed_ids.add(id)
return True
def has_failed(self, id):
return id in self._failed_ids
class FailureInjection(TrainingCallback):
def __init__(self, id, state, ranks, iteration):
self._id = id
self._state = state
self._ranks = ranks or []
self._iteration = iteration
super(FailureInjection).__init__()
def after_iteration(self, model, epoch, evals_log):
if epoch == self._iteration:
rank = get_actor_rank()
if rank in self._ranks:
if not ray.get(self._state.has_failed.remote(self._id)):
success = ray.get(self._state.set_failed.remote(self._id))
if not success:
# Another rank is already about to fail
return
pid = os.getpid()
print(f"Killing process: {pid} for actor rank {rank}")
time.sleep(1)
os.kill(pid, 9)
class TrackingCallback(TrainingCallback):
def before_iteration(self, model, epoch, evals_log):
if get_actor_rank() == 3:
print(f"[Rank {get_actor_rank()}] I am at iteration {epoch}")
put_queue(get_world_size())
def get_parquet_files(path, num_files=0):
path = os.path.expanduser(path)
if not os.path.exists(path):
raise ValueError(f"Path does not exist: {path}")
files = sorted(glob.glob(f"{path}/**/*.parquet"))
while num_files > len(files):
files = files + files
return files[0:num_files]
def train_ray(
path,
num_workers,
num_boost_rounds,
num_files=0,
regression=False,
use_gpu=False,
ray_params=None,
xgboost_params=None,
**kwargs,
):
if not isinstance(path, list):
path = get_parquet_files(path, num_files=num_files)
use_device_matrix = False
if use_gpu:
try:
import cupy # noqa: F401
use_device_matrix = True
except ImportError:
use_device_matrix = False
if use_device_matrix:
dtrain = RayDeviceQuantileDMatrix(
path,
num_actors=num_workers,
label="labels",
ignore=["partition"],
filetype=RayFileType.PARQUET,
)
else:
dtrain = RayDMatrix(
path,
num_actors=num_workers,
label="labels",
ignore=["partition"],
filetype=RayFileType.PARQUET,
)
config = {"tree_method": "hist" if not use_gpu else "gpu_hist"}
if not regression:
# Classification
config.update(
{
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
}
)
else:
# Regression
config.update(
{
"objective": "reg:squarederror",
"eval_metric": ["logloss", "rmse"],
}
)
if xgboost_params:
config.update(xgboost_params)
start = time.time()
evals_result = {}
additional_results = {}
bst = train(
config,
dtrain,
evals_result=evals_result,
additional_results=additional_results,
num_boost_round=num_boost_rounds,
ray_params=ray_params
or RayParams(
max_actor_restarts=2,
num_actors=num_workers,
cpus_per_actor=1,
gpus_per_actor=1 if not use_gpu else 1,
),
evals=[(dtrain, "train")],
**kwargs,
)
taken = time.time() - start
print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
out_file = os.path.expanduser(
"~/benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")
)
bst.save_model(out_file)
print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
return bst, additional_results, taken