2021-01-20 18:40:23 +01:00
|
|
|
"""Training on a GPU cluster.
|
|
|
|
|
|
|
|
This will train a small dataset on a distributed GPU cluster.
|
|
|
|
|
|
|
|
Test owner: krfricke
|
|
|
|
|
|
|
|
Acceptance criteria: Should run through and report final results.
|
|
|
|
|
|
|
|
Notes: The test will report output such as this:
|
|
|
|
```
|
|
|
|
[05:14:49] WARNING: ../src/gbm/gbtree.cc:350: Loading from a raw memory buffer
|
|
|
|
on CPU only machine. Changing tree_method to hist.
|
|
|
|
[05:14:49] WARNING: ../src/learner.cc:222: No visible GPU is found, setting
|
|
|
|
`gpu_id` to -1
|
|
|
|
```
|
|
|
|
|
|
|
|
This is _not_ an error. This is due to the checkpoints being loaded on the
|
|
|
|
XGBoost driver, and since the driver lives on the head node (which has no
|
|
|
|
GPU), XGBoost warns that it can't use the GPU. Training still happened using
|
|
|
|
the GPUs.
|
|
|
|
"""
|
2021-05-08 21:38:39 +02:00
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
2021-01-20 18:40:23 +01:00
|
|
|
import ray
|
|
|
|
from xgboost_ray import RayParams
|
|
|
|
|
2021-06-01 20:19:15 +02:00
|
|
|
from ray.util.xgboost.release_test_util import train_ray
|
2021-01-20 18:40:23 +01:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2022-04-20 15:18:22 -07:00
|
|
|
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
|
|
|
|
# anyscale_default_cloud.
|
|
|
|
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
|
|
|
|
# Passing in runtime_env to ray.init() will also set it for all the
|
|
|
|
# workers.
|
|
|
|
runtime_env = {
|
|
|
|
"env_vars": {
|
|
|
|
"NCCL_SOCKET_IFNAME": "ens3",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ray.init(address="auto", runtime_env=runtime_env)
|
2021-01-20 18:40:23 +01:00
|
|
|
|
|
|
|
ray_params = RayParams(
|
|
|
|
elastic_training=False,
|
|
|
|
max_actor_restarts=2,
|
|
|
|
num_actors=4,
|
|
|
|
cpus_per_actor=4,
|
|
|
|
gpus_per_actor=1,
|
|
|
|
)
|
|
|
|
|
2021-05-08 21:38:39 +02:00
|
|
|
start = time.time()
|
2021-01-20 18:40:23 +01:00
|
|
|
train_ray(
|
|
|
|
path="/data/classification.parquet",
|
2022-03-10 08:14:31 +00:00
|
|
|
num_workers=None,
|
2021-01-20 18:40:23 +01:00
|
|
|
num_boost_rounds=100,
|
|
|
|
num_files=25,
|
|
|
|
regression=False,
|
|
|
|
use_gpu=True,
|
|
|
|
ray_params=ray_params,
|
|
|
|
xgboost_params=None,
|
|
|
|
)
|
2021-05-08 21:38:39 +02:00
|
|
|
taken = time.time() - start
|
|
|
|
|
|
|
|
result = {
|
|
|
|
"time_taken": taken,
|
|
|
|
}
|
|
|
|
test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/train_gpu.json")
|
|
|
|
with open(test_output_json, "wt") as f:
|
|
|
|
json.dump(result, f)
|
2021-02-17 23:00:49 +01:00
|
|
|
|
|
|
|
print("PASSED.")
|