2021-01-20 18:40:23 +01:00
|
|
|
"""Small cluster training
|
|
|
|
|
|
|
|
This training run will start 4 workers on 4 nodes (including head node).
|
|
|
|
|
|
|
|
Test owner: krfricke
|
|
|
|
|
|
|
|
Acceptance criteria: Should run through and report final results.
|
|
|
|
"""
|
2021-05-08 21:38:39 +02:00
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
2021-01-20 18:40:23 +01:00
|
|
|
import ray
|
2021-08-18 20:56:33 -07:00
|
|
|
from ray._private.test_utils import wait_for_num_nodes
|
2021-01-20 18:40:23 +01:00
|
|
|
from xgboost_ray import RayParams
|
|
|
|
|
2021-06-01 20:19:15 +02:00
|
|
|
from ray.util.xgboost.release_test_util import train_ray
|
2021-01-20 18:40:23 +01:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2021-06-23 00:52:08 -07:00
|
|
|
addr = os.environ.get("RAY_ADDRESS")
|
|
|
|
job_name = os.environ.get("RAY_JOB_NAME", "train_small")
|
|
|
|
if addr.startswith("anyscale://"):
|
|
|
|
ray.client(address=addr).job_name(job_name).connect()
|
|
|
|
else:
|
|
|
|
ray.init(address="auto")
|
2021-07-01 02:17:53 -07:00
|
|
|
|
|
|
|
wait_for_num_nodes(
|
|
|
|
int(os.environ.get("RAY_RELEASE_MIN_WORKERS", 0)) + 1, 600)
|
2021-06-23 00:52:08 -07:00
|
|
|
|
|
|
|
output = os.environ["TEST_OUTPUT_JSON"]
|
|
|
|
state = os.environ["TEST_STATE_JSON"]
|
2021-01-20 18:40:23 +01:00
|
|
|
ray_params = RayParams(
|
|
|
|
elastic_training=False,
|
|
|
|
max_actor_restarts=2,
|
|
|
|
num_actors=4,
|
|
|
|
cpus_per_actor=4,
|
|
|
|
gpus_per_actor=0)
|
|
|
|
|
2021-05-08 21:38:39 +02:00
|
|
|
start = time.time()
|
2021-06-23 00:52:08 -07:00
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
def train():
|
|
|
|
os.environ["TEST_OUTPUT_JSON"] = output
|
|
|
|
os.environ["TEST_STATE_JSON"] = state
|
|
|
|
train_ray(
|
|
|
|
path="/data/classification.parquet",
|
|
|
|
num_workers=4,
|
|
|
|
num_boost_rounds=100,
|
|
|
|
num_files=25,
|
|
|
|
regression=False,
|
|
|
|
use_gpu=False,
|
|
|
|
ray_params=ray_params,
|
|
|
|
xgboost_params=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
ray.get(train.remote())
|
2021-05-08 21:38:39 +02:00
|
|
|
taken = time.time() - start
|
|
|
|
|
|
|
|
result = {
|
|
|
|
"time_taken": taken,
|
|
|
|
}
|
|
|
|
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
|
|
|
"/tmp/train_small.json")
|
|
|
|
with open(test_output_json, "wt") as f:
|
|
|
|
json.dump(result, f)
|
2021-02-17 23:00:49 +01:00
|
|
|
|
|
|
|
print("PASSED.")
|