mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[Release/Horovod] Add user test for Horovod (#19661)
* infra * wip * add test * typo * typo * update * rename * fix * full path * formatting * reorder * update * update * Update release/horovod_tests/workloads/horovod_user_test.py Co-authored-by: matthewdeng <matthew.j.deng@gmail.com> * bump num_workers * update installs * try * add pip_packages * min_workers * fix * bump pg timeout * Fix symlink * fix * fix * cmake * fix * pin filelock * final * update * fix * Update release/horovod_tests/workloads/horovod_user_test.py * fix * fix * separate compute template * test latest and master Co-authored-by: matthewdeng <matthew.j.deng@gmail.com>
This commit is contained in:
parent
e1e4a45b8d
commit
474e44f7e0
13 changed files with 171 additions and 20 deletions
|
@ -1 +1 @@
|
||||||
../../../../release/horovod_tests/workloads/horovod_test.py
|
../../../../release/horovod_tests/workloads/horovod_tune_test.py
|
|
@ -115,11 +115,20 @@ def train_fn(data_dir=None,
|
||||||
100. * batch_idx / len(train_loader), loss.item()))
|
100. * batch_idx / len(train_loader), loss.item()))
|
||||||
|
|
||||||
|
|
||||||
def main(num_workers, use_gpu, **kwargs):
|
def main(num_workers,
|
||||||
settings = RayExecutor.create_settings(timeout_s=30)
|
use_gpu,
|
||||||
|
timeout_s=30,
|
||||||
|
placement_group_timeout_s=100,
|
||||||
|
kwargs=None):
|
||||||
|
kwargs = kwargs or {}
|
||||||
|
if use_gpu:
|
||||||
|
kwargs["use_cuda"] = True
|
||||||
|
settings = RayExecutor.create_settings(
|
||||||
|
timeout_s=timeout_s,
|
||||||
|
placement_group_timeout_s=placement_group_timeout_s)
|
||||||
executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
|
executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
|
||||||
executor.start()
|
executor.start()
|
||||||
executor.run(train_fn, **kwargs)
|
executor.run(train_fn, kwargs=kwargs)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -20,7 +20,12 @@ import yaml
|
||||||
|
|
||||||
|
|
||||||
class ReleaseTest:
|
class ReleaseTest:
|
||||||
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
smoke_test: bool = False,
|
||||||
|
retry: int = 0,
|
||||||
|
):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.smoke_test = smoke_test
|
self.smoke_test = smoke_test
|
||||||
self.retry = retry
|
self.retry = retry
|
||||||
|
@ -243,6 +248,19 @@ MANUAL_TESTS = {
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HOROVOD_INSTALL_ENV_VARS = [
|
||||||
|
"HOROVOD_WITH_GLOO", "HOROVOD_WITHOUT_MPI", "HOROVOD_WITHOUT_TENSORFLOW",
|
||||||
|
"HOROVOD_WITHOUT_MXNET", "HOROVOD_WITH_PYTORCH"
|
||||||
|
]
|
||||||
|
|
||||||
|
HOROVOD_SETUP_COMMANDS = [
|
||||||
|
"sudo apt update", "sudo apt -y install build-essential",
|
||||||
|
"pip install cmake"
|
||||||
|
] + [
|
||||||
|
f"export {horovod_env_var}=1"
|
||||||
|
for horovod_env_var in HOROVOD_INSTALL_ENV_VARS
|
||||||
|
]
|
||||||
|
|
||||||
# This test suite holds "user" tests to test important user workflows
|
# This test suite holds "user" tests to test important user workflows
|
||||||
# in a particular environment.
|
# in a particular environment.
|
||||||
# All workloads in this test suite should:
|
# All workloads in this test suite should:
|
||||||
|
@ -251,6 +269,17 @@ MANUAL_TESTS = {
|
||||||
# 3. Use GPUs if applicable
|
# 3. Use GPUs if applicable
|
||||||
# 4. Have the `use_connect` flag set.
|
# 4. Have the `use_connect` flag set.
|
||||||
USER_TESTS = {
|
USER_TESTS = {
|
||||||
|
"~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||||||
|
ConnectTest(
|
||||||
|
"horovod_user_test_latest",
|
||||||
|
setup_commands=HOROVOD_SETUP_COMMANDS,
|
||||||
|
requirements_file="release/horovod_tests/driver_requirements.txt"),
|
||||||
|
ConnectTest(
|
||||||
|
"horovod_user_test_master",
|
||||||
|
setup_commands=HOROVOD_SETUP_COMMANDS,
|
||||||
|
requirements_file="release/horovod_tests"
|
||||||
|
"/driver_requirements_master.txt")
|
||||||
|
],
|
||||||
"~/ray/release/train_tests/train_tests.yaml": [
|
"~/ray/release/train_tests/train_tests.yaml": [
|
||||||
ConnectTest(
|
ConnectTest(
|
||||||
"train_tensorflow_mnist_test",
|
"train_tensorflow_mnist_test",
|
||||||
|
@ -260,7 +289,7 @@ USER_TESTS = {
|
||||||
"train_torch_linear_test",
|
"train_torch_linear_test",
|
||||||
requirements_file="release/train_tests"
|
requirements_file="release/train_tests"
|
||||||
"/driver_requirements.txt")
|
"/driver_requirements.txt")
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
SUITES = {
|
SUITES = {
|
||||||
|
@ -484,22 +513,21 @@ def create_test_step(
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
step_conf["commands"] = [
|
|
||||||
"pip install -q -r release/requirements.txt",
|
|
||||||
"pip install -U boto3 botocore",
|
|
||||||
f"git clone -b {ray_test_branch} {ray_test_repo} ~/ray", cmd,
|
|
||||||
"sudo cp -rf /tmp/artifacts/* /tmp/ray_release_test_artifacts "
|
|
||||||
"|| true"
|
|
||||||
]
|
|
||||||
|
|
||||||
if isinstance(test_name, ConnectTest):
|
if isinstance(test_name, ConnectTest):
|
||||||
# Add driver side setup commands to the step.
|
# Add driver side setup commands to the step.
|
||||||
pip_requirements_command = [f"pip install -U -r "
|
pip_requirements_command = [f"pip install -U -r "
|
||||||
f"{test_name.requirements_file}"] if \
|
f"{test_name.requirements_file}"] if \
|
||||||
test_name.requirements_file else []
|
test_name.requirements_file else []
|
||||||
step_conf["commands"] = test_name.setup_commands \
|
step_conf["commands"] = test_name.setup_commands \
|
||||||
+ pip_requirements_command \
|
+ pip_requirements_command
|
||||||
+ step_conf["commands"]
|
|
||||||
|
step_conf["commands"] += [
|
||||||
|
"pip install -q -r release/requirements.txt",
|
||||||
|
"pip install -U boto3 botocore",
|
||||||
|
f"git clone -b {ray_test_branch} {ray_test_repo} ~/ray", cmd,
|
||||||
|
"sudo cp -rf /tmp/artifacts/* /tmp/ray_release_test_artifacts "
|
||||||
|
"|| true"
|
||||||
|
]
|
||||||
|
|
||||||
step_conf["label"] = (
|
step_conf["label"] = (
|
||||||
f"{test_name} "
|
f"{test_name} "
|
||||||
|
|
|
@ -14,7 +14,7 @@ post_build_cmds:
|
||||||
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
|
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
|
||||||
- pip3 install numpy || true
|
- pip3 install numpy || true
|
||||||
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||||
- pip3 install 'ray[rllib]'
|
- pip3 install 'ray[tune]'
|
||||||
- pip3 install torch torchvision
|
- pip3 install torch torchvision
|
||||||
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U git+https://github.com/horovod/horovod.git
|
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U horovod
|
||||||
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
|
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
|
||||||
|
|
20
release/horovod_tests/app_config_master.yaml
Normal file
20
release/horovod_tests/app_config_master.yaml
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
base_image: "anyscale/ray-ml:pinned-nightly-py37-gpu"
|
||||||
|
env_vars: {}
|
||||||
|
debian_packages:
|
||||||
|
- curl
|
||||||
|
|
||||||
|
python:
|
||||||
|
pip_packages:
|
||||||
|
- pytest
|
||||||
|
- awscli
|
||||||
|
conda_packages: []
|
||||||
|
|
||||||
|
post_build_cmds:
|
||||||
|
- pip uninstall -y numpy ray || true
|
||||||
|
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
|
||||||
|
- pip3 install numpy || true
|
||||||
|
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||||
|
- pip3 install 'ray[tune]'
|
||||||
|
- pip3 install torch torchvision
|
||||||
|
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U git+https://github.com/horovod/horovod.git
|
||||||
|
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
|
8
release/horovod_tests/base_driver_reqs.txt
Normal file
8
release/horovod_tests/base_driver_reqs.txt
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# Make sure the driver versions are the same as cluster versions.
|
||||||
|
# The cluster uses ray-ml Docker image.
|
||||||
|
# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
|
||||||
|
# We constrain on these requirements file so that the same versions are installed.
|
||||||
|
-c ../../python/requirements/ml/requirements_dl.txt
|
||||||
|
|
||||||
|
torch
|
||||||
|
torchvision
|
|
@ -10,8 +10,8 @@ head_node_type:
|
||||||
worker_node_types:
|
worker_node_types:
|
||||||
- name: worker_node
|
- name: worker_node
|
||||||
instance_type: g3.8xlarge
|
instance_type: g3.8xlarge
|
||||||
min_workers: 3
|
|
||||||
max_workers: 3
|
max_workers: 3
|
||||||
|
min_workers: 3
|
||||||
use_spot: false
|
use_spot: false
|
||||||
|
|
||||||
aws:
|
aws:
|
||||||
|
|
24
release/horovod_tests/compute_tpl_autoscaling.yaml
Normal file
24
release/horovod_tests/compute_tpl_autoscaling.yaml
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||||
|
region: us-west-2
|
||||||
|
|
||||||
|
max_workers: 3
|
||||||
|
|
||||||
|
head_node_type:
|
||||||
|
name: head_node
|
||||||
|
instance_type: g3.8xlarge
|
||||||
|
|
||||||
|
worker_node_types:
|
||||||
|
- name: worker_node
|
||||||
|
instance_type: g3.8xlarge
|
||||||
|
max_workers: 3
|
||||||
|
min_workers: 0
|
||||||
|
use_spot: false
|
||||||
|
|
||||||
|
aws:
|
||||||
|
TagSpecifications:
|
||||||
|
- ResourceType: "instance"
|
||||||
|
Tags:
|
||||||
|
- Key: anyscale-user
|
||||||
|
Value: '{{env["ANYSCALE_USER"]}}'
|
||||||
|
- Key: anyscale-expiration
|
||||||
|
Value: '{{env["EXPIRATION_1D"]}}'
|
3
release/horovod_tests/driver_requirements.txt
Normal file
3
release/horovod_tests/driver_requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
-r ./base_driver_reqs.txt
|
||||||
|
|
||||||
|
horovod
|
4
release/horovod_tests/driver_requirements_master.txt
Normal file
4
release/horovod_tests/driver_requirements_master.txt
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
-r ./base_driver_reqs.txt
|
||||||
|
|
||||||
|
# Horovod master.
|
||||||
|
git+https://github.com/horovod/horovod.git
|
|
@ -1,6 +1,6 @@
|
||||||
- name: horovod_test
|
- name: horovod_test
|
||||||
cluster:
|
cluster:
|
||||||
app_config: app_config.yaml
|
app_config: app_config_master.yaml
|
||||||
compute_template: compute_tpl.yaml
|
compute_template: compute_tpl.yaml
|
||||||
|
|
||||||
run:
|
run:
|
||||||
|
@ -12,3 +12,25 @@
|
||||||
smoke_test:
|
smoke_test:
|
||||||
run:
|
run:
|
||||||
timeout: 1800
|
timeout: 1800
|
||||||
|
|
||||||
|
- name: horovod_user_test_latest
|
||||||
|
cluster:
|
||||||
|
app_config: app_config.yaml
|
||||||
|
compute_template: compute_tpl_autoscaling.yaml
|
||||||
|
|
||||||
|
run:
|
||||||
|
use_connect: True
|
||||||
|
autosuspend_mins: 10
|
||||||
|
timeout: 1200
|
||||||
|
script: python workloads/horovod_user_test.py
|
||||||
|
|
||||||
|
- name: horovod_user_test_master
|
||||||
|
cluster:
|
||||||
|
app_config: app_config_master.yaml
|
||||||
|
compute_template: compute_tpl_autoscaling.yaml
|
||||||
|
|
||||||
|
run:
|
||||||
|
use_connect: True
|
||||||
|
autosuspend_mins: 10
|
||||||
|
timeout: 1200
|
||||||
|
script: python workloads/horovod_user_test.py
|
||||||
|
|
33
release/horovod_tests/workloads/horovod_user_test.py
Normal file
33
release/horovod_tests/workloads/horovod_user_test.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
import ray
|
||||||
|
from ray.util.horovod.horovod_example import main
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
addr = os.environ.get("RAY_ADDRESS")
|
||||||
|
job_name = os.environ.get("RAY_JOB_NAME", "horovod_user_test")
|
||||||
|
if addr is not None and addr.startswith("anyscale://"):
|
||||||
|
ray.init(address=addr, job_name=job_name)
|
||||||
|
else:
|
||||||
|
ray.init(address="auto")
|
||||||
|
|
||||||
|
main(
|
||||||
|
num_workers=6,
|
||||||
|
use_gpu=True,
|
||||||
|
placement_group_timeout_s=900,
|
||||||
|
kwargs={"num_epochs": 20})
|
||||||
|
|
||||||
|
taken = time.time() - start
|
||||||
|
result = {
|
||||||
|
"time_taken": taken,
|
||||||
|
}
|
||||||
|
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||||
|
"/tmp/horovod_user_test.json")
|
||||||
|
with open(test_output_json, "wt") as f:
|
||||||
|
json.dump(result, f)
|
||||||
|
|
||||||
|
print("Test Successful!")
|
Loading…
Add table
Reference in a new issue