[Release/Horovod] Add user test for Horovod (#19661)

* infra

* wip

* add test

* typo

* typo

* update

* rename

* fix

* full path

* formatting

* reorder

* update

* update

* Update release/horovod_tests/workloads/horovod_user_test.py

Co-authored-by: matthewdeng <matthew.j.deng@gmail.com>

* bump num_workers

* update installs

* try

* add pip_packages

* min_workers

* fix

* bump pg timeout

* Fix symlink

* fix

* fix

* cmake

* fix

* pin filelock

* final

* update

* fix

* Update release/horovod_tests/workloads/horovod_user_test.py

* fix

* fix

* separate compute template

* test latest and master

Co-authored-by: matthewdeng <matthew.j.deng@gmail.com>
This commit is contained in:
Amog Kamsetty 2021-11-01 18:28:07 -07:00 committed by GitHub
parent e1e4a45b8d
commit 474e44f7e0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 171 additions and 20 deletions

View file

@ -1 +1 @@
../../../../release/horovod_tests/workloads/horovod_test.py
../../../../release/horovod_tests/workloads/horovod_tune_test.py

View file

@ -115,11 +115,20 @@ def train_fn(data_dir=None,
100. * batch_idx / len(train_loader), loss.item()))
def main(num_workers, use_gpu, **kwargs):
settings = RayExecutor.create_settings(timeout_s=30)
def main(num_workers,
use_gpu,
timeout_s=30,
placement_group_timeout_s=100,
kwargs=None):
kwargs = kwargs or {}
if use_gpu:
kwargs["use_cuda"] = True
settings = RayExecutor.create_settings(
timeout_s=timeout_s,
placement_group_timeout_s=placement_group_timeout_s)
executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
executor.start()
executor.run(train_fn, **kwargs)
executor.run(train_fn, kwargs=kwargs)
if __name__ == "__main__":

View file

@ -20,7 +20,12 @@ import yaml
class ReleaseTest:
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
def __init__(
self,
name: str,
smoke_test: bool = False,
retry: int = 0,
):
self.name = name
self.smoke_test = smoke_test
self.retry = retry
@ -243,6 +248,19 @@ MANUAL_TESTS = {
],
}
HOROVOD_INSTALL_ENV_VARS = [
"HOROVOD_WITH_GLOO", "HOROVOD_WITHOUT_MPI", "HOROVOD_WITHOUT_TENSORFLOW",
"HOROVOD_WITHOUT_MXNET", "HOROVOD_WITH_PYTORCH"
]
HOROVOD_SETUP_COMMANDS = [
"sudo apt update", "sudo apt -y install build-essential",
"pip install cmake"
] + [
f"export {horovod_env_var}=1"
for horovod_env_var in HOROVOD_INSTALL_ENV_VARS
]
# This test suite holds "user" tests to test important user workflows
# in a particular environment.
# All workloads in this test suite should:
@ -251,6 +269,17 @@ MANUAL_TESTS = {
# 3. Use GPUs if applicable
# 4. Have the `use_connect` flag set.
USER_TESTS = {
"~/ray/release/horovod_tests/horovod_tests.yaml": [
ConnectTest(
"horovod_user_test_latest",
setup_commands=HOROVOD_SETUP_COMMANDS,
requirements_file="release/horovod_tests/driver_requirements.txt"),
ConnectTest(
"horovod_user_test_master",
setup_commands=HOROVOD_SETUP_COMMANDS,
requirements_file="release/horovod_tests"
"/driver_requirements_master.txt")
],
"~/ray/release/train_tests/train_tests.yaml": [
ConnectTest(
"train_tensorflow_mnist_test",
@ -260,7 +289,7 @@ USER_TESTS = {
"train_torch_linear_test",
requirements_file="release/train_tests"
"/driver_requirements.txt")
]
],
}
SUITES = {
@ -484,22 +513,21 @@ def create_test_step(
}]
}
step_conf["commands"] = [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
f"git clone -b {ray_test_branch} {ray_test_repo} ~/ray", cmd,
"sudo cp -rf /tmp/artifacts/* /tmp/ray_release_test_artifacts "
"|| true"
]
if isinstance(test_name, ConnectTest):
# Add driver side setup commands to the step.
pip_requirements_command = [f"pip install -U -r "
f"{test_name.requirements_file}"] if \
test_name.requirements_file else []
step_conf["commands"] = test_name.setup_commands \
+ pip_requirements_command \
+ step_conf["commands"]
+ pip_requirements_command
step_conf["commands"] += [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
f"git clone -b {ray_test_branch} {ray_test_repo} ~/ray", cmd,
"sudo cp -rf /tmp/artifacts/* /tmp/ray_release_test_artifacts "
"|| true"
]
step_conf["label"] = (
f"{test_name} "

View file

@ -14,7 +14,7 @@ post_build_cmds:
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
- pip3 install numpy || true
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
- pip3 install 'ray[rllib]'
- pip3 install 'ray[tune]'
- pip3 install torch torchvision
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U git+https://github.com/horovod/horovod.git
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U horovod
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}

View file

@ -0,0 +1,20 @@
base_image: "anyscale/ray-ml:pinned-nightly-py37-gpu"
env_vars: {}
debian_packages:
- curl
python:
pip_packages:
- pytest
- awscli
conda_packages: []
post_build_cmds:
- pip uninstall -y numpy ray || true
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
- pip3 install numpy || true
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
- pip3 install 'ray[tune]'
- pip3 install torch torchvision
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U git+https://github.com/horovod/horovod.git
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}

View file

@ -0,0 +1,8 @@
# Make sure the driver versions are the same as cluster versions.
# The cluster uses ray-ml Docker image.
# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
# We constrain on these requirements file so that the same versions are installed.
-c ../../python/requirements/ml/requirements_dl.txt
torch
torchvision

View file

@ -10,8 +10,8 @@ head_node_type:
worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
min_workers: 3
max_workers: 3
min_workers: 3
use_spot: false
aws:

View file

@ -0,0 +1,24 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2
max_workers: 3
head_node_type:
name: head_node
instance_type: g3.8xlarge
worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
max_workers: 3
min_workers: 0
use_spot: false
aws:
TagSpecifications:
- ResourceType: "instance"
Tags:
- Key: anyscale-user
Value: '{{env["ANYSCALE_USER"]}}'
- Key: anyscale-expiration
Value: '{{env["EXPIRATION_1D"]}}'

View file

@ -0,0 +1,3 @@
-r ./base_driver_reqs.txt
horovod

View file

@ -0,0 +1,4 @@
-r ./base_driver_reqs.txt
# Horovod master.
git+https://github.com/horovod/horovod.git

View file

@ -1,6 +1,6 @@
- name: horovod_test
cluster:
app_config: app_config.yaml
app_config: app_config_master.yaml
compute_template: compute_tpl.yaml
run:
@ -12,3 +12,25 @@
smoke_test:
run:
timeout: 1800
- name: horovod_user_test_latest
cluster:
app_config: app_config.yaml
compute_template: compute_tpl_autoscaling.yaml
run:
use_connect: True
autosuspend_mins: 10
timeout: 1200
script: python workloads/horovod_user_test.py
- name: horovod_user_test_master
cluster:
app_config: app_config_master.yaml
compute_template: compute_tpl_autoscaling.yaml
run:
use_connect: True
autosuspend_mins: 10
timeout: 1200
script: python workloads/horovod_user_test.py

View file

@ -0,0 +1,33 @@
import json
import os
import time
import ray
from ray.util.horovod.horovod_example import main
if __name__ == "__main__":
start = time.time()
addr = os.environ.get("RAY_ADDRESS")
job_name = os.environ.get("RAY_JOB_NAME", "horovod_user_test")
if addr is not None and addr.startswith("anyscale://"):
ray.init(address=addr, job_name=job_name)
else:
ray.init(address="auto")
main(
num_workers=6,
use_gpu=True,
placement_group_timeout_s=900,
kwargs={"num_epochs": 20})
taken = time.time() - start
result = {
"time_taken": taken,
}
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
"/tmp/horovod_user_test.json")
with open(test_output_json, "wt") as f:
json.dump(result, f)
print("Test Successful!")