mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Release/Horovod] Add user test for Horovod (#19661)
* infra * wip * add test * typo * typo * update * rename * fix * full path * formatting * reorder * update * update * Update release/horovod_tests/workloads/horovod_user_test.py Co-authored-by: matthewdeng <matthew.j.deng@gmail.com> * bump num_workers * update installs * try * add pip_packages * min_workers * fix * bump pg timeout * Fix symlink * fix * fix * cmake * fix * pin filelock * final * update * fix * Update release/horovod_tests/workloads/horovod_user_test.py * fix * fix * separate compute template * test latest and master Co-authored-by: matthewdeng <matthew.j.deng@gmail.com>
This commit is contained in:
parent
e1e4a45b8d
commit
474e44f7e0
13 changed files with 171 additions and 20 deletions
|
@ -1 +1 @@
|
|||
../../../../release/horovod_tests/workloads/horovod_test.py
|
||||
../../../../release/horovod_tests/workloads/horovod_tune_test.py
|
|
@ -115,11 +115,20 @@ def train_fn(data_dir=None,
|
|||
100. * batch_idx / len(train_loader), loss.item()))
|
||||
|
||||
|
||||
def main(num_workers, use_gpu, **kwargs):
|
||||
settings = RayExecutor.create_settings(timeout_s=30)
|
||||
def main(num_workers,
|
||||
use_gpu,
|
||||
timeout_s=30,
|
||||
placement_group_timeout_s=100,
|
||||
kwargs=None):
|
||||
kwargs = kwargs or {}
|
||||
if use_gpu:
|
||||
kwargs["use_cuda"] = True
|
||||
settings = RayExecutor.create_settings(
|
||||
timeout_s=timeout_s,
|
||||
placement_group_timeout_s=placement_group_timeout_s)
|
||||
executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
|
||||
executor.start()
|
||||
executor.run(train_fn, **kwargs)
|
||||
executor.run(train_fn, kwargs=kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -20,7 +20,12 @@ import yaml
|
|||
|
||||
|
||||
class ReleaseTest:
|
||||
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
smoke_test: bool = False,
|
||||
retry: int = 0,
|
||||
):
|
||||
self.name = name
|
||||
self.smoke_test = smoke_test
|
||||
self.retry = retry
|
||||
|
@ -243,6 +248,19 @@ MANUAL_TESTS = {
|
|||
],
|
||||
}
|
||||
|
||||
HOROVOD_INSTALL_ENV_VARS = [
|
||||
"HOROVOD_WITH_GLOO", "HOROVOD_WITHOUT_MPI", "HOROVOD_WITHOUT_TENSORFLOW",
|
||||
"HOROVOD_WITHOUT_MXNET", "HOROVOD_WITH_PYTORCH"
|
||||
]
|
||||
|
||||
HOROVOD_SETUP_COMMANDS = [
|
||||
"sudo apt update", "sudo apt -y install build-essential",
|
||||
"pip install cmake"
|
||||
] + [
|
||||
f"export {horovod_env_var}=1"
|
||||
for horovod_env_var in HOROVOD_INSTALL_ENV_VARS
|
||||
]
|
||||
|
||||
# This test suite holds "user" tests to test important user workflows
|
||||
# in a particular environment.
|
||||
# All workloads in this test suite should:
|
||||
|
@ -251,6 +269,17 @@ MANUAL_TESTS = {
|
|||
# 3. Use GPUs if applicable
|
||||
# 4. Have the `use_connect` flag set.
|
||||
USER_TESTS = {
|
||||
"~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||||
ConnectTest(
|
||||
"horovod_user_test_latest",
|
||||
setup_commands=HOROVOD_SETUP_COMMANDS,
|
||||
requirements_file="release/horovod_tests/driver_requirements.txt"),
|
||||
ConnectTest(
|
||||
"horovod_user_test_master",
|
||||
setup_commands=HOROVOD_SETUP_COMMANDS,
|
||||
requirements_file="release/horovod_tests"
|
||||
"/driver_requirements_master.txt")
|
||||
],
|
||||
"~/ray/release/train_tests/train_tests.yaml": [
|
||||
ConnectTest(
|
||||
"train_tensorflow_mnist_test",
|
||||
|
@ -260,7 +289,7 @@ USER_TESTS = {
|
|||
"train_torch_linear_test",
|
||||
requirements_file="release/train_tests"
|
||||
"/driver_requirements.txt")
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
SUITES = {
|
||||
|
@ -484,22 +513,21 @@ def create_test_step(
|
|||
}]
|
||||
}
|
||||
|
||||
step_conf["commands"] = [
|
||||
"pip install -q -r release/requirements.txt",
|
||||
"pip install -U boto3 botocore",
|
||||
f"git clone -b {ray_test_branch} {ray_test_repo} ~/ray", cmd,
|
||||
"sudo cp -rf /tmp/artifacts/* /tmp/ray_release_test_artifacts "
|
||||
"|| true"
|
||||
]
|
||||
|
||||
if isinstance(test_name, ConnectTest):
|
||||
# Add driver side setup commands to the step.
|
||||
pip_requirements_command = [f"pip install -U -r "
|
||||
f"{test_name.requirements_file}"] if \
|
||||
test_name.requirements_file else []
|
||||
step_conf["commands"] = test_name.setup_commands \
|
||||
+ pip_requirements_command \
|
||||
+ step_conf["commands"]
|
||||
+ pip_requirements_command
|
||||
|
||||
step_conf["commands"] += [
|
||||
"pip install -q -r release/requirements.txt",
|
||||
"pip install -U boto3 botocore",
|
||||
f"git clone -b {ray_test_branch} {ray_test_repo} ~/ray", cmd,
|
||||
"sudo cp -rf /tmp/artifacts/* /tmp/ray_release_test_artifacts "
|
||||
"|| true"
|
||||
]
|
||||
|
||||
step_conf["label"] = (
|
||||
f"{test_name} "
|
||||
|
|
|
@ -14,7 +14,7 @@ post_build_cmds:
|
|||
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
|
||||
- pip3 install numpy || true
|
||||
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||
- pip3 install 'ray[rllib]'
|
||||
- pip3 install 'ray[tune]'
|
||||
- pip3 install torch torchvision
|
||||
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U git+https://github.com/horovod/horovod.git
|
||||
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U horovod
|
||||
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
|
||||
|
|
20
release/horovod_tests/app_config_master.yaml
Normal file
20
release/horovod_tests/app_config_master.yaml
Normal file
|
@ -0,0 +1,20 @@
|
|||
base_image: "anyscale/ray-ml:pinned-nightly-py37-gpu"
|
||||
env_vars: {}
|
||||
debian_packages:
|
||||
- curl
|
||||
|
||||
python:
|
||||
pip_packages:
|
||||
- pytest
|
||||
- awscli
|
||||
conda_packages: []
|
||||
|
||||
post_build_cmds:
|
||||
- pip uninstall -y numpy ray || true
|
||||
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
|
||||
- pip3 install numpy || true
|
||||
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||
- pip3 install 'ray[tune]'
|
||||
- pip3 install torch torchvision
|
||||
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip3 install -U git+https://github.com/horovod/horovod.git
|
||||
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
|
8
release/horovod_tests/base_driver_reqs.txt
Normal file
8
release/horovod_tests/base_driver_reqs.txt
Normal file
|
@ -0,0 +1,8 @@
|
|||
# Make sure the driver versions are the same as cluster versions.
|
||||
# The cluster uses ray-ml Docker image.
|
||||
# ray-ml Docker image installs dependencies from ray/python/requirements/ml/ directory.
|
||||
# We constrain on these requirements file so that the same versions are installed.
|
||||
-c ../../python/requirements/ml/requirements_dl.txt
|
||||
|
||||
torch
|
||||
torchvision
|
|
@ -10,8 +10,8 @@ head_node_type:
|
|||
worker_node_types:
|
||||
- name: worker_node
|
||||
instance_type: g3.8xlarge
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
min_workers: 3
|
||||
use_spot: false
|
||||
|
||||
aws:
|
||||
|
|
24
release/horovod_tests/compute_tpl_autoscaling.yaml
Normal file
24
release/horovod_tests/compute_tpl_autoscaling.yaml
Normal file
|
@ -0,0 +1,24 @@
|
|||
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||
region: us-west-2
|
||||
|
||||
max_workers: 3
|
||||
|
||||
head_node_type:
|
||||
name: head_node
|
||||
instance_type: g3.8xlarge
|
||||
|
||||
worker_node_types:
|
||||
- name: worker_node
|
||||
instance_type: g3.8xlarge
|
||||
max_workers: 3
|
||||
min_workers: 0
|
||||
use_spot: false
|
||||
|
||||
aws:
|
||||
TagSpecifications:
|
||||
- ResourceType: "instance"
|
||||
Tags:
|
||||
- Key: anyscale-user
|
||||
Value: '{{env["ANYSCALE_USER"]}}'
|
||||
- Key: anyscale-expiration
|
||||
Value: '{{env["EXPIRATION_1D"]}}'
|
3
release/horovod_tests/driver_requirements.txt
Normal file
3
release/horovod_tests/driver_requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
-r ./base_driver_reqs.txt
|
||||
|
||||
horovod
|
4
release/horovod_tests/driver_requirements_master.txt
Normal file
4
release/horovod_tests/driver_requirements_master.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
-r ./base_driver_reqs.txt
|
||||
|
||||
# Horovod master.
|
||||
git+https://github.com/horovod/horovod.git
|
|
@ -1,6 +1,6 @@
|
|||
- name: horovod_test
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
app_config: app_config_master.yaml
|
||||
compute_template: compute_tpl.yaml
|
||||
|
||||
run:
|
||||
|
@ -12,3 +12,25 @@
|
|||
smoke_test:
|
||||
run:
|
||||
timeout: 1800
|
||||
|
||||
- name: horovod_user_test_latest
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl_autoscaling.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 1200
|
||||
script: python workloads/horovod_user_test.py
|
||||
|
||||
- name: horovod_user_test_master
|
||||
cluster:
|
||||
app_config: app_config_master.yaml
|
||||
compute_template: compute_tpl_autoscaling.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 1200
|
||||
script: python workloads/horovod_user_test.py
|
||||
|
|
33
release/horovod_tests/workloads/horovod_user_test.py
Normal file
33
release/horovod_tests/workloads/horovod_user_test.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.util.horovod.horovod_example import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
start = time.time()
|
||||
|
||||
addr = os.environ.get("RAY_ADDRESS")
|
||||
job_name = os.environ.get("RAY_JOB_NAME", "horovod_user_test")
|
||||
if addr is not None and addr.startswith("anyscale://"):
|
||||
ray.init(address=addr, job_name=job_name)
|
||||
else:
|
||||
ray.init(address="auto")
|
||||
|
||||
main(
|
||||
num_workers=6,
|
||||
use_gpu=True,
|
||||
placement_group_timeout_s=900,
|
||||
kwargs={"num_epochs": 20})
|
||||
|
||||
taken = time.time() - start
|
||||
result = {
|
||||
"time_taken": taken,
|
||||
}
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
||||
"/tmp/horovod_user_test.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
print("Test Successful!")
|
Loading…
Add table
Reference in a new issue