[Release] Use NCCL backend for release tests (#20677)

* use nccl for release tests

* link issue
This commit is contained in:
Amog Kamsetty 2021-11-29 12:42:13 -08:00 committed by GitHub
parent d7b14ad9b8
commit 99ed623371
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 21 additions and 13 deletions

View file

@ -5,7 +5,6 @@ import torch
import torch.nn as nn
import ray.train as train
from ray.train import Trainer
from ray.train.torch import TorchConfig
from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
@ -89,9 +88,7 @@ def train_func(config):
def train_linear(num_workers=2, use_gpu=False, epochs=3):
trainer = Trainer(
backend=TorchConfig(backend="gloo"),
num_workers=num_workers,
use_gpu=use_gpu)
backend="torch", num_workers=num_workers, use_gpu=use_gpu)
config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
trainer.start()
results = trainer.run(

View file

@ -1,6 +1,4 @@
base_image: "anyscale/ray-ml:nightly-py37-gpu"
env_vars:
PL_TORCH_DISTRIBUTED_BACKEND: gloo
debian_packages:
- curl

View file

@ -1,6 +1,4 @@
base_image: "anyscale/ray-ml:nightly-py37-gpu"
env_vars:
PL_TORCH_DISTRIBUTED_BACKEND: gloo
debian_packages:
- curl

View file

@ -9,11 +9,19 @@ if __name__ == "__main__":
start = time.time()
addr = os.environ.get("RAY_ADDRESS")
job_name = os.environ.get("RAY_JOB_NAME", "horovod_user_test")
job_name = os.environ.get("RAY_JOB_NAME", "ray_lightning_user_test")
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
# anyscale_default_cloud.
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
# Passing in runtime_env to ray.init() will also set it for all the
# workers.
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
if addr is not None and addr.startswith("anyscale://"):
ray.init(address=addr, job_name=job_name)
ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
else:
ray.init(address="auto")
ray.init(address="auto", runtime_env=runtime_env)
main(num_workers=6, use_gpu=True, max_steps=50)

View file

@ -12,10 +12,17 @@ if __name__ == "__main__":
addr = os.environ.get("RAY_ADDRESS")
job_name = os.environ.get("RAY_JOB_NAME", "train_torch_linear_test")
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
# anyscale_default_cloud.
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
# Passing in runtime_env to ray.init() will also set it for all the
# workers.
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
if addr is not None and addr.startswith("anyscale://"):
ray.init(address=addr, job_name=job_name)
ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
else:
ray.init(address="auto")
ray.init(address="auto", runtime_env=runtime_env)
results = train_linear(num_workers=6, use_gpu=True, epochs=20)