[Release] Use NCCL backend for release tests (#20677)

* use nccl for release tests

* link issue
This commit is contained in:
Amog Kamsetty 2021-11-29 12:42:13 -08:00 committed by GitHub
parent d7b14ad9b8
commit 99ed623371
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 21 additions and 13 deletions

View file

@ -5,7 +5,6 @@ import torch
import torch.nn as nn import torch.nn as nn
import ray.train as train import ray.train as train
from ray.train import Trainer from ray.train import Trainer
from ray.train.torch import TorchConfig
from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
@ -89,9 +88,7 @@ def train_func(config):
def train_linear(num_workers=2, use_gpu=False, epochs=3): def train_linear(num_workers=2, use_gpu=False, epochs=3):
trainer = Trainer( trainer = Trainer(
backend=TorchConfig(backend="gloo"), backend="torch", num_workers=num_workers, use_gpu=use_gpu)
num_workers=num_workers,
use_gpu=use_gpu)
config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
trainer.start() trainer.start()
results = trainer.run( results = trainer.run(

View file

@ -1,6 +1,4 @@
base_image: "anyscale/ray-ml:nightly-py37-gpu" base_image: "anyscale/ray-ml:nightly-py37-gpu"
env_vars:
PL_TORCH_DISTRIBUTED_BACKEND: gloo
debian_packages: debian_packages:
- curl - curl

View file

@ -1,6 +1,4 @@
base_image: "anyscale/ray-ml:nightly-py37-gpu" base_image: "anyscale/ray-ml:nightly-py37-gpu"
env_vars:
PL_TORCH_DISTRIBUTED_BACKEND: gloo
debian_packages: debian_packages:
- curl - curl

View file

@ -9,11 +9,19 @@ if __name__ == "__main__":
start = time.time() start = time.time()
addr = os.environ.get("RAY_ADDRESS") addr = os.environ.get("RAY_ADDRESS")
job_name = os.environ.get("RAY_JOB_NAME", "horovod_user_test") job_name = os.environ.get("RAY_JOB_NAME", "ray_lightning_user_test")
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
# anyscale_default_cloud.
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
# Passing in runtime_env to ray.init() will also set it for all the
# workers.
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
if addr is not None and addr.startswith("anyscale://"): if addr is not None and addr.startswith("anyscale://"):
ray.init(address=addr, job_name=job_name) ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
else: else:
ray.init(address="auto") ray.init(address="auto", runtime_env=runtime_env)
main(num_workers=6, use_gpu=True, max_steps=50) main(num_workers=6, use_gpu=True, max_steps=50)

View file

@ -12,10 +12,17 @@ if __name__ == "__main__":
addr = os.environ.get("RAY_ADDRESS") addr = os.environ.get("RAY_ADDRESS")
job_name = os.environ.get("RAY_JOB_NAME", "train_torch_linear_test") job_name = os.environ.get("RAY_JOB_NAME", "train_torch_linear_test")
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
# anyscale_default_cloud.
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
# Passing in runtime_env to ray.init() will also set it for all the
# workers.
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
if addr is not None and addr.startswith("anyscale://"): if addr is not None and addr.startswith("anyscale://"):
ray.init(address=addr, job_name=job_name) ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
else: else:
ray.init(address="auto") ray.init(address="auto", runtime_env=runtime_env)
results = train_linear(num_workers=6, use_gpu=True, epochs=20) results = train_linear(num_workers=6, use_gpu=True, epochs=20)