mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[Release] Use NCCL backend for release tests (#20677)
* use nccl for release tests * link issue
This commit is contained in:
parent
d7b14ad9b8
commit
99ed623371
5 changed files with 21 additions and 13 deletions
|
@ -5,7 +5,6 @@ import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import ray.train as train
|
import ray.train as train
|
||||||
from ray.train import Trainer
|
from ray.train import Trainer
|
||||||
from ray.train.torch import TorchConfig
|
|
||||||
from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
|
from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,9 +88,7 @@ def train_func(config):
|
||||||
|
|
||||||
def train_linear(num_workers=2, use_gpu=False, epochs=3):
|
def train_linear(num_workers=2, use_gpu=False, epochs=3):
|
||||||
trainer = Trainer(
|
trainer = Trainer(
|
||||||
backend=TorchConfig(backend="gloo"),
|
backend="torch", num_workers=num_workers, use_gpu=use_gpu)
|
||||||
num_workers=num_workers,
|
|
||||||
use_gpu=use_gpu)
|
|
||||||
config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
|
config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
|
||||||
trainer.start()
|
trainer.start()
|
||||||
results = trainer.run(
|
results = trainer.run(
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
base_image: "anyscale/ray-ml:nightly-py37-gpu"
|
base_image: "anyscale/ray-ml:nightly-py37-gpu"
|
||||||
env_vars:
|
|
||||||
PL_TORCH_DISTRIBUTED_BACKEND: gloo
|
|
||||||
|
|
||||||
debian_packages:
|
debian_packages:
|
||||||
- curl
|
- curl
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
base_image: "anyscale/ray-ml:nightly-py37-gpu"
|
base_image: "anyscale/ray-ml:nightly-py37-gpu"
|
||||||
env_vars:
|
|
||||||
PL_TORCH_DISTRIBUTED_BACKEND: gloo
|
|
||||||
|
|
||||||
debian_packages:
|
debian_packages:
|
||||||
- curl
|
- curl
|
||||||
|
|
|
@ -9,11 +9,19 @@ if __name__ == "__main__":
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
addr = os.environ.get("RAY_ADDRESS")
|
addr = os.environ.get("RAY_ADDRESS")
|
||||||
job_name = os.environ.get("RAY_JOB_NAME", "horovod_user_test")
|
job_name = os.environ.get("RAY_JOB_NAME", "ray_lightning_user_test")
|
||||||
|
|
||||||
|
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
|
||||||
|
# anyscale_default_cloud.
|
||||||
|
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
|
||||||
|
# Passing in runtime_env to ray.init() will also set it for all the
|
||||||
|
# workers.
|
||||||
|
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
|
||||||
|
|
||||||
if addr is not None and addr.startswith("anyscale://"):
|
if addr is not None and addr.startswith("anyscale://"):
|
||||||
ray.init(address=addr, job_name=job_name)
|
ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
|
||||||
else:
|
else:
|
||||||
ray.init(address="auto")
|
ray.init(address="auto", runtime_env=runtime_env)
|
||||||
|
|
||||||
main(num_workers=6, use_gpu=True, max_steps=50)
|
main(num_workers=6, use_gpu=True, max_steps=50)
|
||||||
|
|
||||||
|
|
|
@ -12,10 +12,17 @@ if __name__ == "__main__":
|
||||||
addr = os.environ.get("RAY_ADDRESS")
|
addr = os.environ.get("RAY_ADDRESS")
|
||||||
job_name = os.environ.get("RAY_JOB_NAME", "train_torch_linear_test")
|
job_name = os.environ.get("RAY_JOB_NAME", "train_torch_linear_test")
|
||||||
|
|
||||||
|
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
|
||||||
|
# anyscale_default_cloud.
|
||||||
|
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
|
||||||
|
# Passing in runtime_env to ray.init() will also set it for all the
|
||||||
|
# workers.
|
||||||
|
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
|
||||||
|
|
||||||
if addr is not None and addr.startswith("anyscale://"):
|
if addr is not None and addr.startswith("anyscale://"):
|
||||||
ray.init(address=addr, job_name=job_name)
|
ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
|
||||||
else:
|
else:
|
||||||
ray.init(address="auto")
|
ray.init(address="auto", runtime_env=runtime_env)
|
||||||
|
|
||||||
results = train_linear(num_workers=6, use_gpu=True, epochs=20)
|
results = train_linear(num_workers=6, use_gpu=True, epochs=20)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue