mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Release] Use NCCL backend for release tests (#20677)
* use nccl for release tests * link issue
This commit is contained in:
parent
d7b14ad9b8
commit
99ed623371
5 changed files with 21 additions and 13 deletions
|
@ -5,7 +5,6 @@ import torch
|
|||
import torch.nn as nn
|
||||
import ray.train as train
|
||||
from ray.train import Trainer
|
||||
from ray.train.torch import TorchConfig
|
||||
from ray.train.callbacks import JsonLoggerCallback, TBXLoggerCallback
|
||||
|
||||
|
||||
|
@ -89,9 +88,7 @@ def train_func(config):
|
|||
|
||||
def train_linear(num_workers=2, use_gpu=False, epochs=3):
|
||||
trainer = Trainer(
|
||||
backend=TorchConfig(backend="gloo"),
|
||||
num_workers=num_workers,
|
||||
use_gpu=use_gpu)
|
||||
backend="torch", num_workers=num_workers, use_gpu=use_gpu)
|
||||
config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
|
||||
trainer.start()
|
||||
results = trainer.run(
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
base_image: "anyscale/ray-ml:nightly-py37-gpu"
|
||||
env_vars:
|
||||
PL_TORCH_DISTRIBUTED_BACKEND: gloo
|
||||
|
||||
debian_packages:
|
||||
- curl
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
base_image: "anyscale/ray-ml:nightly-py37-gpu"
|
||||
env_vars:
|
||||
PL_TORCH_DISTRIBUTED_BACKEND: gloo
|
||||
|
||||
debian_packages:
|
||||
- curl
|
||||
|
|
|
@ -9,11 +9,19 @@ if __name__ == "__main__":
|
|||
start = time.time()
|
||||
|
||||
addr = os.environ.get("RAY_ADDRESS")
|
||||
job_name = os.environ.get("RAY_JOB_NAME", "horovod_user_test")
|
||||
job_name = os.environ.get("RAY_JOB_NAME", "ray_lightning_user_test")
|
||||
|
||||
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
|
||||
# anyscale_default_cloud.
|
||||
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
|
||||
# Passing in runtime_env to ray.init() will also set it for all the
|
||||
# workers.
|
||||
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
|
||||
|
||||
if addr is not None and addr.startswith("anyscale://"):
|
||||
ray.init(address=addr, job_name=job_name)
|
||||
ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
|
||||
else:
|
||||
ray.init(address="auto")
|
||||
ray.init(address="auto", runtime_env=runtime_env)
|
||||
|
||||
main(num_workers=6, use_gpu=True, max_steps=50)
|
||||
|
||||
|
|
|
@ -12,10 +12,17 @@ if __name__ == "__main__":
|
|||
addr = os.environ.get("RAY_ADDRESS")
|
||||
job_name = os.environ.get("RAY_JOB_NAME", "train_torch_linear_test")
|
||||
|
||||
# Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
|
||||
# anyscale_default_cloud.
|
||||
# See https://github.com/pytorch/pytorch/issues/68893 for more details.
|
||||
# Passing in runtime_env to ray.init() will also set it for all the
|
||||
# workers.
|
||||
runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
|
||||
|
||||
if addr is not None and addr.startswith("anyscale://"):
|
||||
ray.init(address=addr, job_name=job_name)
|
||||
ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
|
||||
else:
|
||||
ray.init(address="auto")
|
||||
ray.init(address="auto", runtime_env=runtime_env)
|
||||
|
||||
results = train_linear(num_workers=6, use_gpu=True, epochs=20)
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue