[train] remove default num_workers (#19518)

* [train] remove default num_workers

* fix tests
This commit is contained in:
matthewdeng 2021-10-19 13:53:23 -07:00 committed by GitHub
parent 7e10f6a876
commit 19eabd7a55
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 15 additions and 17 deletions

View file

@ -24,7 +24,7 @@ class TrainingLogdirMixin:
logdir_path = Path(logdir)
if not logdir_path.is_dir():
raise ValueError(f"logdir '{logdir}' must be a directory.")
raise ValueError(f"logdir '{logdir_path}' must be a directory.")
self._logdir_path = logdir_path

View file

@ -6,7 +6,7 @@ from ray.train import Trainer
from ray.train.examples.train_fashion_mnist_example import train_func
def main(num_workers=1, use_gpu=False):
def main(num_workers=2, use_gpu=False):
mlflow.set_experiment("train_torch_fashion_mnist")
trainer = Trainer(
@ -38,7 +38,7 @@ if __name__ == "__main__":
"--num-workers",
"-n",
type=int,
default=1,
default=2,
help="Sets number of workers for training.")
parser.add_argument(
"--use-gpu",

View file

@ -120,7 +120,7 @@ def train_func(config: Dict):
return loss_results
def train_fashion_mnist(num_workers=1, use_gpu=False):
def train_fashion_mnist(num_workers=2, use_gpu=False):
trainer = Trainer(
backend="torch", num_workers=num_workers, use_gpu=use_gpu)
trainer.start()
@ -131,7 +131,7 @@ def train_fashion_mnist(num_workers=1, use_gpu=False):
"batch_size": 64,
"epochs": 4
},
callbacks=[JsonLoggerCallback("./train_results")])
callbacks=[JsonLoggerCallback()])
trainer.shutdown()
print(f"Loss results: {result}")
@ -147,15 +147,13 @@ if __name__ == "__main__":
"--num-workers",
"-n",
type=int,
default=1,
default=2,
help="Sets number of workers for training.")
parser.add_argument(
"--use-gpu",
action="store_true",
default=False,
help="Enables GPU training")
parser.add_argument(
"--tune", action="store_true", default=False, help="Tune training")
parser.add_argument(
"--smoke-test",
action="store_true",

View file

@ -592,10 +592,10 @@ def test_horovod_torch_mnist_stateful(ray_start_2_cpus):
def test_init_failure(ray_start_2_cpus):
with pytest.raises(TypeError):
Trainer(5)
Trainer(5, num_workers=2)
with pytest.raises(ValueError):
Trainer("invalid")
Trainer("invalid", num_workers=2)
def test_start_failure(ray_start_2_cpus):

View file

@ -97,7 +97,7 @@ def test_tune_error(ray_start_2_cpus):
def train_func(config):
raise RuntimeError("Error in training function!")
trainer = Trainer(TestConfig())
trainer = Trainer(TestConfig(), num_workers=1)
TestTrainable = trainer.to_tune_trainable(train_func)
with pytest.raises(TuneError):
@ -110,7 +110,7 @@ def test_tune_checkpoint(ray_start_2_cpus):
train.report(test=i)
train.save_checkpoint(hello="world")
trainer = Trainer(TestConfig())
trainer = Trainer(TestConfig(), num_workers=1)
TestTrainable = trainer.to_tune_trainable(train_func)
[trial] = tune.run(TestTrainable).trials
@ -133,7 +133,7 @@ def test_reuse_checkpoint(ray_start_2_cpus):
train.save_checkpoint(iter=i)
train.report(test=i, training_iteration=i)
trainer = Trainer(TestConfig())
trainer = Trainer(TestConfig(), num_workers=1)
TestTrainable = trainer.to_tune_trainable(train_func)
[trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials
@ -163,7 +163,7 @@ def test_retry(ray_start_2_cpus):
train.save_checkpoint(iter=i)
train.report(test=i, training_iteration=i)
trainer = Trainer(TestConfig())
trainer = Trainer(TestConfig(), num_workers=1)
TestTrainable = trainer.to_tune_trainable(train_func)
analysis = tune.run(TestTrainable, max_failures=3)

View file

@ -64,8 +64,8 @@ class Trainer:
a subclass of ``BackendConfig`` can be passed in.
Supported ``str`` values: {"torch", "tensorflow", "horovod"}.
num_workers (int): The number of workers (Ray actors) to launch.
Defaults to 1. Each worker will reserve 1 CPU by default. The
number of CPUs reserved by each worker can be overridden with the
Each worker will reserve 1 CPU by default. The number of CPUs
reserved by each worker can be overridden with the
``resources_per_worker`` argument.
use_gpu (bool): If True, training will be done on GPUs (1 per
worker). Defaults to False. The number of GPUs reserved by each
@ -85,7 +85,7 @@ class Trainer:
def __init__(
self,
backend: Union[str, BackendConfig],
num_workers: int = 1,
num_workers: int,
use_gpu: bool = False,
resources_per_worker: Optional[Dict[str, float]] = None,
logdir: Optional[str] = None,