From 19eabd7a558cf690a4454dc6e1f1cf552b8f18ad Mon Sep 17 00:00:00 2001 From: matthewdeng Date: Tue, 19 Oct 2021 13:53:23 -0700 Subject: [PATCH] [train] remove default num_workers (#19518) * [train] remove default num_workers * fix tests --- python/ray/train/callbacks/logging.py | 2 +- python/ray/train/examples/mlflow_fashion_mnist_example.py | 4 ++-- python/ray/train/examples/train_fashion_mnist_example.py | 8 +++----- python/ray/train/tests/test_trainer.py | 4 ++-- python/ray/train/tests/test_tune.py | 8 ++++---- python/ray/train/trainer.py | 6 +++--- 6 files changed, 15 insertions(+), 17 deletions(-) diff --git a/python/ray/train/callbacks/logging.py b/python/ray/train/callbacks/logging.py index eacd7bb4e..41ead4db3 100644 --- a/python/ray/train/callbacks/logging.py +++ b/python/ray/train/callbacks/logging.py @@ -24,7 +24,7 @@ class TrainingLogdirMixin: logdir_path = Path(logdir) if not logdir_path.is_dir(): - raise ValueError(f"logdir '{logdir}' must be a directory.") + raise ValueError(f"logdir '{logdir_path}' must be a directory.") self._logdir_path = logdir_path diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py index 0b7dd1503..5205b5e94 100644 --- a/python/ray/train/examples/mlflow_fashion_mnist_example.py +++ b/python/ray/train/examples/mlflow_fashion_mnist_example.py @@ -6,7 +6,7 @@ from ray.train import Trainer from ray.train.examples.train_fashion_mnist_example import train_func -def main(num_workers=1, use_gpu=False): +def main(num_workers=2, use_gpu=False): mlflow.set_experiment("train_torch_fashion_mnist") trainer = Trainer( @@ -38,7 +38,7 @@ if __name__ == "__main__": "--num-workers", "-n", type=int, - default=1, + default=2, help="Sets number of workers for training.") parser.add_argument( "--use-gpu", diff --git a/python/ray/train/examples/train_fashion_mnist_example.py b/python/ray/train/examples/train_fashion_mnist_example.py index 72261b4e8..823717c56 100644 --- a/python/ray/train/examples/train_fashion_mnist_example.py +++ b/python/ray/train/examples/train_fashion_mnist_example.py @@ -120,7 +120,7 @@ def train_func(config: Dict): return loss_results -def train_fashion_mnist(num_workers=1, use_gpu=False): +def train_fashion_mnist(num_workers=2, use_gpu=False): trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu) trainer.start() @@ -131,7 +131,7 @@ def train_fashion_mnist(num_workers=1, use_gpu=False): "batch_size": 64, "epochs": 4 }, - callbacks=[JsonLoggerCallback("./train_results")]) + callbacks=[JsonLoggerCallback()]) trainer.shutdown() print(f"Loss results: {result}") @@ -147,15 +147,13 @@ if __name__ == "__main__": "--num-workers", "-n", type=int, - default=1, + default=2, help="Sets number of workers for training.") parser.add_argument( "--use-gpu", action="store_true", default=False, help="Enables GPU training") - parser.add_argument( - "--tune", action="store_true", default=False, help="Tune training") parser.add_argument( "--smoke-test", action="store_true", diff --git a/python/ray/train/tests/test_trainer.py b/python/ray/train/tests/test_trainer.py index 94ec8c3c8..bcd6f6b81 100644 --- a/python/ray/train/tests/test_trainer.py +++ b/python/ray/train/tests/test_trainer.py @@ -592,10 +592,10 @@ def test_horovod_torch_mnist_stateful(ray_start_2_cpus): def test_init_failure(ray_start_2_cpus): with pytest.raises(TypeError): - Trainer(5) + Trainer(5, num_workers=2) with pytest.raises(ValueError): - Trainer("invalid") + Trainer("invalid", num_workers=2) def test_start_failure(ray_start_2_cpus): diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index cbb95de24..39ec177da 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -97,7 +97,7 @@ def test_tune_error(ray_start_2_cpus): def train_func(config): raise RuntimeError("Error in training function!") - trainer = Trainer(TestConfig()) + trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) with pytest.raises(TuneError): @@ -110,7 +110,7 @@ def test_tune_checkpoint(ray_start_2_cpus): train.report(test=i) train.save_checkpoint(hello="world") - trainer = Trainer(TestConfig()) + trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) [trial] = tune.run(TestTrainable).trials @@ -133,7 +133,7 @@ def test_reuse_checkpoint(ray_start_2_cpus): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) - trainer = Trainer(TestConfig()) + trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials @@ -163,7 +163,7 @@ def test_retry(ray_start_2_cpus): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) - trainer = Trainer(TestConfig()) + trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) analysis = tune.run(TestTrainable, max_failures=3) diff --git a/python/ray/train/trainer.py b/python/ray/train/trainer.py index edd8b73a9..4e7589b46 100644 --- a/python/ray/train/trainer.py +++ b/python/ray/train/trainer.py @@ -64,8 +64,8 @@ class Trainer: a subclass of ``BackendConfig`` can be passed in. Supported ``str`` values: {"torch", "tensorflow", "horovod"}. num_workers (int): The number of workers (Ray actors) to launch. - Defaults to 1. Each worker will reserve 1 CPU by default. The - number of CPUs reserved by each worker can be overridden with the + Each worker will reserve 1 CPU by default. The number of CPUs + reserved by each worker can be overridden with the ``resources_per_worker`` argument. use_gpu (bool): If True, training will be done on GPUs (1 per worker). Defaults to False. The number of GPUs reserved by each @@ -85,7 +85,7 @@ class Trainer: def __init__( self, backend: Union[str, BackendConfig], - num_workers: int = 1, + num_workers: int, use_gpu: bool = False, resources_per_worker: Optional[Dict[str, float]] = None, logdir: Optional[str] = None,