mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[train] remove default num_workers (#19518)
* [train] remove default num_workers * fix tests
This commit is contained in:
parent
7e10f6a876
commit
19eabd7a55
6 changed files with 15 additions and 17 deletions
|
@ -24,7 +24,7 @@ class TrainingLogdirMixin:
|
|||
logdir_path = Path(logdir)
|
||||
|
||||
if not logdir_path.is_dir():
|
||||
raise ValueError(f"logdir '{logdir}' must be a directory.")
|
||||
raise ValueError(f"logdir '{logdir_path}' must be a directory.")
|
||||
|
||||
self._logdir_path = logdir_path
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from ray.train import Trainer
|
|||
from ray.train.examples.train_fashion_mnist_example import train_func
|
||||
|
||||
|
||||
def main(num_workers=1, use_gpu=False):
|
||||
def main(num_workers=2, use_gpu=False):
|
||||
mlflow.set_experiment("train_torch_fashion_mnist")
|
||||
|
||||
trainer = Trainer(
|
||||
|
@ -38,7 +38,7 @@ if __name__ == "__main__":
|
|||
"--num-workers",
|
||||
"-n",
|
||||
type=int,
|
||||
default=1,
|
||||
default=2,
|
||||
help="Sets number of workers for training.")
|
||||
parser.add_argument(
|
||||
"--use-gpu",
|
||||
|
|
|
@ -120,7 +120,7 @@ def train_func(config: Dict):
|
|||
return loss_results
|
||||
|
||||
|
||||
def train_fashion_mnist(num_workers=1, use_gpu=False):
|
||||
def train_fashion_mnist(num_workers=2, use_gpu=False):
|
||||
trainer = Trainer(
|
||||
backend="torch", num_workers=num_workers, use_gpu=use_gpu)
|
||||
trainer.start()
|
||||
|
@ -131,7 +131,7 @@ def train_fashion_mnist(num_workers=1, use_gpu=False):
|
|||
"batch_size": 64,
|
||||
"epochs": 4
|
||||
},
|
||||
callbacks=[JsonLoggerCallback("./train_results")])
|
||||
callbacks=[JsonLoggerCallback()])
|
||||
trainer.shutdown()
|
||||
print(f"Loss results: {result}")
|
||||
|
||||
|
@ -147,15 +147,13 @@ if __name__ == "__main__":
|
|||
"--num-workers",
|
||||
"-n",
|
||||
type=int,
|
||||
default=1,
|
||||
default=2,
|
||||
help="Sets number of workers for training.")
|
||||
parser.add_argument(
|
||||
"--use-gpu",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enables GPU training")
|
||||
parser.add_argument(
|
||||
"--tune", action="store_true", default=False, help="Tune training")
|
||||
parser.add_argument(
|
||||
"--smoke-test",
|
||||
action="store_true",
|
||||
|
|
|
@ -592,10 +592,10 @@ def test_horovod_torch_mnist_stateful(ray_start_2_cpus):
|
|||
|
||||
def test_init_failure(ray_start_2_cpus):
|
||||
with pytest.raises(TypeError):
|
||||
Trainer(5)
|
||||
Trainer(5, num_workers=2)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
Trainer("invalid")
|
||||
Trainer("invalid", num_workers=2)
|
||||
|
||||
|
||||
def test_start_failure(ray_start_2_cpus):
|
||||
|
|
|
@ -97,7 +97,7 @@ def test_tune_error(ray_start_2_cpus):
|
|||
def train_func(config):
|
||||
raise RuntimeError("Error in training function!")
|
||||
|
||||
trainer = Trainer(TestConfig())
|
||||
trainer = Trainer(TestConfig(), num_workers=1)
|
||||
TestTrainable = trainer.to_tune_trainable(train_func)
|
||||
|
||||
with pytest.raises(TuneError):
|
||||
|
@ -110,7 +110,7 @@ def test_tune_checkpoint(ray_start_2_cpus):
|
|||
train.report(test=i)
|
||||
train.save_checkpoint(hello="world")
|
||||
|
||||
trainer = Trainer(TestConfig())
|
||||
trainer = Trainer(TestConfig(), num_workers=1)
|
||||
TestTrainable = trainer.to_tune_trainable(train_func)
|
||||
|
||||
[trial] = tune.run(TestTrainable).trials
|
||||
|
@ -133,7 +133,7 @@ def test_reuse_checkpoint(ray_start_2_cpus):
|
|||
train.save_checkpoint(iter=i)
|
||||
train.report(test=i, training_iteration=i)
|
||||
|
||||
trainer = Trainer(TestConfig())
|
||||
trainer = Trainer(TestConfig(), num_workers=1)
|
||||
TestTrainable = trainer.to_tune_trainable(train_func)
|
||||
|
||||
[trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials
|
||||
|
@ -163,7 +163,7 @@ def test_retry(ray_start_2_cpus):
|
|||
train.save_checkpoint(iter=i)
|
||||
train.report(test=i, training_iteration=i)
|
||||
|
||||
trainer = Trainer(TestConfig())
|
||||
trainer = Trainer(TestConfig(), num_workers=1)
|
||||
TestTrainable = trainer.to_tune_trainable(train_func)
|
||||
|
||||
analysis = tune.run(TestTrainable, max_failures=3)
|
||||
|
|
|
@ -64,8 +64,8 @@ class Trainer:
|
|||
a subclass of ``BackendConfig`` can be passed in.
|
||||
Supported ``str`` values: {"torch", "tensorflow", "horovod"}.
|
||||
num_workers (int): The number of workers (Ray actors) to launch.
|
||||
Defaults to 1. Each worker will reserve 1 CPU by default. The
|
||||
number of CPUs reserved by each worker can be overridden with the
|
||||
Each worker will reserve 1 CPU by default. The number of CPUs
|
||||
reserved by each worker can be overridden with the
|
||||
``resources_per_worker`` argument.
|
||||
use_gpu (bool): If True, training will be done on GPUs (1 per
|
||||
worker). Defaults to False. The number of GPUs reserved by each
|
||||
|
@ -85,7 +85,7 @@ class Trainer:
|
|||
def __init__(
|
||||
self,
|
||||
backend: Union[str, BackendConfig],
|
||||
num_workers: int = 1,
|
||||
num_workers: int,
|
||||
use_gpu: bool = False,
|
||||
resources_per_worker: Optional[Dict[str, float]] = None,
|
||||
logdir: Optional[str] = None,
|
||||
|
|
Loading…
Add table
Reference in a new issue