mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[tune] Limit maximum number of pending trials. Add convergence test. (#14835)
This commit is contained in:
parent
5d763b3f49
commit
898243d538
17 changed files with 257 additions and 118 deletions
|
@ -759,7 +759,7 @@ These are the environment variables Ray Tune currently considers:
|
||||||
* **TUNE_MAX_LEN_IDENTIFIER**: Maximum length of trial subdirectory names (those
|
* **TUNE_MAX_LEN_IDENTIFIER**: Maximum length of trial subdirectory names (those
|
||||||
with the parameter values in them)
|
with the parameter values in them)
|
||||||
* **TUNE_MAX_PENDING_TRIALS_PG**: Maximum number of pending trials when placement groups are used. Defaults
|
* **TUNE_MAX_PENDING_TRIALS_PG**: Maximum number of pending trials when placement groups are used. Defaults
|
||||||
to ``1000``.
|
to ``auto``, which will be updated to ``1000`` for random/grid search and ``1`` for any other search algorithms.
|
||||||
* **TUNE_PLACEMENT_GROUP_AUTO_DISABLED**: Ray Tune automatically uses placement groups
|
* **TUNE_PLACEMENT_GROUP_AUTO_DISABLED**: Ray Tune automatically uses placement groups
|
||||||
instead of the legacy resource requests. Setting this to 1 enables legacy placement.
|
instead of the legacy resource requests. Setting this to 1 enables legacy placement.
|
||||||
* **TUNE_PLACEMENT_GROUP_CLEANUP_DISABLED**: Ray Tune cleans up existing placement groups
|
* **TUNE_PLACEMENT_GROUP_CLEANUP_DISABLED**: Ray Tune cleans up existing placement groups
|
||||||
|
@ -767,6 +767,9 @@ These are the environment variables Ray Tune currently considers:
|
||||||
that scheduled placement groups are removed when multiple calls to ``tune.run()`` are
|
that scheduled placement groups are removed when multiple calls to ``tune.run()`` are
|
||||||
done in the same script. You might want to disable this if you run multiple Tune runs in
|
done in the same script. You might want to disable this if you run multiple Tune runs in
|
||||||
parallel from different scripts. Set to 1 to disable.
|
parallel from different scripts. Set to 1 to disable.
|
||||||
|
* **TUNE_PLACEMENT_GROUP_PREFIX**: Prefix for placement groups created by Ray Tune. This prefix is used
|
||||||
|
e.g. to identify placement groups that should be cleaned up on start/stop of the tuning run. This is
|
||||||
|
initialized to a unique name at the start of the first run.
|
||||||
* **TUNE_PLACEMENT_GROUP_WAIT_S**: Default time the trial executor waits for placement
|
* **TUNE_PLACEMENT_GROUP_WAIT_S**: Default time the trial executor waits for placement
|
||||||
groups to be placed before continuing the tuning loop. Setting this to a float
|
groups to be placed before continuing the tuning loop. Setting this to a float
|
||||||
will block for that many seconds. This is mostly used for testing purposes. Defaults
|
will block for that many seconds. This is mostly used for testing purposes. Defaults
|
||||||
|
|
|
@ -50,9 +50,9 @@ py_test(
|
||||||
)
|
)
|
||||||
|
|
||||||
py_test(
|
py_test(
|
||||||
name = "test_convergence_gaussian_process",
|
name = "test_convergence",
|
||||||
size = "small",
|
size = "medium",
|
||||||
srcs = ["tests/test_convergence_gaussian_process.py"],
|
srcs = ["tests/test_convergence.py"],
|
||||||
deps = [":tune_lib"],
|
deps = [":tune_lib"],
|
||||||
tags = ["exclusive"],
|
tags = ["exclusive"],
|
||||||
)
|
)
|
||||||
|
|
|
@ -19,7 +19,8 @@ from ray.tune.error import AbortTrialExecution, TuneError
|
||||||
from ray.tune.logger import NoopLogger
|
from ray.tune.logger import NoopLogger
|
||||||
from ray.tune.result import TRIAL_INFO, STDOUT_FILE, STDERR_FILE
|
from ray.tune.result import TRIAL_INFO, STDOUT_FILE, STDERR_FILE
|
||||||
from ray.tune.resources import Resources
|
from ray.tune.resources import Resources
|
||||||
from ray.tune.utils.placement_groups import PlacementGroupManager
|
from ray.tune.utils.placement_groups import PlacementGroupManager, \
|
||||||
|
get_tune_pg_prefix
|
||||||
from ray.tune.utils.trainable import TrainableUtil
|
from ray.tune.utils.trainable import TrainableUtil
|
||||||
from ray.tune.trial import Trial, Checkpoint, Location, TrialInfo
|
from ray.tune.trial import Trial, Checkpoint, Location, TrialInfo
|
||||||
from ray.tune.trial_executor import TrialExecutor
|
from ray.tune.trial_executor import TrialExecutor
|
||||||
|
@ -160,7 +161,7 @@ class RayTrialExecutor(TrialExecutor):
|
||||||
|
|
||||||
self._avail_resources = Resources(cpu=0, gpu=0)
|
self._avail_resources = Resources(cpu=0, gpu=0)
|
||||||
self._committed_resources = Resources(cpu=0, gpu=0)
|
self._committed_resources = Resources(cpu=0, gpu=0)
|
||||||
self._pg_manager = PlacementGroupManager()
|
self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix())
|
||||||
self._staged_trials = set()
|
self._staged_trials = set()
|
||||||
self._just_staged_trials = set()
|
self._just_staged_trials = set()
|
||||||
self._trial_just_finished = False
|
self._trial_just_finished = False
|
||||||
|
@ -197,6 +198,9 @@ class RayTrialExecutor(TrialExecutor):
|
||||||
"""Returns True if trials have recently been staged."""
|
"""Returns True if trials have recently been staged."""
|
||||||
return self._pg_manager.in_staging_grace_period()
|
return self._pg_manager.in_staging_grace_period()
|
||||||
|
|
||||||
|
def set_max_pending_trials(self, max_pending: int):
|
||||||
|
self._pg_manager.set_max_staging(max_pending)
|
||||||
|
|
||||||
def stage_and_update_status(self, trials: List[Trial]):
|
def stage_and_update_status(self, trials: List[Trial]):
|
||||||
"""Check and update statuses of scheduled placement groups.
|
"""Check and update statuses of scheduled placement groups.
|
||||||
|
|
||||||
|
@ -783,7 +787,9 @@ class RayTrialExecutor(TrialExecutor):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if trial.uses_placement_groups:
|
if trial.uses_placement_groups:
|
||||||
return trial in self._staged_trials or self._pg_manager.can_stage()
|
return trial in self._staged_trials or self._pg_manager.can_stage(
|
||||||
|
) or self._pg_manager.has_ready(
|
||||||
|
trial, update=True)
|
||||||
|
|
||||||
return self.has_resources(trial.resources)
|
return self.has_resources(trial.resources)
|
||||||
|
|
||||||
|
|
|
@ -213,6 +213,7 @@ class ExperimentPlateauStopper(Stopper):
|
||||||
return self.has_plateaued() and self._iterations >= self._patience
|
return self.has_plateaued() and self._iterations >= self._patience
|
||||||
|
|
||||||
|
|
||||||
|
# Deprecate: 1.4
|
||||||
class EarlyStopping(ExperimentPlateauStopper):
|
class EarlyStopping(ExperimentPlateauStopper):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
|
|
|
@ -17,10 +17,11 @@ except ImportError:
|
||||||
|
|
||||||
# This exception only exists in newer Ax releases for python 3.7
|
# This exception only exists in newer Ax releases for python 3.7
|
||||||
try:
|
try:
|
||||||
|
from ax.exceptions.core import DataRequiredError
|
||||||
from ax.exceptions.generation_strategy import \
|
from ax.exceptions.generation_strategy import \
|
||||||
MaxParallelismReachedException
|
MaxParallelismReachedException
|
||||||
except ImportError:
|
except ImportError:
|
||||||
MaxParallelismReachedException = Exception
|
MaxParallelismReachedException = DataRequiredError = Exception
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
@ -262,7 +263,7 @@ class AxSearch(Searcher):
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
parameters, trial_index = self._ax.get_next_trial()
|
parameters, trial_index = self._ax.get_next_trial()
|
||||||
except MaxParallelismReachedException:
|
except (MaxParallelismReachedException, DataRequiredError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self._live_trial_mapping[trial_id] = trial_index
|
self._live_trial_mapping[trial_id] = trial_index
|
||||||
|
|
|
@ -4,7 +4,6 @@ from __future__ import print_function
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
import logging
|
import logging
|
||||||
import pickle
|
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
from ray.tune.result import DEFAULT_METRIC
|
from ray.tune.result import DEFAULT_METRIC
|
||||||
|
@ -331,17 +330,6 @@ class DragonflySearch(Searcher):
|
||||||
self._opt.tell([(trial_info,
|
self._opt.tell([(trial_info,
|
||||||
self._metric_op * result[self._metric])])
|
self._metric_op * result[self._metric])])
|
||||||
|
|
||||||
def save(self, checkpoint_path: str):
|
|
||||||
trials_object = (self._initial_points, self._opt)
|
|
||||||
with open(checkpoint_path, "wb") as outputFile:
|
|
||||||
pickle.dump(trials_object, outputFile)
|
|
||||||
|
|
||||||
def restore(self, checkpoint_dir: str):
|
|
||||||
with open(checkpoint_dir, "rb") as inputFile:
|
|
||||||
trials_object = pickle.load(inputFile)
|
|
||||||
self._initial_points = trials_object[0]
|
|
||||||
self._opt = trials_object[1]
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_search_space(spec: Dict) -> List[Dict]:
|
def convert_search_space(spec: Dict) -> List[Dict]:
|
||||||
resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec)
|
resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
import inspect
|
||||||
import logging
|
import logging
|
||||||
import pickle
|
import pickle
|
||||||
from typing import Dict, Optional, Union, List, Sequence
|
from typing import Dict, Optional, Type, Union, List, Sequence
|
||||||
|
|
||||||
from ray.tune.result import DEFAULT_METRIC
|
from ray.tune.result import DEFAULT_METRIC
|
||||||
from ray.tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
|
from ray.tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
|
||||||
|
@ -108,7 +109,8 @@ class NevergradSearch(Searcher):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
optimizer: Union[None, Optimizer, ConfiguredOptimizer] = None,
|
optimizer: Union[None, Optimizer, Type[Optimizer],
|
||||||
|
ConfiguredOptimizer] = None,
|
||||||
space: Optional[Union[Dict, Parameter]] = None,
|
space: Optional[Union[Dict, Parameter]] = None,
|
||||||
metric: Optional[str] = None,
|
metric: Optional[str] = None,
|
||||||
mode: Optional[str] = None,
|
mode: Optional[str] = None,
|
||||||
|
@ -154,7 +156,9 @@ class NevergradSearch(Searcher):
|
||||||
"parameter.")
|
"parameter.")
|
||||||
self._parameters = space
|
self._parameters = space
|
||||||
self._nevergrad_opt = optimizer
|
self._nevergrad_opt = optimizer
|
||||||
elif isinstance(optimizer, ConfiguredOptimizer):
|
elif (inspect.isclass(optimizer)
|
||||||
|
and issubclass(optimizer, Optimizer)) or isinstance(
|
||||||
|
optimizer, ConfiguredOptimizer):
|
||||||
self._opt_factory = optimizer
|
self._opt_factory = optimizer
|
||||||
self._parameters = None
|
self._parameters = None
|
||||||
self._space = space
|
self._space = space
|
||||||
|
|
|
@ -364,11 +364,11 @@ def test_trial_migration(start_connected_emptyhead_cluster, trainable_id):
|
||||||
|
|
||||||
@pytest.mark.parametrize("trainable_id", ["__fake", "__fake_durable"])
|
@pytest.mark.parametrize("trainable_id", ["__fake", "__fake_durable"])
|
||||||
@pytest.mark.parametrize("with_pg", [True, False])
|
@pytest.mark.parametrize("with_pg", [True, False])
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def test_trial_requeue(start_connected_emptyhead_cluster, trainable_id,
|
def test_trial_requeue(start_connected_emptyhead_cluster, trainable_id,
|
||||||
with_pg):
|
with_pg):
|
||||||
"""Removing a node in full cluster causes Trial to be requeued."""
|
"""Removing a node in full cluster causes Trial to be requeued."""
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
if not with_pg:
|
if not with_pg:
|
||||||
os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"
|
os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"
|
||||||
|
|
||||||
|
|
156
python/ray/tune/tests/test_convergence.py
Normal file
156
python/ray/tune/tests/test_convergence.py
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import ray
|
||||||
|
from ray import tune
|
||||||
|
from ray.tune.stopper import ExperimentPlateauStopper
|
||||||
|
from ray.tune.suggest import ConcurrencyLimiter
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
def loss(config, reporter):
|
||||||
|
x = config.get("x")
|
||||||
|
reporter(loss=x**2) # A simple function to optimize
|
||||||
|
|
||||||
|
|
||||||
|
class ConvergenceTest(unittest.TestCase):
|
||||||
|
"""Test convergence in gaussian process."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls) -> None:
|
||||||
|
ray.init(local_mode=False, num_cpus=1, num_gpus=0)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls) -> None:
|
||||||
|
ray.shutdown()
|
||||||
|
|
||||||
|
def _testConvergence(self, searcher, top=3, patience=20):
|
||||||
|
# This is the space of parameters to explore
|
||||||
|
space = {"x": tune.uniform(0, 20)}
|
||||||
|
|
||||||
|
resources_per_trial = {"cpu": 1, "gpu": 0}
|
||||||
|
|
||||||
|
analysis = tune.run(
|
||||||
|
loss,
|
||||||
|
metric="loss",
|
||||||
|
mode="min",
|
||||||
|
stop=ExperimentPlateauStopper(
|
||||||
|
metric="loss", top=top, patience=patience),
|
||||||
|
search_alg=searcher,
|
||||||
|
config=space,
|
||||||
|
num_samples=100, # Number of iterations
|
||||||
|
resources_per_trial=resources_per_trial,
|
||||||
|
raise_on_failed_trial=False,
|
||||||
|
fail_fast=True,
|
||||||
|
reuse_actors=True,
|
||||||
|
verbose=1)
|
||||||
|
print(f"Num trials: {len(analysis.trials)}. "
|
||||||
|
f"Best result: {analysis.best_config['x']}")
|
||||||
|
|
||||||
|
return analysis
|
||||||
|
|
||||||
|
def testConvergenceAx(self):
|
||||||
|
from ray.tune.suggest.ax import AxSearch
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
|
||||||
|
searcher = AxSearch()
|
||||||
|
analysis = self._testConvergence(searcher, patience=10)
|
||||||
|
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-5)
|
||||||
|
|
||||||
|
def testConvergenceBayesOpt(self):
|
||||||
|
from ray.tune.suggest.bayesopt import BayesOptSearch
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
|
||||||
|
# Following bayesian optimization
|
||||||
|
searcher = BayesOptSearch(random_search_steps=10)
|
||||||
|
searcher.repeat_float_precision = 5
|
||||||
|
searcher = ConcurrencyLimiter(searcher, 1)
|
||||||
|
|
||||||
|
analysis = self._testConvergence(searcher, patience=100)
|
||||||
|
|
||||||
|
assert len(analysis.trials) < 50
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-5)
|
||||||
|
|
||||||
|
def testConvergenceDragonfly(self):
|
||||||
|
from ray.tune.suggest.dragonfly import DragonflySearch
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
searcher = DragonflySearch(domain="euclidean", optimizer="bandit")
|
||||||
|
analysis = self._testConvergence(searcher)
|
||||||
|
|
||||||
|
assert len(analysis.trials) < 100
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-5)
|
||||||
|
|
||||||
|
def testConvergenceHEBO(self):
|
||||||
|
from ray.tune.suggest.hebo import HEBOSearch
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
searcher = HEBOSearch()
|
||||||
|
analysis = self._testConvergence(searcher)
|
||||||
|
|
||||||
|
assert len(analysis.trials) < 100
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-2)
|
||||||
|
|
||||||
|
def testConvergenceHyperopt(self):
|
||||||
|
from ray.tune.suggest.hyperopt import HyperOptSearch
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
searcher = HyperOptSearch(random_state_seed=1234)
|
||||||
|
analysis = self._testConvergence(searcher, patience=50, top=5)
|
||||||
|
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-2)
|
||||||
|
|
||||||
|
def testConvergenceNevergrad(self):
|
||||||
|
from ray.tune.suggest.nevergrad import NevergradSearch
|
||||||
|
import nevergrad as ng
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
searcher = NevergradSearch(optimizer=ng.optimizers.PSO)
|
||||||
|
analysis = self._testConvergence(searcher, patience=50, top=5)
|
||||||
|
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-3)
|
||||||
|
|
||||||
|
def testConvergenceOptuna(self):
|
||||||
|
from ray.tune.suggest.optuna import OptunaSearch
|
||||||
|
|
||||||
|
np.random.seed(1)
|
||||||
|
searcher = OptunaSearch()
|
||||||
|
analysis = self._testConvergence(
|
||||||
|
searcher,
|
||||||
|
top=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
# This assertion is much weaker than in the BO case, but TPE
|
||||||
|
# don't converge too close. It is still unlikely to get to this
|
||||||
|
# tolerance with random search (~0.01% chance)
|
||||||
|
assert len(analysis.trials) < 100
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-2)
|
||||||
|
|
||||||
|
def testConvergenceSkOpt(self):
|
||||||
|
from ray.tune.suggest.skopt import SkOptSearch
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
searcher = SkOptSearch()
|
||||||
|
analysis = self._testConvergence(searcher)
|
||||||
|
|
||||||
|
assert len(analysis.trials) < 100
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-3)
|
||||||
|
|
||||||
|
def testConvergenceZoopt(self):
|
||||||
|
from ray.tune.suggest.zoopt import ZOOptSearch
|
||||||
|
|
||||||
|
np.random.seed(0)
|
||||||
|
searcher = ZOOptSearch(budget=100)
|
||||||
|
analysis = self._testConvergence(searcher)
|
||||||
|
|
||||||
|
assert len(analysis.trials) < 100
|
||||||
|
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-3)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import pytest
|
||||||
|
import sys
|
||||||
|
sys.exit(pytest.main(["-v", __file__]))
|
|
@ -1,56 +0,0 @@
|
||||||
import math
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import ray
|
|
||||||
from ray import tune
|
|
||||||
from ray.tune.suggest.bayesopt import BayesOptSearch
|
|
||||||
from ray.tune.suggest import ConcurrencyLimiter
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
|
|
||||||
def loss(config, reporter):
|
|
||||||
x = config.get("x")
|
|
||||||
reporter(loss=x**2) # A simple function to optimize
|
|
||||||
|
|
||||||
|
|
||||||
class ConvergenceTest(unittest.TestCase):
|
|
||||||
"""Test convergence in gaussian process."""
|
|
||||||
|
|
||||||
def shutDown(self):
|
|
||||||
ray.shutdown()
|
|
||||||
|
|
||||||
def test_convergence_gaussian_process(self):
|
|
||||||
np.random.seed(0)
|
|
||||||
ray.init(local_mode=True, num_cpus=1, num_gpus=1)
|
|
||||||
|
|
||||||
# This is the space of parameters to explore
|
|
||||||
space = {"x": tune.uniform(0, 20)}
|
|
||||||
|
|
||||||
resources_per_trial = {"cpu": 1, "gpu": 0}
|
|
||||||
|
|
||||||
# Following bayesian optimization
|
|
||||||
gp = BayesOptSearch(random_search_steps=10)
|
|
||||||
gp.repeat_float_precision = 5
|
|
||||||
gp = ConcurrencyLimiter(gp, 1)
|
|
||||||
|
|
||||||
# Execution of the BO.
|
|
||||||
analysis = tune.run(
|
|
||||||
loss,
|
|
||||||
metric="loss",
|
|
||||||
mode="min",
|
|
||||||
# stop=EarlyStopping("loss", mode="min", patience=5),
|
|
||||||
search_alg=gp,
|
|
||||||
config=space,
|
|
||||||
num_samples=100, # Number of iterations
|
|
||||||
resources_per_trial=resources_per_trial,
|
|
||||||
raise_on_failed_trial=False,
|
|
||||||
fail_fast=True,
|
|
||||||
verbose=1)
|
|
||||||
assert len(analysis.trials) in {13, 40, 43} # it is 43 on the cluster?
|
|
||||||
assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-5)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import pytest
|
|
||||||
import sys
|
|
||||||
sys.exit(pytest.main(["-v", __file__]))
|
|
|
@ -1,7 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
from ray.rllib import _register_all
|
from ray.rllib import _register_all
|
||||||
|
@ -294,9 +293,9 @@ class TrialRunnerTest(unittest.TestCase):
|
||||||
self.assertEqual(trials[0].status, Trial.RUNNING)
|
self.assertEqual(trials[0].status, Trial.RUNNING)
|
||||||
self.assertEqual(runner.trial_executor._committed_resources.cpu, 2)
|
self.assertEqual(runner.trial_executor._committed_resources.cpu, 2)
|
||||||
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testQueueFilling(self):
|
def testQueueFilling(self):
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
ray.init(num_cpus=4)
|
ray.init(num_cpus=4)
|
||||||
|
|
||||||
def f1(config):
|
def f1(config):
|
||||||
|
|
|
@ -31,6 +31,8 @@ class TrialRunnerTest3(unittest.TestCase):
|
||||||
# Block for results even when placement groups are pending
|
# Block for results even when placement groups are pending
|
||||||
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
|
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
|
||||||
|
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto" # Reset default
|
||||||
|
|
||||||
self.tmpdir = tempfile.mkdtemp()
|
self.tmpdir = tempfile.mkdtemp()
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
|
@ -114,11 +116,10 @@ class TrialRunnerTest3(unittest.TestCase):
|
||||||
self.assertEqual(trials[2].status, Trial.RUNNING)
|
self.assertEqual(trials[2].status, Trial.RUNNING)
|
||||||
self.assertEqual(trials[-1].status, Trial.TERMINATED)
|
self.assertEqual(trials[-1].status, Trial.TERMINATED)
|
||||||
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testSearchAlgNotification(self):
|
def testSearchAlgNotification(self):
|
||||||
"""Checks notification of trial to the Search Algorithm."""
|
"""Checks notification of trial to the Search Algorithm."""
|
||||||
os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early
|
os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
ray.init(num_cpus=4, num_gpus=2)
|
ray.init(num_cpus=4, num_gpus=2)
|
||||||
experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}}
|
experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}}
|
||||||
|
@ -235,10 +236,9 @@ class TrialRunnerTest3(unittest.TestCase):
|
||||||
self.assertTrue(search_alg.is_finished())
|
self.assertTrue(search_alg.is_finished())
|
||||||
self.assertTrue(runner.is_finished())
|
self.assertTrue(runner.is_finished())
|
||||||
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testSearchAlgFinishes(self):
|
def testSearchAlgFinishes(self):
|
||||||
"""Empty SearchAlg changing state in `next_trials` does not crash."""
|
"""Empty SearchAlg changing state in `next_trials` does not crash."""
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
class FinishFastAlg(_MockSuggestionAlgorithm):
|
class FinishFastAlg(_MockSuggestionAlgorithm):
|
||||||
_index = 0
|
_index = 0
|
||||||
|
@ -295,7 +295,7 @@ class TrialRunnerTest3(unittest.TestCase):
|
||||||
def __init__(self, index):
|
def __init__(self, index):
|
||||||
self.index = index
|
self.index = index
|
||||||
self.returned_result = []
|
self.returned_result = []
|
||||||
super().__init__(metric="result", mode="max")
|
super().__init__(metric="episode_reward_mean", mode="max")
|
||||||
|
|
||||||
def suggest(self, trial_id):
|
def suggest(self, trial_id):
|
||||||
self.index += 1
|
self.index += 1
|
||||||
|
@ -506,10 +506,10 @@ class TrialRunnerTest3(unittest.TestCase):
|
||||||
runner2.step() # Process save
|
runner2.step() # Process save
|
||||||
self.assertRaises(TuneError, runner2.step)
|
self.assertRaises(TuneError, runner2.step)
|
||||||
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testTrialNoSave(self):
|
def testTrialNoSave(self):
|
||||||
"""Check that non-checkpointing trials are not saved."""
|
"""Check that non-checkpointing trials are not saved."""
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
ray.init(num_cpus=3)
|
ray.init(num_cpus=3)
|
||||||
|
|
||||||
runner = TrialRunner(
|
runner = TrialRunner(
|
||||||
|
@ -635,10 +635,9 @@ class TrialRunnerTest3(unittest.TestCase):
|
||||||
self.assertEqual(trial.last_result[TRAINING_ITERATION], 9)
|
self.assertEqual(trial.last_result[TRAINING_ITERATION], 9)
|
||||||
self.assertEqual(num_checkpoints(trial), 3)
|
self.assertEqual(num_checkpoints(trial), 3)
|
||||||
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testUserCheckpoint(self):
|
def testUserCheckpoint(self):
|
||||||
os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early
|
os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
ray.init(num_cpus=3)
|
ray.init(num_cpus=3)
|
||||||
runner = TrialRunner(
|
runner = TrialRunner(
|
||||||
|
|
|
@ -3,7 +3,6 @@ import os
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
from ray import tune
|
from ray import tune
|
||||||
|
@ -19,6 +18,7 @@ from ray.rllib import _register_all
|
||||||
class TrialRunnerPlacementGroupTest(unittest.TestCase):
|
class TrialRunnerPlacementGroupTest(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "10000"
|
os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "10000"
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto" # Reset default
|
||||||
self.head_cpus = 8
|
self.head_cpus = 8
|
||||||
self.head_gpus = 4
|
self.head_gpus = 4
|
||||||
self.head_custom = 16
|
self.head_custom = 16
|
||||||
|
@ -154,15 +154,13 @@ class TrialRunnerPlacementGroupTest(unittest.TestCase):
|
||||||
"""Assert that reuse actors doesn't leak placement groups"""
|
"""Assert that reuse actors doesn't leak placement groups"""
|
||||||
self.testPlacementGroupRequests(reuse_actors=True)
|
self.testPlacementGroupRequests(reuse_actors=True)
|
||||||
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 6)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 6)
|
|
||||||
def testPlacementGroupLimitedRequests(self):
|
def testPlacementGroupLimitedRequests(self):
|
||||||
"""Assert that maximum number of placement groups is enforced."""
|
"""Assert that maximum number of placement groups is enforced."""
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "6"
|
||||||
self.testPlacementGroupRequests(scheduled=6)
|
self.testPlacementGroupRequests(scheduled=6)
|
||||||
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 6)
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 6)
|
|
||||||
def testPlacementGroupLimitedRequestsWithActorReuse(self):
|
def testPlacementGroupLimitedRequestsWithActorReuse(self):
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "6"
|
||||||
self.testPlacementGroupRequests(reuse_actors=True, scheduled=6)
|
self.testPlacementGroupRequests(reuse_actors=True, scheduled=6)
|
||||||
|
|
||||||
def testPlacementGroupDistributedTraining(self, reuse_actors=False):
|
def testPlacementGroupDistributedTraining(self, reuse_actors=False):
|
||||||
|
|
|
@ -6,7 +6,6 @@ import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
|
||||||
|
|
||||||
import skopt
|
import skopt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -206,9 +205,9 @@ class TuneFailResumeGridTest(unittest.TestCase):
|
||||||
shutil.rmtree(self.logdir)
|
shutil.rmtree(self.logdir)
|
||||||
ray.shutdown()
|
ray.shutdown()
|
||||||
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testFailResumeGridSearch(self):
|
def testFailResumeGridSearch(self):
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
config = dict(
|
config = dict(
|
||||||
num_samples=3,
|
num_samples=3,
|
||||||
fail_fast=True,
|
fail_fast=True,
|
||||||
|
@ -237,9 +236,9 @@ class TuneFailResumeGridTest(unittest.TestCase):
|
||||||
test2_counter = Counter([t.config["test2"] for t in analysis.trials])
|
test2_counter = Counter([t.config["test2"] for t in analysis.trials])
|
||||||
assert all(v == 9 for v in test2_counter.values())
|
assert all(v == 9 for v in test2_counter.values())
|
||||||
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testFailResumeWithPreset(self):
|
def testFailResumeWithPreset(self):
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
search_alg = BasicVariantGenerator(points_to_evaluate=[{
|
search_alg = BasicVariantGenerator(points_to_evaluate=[{
|
||||||
"test": -1,
|
"test": -1,
|
||||||
"test2": -1
|
"test2": -1
|
||||||
|
@ -280,9 +279,9 @@ class TuneFailResumeGridTest(unittest.TestCase):
|
||||||
assert test2_counter.pop(-1) == 4
|
assert test2_counter.pop(-1) == 4
|
||||||
assert all(v == 10 for v in test2_counter.values())
|
assert all(v == 10 for v in test2_counter.values())
|
||||||
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testFailResumeAfterPreset(self):
|
def testFailResumeAfterPreset(self):
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
search_alg = BasicVariantGenerator(points_to_evaluate=[{
|
search_alg = BasicVariantGenerator(points_to_evaluate=[{
|
||||||
"test": -1,
|
"test": -1,
|
||||||
"test2": -1
|
"test2": -1
|
||||||
|
@ -324,9 +323,9 @@ class TuneFailResumeGridTest(unittest.TestCase):
|
||||||
assert test2_counter.pop(-1) == 4
|
assert test2_counter.pop(-1) == 4
|
||||||
assert all(v == 10 for v in test2_counter.values())
|
assert all(v == 10 for v in test2_counter.values())
|
||||||
|
|
||||||
@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
|
|
||||||
def testMultiExperimentFail(self):
|
def testMultiExperimentFail(self):
|
||||||
|
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
|
||||||
|
|
||||||
experiments = []
|
experiments = []
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
experiments.append(
|
experiments.append(
|
||||||
|
|
|
@ -285,3 +285,7 @@ class TrialExecutor:
|
||||||
def in_staging_grace_period(self) -> bool:
|
def in_staging_grace_period(self) -> bool:
|
||||||
"""Returns True if trials have recently been staged."""
|
"""Returns True if trials have recently been staged."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def set_max_pending_trials(self, max_pending: int):
|
||||||
|
"""Set the maximum number of allowed pending trials."""
|
||||||
|
pass
|
||||||
|
|
|
@ -22,7 +22,6 @@ from ray.tune.schedulers import FIFOScheduler, TrialScheduler
|
||||||
from ray.tune.suggest import BasicVariantGenerator, SearchAlgorithm
|
from ray.tune.suggest import BasicVariantGenerator, SearchAlgorithm
|
||||||
from ray.tune.utils import warn_if_slow, flatten_dict
|
from ray.tune.utils import warn_if_slow, flatten_dict
|
||||||
from ray.tune.utils.log import Verbosity, has_verbosity
|
from ray.tune.utils.log import Verbosity, has_verbosity
|
||||||
from ray.tune.utils.placement_groups import TUNE_MAX_PENDING_TRIALS_PG
|
|
||||||
from ray.tune.utils.serialization import TuneFunctionDecoder, \
|
from ray.tune.utils.serialization import TuneFunctionDecoder, \
|
||||||
TuneFunctionEncoder
|
TuneFunctionEncoder
|
||||||
from ray.tune.web_server import TuneServer
|
from ray.tune.web_server import TuneServer
|
||||||
|
@ -215,9 +214,18 @@ class TrialRunner:
|
||||||
self.trial_executor = trial_executor or RayTrialExecutor()
|
self.trial_executor = trial_executor or RayTrialExecutor()
|
||||||
self._pending_trial_queue_times = {}
|
self._pending_trial_queue_times = {}
|
||||||
|
|
||||||
# Setting this to 0 still allows adding one new (pending) trial,
|
# Set the number of maximum pending trials
|
||||||
# but it will prevent us from trying to fill the trial list
|
max_pending_trials = os.getenv("TUNE_MAX_PENDING_TRIALS_PG", "auto")
|
||||||
self._max_pending_trials = 0 # Can be updated in `self.add_trial()`
|
if max_pending_trials == "auto":
|
||||||
|
# Auto detect
|
||||||
|
if isinstance(self._search_alg, BasicVariantGenerator):
|
||||||
|
self._max_pending_trials = 1000
|
||||||
|
else:
|
||||||
|
self._max_pending_trials = 1
|
||||||
|
else:
|
||||||
|
# Manual override
|
||||||
|
self._max_pending_trials = int(max_pending_trials)
|
||||||
|
self.trial_executor.set_max_pending_trials(self._max_pending_trials)
|
||||||
|
|
||||||
self._metric = metric
|
self._metric = metric
|
||||||
|
|
||||||
|
@ -557,9 +565,6 @@ class TrialRunner:
|
||||||
Args:
|
Args:
|
||||||
trial (Trial): Trial to queue.
|
trial (Trial): Trial to queue.
|
||||||
"""
|
"""
|
||||||
if trial.uses_placement_groups:
|
|
||||||
self._max_pending_trials = TUNE_MAX_PENDING_TRIALS_PG
|
|
||||||
|
|
||||||
self._trials.append(trial)
|
self._trials.append(trial)
|
||||||
with warn_if_slow("scheduler.on_trial_add"):
|
with warn_if_slow("scheduler.on_trial_add"):
|
||||||
self._scheduler_alg.on_trial_add(self, trial)
|
self._scheduler_alg.on_trial_add(self, trial)
|
||||||
|
|
|
@ -16,9 +16,36 @@ from ray.util.placement_group import PlacementGroup, get_placement_group, \
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ray.tune.trial import Trial
|
from ray.tune.trial import Trial
|
||||||
|
|
||||||
TUNE_MAX_PENDING_TRIALS_PG = int(os.getenv("TUNE_MAX_PENDING_TRIALS_PG", 1000))
|
|
||||||
TUNE_PLACEMENT_GROUP_REMOVAL_DELAY = 2.
|
TUNE_PLACEMENT_GROUP_REMOVAL_DELAY = 2.
|
||||||
|
|
||||||
|
_tune_pg_prefix = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_tune_pg_prefix():
|
||||||
|
"""Get the tune placement group name prefix.
|
||||||
|
|
||||||
|
This will store the prefix in a global variable so that subsequent runs
|
||||||
|
can use this identifier to clean up placement groups before starting their
|
||||||
|
run.
|
||||||
|
|
||||||
|
Can be overwritten with the ``TUNE_PLACEMENT_GROUP_PREFIX`` env variable.
|
||||||
|
"""
|
||||||
|
global _tune_pg_prefix
|
||||||
|
|
||||||
|
if _tune_pg_prefix:
|
||||||
|
return _tune_pg_prefix
|
||||||
|
|
||||||
|
# Else: check env variable
|
||||||
|
env_prefix = os.getenv("TUNE_PLACEMENT_GROUP_PREFIX", "")
|
||||||
|
|
||||||
|
if env_prefix:
|
||||||
|
_tune_pg_prefix = env_prefix
|
||||||
|
return _tune_pg_prefix
|
||||||
|
|
||||||
|
# Else: create and store unique prefix
|
||||||
|
_tune_pg_prefix = f"__tune_{uuid.uuid4().hex[:8]}__"
|
||||||
|
return _tune_pg_prefix
|
||||||
|
|
||||||
|
|
||||||
class PlacementGroupFactory:
|
class PlacementGroupFactory:
|
||||||
"""Wrapper class that creates placement groups for trials.
|
"""Wrapper class that creates placement groups for trials.
|
||||||
|
@ -187,7 +214,7 @@ class PlacementGroupManager:
|
||||||
prefix (str): Prefix for the placement group names that are created.
|
prefix (str): Prefix for the placement group names that are created.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, prefix: str = "_tune__"):
|
def __init__(self, prefix: str = "__tune__", max_staging: int = 1000):
|
||||||
self._prefix = prefix
|
self._prefix = prefix
|
||||||
|
|
||||||
# Sets of staged placement groups by factory
|
# Sets of staged placement groups by factory
|
||||||
|
@ -220,6 +247,11 @@ class PlacementGroupManager:
|
||||||
self._grace_period = float(
|
self._grace_period = float(
|
||||||
os.getenv("TUNE_TRIAL_STARTUP_GRACE_PERIOD", 10.))
|
os.getenv("TUNE_TRIAL_STARTUP_GRACE_PERIOD", 10.))
|
||||||
|
|
||||||
|
self._max_staging = max_staging
|
||||||
|
|
||||||
|
def set_max_staging(self, max_staging: int):
|
||||||
|
self._max_staging = max_staging
|
||||||
|
|
||||||
def remove_pg(self, pg: PlacementGroup):
|
def remove_pg(self, pg: PlacementGroup):
|
||||||
"""Schedule placement group for (delayed) removal.
|
"""Schedule placement group for (delayed) removal.
|
||||||
|
|
||||||
|
@ -314,7 +346,7 @@ class PlacementGroupManager:
|
||||||
|
|
||||||
def can_stage(self):
|
def can_stage(self):
|
||||||
"""Return True if we can stage another placement group."""
|
"""Return True if we can stage another placement group."""
|
||||||
return len(self._staging_futures) < TUNE_MAX_PENDING_TRIALS_PG
|
return len(self._staging_futures) < self._max_staging
|
||||||
|
|
||||||
def update_status(self):
|
def update_status(self):
|
||||||
"""Update placement group status.
|
"""Update placement group status.
|
||||||
|
|
Loading…
Add table
Reference in a new issue