mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[tune] Fixup exception messages (#5238)
This commit is contained in:
parent
d58b986858
commit
b0c0de49a2
5 changed files with 18 additions and 9 deletions
|
@ -50,8 +50,11 @@ if __name__ == "__main__":
|
|||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--smoke-test", action="store_true", help="Finish quickly for testing")
|
||||
parser.add_argument(
|
||||
"--ray-redis-address",
|
||||
help="Address of Ray cluster for seamless distributed execution.")
|
||||
args, _ = parser.parse_known_args()
|
||||
ray.init()
|
||||
ray.init(redis_address=args.ray_redis_address)
|
||||
|
||||
# asynchronous hyperband early stopping, configured with
|
||||
# `episode_reward_mean` as the
|
||||
|
|
|
@ -55,7 +55,7 @@ class NodeSyncMixin():
|
|||
|
||||
def _check_valid_worker_ip(self):
|
||||
if not self.worker_ip:
|
||||
logger.info("Worker ip unknown, skipping log sync for {}".format(
|
||||
logger.debug("Worker ip unknown, skipping log sync for {}".format(
|
||||
self._local_dir))
|
||||
return False
|
||||
if self.worker_ip == self.local_ip:
|
||||
|
|
|
@ -180,11 +180,8 @@ class RayTrialExecutor(TrialExecutor):
|
|||
logger.debug("Reusing actor for {}".format(trial.runner))
|
||||
self._cached_actor = trial.runner
|
||||
else:
|
||||
logger.info(
|
||||
"Destroying actor for trial {}. If your trainable is "
|
||||
"slow to initialize, consider setting "
|
||||
"reuse_actors=True to reduce actor creation "
|
||||
"overheads.".format(trial))
|
||||
logger.debug(
|
||||
"Destroying actor for trial {}.".format(trial))
|
||||
trial.runner.stop.remote()
|
||||
trial.runner.__ray_terminate__.remote()
|
||||
except Exception:
|
||||
|
|
|
@ -26,6 +26,8 @@ from ray.tune.util import UtilMonitor
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SETUP_TIME_THRESHOLD = 10
|
||||
|
||||
|
||||
class Trainable(object):
|
||||
"""Abstract class for trainable models, functions, etc.
|
||||
|
@ -93,7 +95,14 @@ class Trainable(object):
|
|||
self._timesteps_since_restore = 0
|
||||
self._iterations_since_restore = 0
|
||||
self._restored = False
|
||||
start_time = time.time()
|
||||
self._setup(copy.deepcopy(self.config))
|
||||
setup_time = time.time() - start_time
|
||||
if setup_time > SETUP_TIME_THRESHOLD:
|
||||
logger.info("_setup took {:.3f} seconds. If your trainable is "
|
||||
"slow to initialize, consider setting "
|
||||
"reuse_actors=True to reduce actor creation "
|
||||
"overheads.".format(setup_time))
|
||||
self._local_ip = ray.services.get_node_ip_address()
|
||||
self._monitor = UtilMonitor(start=log_sys_usage)
|
||||
|
||||
|
|
|
@ -43,10 +43,10 @@ class UtilMonitor(Thread):
|
|||
|
||||
def __init__(self, start=True, delay=0.7):
|
||||
self.stopped = True
|
||||
if GPUtil is None:
|
||||
if GPUtil is None and start:
|
||||
logger.warning("Install gputil for GPU system monitoring.")
|
||||
|
||||
if psutil is None:
|
||||
if psutil is None and start:
|
||||
logger.warning("Install psutil to monitor system performance.")
|
||||
|
||||
if GPUtil is None and psutil is None:
|
||||
|
|
Loading…
Add table
Reference in a new issue