[tune] Clean up result logging: move out of /tmp, add timestamp (#1297)

This commit is contained in:
Eric Liang 2017-12-15 14:19:08 -08:00 committed by GitHub
parent 12fdb3f53a
commit fbf1806b8a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 64 additions and 26 deletions

View file

@ -153,6 +153,6 @@ workers, we can train the agent in around 25 minutes.
You can visualize performance by running You can visualize performance by running
:code:`tensorboard --logdir [directory]` in a separate screen, where :code:`tensorboard --logdir [directory]` in a separate screen, where
:code:`[directory]` is defaulted to :code:`/tmp/ray/`. If you are running :code:`[directory]` is defaulted to :code:`~/ray_results/`. If you are running
multiple experiments, be sure to vary the directory to which Tensorflow saves multiple experiments, be sure to vary the directory to which Tensorflow saves
its progress (found in :code:`a3c.py`). its progress (found in :code:`a3c.py`).

View file

@ -28,7 +28,7 @@ TensorBoard to the log output directory as follows.
.. code-block:: bash .. code-block:: bash
tensorboard --logdir=/tmp/ray tensorboard --logdir=~/ray_results
Many of the TensorBoard metrics are also printed to the console, but you might Many of the TensorBoard metrics are also printed to the console, but you might
find it easier to visualize and compare between runs using the TensorBoard UI. find it easier to visualize and compare between runs using the TensorBoard UI.

View file

@ -59,7 +59,7 @@ You can train a simple DQN agent with the following command
python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0 python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0
By default, the results will be logged to a subdirectory of ``/tmp/ray``. By default, the results will be logged to a subdirectory of ``~/ray_results``.
This subdirectory will contain a file ``params.json`` which contains the This subdirectory will contain a file ``params.json`` which contains the
hyperparameters, a file ``result.json`` which contains a training summary hyperparameters, a file ``result.json`` which contains a training summary
for each episode and a TensorBoard file that can be used to visualize for each episode and a TensorBoard file that can be used to visualize
@ -67,7 +67,7 @@ training process with TensorBoard by running
:: ::
tensorboard --logdir=/tmp/ray tensorboard --logdir=~/ray_results
The ``train.py`` script has a number of options you can show by running The ``train.py`` script has a number of options you can show by running

View file

@ -50,7 +50,7 @@ This script runs a small grid search over the ``my_func`` function using ray.tun
== Status == == Status ==
Using FIFO scheduling algorithm. Using FIFO scheduling algorithm.
Resources used: 4/8 CPUs, 0/0 GPUs Resources used: 4/8 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/my_experiment Result logdir: ~/ray_results/my_experiment
- my_func_0_alpha=0.2,beta=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc - my_func_0_alpha=0.2,beta=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc
- my_func_1_alpha=0.4,beta=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc - my_func_1_alpha=0.4,beta=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc
- my_func_2_alpha=0.6,beta=1: TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc - my_func_2_alpha=0.6,beta=1: TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc
@ -63,14 +63,14 @@ In order to report incremental progress, ``my_func`` periodically calls the ``re
Visualizing Results Visualizing Results
------------------- -------------------
Ray.tune logs trial results to a unique directory per experiment, e.g. ``/tmp/ray/my_experiment`` in the above example. The log records are compatible with a number of visualization tools: Ray.tune logs trial results to a unique directory per experiment, e.g. ``~/ray_results/my_experiment`` in the above example. The log records are compatible with a number of visualization tools:
To visualize learning in tensorboard, run: To visualize learning in tensorboard, run:
:: ::
$ pip install tensorboard $ pip install tensorboard
$ tensorboard --logdir=/tmp/ray/my_experiment $ tensorboard --logdir=~/ray_results/my_experiment
.. image:: ray-tune-tensorboard.png .. image:: ray-tune-tensorboard.png
@ -79,7 +79,7 @@ To use rllab's VisKit (you may have to install some dependencies), run:
:: ::
$ git clone https://github.com/rll/rllab.git $ git clone https://github.com/rll/rllab.git
$ python rllab/rllab/viskit/frontend.py /tmp/ray/my_experiment $ python rllab/rllab/viskit/frontend.py ~/ray_results/my_experiment
.. image:: ray-tune-viskit.png .. image:: ray-tune-viskit.png

View file

@ -18,7 +18,7 @@ import uuid
import tensorflow as tf import tensorflow as tf
from ray.tune.logger import UnifiedLogger from ray.tune.logger import UnifiedLogger
from ray.tune.registry import ENV_CREATOR from ray.tune.registry import ENV_CREATOR
from ray.tune.result import TrainingResult from ray.tune.result import DEFAULT_RESULTS_DIR, TrainingResult
from ray.tune.trainable import Trainable from ray.tune.trainable import Trainable
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -72,7 +72,6 @@ class Agent(Trainable):
_allow_unknown_configs = False _allow_unknown_configs = False
_allow_unknown_subkeys = [] _allow_unknown_subkeys = []
_default_logdir = "/tmp/ray"
def __init__( def __init__(
self, config={}, env=None, registry=None, logger_creator=None): self, config={}, env=None, registry=None, logger_creator=None):
@ -111,10 +110,10 @@ class Agent(Trainable):
logdir_suffix = "{}_{}_{}".format( logdir_suffix = "{}_{}_{}".format(
env, self._agent_name, env, self._agent_name,
datetime.today().strftime("%Y-%m-%d_%H-%M-%S")) datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
if not os.path.exists(self._default_logdir): if not os.path.exists(DEFAULT_RESULTS_DIR):
os.makedirs(self._default_logdir) os.makedirs(DEFAULT_RESULTS_DIR)
self.logdir = tempfile.mkdtemp( self.logdir = tempfile.mkdtemp(
prefix=logdir_suffix, dir=self._default_logdir) prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR)
self._result_logger = UnifiedLogger(self.config, self.logdir, None) self._result_logger = UnifiedLogger(self.config, self.logdir, None)
self._iteration = 0 self._iteration = 0
@ -155,8 +154,11 @@ class Agent(Trainable):
self._time_total += time_this_iter self._time_total += time_this_iter
self._timesteps_total += result.timesteps_this_iter self._timesteps_total += result.timesteps_this_iter
now = datetime.today()
result = result._replace( result = result._replace(
experiment_id=self._experiment_id, experiment_id=self._experiment_id,
date=now.strftime("%Y-%m-%d_%H-%M-%S"),
timestamp=int(time.mktime(now.timetuple())),
training_iteration=self._iteration, training_iteration=self._iteration,
timesteps_total=self._timesteps_total, timesteps_total=self._timesteps_total,
time_this_iter_s=time_this_iter, time_this_iter_s=time_this_iter,

View file

@ -57,7 +57,7 @@ if __name__ == "__main__":
else: else:
# Note: keep this in sync with tune/config_parser.py # Note: keep this in sync with tune/config_parser.py
experiments = { experiments = {
args.experiment_name: { # i.e. log to /tmp/ray/default args.experiment_name: { # i.e. log to ~/ray_results/default
"run": args.run, "run": args.run,
"checkpoint_freq": args.checkpoint_freq, "checkpoint_freq": args.checkpoint_freq,
"local_dir": args.local_dir, "local_dir": args.local_dir,

View file

@ -24,6 +24,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n",
"import pandas as pd\n", "import pandas as pd\n",
"from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n", "from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n",
"import plotly\n", "import plotly\n",
@ -46,7 +47,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"RESULTS_DIR = \"/tmp/ray/\"\n", "RESULTS_DIR = os.path.expanduser(\"~/ray_results\")\n",
"df = load_results_to_df(RESULTS_DIR)\n", "df = load_results_to_df(RESULTS_DIR)\n",
"[key for key in df]" "[key for key in df]"
] ]

View file

@ -7,6 +7,7 @@ import argparse
import json import json
from ray.tune import TuneError from ray.tune import TuneError
from ray.tune.result import DEFAULT_RESULTS_DIR
from ray.tune.trial import Resources from ray.tune.trial import Resources
@ -63,8 +64,9 @@ def make_parser(**kwargs):
"--repeat", default=1, type=int, "--repeat", default=1, type=int,
help="Number of times to repeat each trial.") help="Number of times to repeat each trial.")
parser.add_argument( parser.add_argument(
"--local-dir", default="/tmp/ray", type=str, "--local-dir", default=DEFAULT_RESULTS_DIR, type=str,
help="Local dir to save training results to. Defaults to '/tmp/ray'.") help="Local dir to save training results to. Defaults to '{}'.".format(
DEFAULT_RESULTS_DIR))
parser.add_argument( parser.add_argument(
"--upload-dir", default="", type=str, "--upload-dir", default="", type=str,
help="Optional URI to upload training results to.") help="Optional URI to upload training results to.")

View file

@ -4,6 +4,7 @@ from __future__ import print_function
from collections import namedtuple from collections import namedtuple
import json import json
import os
try: try:
import yaml import yaml
@ -20,6 +21,9 @@ Most of the fields are optional, the only required one is timesteps_total.
In RLlib, the supplied algorithms fill in TrainingResult for you. In RLlib, the supplied algorithms fill in TrainingResult for you.
""" """
# Where ray.tune writes result files by default
DEFAULT_RESULTS_DIR = os.path.expanduser("~/ray_results")
TrainingResult = namedtuple("TrainingResult", [ TrainingResult = namedtuple("TrainingResult", [
# (Required) Accumulated timesteps for this entire experiment. # (Required) Accumulated timesteps for this entire experiment.
@ -40,9 +44,12 @@ TrainingResult = namedtuple("TrainingResult", [
# (Optional) The number of episodes total. # (Optional) The number of episodes total.
"episodes_total", "episodes_total",
# (Optional) The current training accuracy if applicable> # (Optional) The current training accuracy if applicable.
"mean_accuracy", "mean_accuracy",
# (Optional) The current validation accuracy if applicable.
"mean_validation_accuracy",
# (Optional) The current training loss if applicable. # (Optional) The current training loss if applicable.
"mean_loss", "mean_loss",
@ -69,6 +76,12 @@ TrainingResult = namedtuple("TrainingResult", [
# (Auto-filled) The pid of the training process. # (Auto-filled) The pid of the training process.
"pid", "pid",
# (Auto-filled) A formatted date of when the result was processed.
"date",
# (Auto-filled) A UNIX timestamp of when the result was processed.
"timestamp",
# (Auto-filled) The hostname of the machine hosting the training process. # (Auto-filled) The hostname of the machine hosting the training process.
"hostname", "hostname",
]) ])

View file

@ -2,6 +2,7 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from datetime import datetime
import tempfile import tempfile
import traceback import traceback
import ray import ray
@ -10,7 +11,7 @@ import os
from collections import namedtuple from collections import namedtuple
from ray.tune import TuneError from ray.tune import TuneError
from ray.tune.logger import NoopLogger, UnifiedLogger from ray.tune.logger import NoopLogger, UnifiedLogger
from ray.tune.result import TrainingResult from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
@ -62,7 +63,7 @@ class Trial(object):
ERROR = "ERROR" ERROR = "ERROR"
def __init__( def __init__(
self, trainable_name, config={}, local_dir='/tmp/ray', self, trainable_name, config={}, local_dir=DEFAULT_RESULTS_DIR,
experiment_tag=None, resources=Resources(cpu=1, gpu=0), experiment_tag=None, resources=Resources(cpu=1, gpu=0),
stopping_criterion={}, checkpoint_freq=0, stopping_criterion={}, checkpoint_freq=0,
restore_path=None, upload_dir=None): restore_path=None, upload_dir=None):
@ -295,16 +296,22 @@ class Trial(object):
if not os.path.exists(self.local_dir): if not os.path.exists(self.local_dir):
os.makedirs(self.local_dir) os.makedirs(self.local_dir)
self.logdir = tempfile.mkdtemp( self.logdir = tempfile.mkdtemp(
prefix=str(self), dir=self.local_dir) prefix=str(self), dir=self.local_dir,
suffix=datetime.today().strftime("_%Y-%m-%d_%H-%M-%S"))
self.result_logger = UnifiedLogger( self.result_logger = UnifiedLogger(
self.config, self.logdir, self.upload_dir) self.config, self.logdir, self.upload_dir)
remote_logdir = self.logdir remote_logdir = self.logdir
def logger_creator(config):
# Set the working dir in the remote process, for user file writes
os.chdir(remote_logdir)
return NoopLogger(config, remote_logdir)
# Logging for trials is handled centrally by TrialRunner, so # Logging for trials is handled centrally by TrialRunner, so
# configure the remote runner to use a noop-logger. # configure the remote runner to use a noop-logger.
self.runner = cls.remote( self.runner = cls.remote(
config=self.config, config=self.config, registry=get_registry(),
registry=get_registry(), logger_creator=logger_creator)
logger_creator=lambda config: NoopLogger(config, remote_logdir))
def __str__(self): def __str__(self):
if "env" in self.config: if "env" in self.config:

View file

@ -12,6 +12,7 @@ from ray.rllib import _register_all
from ray.tune import Trainable, TuneError from ray.tune import Trainable, TuneError
from ray.tune import register_env, register_trainable, run_experiments from ray.tune import register_env, register_trainable, run_experiments
from ray.tune.registry import _default_registry, TRAINABLE_CLASS from ray.tune.registry import _default_registry, TRAINABLE_CLASS
from ray.tune.result import DEFAULT_RESULTS_DIR
from ray.tune.trial import Trial, Resources from ray.tune.trial import Trial, Resources
from ray.tune.trial_runner import TrialRunner from ray.tune.trial_runner import TrialRunner
from ray.tune.variant_generator import generate_trials, grid_search, \ from ray.tune.variant_generator import generate_trials, grid_search, \
@ -63,6 +64,17 @@ class TrainableFunctionApiTest(unittest.TestCase):
"config": {"a": "b"}, "config": {"a": "b"},
}}) }})
def testLogdir(self):
def train(config, reporter):
assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
reporter(timesteps_total=1)
register_trainable("f1", train)
run_experiments({"foo": {
"run": "f1",
"local_dir": "/tmp/logdir",
"config": {"a": "b"},
}})
def testBadParams(self): def testBadParams(self):
def f(): def f():
run_experiments({"foo": {}}) run_experiments({"foo": {}})
@ -191,7 +203,9 @@ class VariantGeneratorTest(unittest.TestCase):
self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"}) self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
self.assertEqual(trials[0].trainable_name, "PPO") self.assertEqual(trials[0].trainable_name, "PPO")
self.assertEqual(trials[0].experiment_tag, "0") self.assertEqual(trials[0].experiment_tag, "0")
self.assertEqual(trials[0].local_dir, "/tmp/ray/tune-pong") self.assertEqual(
trials[0].local_dir,
os.path.join(DEFAULT_RESULTS_DIR, "tune-pong"))
self.assertEqual(trials[1].experiment_tag, "1") self.assertEqual(trials[1].experiment_tag, "1")
def testEval(self): def testEval(self):
@ -207,7 +221,6 @@ class VariantGeneratorTest(unittest.TestCase):
self.assertEqual(len(trials), 1) self.assertEqual(len(trials), 1)
self.assertEqual(trials[0].config, {"foo": 4}) self.assertEqual(trials[0].config, {"foo": 4})
self.assertEqual(trials[0].experiment_tag, "0_foo=4") self.assertEqual(trials[0].experiment_tag, "0_foo=4")
self.assertEqual(trials[0].local_dir, "/tmp/ray/")
def testGridSearch(self): def testGridSearch(self):
trials = generate_trials({ trials = generate_trials({