mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[tune] Clean up result logging: move out of /tmp, add timestamp (#1297)
This commit is contained in:
parent
12fdb3f53a
commit
fbf1806b8a
11 changed files with 64 additions and 26 deletions
|
@ -153,6 +153,6 @@ workers, we can train the agent in around 25 minutes.
|
||||||
|
|
||||||
You can visualize performance by running
|
You can visualize performance by running
|
||||||
:code:`tensorboard --logdir [directory]` in a separate screen, where
|
:code:`tensorboard --logdir [directory]` in a separate screen, where
|
||||||
:code:`[directory]` is defaulted to :code:`/tmp/ray/`. If you are running
|
:code:`[directory]` is defaulted to :code:`~/ray_results/`. If you are running
|
||||||
multiple experiments, be sure to vary the directory to which Tensorflow saves
|
multiple experiments, be sure to vary the directory to which Tensorflow saves
|
||||||
its progress (found in :code:`a3c.py`).
|
its progress (found in :code:`a3c.py`).
|
||||||
|
|
|
@ -28,7 +28,7 @@ TensorBoard to the log output directory as follows.
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
tensorboard --logdir=/tmp/ray
|
tensorboard --logdir=~/ray_results
|
||||||
|
|
||||||
Many of the TensorBoard metrics are also printed to the console, but you might
|
Many of the TensorBoard metrics are also printed to the console, but you might
|
||||||
find it easier to visualize and compare between runs using the TensorBoard UI.
|
find it easier to visualize and compare between runs using the TensorBoard UI.
|
||||||
|
|
|
@ -59,7 +59,7 @@ You can train a simple DQN agent with the following command
|
||||||
|
|
||||||
python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0
|
python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0
|
||||||
|
|
||||||
By default, the results will be logged to a subdirectory of ``/tmp/ray``.
|
By default, the results will be logged to a subdirectory of ``~/ray_results``.
|
||||||
This subdirectory will contain a file ``params.json`` which contains the
|
This subdirectory will contain a file ``params.json`` which contains the
|
||||||
hyperparameters, a file ``result.json`` which contains a training summary
|
hyperparameters, a file ``result.json`` which contains a training summary
|
||||||
for each episode and a TensorBoard file that can be used to visualize
|
for each episode and a TensorBoard file that can be used to visualize
|
||||||
|
@ -67,7 +67,7 @@ training process with TensorBoard by running
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
tensorboard --logdir=/tmp/ray
|
tensorboard --logdir=~/ray_results
|
||||||
|
|
||||||
|
|
||||||
The ``train.py`` script has a number of options you can show by running
|
The ``train.py`` script has a number of options you can show by running
|
||||||
|
|
|
@ -50,7 +50,7 @@ This script runs a small grid search over the ``my_func`` function using ray.tun
|
||||||
== Status ==
|
== Status ==
|
||||||
Using FIFO scheduling algorithm.
|
Using FIFO scheduling algorithm.
|
||||||
Resources used: 4/8 CPUs, 0/0 GPUs
|
Resources used: 4/8 CPUs, 0/0 GPUs
|
||||||
Result logdir: /tmp/ray/my_experiment
|
Result logdir: ~/ray_results/my_experiment
|
||||||
- my_func_0_alpha=0.2,beta=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc
|
- my_func_0_alpha=0.2,beta=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc
|
||||||
- my_func_1_alpha=0.4,beta=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc
|
- my_func_1_alpha=0.4,beta=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc
|
||||||
- my_func_2_alpha=0.6,beta=1: TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc
|
- my_func_2_alpha=0.6,beta=1: TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc
|
||||||
|
@ -63,14 +63,14 @@ In order to report incremental progress, ``my_func`` periodically calls the ``re
|
||||||
Visualizing Results
|
Visualizing Results
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Ray.tune logs trial results to a unique directory per experiment, e.g. ``/tmp/ray/my_experiment`` in the above example. The log records are compatible with a number of visualization tools:
|
Ray.tune logs trial results to a unique directory per experiment, e.g. ``~/ray_results/my_experiment`` in the above example. The log records are compatible with a number of visualization tools:
|
||||||
|
|
||||||
To visualize learning in tensorboard, run:
|
To visualize learning in tensorboard, run:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
$ pip install tensorboard
|
$ pip install tensorboard
|
||||||
$ tensorboard --logdir=/tmp/ray/my_experiment
|
$ tensorboard --logdir=~/ray_results/my_experiment
|
||||||
|
|
||||||
.. image:: ray-tune-tensorboard.png
|
.. image:: ray-tune-tensorboard.png
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ To use rllab's VisKit (you may have to install some dependencies), run:
|
||||||
::
|
::
|
||||||
|
|
||||||
$ git clone https://github.com/rll/rllab.git
|
$ git clone https://github.com/rll/rllab.git
|
||||||
$ python rllab/rllab/viskit/frontend.py /tmp/ray/my_experiment
|
$ python rllab/rllab/viskit/frontend.py ~/ray_results/my_experiment
|
||||||
|
|
||||||
.. image:: ray-tune-viskit.png
|
.. image:: ray-tune-viskit.png
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ import uuid
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from ray.tune.logger import UnifiedLogger
|
from ray.tune.logger import UnifiedLogger
|
||||||
from ray.tune.registry import ENV_CREATOR
|
from ray.tune.registry import ENV_CREATOR
|
||||||
from ray.tune.result import TrainingResult
|
from ray.tune.result import DEFAULT_RESULTS_DIR, TrainingResult
|
||||||
from ray.tune.trainable import Trainable
|
from ray.tune.trainable import Trainable
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -72,7 +72,6 @@ class Agent(Trainable):
|
||||||
|
|
||||||
_allow_unknown_configs = False
|
_allow_unknown_configs = False
|
||||||
_allow_unknown_subkeys = []
|
_allow_unknown_subkeys = []
|
||||||
_default_logdir = "/tmp/ray"
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, config={}, env=None, registry=None, logger_creator=None):
|
self, config={}, env=None, registry=None, logger_creator=None):
|
||||||
|
@ -111,10 +110,10 @@ class Agent(Trainable):
|
||||||
logdir_suffix = "{}_{}_{}".format(
|
logdir_suffix = "{}_{}_{}".format(
|
||||||
env, self._agent_name,
|
env, self._agent_name,
|
||||||
datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
|
datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
|
||||||
if not os.path.exists(self._default_logdir):
|
if not os.path.exists(DEFAULT_RESULTS_DIR):
|
||||||
os.makedirs(self._default_logdir)
|
os.makedirs(DEFAULT_RESULTS_DIR)
|
||||||
self.logdir = tempfile.mkdtemp(
|
self.logdir = tempfile.mkdtemp(
|
||||||
prefix=logdir_suffix, dir=self._default_logdir)
|
prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR)
|
||||||
self._result_logger = UnifiedLogger(self.config, self.logdir, None)
|
self._result_logger = UnifiedLogger(self.config, self.logdir, None)
|
||||||
|
|
||||||
self._iteration = 0
|
self._iteration = 0
|
||||||
|
@ -155,8 +154,11 @@ class Agent(Trainable):
|
||||||
self._time_total += time_this_iter
|
self._time_total += time_this_iter
|
||||||
self._timesteps_total += result.timesteps_this_iter
|
self._timesteps_total += result.timesteps_this_iter
|
||||||
|
|
||||||
|
now = datetime.today()
|
||||||
result = result._replace(
|
result = result._replace(
|
||||||
experiment_id=self._experiment_id,
|
experiment_id=self._experiment_id,
|
||||||
|
date=now.strftime("%Y-%m-%d_%H-%M-%S"),
|
||||||
|
timestamp=int(time.mktime(now.timetuple())),
|
||||||
training_iteration=self._iteration,
|
training_iteration=self._iteration,
|
||||||
timesteps_total=self._timesteps_total,
|
timesteps_total=self._timesteps_total,
|
||||||
time_this_iter_s=time_this_iter,
|
time_this_iter_s=time_this_iter,
|
||||||
|
|
|
@ -57,7 +57,7 @@ if __name__ == "__main__":
|
||||||
else:
|
else:
|
||||||
# Note: keep this in sync with tune/config_parser.py
|
# Note: keep this in sync with tune/config_parser.py
|
||||||
experiments = {
|
experiments = {
|
||||||
args.experiment_name: { # i.e. log to /tmp/ray/default
|
args.experiment_name: { # i.e. log to ~/ray_results/default
|
||||||
"run": args.run,
|
"run": args.run,
|
||||||
"checkpoint_freq": args.checkpoint_freq,
|
"checkpoint_freq": args.checkpoint_freq,
|
||||||
"local_dir": args.local_dir,
|
"local_dir": args.local_dir,
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import os\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n",
|
"from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n",
|
||||||
"import plotly\n",
|
"import plotly\n",
|
||||||
|
@ -46,7 +47,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"RESULTS_DIR = \"/tmp/ray/\"\n",
|
"RESULTS_DIR = os.path.expanduser(\"~/ray_results\")\n",
|
||||||
"df = load_results_to_df(RESULTS_DIR)\n",
|
"df = load_results_to_df(RESULTS_DIR)\n",
|
||||||
"[key for key in df]"
|
"[key for key in df]"
|
||||||
]
|
]
|
||||||
|
|
|
@ -7,6 +7,7 @@ import argparse
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from ray.tune import TuneError
|
from ray.tune import TuneError
|
||||||
|
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||||
from ray.tune.trial import Resources
|
from ray.tune.trial import Resources
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,8 +64,9 @@ def make_parser(**kwargs):
|
||||||
"--repeat", default=1, type=int,
|
"--repeat", default=1, type=int,
|
||||||
help="Number of times to repeat each trial.")
|
help="Number of times to repeat each trial.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--local-dir", default="/tmp/ray", type=str,
|
"--local-dir", default=DEFAULT_RESULTS_DIR, type=str,
|
||||||
help="Local dir to save training results to. Defaults to '/tmp/ray'.")
|
help="Local dir to save training results to. Defaults to '{}'.".format(
|
||||||
|
DEFAULT_RESULTS_DIR))
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--upload-dir", default="", type=str,
|
"--upload-dir", default="", type=str,
|
||||||
help="Optional URI to upload training results to.")
|
help="Optional URI to upload training results to.")
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import print_function
|
||||||
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import yaml
|
import yaml
|
||||||
|
@ -20,6 +21,9 @@ Most of the fields are optional, the only required one is timesteps_total.
|
||||||
In RLlib, the supplied algorithms fill in TrainingResult for you.
|
In RLlib, the supplied algorithms fill in TrainingResult for you.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Where ray.tune writes result files by default
|
||||||
|
DEFAULT_RESULTS_DIR = os.path.expanduser("~/ray_results")
|
||||||
|
|
||||||
|
|
||||||
TrainingResult = namedtuple("TrainingResult", [
|
TrainingResult = namedtuple("TrainingResult", [
|
||||||
# (Required) Accumulated timesteps for this entire experiment.
|
# (Required) Accumulated timesteps for this entire experiment.
|
||||||
|
@ -40,9 +44,12 @@ TrainingResult = namedtuple("TrainingResult", [
|
||||||
# (Optional) The number of episodes total.
|
# (Optional) The number of episodes total.
|
||||||
"episodes_total",
|
"episodes_total",
|
||||||
|
|
||||||
# (Optional) The current training accuracy if applicable>
|
# (Optional) The current training accuracy if applicable.
|
||||||
"mean_accuracy",
|
"mean_accuracy",
|
||||||
|
|
||||||
|
# (Optional) The current validation accuracy if applicable.
|
||||||
|
"mean_validation_accuracy",
|
||||||
|
|
||||||
# (Optional) The current training loss if applicable.
|
# (Optional) The current training loss if applicable.
|
||||||
"mean_loss",
|
"mean_loss",
|
||||||
|
|
||||||
|
@ -69,6 +76,12 @@ TrainingResult = namedtuple("TrainingResult", [
|
||||||
# (Auto-filled) The pid of the training process.
|
# (Auto-filled) The pid of the training process.
|
||||||
"pid",
|
"pid",
|
||||||
|
|
||||||
|
# (Auto-filled) A formatted date of when the result was processed.
|
||||||
|
"date",
|
||||||
|
|
||||||
|
# (Auto-filled) A UNIX timestamp of when the result was processed.
|
||||||
|
"timestamp",
|
||||||
|
|
||||||
# (Auto-filled) The hostname of the machine hosting the training process.
|
# (Auto-filled) The hostname of the machine hosting the training process.
|
||||||
"hostname",
|
"hostname",
|
||||||
])
|
])
|
||||||
|
|
|
@ -2,6 +2,7 @@ from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
import ray
|
import ray
|
||||||
|
@ -10,7 +11,7 @@ import os
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from ray.tune import TuneError
|
from ray.tune import TuneError
|
||||||
from ray.tune.logger import NoopLogger, UnifiedLogger
|
from ray.tune.logger import NoopLogger, UnifiedLogger
|
||||||
from ray.tune.result import TrainingResult
|
from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
|
||||||
from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
|
from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,7 +63,7 @@ class Trial(object):
|
||||||
ERROR = "ERROR"
|
ERROR = "ERROR"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, trainable_name, config={}, local_dir='/tmp/ray',
|
self, trainable_name, config={}, local_dir=DEFAULT_RESULTS_DIR,
|
||||||
experiment_tag=None, resources=Resources(cpu=1, gpu=0),
|
experiment_tag=None, resources=Resources(cpu=1, gpu=0),
|
||||||
stopping_criterion={}, checkpoint_freq=0,
|
stopping_criterion={}, checkpoint_freq=0,
|
||||||
restore_path=None, upload_dir=None):
|
restore_path=None, upload_dir=None):
|
||||||
|
@ -295,16 +296,22 @@ class Trial(object):
|
||||||
if not os.path.exists(self.local_dir):
|
if not os.path.exists(self.local_dir):
|
||||||
os.makedirs(self.local_dir)
|
os.makedirs(self.local_dir)
|
||||||
self.logdir = tempfile.mkdtemp(
|
self.logdir = tempfile.mkdtemp(
|
||||||
prefix=str(self), dir=self.local_dir)
|
prefix=str(self), dir=self.local_dir,
|
||||||
|
suffix=datetime.today().strftime("_%Y-%m-%d_%H-%M-%S"))
|
||||||
self.result_logger = UnifiedLogger(
|
self.result_logger = UnifiedLogger(
|
||||||
self.config, self.logdir, self.upload_dir)
|
self.config, self.logdir, self.upload_dir)
|
||||||
remote_logdir = self.logdir
|
remote_logdir = self.logdir
|
||||||
|
|
||||||
|
def logger_creator(config):
|
||||||
|
# Set the working dir in the remote process, for user file writes
|
||||||
|
os.chdir(remote_logdir)
|
||||||
|
return NoopLogger(config, remote_logdir)
|
||||||
|
|
||||||
# Logging for trials is handled centrally by TrialRunner, so
|
# Logging for trials is handled centrally by TrialRunner, so
|
||||||
# configure the remote runner to use a noop-logger.
|
# configure the remote runner to use a noop-logger.
|
||||||
self.runner = cls.remote(
|
self.runner = cls.remote(
|
||||||
config=self.config,
|
config=self.config, registry=get_registry(),
|
||||||
registry=get_registry(),
|
logger_creator=logger_creator)
|
||||||
logger_creator=lambda config: NoopLogger(config, remote_logdir))
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if "env" in self.config:
|
if "env" in self.config:
|
||||||
|
|
|
@ -12,6 +12,7 @@ from ray.rllib import _register_all
|
||||||
from ray.tune import Trainable, TuneError
|
from ray.tune import Trainable, TuneError
|
||||||
from ray.tune import register_env, register_trainable, run_experiments
|
from ray.tune import register_env, register_trainable, run_experiments
|
||||||
from ray.tune.registry import _default_registry, TRAINABLE_CLASS
|
from ray.tune.registry import _default_registry, TRAINABLE_CLASS
|
||||||
|
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||||
from ray.tune.trial import Trial, Resources
|
from ray.tune.trial import Trial, Resources
|
||||||
from ray.tune.trial_runner import TrialRunner
|
from ray.tune.trial_runner import TrialRunner
|
||||||
from ray.tune.variant_generator import generate_trials, grid_search, \
|
from ray.tune.variant_generator import generate_trials, grid_search, \
|
||||||
|
@ -63,6 +64,17 @@ class TrainableFunctionApiTest(unittest.TestCase):
|
||||||
"config": {"a": "b"},
|
"config": {"a": "b"},
|
||||||
}})
|
}})
|
||||||
|
|
||||||
|
def testLogdir(self):
|
||||||
|
def train(config, reporter):
|
||||||
|
assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
|
||||||
|
reporter(timesteps_total=1)
|
||||||
|
register_trainable("f1", train)
|
||||||
|
run_experiments({"foo": {
|
||||||
|
"run": "f1",
|
||||||
|
"local_dir": "/tmp/logdir",
|
||||||
|
"config": {"a": "b"},
|
||||||
|
}})
|
||||||
|
|
||||||
def testBadParams(self):
|
def testBadParams(self):
|
||||||
def f():
|
def f():
|
||||||
run_experiments({"foo": {}})
|
run_experiments({"foo": {}})
|
||||||
|
@ -191,7 +203,9 @@ class VariantGeneratorTest(unittest.TestCase):
|
||||||
self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
|
self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
|
||||||
self.assertEqual(trials[0].trainable_name, "PPO")
|
self.assertEqual(trials[0].trainable_name, "PPO")
|
||||||
self.assertEqual(trials[0].experiment_tag, "0")
|
self.assertEqual(trials[0].experiment_tag, "0")
|
||||||
self.assertEqual(trials[0].local_dir, "/tmp/ray/tune-pong")
|
self.assertEqual(
|
||||||
|
trials[0].local_dir,
|
||||||
|
os.path.join(DEFAULT_RESULTS_DIR, "tune-pong"))
|
||||||
self.assertEqual(trials[1].experiment_tag, "1")
|
self.assertEqual(trials[1].experiment_tag, "1")
|
||||||
|
|
||||||
def testEval(self):
|
def testEval(self):
|
||||||
|
@ -207,7 +221,6 @@ class VariantGeneratorTest(unittest.TestCase):
|
||||||
self.assertEqual(len(trials), 1)
|
self.assertEqual(len(trials), 1)
|
||||||
self.assertEqual(trials[0].config, {"foo": 4})
|
self.assertEqual(trials[0].config, {"foo": 4})
|
||||||
self.assertEqual(trials[0].experiment_tag, "0_foo=4")
|
self.assertEqual(trials[0].experiment_tag, "0_foo=4")
|
||||||
self.assertEqual(trials[0].local_dir, "/tmp/ray/")
|
|
||||||
|
|
||||||
def testGridSearch(self):
|
def testGridSearch(self):
|
||||||
trials = generate_trials({
|
trials = generate_trials({
|
||||||
|
|
Loading…
Add table
Reference in a new issue