Add a web dashboard for monitoring node resource usage (#4066)

2025-03-06 10:31:39 -05:00 · 2019-02-21 08:10:04 +00:00 · 2019-02-21 08:10:04 +00:00 · 2e30f7ba38
commit 2e30f7ba38
parent 3ac8fd7ee8
13 changed files with 1239 additions and 44 deletions
--- a/.travis/check-git-clang-format-output.sh
+++ b/.travis/check-git-clang-format-output.sh
@ -8,7 +8,7 @@ else
  base_commit="$TRAVIS_BRANCH"
  echo "Running clang-format against branch $base_commit, with hash $(git rev-parse $base_commit)"
 fi
-output="$(.travis/git-clang-format --binary clang-format-3.8 --commit $base_commit --diff --exclude '(.*thirdparty/|.*redismodule.h|.*webui*)')"
+output="$(.travis/git-clang-format --binary clang-format-3.8 --commit $base_commit --diff --exclude '(.*thirdparty/|.*redismodule.h|.*.js)')"
 if [ "$output" == "no modified files to format" ] || [ "$output" == "clang-format did not modify any files" ] ; then
  echo "clang-format passed."
  exit 0
--- a/python/ray/autoscaler/commands.py
+++ b/python/ray/autoscaler/commands.py
@ -127,30 +127,34 @@ def kill_node(config_file, yes, override_cluster_name):
    confirm("This will kill a node in your cluster", yes)

    provider = get_node_provider(config["provider"], config["cluster_name"])
-    nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
-    node = random.choice(nodes)
-    logger.info("kill_node: Terminating worker {}".format(node))
+    try:
+        nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
+        node = random.choice(nodes)
+        logger.info("kill_node: Terminating worker {}".format(node))

-    updater = NodeUpdaterThread(
-        node_id=node,
-        provider_config=config["provider"],
-        provider=provider,
-        auth_config=config["auth"],
-        cluster_name=config["cluster_name"],
-        file_mounts=config["file_mounts"],
-        initialization_commands=[],
-        setup_commands=[],
-        runtime_hash="")
+        updater = NodeUpdaterThread(
+            node_id=node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=[],
+            setup_commands=[],
+            runtime_hash="")

-    _exec(updater, "ray stop", False, False)
+        _exec(updater, "ray stop", False, False)

-    time.sleep(5)
+        time.sleep(5)

-    iip = provider.internal_ip(node)
-    if iip:
-        return iip
+        if config.get("provider", {}).get("use_internal_ips", False) is True:
+            node_ip = provider.internal_ip(node)
+        else:
+            node_ip = provider.external_ip(node)
+    finally:
+        provider.cleanup()

-    return provider.external_ip(node)
+    return node_ip


 def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
@ -246,14 +250,18 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
        # Refresh the node cache so we see the external ip if available
        provider.nodes(head_node_tags)

+        if config.get("provider", {}).get("use_internal_ips", False) is True:
+            head_node_ip = provider.internal_ip(head_node)
+        else:
+            head_node_ip = provider.external_ip(head_node)
+
        if updater.exitcode != 0:
            logger.error("get_or_create_head_node: "
-                         "Updating {} failed".format(
-                             provider.external_ip(head_node)))
+                         "Updating {} failed".format(head_node_ip))
            sys.exit(1)
-        logger.info("get_or_create_head_node: "
-                    "Head node up-to-date, IP address is: {}".format(
-                        provider.external_ip(head_node)))
+        logger.info(
+            "get_or_create_head_node: "
+            "Head node up-to-date, IP address is: {}".format(head_node_ip))

        monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
        use_docker = bool(config["docker"]["container_name"])
@ -268,10 +276,11 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
                  quote(monitor_str), modifiers))
        print("To open a console on the cluster:\n\n"
              "  ray attach {}{}\n".format(config_file, modifiers))
+
        print("To ssh manually to the cluster, run:\n\n"
              "  ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"],
                                           config["auth"]["ssh_user"],
-                                           provider.external_ip(head_node)))
+                                           head_node_ip))
    finally:
        provider.cleanup()

@ -456,11 +465,14 @@ def get_head_node_ip(config_file, override_cluster_name):
    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        head_node = _get_head_node(config, config_file, override_cluster_name)
-        ip = provider.external_ip(head_node)
+        if config.get("provider", {}).get("use_internal_ips", False) is True:
+            head_node_ip = provider.internal_ip(head_node)
+        else:
+            head_node_ip = provider.external_ip(head_node)
    finally:
        provider.cleanup()

-    return ip
+    return head_node_ip


 def get_worker_node_ips(config_file, override_cluster_name):
@ -469,9 +481,17 @@ def get_worker_node_ips(config_file, override_cluster_name):
    config = yaml.load(open(config_file).read())
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
+
    provider = get_node_provider(config["provider"], config["cluster_name"])
-    nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
-    return [provider.external_ip(node) for node in nodes]
+    try:
+        nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
+
+        if config.get("provider", {}).get("use_internal_ips", False) is True:
+            return [provider.internal_ip(node) for node in nodes]
+        else:
+            return [provider.external_ip(node) for node in nodes]
+    finally:
+        provider.cleanup()


 def _get_head_node(config,
--- a/python/ray/dashboard/dashboard.py
+++ b/python/ray/dashboard/dashboard.py
@ -0,0 +1,348 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+try:
+    import aiohttp.web
+except ModuleNotFoundError:
+    print("The reporter requires aiohttp to run.")
+    import sys
+    sys.exit(1)
+
+import argparse
+import datetime
+import json
+import logging
+import os
+import threading
+import traceback
+import yaml
+
+from pathlib import Path
+from collections import Counter
+from operator import itemgetter
+from typing import Dict
+
+import ray.ray_constants as ray_constants
+import ray.utils
+
+# Logger for this module. It should be configured at the entry point
+# into the program using Ray. Ray provides a default configuration at
+# entry/init points.
+logger = logging.getLogger(__name__)
+
+
+def to_unix_time(dt):
+    return (dt - datetime.datetime(1970, 1, 1)).total_seconds()
+
+
+class Dashboard(object):
+    """A dashboard process for monitoring Ray nodes.
+
+    This dashboard is made up of a REST API which collates data published by
+        Reporter processes on nodes into a json structure, and a webserver
+        which polls said API for display purposes.
+
+    Attributes:
+        redis_client: A client used to communicate with the Redis server.
+    """
+
+    def __init__(self,
+                 redis_address,
+                 http_port,
+                 token,
+                 temp_dir,
+                 redis_password=None):
+        """Initialize the dashboard object."""
+        self.ip = ray.services.get_node_ip_address()
+        self.port = http_port
+        self.token = token
+        self.temp_dir = temp_dir
+        self.node_stats = NodeStats(redis_address, redis_password)
+
+        self.app = aiohttp.web.Application(middlewares=[self.auth_middleware])
+        self.setup_routes()
+
+    @aiohttp.web.middleware
+    async def auth_middleware(self, req, handler):
+        def valid_token(req):
+            # If the cookie token is correct, accept that.
+            try:
+                if req.cookies["token"] == self.token:
+                    return True
+            except KeyError:
+                pass
+
+            # If the query token is correct, accept that.
+            try:
+                if req.query["token"] == self.token:
+                    return True
+            except KeyError:
+                pass
+
+            # Reject.
+            logger.warning("Dashboard: rejected an invalid token")
+            return False
+
+        # Check that the token is present, either in query or as cookie.
+        if not valid_token(req):
+            return aiohttp.web.Response(status=401, text="401 Unauthorized")
+
+        resp = await handler(req)
+        resp.cookies["token"] = self.token
+        return resp
+
+    def setup_routes(self):
+        def forbidden() -> aiohttp.web.Response:
+            return aiohttp.web.Response(status=403, text="403 Forbidden")
+
+        def get_forbidden(_) -> aiohttp.web.Response:
+            return forbidden()
+
+        async def get_index(req) -> aiohttp.web.Response:
+            return aiohttp.web.FileResponse(
+                os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)), "index.html"))
+
+        async def get_resource(req) -> aiohttp.web.Response:
+            try:
+                path = req.match_info["x"]
+            except KeyError:
+                return forbidden()
+
+            if path not in ["main.css", "main.js"]:
+                return forbidden()
+
+            return aiohttp.web.FileResponse(
+                os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    "res/{}".format(path)))
+
+        async def json_response(result=None, error=None,
+                                ts=None) -> aiohttp.web.Response:
+            if ts is None:
+                ts = datetime.datetime.utcnow()
+
+            return aiohttp.web.json_response({
+                "result": result,
+                "timestamp": to_unix_time(ts),
+                "error": error,
+            })
+
+        async def ray_config(_) -> aiohttp.web.Response:
+            try:
+                with open(
+                        Path("~/ray_bootstrap_config.yaml").expanduser()) as f:
+                    cfg = yaml.load(f)
+            except Exception:
+                return await json_response(error="No config")
+
+            D = {
+                "min_workers": cfg["min_workers"],
+                "max_workers": cfg["max_workers"],
+                "initial_workers": cfg["initial_workers"],
+                "idle_timeout_minutes": cfg["idle_timeout_minutes"],
+            }
+
+            try:
+                D["head_type"] = cfg["head_node"]["InstanceType"]
+            except KeyError:
+                D["head_type"] = "unknown"
+
+            try:
+                D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
+            except KeyError:
+                D["worker_type"] = "unknown"
+
+            return await json_response(result=D)
+
+        async def node_info(req) -> aiohttp.web.Response:
+            now = datetime.datetime.utcnow()
+            D = self.node_stats.get_node_stats()
+            return await json_response(result=D, ts=now)
+
+        self.app.router.add_get("/", get_index)
+        self.app.router.add_get("/index.html", get_index)
+        self.app.router.add_get("/index.htm", get_index)
+        self.app.router.add_get("/res/{x}", get_resource)
+
+        self.app.router.add_get("/api/node_info", node_info)
+        self.app.router.add_get("/api/super_client_table", node_info)
+        self.app.router.add_get("/api/ray_config", ray_config)
+
+        self.app.router.add_get("/{_}", get_forbidden)
+
+    def log_dashboard_url(self):
+        url = "http://{}:{}?token={}".format(self.ip, self.port, self.token)
+        with open(os.path.join(self.temp_dir, "dashboard_url"), "w") as f:
+            f.write(url)
+        logger.info("Dashboard running on {}".format(url))
+
+    def run(self):
+        self.log_dashboard_url()
+        self.node_stats.start()
+        aiohttp.web.run_app(self.app, host=self.ip, port=self.port)
+
+
+class NodeStats(threading.Thread):
+    def __init__(self, redis_address, redis_password=None):
+        self.redis_key = "{}.*".format(ray.gcs_utils.REPORTER_CHANNEL)
+        self.redis_client = ray.services.create_redis_client(
+            redis_address, password=redis_password)
+
+        self._node_stats = {}
+        self._node_stats_lock = threading.Lock()
+        super().__init__()
+
+    def calculate_totals(self) -> Dict:
+        total_boot_time = 0
+        total_cpus = 0
+        total_workers = 0
+        total_load = [0.0, 0.0, 0.0]
+        total_storage_avail = 0
+        total_storage_total = 0
+        total_ram_avail = 0
+        total_ram_total = 0
+        total_sent = 0
+        total_recv = 0
+
+        for v in self._node_stats.values():
+            total_boot_time += v["boot_time"]
+            total_cpus += v["cpus"][0]
+            total_workers += len(v["workers"])
+            total_load[0] += v["load_avg"][0][0]
+            total_load[1] += v["load_avg"][0][1]
+            total_load[2] += v["load_avg"][0][2]
+            total_storage_avail += v["disk"]["/"]["free"]
+            total_storage_total += v["disk"]["/"]["total"]
+            total_ram_avail += v["mem"][1]
+            total_ram_total += v["mem"][0]
+            total_sent += v["net"][0]
+            total_recv += v["net"][1]
+
+        return {
+            "boot_time": total_boot_time,
+            "n_workers": total_workers,
+            "n_cores": total_cpus,
+            "m_avail": total_ram_avail,
+            "m_total": total_ram_total,
+            "d_avail": total_storage_avail,
+            "d_total": total_storage_total,
+            "load": total_load,
+            "n_sent": total_sent,
+            "n_recv": total_recv,
+        }
+
+    def calculate_tasks(self) -> Counter:
+        return Counter(
+            (x["name"]
+             for y in (v["workers"] for v in self._node_stats.values())
+             for x in y))
+
+    def purge_outdated_stats(self):
+        def current(then, now):
+            if (now - then) > 5:
+                return False
+
+            return True
+
+        now = to_unix_time(datetime.datetime.utcnow())
+        self._node_stats = {
+            k: v
+            for k, v in self._node_stats.items() if current(v["now"], now)
+        }
+
+    def get_node_stats(self) -> Dict:
+        with self._node_stats_lock:
+            self.purge_outdated_stats()
+            node_stats = sorted(
+                (v for v in self._node_stats.values()),
+                key=itemgetter("boot_time"))
+            return {
+                "totals": self.calculate_totals(),
+                "tasks": self.calculate_tasks(),
+                "clients": node_stats,
+            }
+
+    def run(self):
+        p = self.redis_client.pubsub(ignore_subscribe_messages=True)
+        p.psubscribe(self.redis_key)
+        logger.info("NodeStats: subscribed to {}".format(self.redis_key))
+
+        for x in p.listen():
+            try:
+                D = json.loads(x["data"])
+                with self._node_stats_lock:
+                    self._node_stats[D["hostname"]] = D
+            except Exception:
+                logger.exception(traceback.format_exc())
+                continue
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=("Parse Redis server for the "
+                     "dashboard to connect to."))
+    parser.add_argument(
+        "--http-port",
+        required=True,
+        type=int,
+        help="The port to use for the HTTP server.")
+    parser.add_argument(
+        "--token",
+        required=True,
+        type=str,
+        help="The token to use for the HTTP server.")
+    parser.add_argument(
+        "--redis-address",
+        required=True,
+        type=str,
+        help="The address to use for Redis.")
+    parser.add_argument(
+        "--redis-password",
+        required=False,
+        type=str,
+        default=None,
+        help="the password to use for Redis")
+    parser.add_argument(
+        "--logging-level",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_LEVEL,
+        choices=ray_constants.LOGGER_LEVEL_CHOICES,
+        help=ray_constants.LOGGER_LEVEL_HELP)
+    parser.add_argument(
+        "--logging-format",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_FORMAT,
+        help=ray_constants.LOGGER_FORMAT_HELP)
+    parser.add_argument(
+        "--temp-dir",
+        required=False,
+        type=str,
+        default=None,
+        help="Specify the path of the temporary directory use by Ray process.")
+    args = parser.parse_args()
+    ray.utils.setup_logger(args.logging_level, args.logging_format)
+
+    dashboard = Dashboard(
+        args.redis_address,
+        args.http_port,
+        args.token,
+        args.temp_dir,
+        redis_password=args.redis_password,
+    )
+
+    try:
+        dashboard.run()
+    except Exception as e:
+        # Something went wrong, so push an error to all drivers.
+        redis_client = ray.services.create_redis_client(
+            args.redis_address, password=args.redis_password)
+        traceback_str = ray.utils.format_error_message(traceback.format_exc())
+        message = ("The dashboard on node {} failed with the following "
+                   "error:\n{}".format(os.uname()[1], traceback_str))
+        ray.utils.push_error_to_driver_through_redis(
+            redis_client, ray_constants.DASHBOARD_DIED_ERROR, message)
+        raise e
--- a/python/ray/dashboard/index.html
+++ b/python/ray/dashboard/index.html
@ -0,0 +1,99 @@
+<!doctype html>
+
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <title>ray dashboard</title>
+    <meta name="description" content="ray dashboard"</meta>
+    <link rel="stylesheet" href="res/main.css">
+
+    <meta name="referrer" content="same-origin">
+    <!--
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.4/vue.min.js"
+    integrity="sha384-rldcjlIPDkF0mEihgyEOIFhd2NW5YL717okjKC5YF2LrqoiBeMk4tpcgbRrlDHj5"
+    crossorigin="anonymous"></script>
+    -->
+
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.4/vue.js"
+    integrity="sha384-94H2I+MU5hfBDUinQG+/Y9JbHALTPlQmHO26R3Jv60MT6WWkOD5hlYNyT9ciiLsR"
+    crossorigin="anonymous"></script>
+</head>
+
+<body>
+    <div id="dashboard">
+        <table v-if="clients && !error" class="ray_node_grid">
+            <thead>
+                <tr>
+                    <th class="hostname">Hostname</th>
+                    <th class="uptime">Uptime</th>
+                    <th class="workers">Workers</th>
+                    <th class="mem">RAM</th>
+                    <th class="storage">Disk</th>
+                    <th class="load">Load (1m, 5m, 15m)</th>
+                    <th class="netsent">Sent (M/s)</th>
+                    <th class="netrecv">Recv (M/s)</th>
+                </tr>
+            </thead>
+            <tbody is="node"
+                v-for="v in clients"
+                :key="v.hostname"
+                :now="now"
+                :hostname="v.hostname"
+                :boot_time="v.boot_time"
+                :n_workers="v.workers.length"
+                :n_cores="v.cpus[0]"
+                :m_avail="v.mem[1]"
+                :m_total="v.mem[0]"
+                :d_avail="v.disk['/'].free"
+                :d_total="v.disk['/'].total"
+                :load="v.load_avg[0]"
+                :n_sent="v.net[0]"
+                :n_recv="v.net[1]"
+                :workers="v.workers"
+            ></tbody>
+            <tbody is="node"
+                class="totals"
+                v-if="totals"
+                :now="now"
+                :hostname="Object.keys(clients).length"
+                :boot_time="totals.boot_time"
+                :n_workers="totals.n_workers"
+                :n_cores="totals.n_cores"
+                :m_avail="totals.m_avail"
+                :m_total="totals.m_total"
+                :d_avail="totals.d_avail"
+                :d_total="totals.d_total"
+                :load="totals.load"
+                :n_sent="totals.n_sent"
+                :n_recv="totals.n_recv"
+                :workers="[]"
+            ></tbody>
+        </table>
+
+        <template v-if="error">
+            <h2>{{error}}</h2>
+        </template>
+
+        <h2 v-if="last_update" :class="outdated_cls">Last updated {{age}} ago</h2>
+
+        <div class="cols">
+            <div class="tasks">
+                <template v-if="tasks && !error">
+                    <h2>tasks</h2>
+                    <ul>
+                        <li v-for="v, k, _ in tasks">{{k}}: {{v}}</li>
+                    </ul>
+                </template>
+            </div>
+
+            <div class="ray_config">
+                <template v-if="ray_config">
+                    <h2>ray config</h2>
+                    <pre>{{ray_config}}</pre>
+                </template>
+            </div>
+        </div>
+    </div>
+</body>
+
+<script src="res/main.js"></script>
--- a/python/ray/dashboard/res/main.css
+++ b/python/ray/dashboard/res/main.css
@ -0,0 +1,58 @@
+* { font-family: monospace; margin: 0; padding: 0; }
+h1, h2 { text-align: center; margin: 1rem 0; }
+h1 { font-size: 3rem; margin: 0.5rem auto; text-align: center; }
+h2 { font-size: 2rem; margin: 1rem auto; text-align: center; }
+div#dashboard {
+    width: 116rem; margin: 1rem auto;
+}
+
+table, tbody, thead {
+    width: 100%;
+    margin: 0;
+    padding: 0;
+}
+tr {
+    cursor: pointer;
+}
+tr.workerlist td, tr.workerlist th {
+    font-size: 0.8rem;
+}
+tr:hover {
+    filter: brightness(0.85);
+}
+td, th {
+    padding: 0.3rem;
+    font-size: 1.4rem;
+    font-family: monospace;
+    text-align: center;
+    background-color: white;
+}
+th {
+    background-color: #eeeeee;
+    border: 1px solid black;
+}
+
+tbody.totals {
+    font-weight: bold;
+    font-size: 1.5rem;
+}
+
+ul { list-style-position: inside; }
+
+.critical { background-color: magenta; }
+.bad { background-color: red; }
+.high { background-color: orange; }
+.average { background-color: limegreen; }
+.low { background-color: aquamarine; }
+
+div.cols {
+    width: 100%;
+    margin: 1rem 0;
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+}
+div.cols div {
+    padding: 1rem 0;
+}
+
+.outdated { background-color: red; }
--- a/python/ray/dashboard/res/main.js
+++ b/python/ray/dashboard/res/main.js
@ -0,0 +1,265 @@
+let dashboard = new Vue({
+    el: "#dashboard",
+    data: {
+        now: (new Date()).getTime() / 1000,
+        shown: {},
+        error: "loading...",
+        last_update: undefined,
+        clients: undefined,
+        totals: undefined,
+        tasks: undefined,
+        ray_config: undefined,
+    },
+    methods: {
+        updateNodeInfo: function() {
+            var self = this;
+            fetch("/api/node_info").then(function (resp) {
+                return resp.json();
+            }).then(function(data) {
+                self.error = data.error;
+                if (data.error) {
+                    self.clients = undefined;
+                    self.tasks = undefined;
+                    self.totals = undefined;
+                    return;
+                }
+                self.last_update = data.timestamp;
+                self.clients = data.result.clients;
+                self.tasks = data.result.tasks;
+                self.totals = data.result.totals;
+            }).catch(function() {
+                self.error = "request error"
+                self.clients = undefined;
+                self.tasks = undefined;
+                self.totals = undefined;
+            }).finally(function() {
+                setTimeout(self.updateNodeInfo, 500);
+            });
+        },
+        updateRayConfig: function() {
+            var self = this;
+            fetch("/api/ray_config").then(function (resp) {
+                return resp.json();
+            }).then(function(data) {
+                if (data.error) {
+                    self.ray_config = undefined;
+                    return;
+                }
+                self.ray_config = data.result;
+            }).catch(function() {
+                self.error = "request error"
+                self.ray_config = undefined;
+            }).finally(function() {
+                setTimeout(self.updateRayConfig, 10000);
+            });
+        },
+        updateAll: function() {
+            this.updateNodeInfo();
+            this.updateRayConfig();
+        },
+        tickClock: function() {
+            this.now = (new Date()).getTime() / 1000;
+        }
+    },
+    computed: {
+        outdated_cls: function(ts) {
+            if ((this.now - this.last_update) > 5) {
+                return "outdated";
+            }
+            return "";
+        },
+        age: function(ts) {
+            return (this.now - this.last_update | 0) + "s";
+        },
+    },
+    filters: {
+        si: function(x) {
+            let prefixes = ["B", "K", "M", "G", "T"]
+            let i = 0;
+            while (x > 1024) {
+                x /= 1024;
+                i += 1;
+            }
+            return `${x.toFixed(1)}${prefixes[i]}`;
+        },
+    },
+});
+
+Vue.component("worker-usage", {
+    props: ['cores', 'workers'],
+    computed: {
+        frac: function() {
+            return this.workers / this.cores;
+        },
+        cls: function() {
+            if (this.frac > 3) { return "critical"; }
+            if (this.frac > 2) { return "bad"; }
+            if (this.frac > 1.5) { return "high"; }
+            if (this.frac > 1) { return "average"; }
+            return "low";
+        },
+    },
+    template: `
+        <td class="workers" :class="cls">
+            {{workers}}/{{cores}} {{(frac*100).toFixed(0)}}%
+        </td>
+    `,
+});
+
+Vue.component("node", {
+    props: [
+        "now",
+        "hostname",
+        "boot_time",
+        "n_workers",
+        "n_cores",
+        "m_avail",
+        "m_total",
+        "d_avail",
+        "d_total",
+        "load",
+        "n_sent",
+        "n_recv",
+        "workers",
+    ],
+    data: function() {
+        return {
+            hidden: true,
+        };
+    },
+    computed: {
+        age: function() {
+            if (this.boot_time) {
+                let n = this.now;
+                if (this.boot_time > 2840140800) {
+                    // Hack. It's a sum of multiple nodes.
+                    n *= this.hostname;
+                }
+                let rs = n - this.boot_time | 0;
+                let s = rs % 60;
+                let m = ((rs / 60) % 60) | 0;
+                let h = (rs / 3600) | 0;
+                if (h) {
+                    return `${h}h ${m}m ${s}s`;
+                }
+                if (m) {
+                    return `${m}m ${s}s`;
+                }
+                return `${s}s`;
+            }
+            return "?"
+        },
+    },
+    methods: {
+        toggleHide: function() {
+            this.hidden = !this.hidden;
+        }
+    },
+    filters: {
+        mib(x) {
+            return `${(x/(1024**2)).toFixed(3)}M`;
+        },
+        hostnamefilter(x) {
+            if (isNaN(x)) {
+                return x;
+            }
+            return `Totals: ${x} nodes`;
+        },
+    },
+    template: `
+    <tbody v-on:click="toggleHide()">
+        <tr class="ray_node">
+            <td class="hostname">{{hostname | hostnamefilter}}</td>
+            <td class="uptime">{{age}}</td>
+            <worker-usage
+                :workers="n_workers"
+                :cores="n_cores"
+            ></worker-usage>
+            <usagebar
+                :avail="m_avail" :total="m_total"
+                stat="mem"
+            ></usagebar>
+            <usagebar
+                :avail="d_avail" :total="d_total"
+                stat="storage"
+            ></usagebar>
+            <loadbar
+                :cores="n_cores"
+                :onem="load[0]"
+                :fivem="load[1]"
+                :fifteenm="load[2]"
+            >
+            </loadbar>
+            <td class="netsent">{{n_sent | mib}}/s</td>
+            <td class="netrecv">{{n_recv | mib}}/s</td>
+        </tr>
+        <template v-if="!hidden && workers">
+            <tr class="workerlist">
+                <th>time</th>
+                <th>name</th>
+                <th>pid</th>
+                <th>uss</th>
+            </tr>
+            <tr class="workerlist" v-for="x in workers">
+                <td>user: {{x.cpu_times.user}}s</td>
+                <td>{{x.name}}</td>
+                <td>{{x.pid}}</td>
+                <td>{{(x.memory_full_info.uss/1048576).toFixed(0)}}MiB</td>
+            </tr>
+        </template>
+    </tbody>
+    `,
+});
+
+Vue.component("usagebar", {
+    props: ['stat', 'avail', 'total'], // e.g. free -m avail
+    computed: {
+        used: function() { return this.total - this.avail; },
+        frac: function() { return (this.total - this.avail)/this.total; },
+        cls: function() {
+            if (this.frac > 0.95) { return "critical"; }
+            if (this.frac > 0.9) { return "bad"; }
+            if (this.frac > 0.8) { return "high"; }
+            if (this.frac > 0.5) { return "average"; }
+            return "low";
+        },
+        tcls: function() {
+            return `${this.stat} ${this.cls}`;
+        }
+    },
+    filters: {
+        gib(x) {
+            return `${(x/(1024**3)).toFixed(1)}G`;
+        },
+        pct(x) {
+            return `${(x*100).toFixed(0)}%`;
+        },
+    },
+    template: `
+    <td class="usagebar" :class="tcls">
+        {{used | gib}}/{{total | gib}} {{ frac | pct }}
+    </td>
+    `,
+});
+
+Vue.component("loadbar", {
+    props: ['cores', 'onem', 'fivem', 'fifteenm'],
+    computed: {
+        frac: function() { return this.onem/this.cores; },
+        cls: function() {
+            if (this.frac > 3) { return "critical"; }
+            if (this.frac > 2.5) { return "bad"; }
+            if (this.frac > 2) { return "high"; }
+            if (this.frac > 1.5) { return "average"; }
+            return "low";
+        },
+    },
+    template: `
+    <td class="load loadbar" :class="cls">
+        {{onem.toFixed(2)}}, {{fivem.toFixed(2)}}, {{fifteenm.toFixed(2)}}
+    </td>
+    `,
+});
+
+setInterval(dashboard.tickClock, 1000);
+dashboard.updateAll();
--- a/python/ray/gcs_utils.py
+++ b/python/ray/gcs_utils.py
@ -39,6 +39,7 @@ __all__ = [

 FUNCTION_PREFIX = "RemoteFunction:"
 LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
+REPORTER_CHANNEL = "RAY_REPORTER"

 # xray heartbeats
 XRAY_HEARTBEAT_CHANNEL = str(TablePubsub.HEARTBEAT).encode("ascii")
--- a/python/ray/node.py
+++ b/python/ray/node.py
@ -10,6 +10,7 @@ import json
 import os
 import logging
 import signal
+import sys
 import tempfile
 import threading
 import time
@ -24,6 +25,8 @@ from ray.utils import try_to_create_directory
 # using logging.basicConfig in its entry/init points.
 logger = logging.getLogger(__name__)

+PY3 = sys.version_info.major >= 3
+

 class Node(object):
    """An encapsulation of the Ray processes on a single node.
@ -81,6 +84,7 @@ class Node(object):
            self._plasma_store_socket_name = None
            self._raylet_socket_name = None
            self._webui_url = None
+            self._dashboard_url = None
        else:
            self._plasma_store_socket_name = (
                ray_params.plasma_store_socket_name)
@ -284,6 +288,35 @@ class Node(object):
            process_info
        ]

+    def start_reporter(self):
+        """Start the reporter."""
+        stdout_file, stderr_file = self.new_log_files("reporter", True)
+        process_info = ray.services.start_reporter(
+            self.redis_address,
+            stdout_file=stdout_file,
+            stderr_file=stderr_file,
+            redis_password=self._ray_params.redis_password)
+        assert ray_constants.PROCESS_TYPE_REPORTER not in self.all_processes
+        if process_info is not None:
+            self.all_processes[ray_constants.PROCESS_TYPE_REPORTER] = [
+                process_info
+            ]
+
+    def start_dashboard(self):
+        """Start the dashboard."""
+        stdout_file, stderr_file = self.new_log_files("dashboard", True)
+        self._dashboard_url, process_info = ray.services.start_dashboard(
+            self.redis_address,
+            self._temp_dir,
+            stdout_file=stdout_file,
+            stderr_file=stderr_file,
+            redis_password=self._ray_params.redis_password)
+        assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes
+        if process_info is not None:
+            self.all_processes[ray_constants.PROCESS_TYPE_DASHBOARD] = [
+                process_info
+            ]
+
    def start_ui(self):
        """Start the web UI."""
        stdout_file, stderr_file = self.new_log_files("webui")
@ -408,14 +441,16 @@ class Node(object):
            self.start_redis()
            self.start_monitor()
            self.start_raylet_monitor()
+            if PY3 and self._ray_params.include_webui:
+                self.start_dashboard()

        self.start_plasma_store()
        self.start_raylet()
+        if PY3 and self._ray_params.include_webui:
+            self.start_reporter()

        if self._ray_params.include_log_monitor:
            self.start_log_monitor()
-        if self._ray_params.include_webui:
-            self.start_ui()

    def _kill_process_type(self,
                           process_type,
@ -545,6 +580,26 @@ class Node(object):
        self._kill_process_type(
            ray_constants.PROCESS_TYPE_LOG_MONITOR, check_alive=check_alive)

+    def kill_reporter(self, check_alive=True):
+        """Kill the reporter.
+
+        Args:
+            check_alive (bool): Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_REPORTER, check_alive=check_alive)
+
+    def kill_dashboard(self, check_alive=True):
+        """Kill the dashboard.
+
+        Args:
+            check_alive (bool): Raise an exception if the process was already
+                dead.
+        """
+        self._kill_process_type(
+            ray_constants.PROCESS_TYPE_DASHBOARD, check_alive=check_alive)
+
    def kill_monitor(self, check_alive=True):
        """Kill the monitor.

--- a/python/ray/ray_constants.py
+++ b/python/ray/ray_constants.py
@ -54,6 +54,8 @@ INFEASIBLE_TASK_ERROR = "infeasible_task"
 REMOVED_NODE_ERROR = "node_removed"
 MONITOR_DIED_ERROR = "monitor_died"
 LOG_MONITOR_DIED_ERROR = "log_monitor_died"
+REPORTER_DIED_ERROR = "reporter_died"
+DASHBOARD_DIED_ERROR = "dashboard_died"

 # Abort autoscaling if more than this number of errors are encountered. This
 # is a safety feature to prevent e.g. runaway node launches.
@ -76,6 +78,9 @@ AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
 AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S",
                                             30)

+# The reporter will report its' statistics this often (milliseconds).
+REPORTER_UPDATE_INTERVAL_MS = env_integer("REPORTER_UPDATE_INTERVAL_MS", 500)
+
 # Max number of retries to AWS (default is 5, time increases exponentially)
 BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)

@ -96,6 +101,8 @@ INFINITE_RECONSTRUCTION = 2**30
 PROCESS_TYPE_MONITOR = "monitor"
 PROCESS_TYPE_RAYLET_MONITOR = "raylet_monitor"
 PROCESS_TYPE_LOG_MONITOR = "log_monitor"
+PROCESS_TYPE_REPORTER = "reporter"
+PROCESS_TYPE_DASHBOARD = "dashboard"
 PROCESS_TYPE_WORKER = "worker"
 PROCESS_TYPE_RAYLET = "raylet"
 PROCESS_TYPE_PLASMA_STORE = "plasma_store"
--- a/python/ray/reporter.py
+++ b/python/ray/reporter.py
@ -0,0 +1,228 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import logging
+import json
+import os
+import traceback
+import time
+import datetime
+from socket import AddressFamily
+
+try:
+    import psutil
+except ModuleNotFoundError:
+    print("The reporter requires psutil to run.")
+    import sys
+    sys.exit(1)
+
+import ray.ray_constants as ray_constants
+import ray.utils
+
+# Logger for this module. It should be configured at the entry point
+# into the program using Ray. Ray provides a default configuration at
+# entry/init points.
+logger = logging.getLogger(__name__)
+
+
+def recursive_asdict(o):
+    if isinstance(o, tuple) and hasattr(o, "_asdict"):
+        return recursive_asdict(o._asdict())
+
+    if isinstance(o, (tuple, list)):
+        L = []
+        for k in o:
+            L.append(recursive_asdict(k))
+        return L
+
+    if isinstance(o, dict):
+        D = {k: recursive_asdict(v) for k, v in o.items()}
+        return D
+
+    return o
+
+
+def jsonify_asdict(o):
+    return json.dumps(recursive_asdict(o))
+
+
+def running_worker(s):
+    if "ray_worker" not in s:
+        return False
+
+    if s == "ray_worker":
+        return False
+
+    return True
+
+
+def determine_ip_address():
+    """Return the first IP address for an ethernet interface on the system."""
+    addrs = [
+        x.address for k, v in psutil.net_if_addrs().items() if k[0] == "e"
+        for x in v if x.family == AddressFamily.AF_INET
+    ]
+    return addrs[0]
+
+
+def to_posix_time(dt):
+    return (dt - datetime.datetime(1970, 1, 1)).total_seconds()
+
+
+class Reporter(object):
+    """A monitor process for monitoring Ray nodes.
+
+    Attributes:
+        host (str): The hostname of this machine. Used to improve the log
+            messages published to Redis.
+        redis_client: A client used to communicate with the Redis server.
+    """
+
+    def __init__(self, redis_address, redis_password=None):
+        """Initialize the reporter object."""
+        self.cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False))
+        self.ip_addr = determine_ip_address()
+        self.hostname = os.uname().nodename
+
+        _ = psutil.cpu_percent()  # For initialization
+
+        self.redis_key = "{}.{}".format(ray.gcs_utils.REPORTER_CHANNEL,
+                                        self.hostname)
+        self.redis_client = ray.services.create_redis_client(
+            redis_address, password=redis_password)
+
+        self.network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
+
+    @staticmethod
+    def get_cpu_percent():
+        return psutil.cpu_percent()
+
+    @staticmethod
+    def get_boot_time():
+        return psutil.boot_time()
+
+    @staticmethod
+    def get_network_stats():
+        ifaces = [
+            v for k, v in psutil.net_io_counters(pernic=True).items()
+            if k[0] == "e"
+        ]
+
+        sent = sum((iface.bytes_sent for iface in ifaces))
+        recv = sum((iface.bytes_recv for iface in ifaces))
+        return sent, recv
+
+    @staticmethod
+    def get_mem_usage():
+        vm = psutil.virtual_memory()
+        return vm.total, vm.available, vm.percent
+
+    @staticmethod
+    def get_disk_usage():
+        return {x: psutil.disk_usage(x) for x in ["/", "/tmp"]}
+
+    @staticmethod
+    def get_workers():
+        return [
+            x.as_dict(attrs=[
+                "pid", "create_time", "cpu_times", "name", "memory_full_info"
+            ]) for x in psutil.process_iter() if running_worker(x.name())
+        ]
+
+    def get_load_avg(self):
+        load = os.getloadavg()
+        per_cpu_load = tuple((round(x / self.cpu_counts[0], 2) for x in load))
+        return load, per_cpu_load
+
+    def get_all_stats(self):
+        now = to_posix_time(datetime.datetime.utcnow())
+        network_stats = self.get_network_stats()
+
+        self.network_stats_hist.append((now, network_stats))
+        self.network_stats_hist = self.network_stats_hist[-7:]
+        then, prev_network_stats = self.network_stats_hist[0]
+        netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then),
+                    (network_stats[1] - prev_network_stats[1]) / (now - then))
+
+        return {
+            "now": now,
+            "hostname": self.hostname,
+            "ip": self.ip_addr,
+            "cpu": self.get_cpu_percent(),
+            "cpus": self.cpu_counts,
+            "mem": self.get_mem_usage(),
+            "workers": self.get_workers(),
+            "boot_time": self.get_boot_time(),
+            "load_avg": self.get_load_avg(),
+            "disk": self.get_disk_usage(),
+            "net": netstats,
+        }
+
+    def perform_iteration(self):
+        """Get any changes to the log files and push updates to Redis."""
+        stats = self.get_all_stats()
+
+        self.redis_client.publish(
+            self.redis_key,
+            jsonify_asdict(stats),
+        )
+
+    def run(self):
+        """Run the reporter."""
+        while True:
+            try:
+                self.perform_iteration()
+            except Exception:
+                traceback.print_exc()
+                pass
+
+            time.sleep(ray_constants.REPORTER_UPDATE_INTERVAL_MS / 1000)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=("Parse Redis server for the "
+                     "reporter to connect to."))
+    parser.add_argument(
+        "--redis-address",
+        required=True,
+        type=str,
+        help="The address to use for Redis.")
+    parser.add_argument(
+        "--redis-password",
+        required=False,
+        type=str,
+        default=None,
+        help="the password to use for Redis")
+    parser.add_argument(
+        "--logging-level",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_LEVEL,
+        choices=ray_constants.LOGGER_LEVEL_CHOICES,
+        help=ray_constants.LOGGER_LEVEL_HELP)
+    parser.add_argument(
+        "--logging-format",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_FORMAT,
+        help=ray_constants.LOGGER_FORMAT_HELP)
+    args = parser.parse_args()
+    ray.utils.setup_logger(args.logging_level, args.logging_format)
+
+    reporter = Reporter(args.redis_address, redis_password=args.redis_password)
+
+    try:
+        reporter.run()
+    except Exception as e:
+        # Something went wrong, so push an error to all drivers.
+        redis_client = ray.services.create_redis_client(
+            args.redis_address, password=args.redis_password)
+        traceback_str = ray.utils.format_error_message(traceback.format_exc())
+        message = ("The reporter on node {} failed with the following "
+                   "error:\n{}".format(os.uname()[1], traceback_str))
+        ray.utils.push_error_to_driver_through_redis(
+            redis_client, ray_constants.REPORTER_DIED_ERROR, message)
+        raise e
--- a/python/ray/services.py
+++ b/python/ray/services.py
@ -827,6 +827,117 @@ def start_log_monitor(redis_address,
    return process_info


+def start_reporter(redis_address,
+                   stdout_file=None,
+                   stderr_file=None,
+                   redis_password=None):
+    """Start a reporter process.
+
+    Args:
+        redis_address (str): The address of the Redis instance.
+        stdout_file: A file handle opened for writing to redirect stdout to. If
+            no redirection should happen, then this should be None.
+        stderr_file: A file handle opened for writing to redirect stderr to. If
+            no redirection should happen, then this should be None.
+        redis_password (str): The password of the redis server.
+
+    Returns:
+        ProcessInfo for the process that was started.
+    """
+    reporter_filepath = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "reporter.py")
+    command = [
+        sys.executable, "-u", reporter_filepath,
+        "--redis-address={}".format(redis_address)
+    ]
+    if redis_password:
+        command += ["--redis-password", redis_password]
+
+    try:
+        import psutil  # noqa: F401
+    except ImportError:
+        logger.warning("Failed to start the reporter. The reporter requires "
+                       "'pip install psutil'.")
+        return None
+
+    process_info = start_ray_process(
+        command,
+        ray_constants.PROCESS_TYPE_REPORTER,
+        stdout_file=stdout_file,
+        stderr_file=stderr_file)
+    return process_info
+
+
+def start_dashboard(redis_address,
+                    temp_dir,
+                    stdout_file=None,
+                    stderr_file=None,
+                    redis_password=None):
+    """Start a dashboard process.
+
+    Args:
+        redis_address (str): The address of the Redis instance.
+        temp_dir (str): The temporary directory used for log files and
+            information for this Ray session.
+        stdout_file: A file handle opened for writing to redirect stdout to. If
+            no redirection should happen, then this should be None.
+        stderr_file: A file handle opened for writing to redirect stderr to. If
+            no redirection should happen, then this should be None.
+        redis_password (str): The password of the redis server.
+
+    Returns:
+        ProcessInfo for the process that was started.
+    """
+    port = 8080
+    while True:
+        try:
+            port_test_socket = socket.socket()
+            port_test_socket.bind(("127.0.0.1", port))
+            port_test_socket.close()
+            break
+        except socket.error:
+            port += 1
+
+    token = ray.utils.decode(binascii.hexlify(os.urandom(24)))
+
+    dashboard_filepath = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "dashboard/dashboard.py")
+    command = [
+        sys.executable,
+        "-u",
+        dashboard_filepath,
+        "--redis-address={}".format(redis_address),
+        "--http-port={}".format(port),
+        "--token={}".format(token),
+        "--temp-dir={}".format(temp_dir),
+    ]
+    if redis_password:
+        command += ["--redis-password", redis_password]
+
+    if sys.version_info <= (3, 0):
+        return None, None
+    try:
+        import aiohttp  # noqa: F401
+        import psutil  # noqa: F401
+    except ImportError:
+        logger.warning(
+            "Failed to start the dashboard. The dashboard requires Python 3 "
+            "as well as 'pip install aiohttp psutil'.")
+        return None, None
+
+    process_info = start_ray_process(
+        command,
+        ray_constants.PROCESS_TYPE_DASHBOARD,
+        stdout_file=stdout_file,
+        stderr_file=stderr_file)
+    dashboard_url = "http://{}:{}/?token={}".format(
+        ray.services.get_node_ip_address(), port, token)
+    print("\n" + "=" * 70)
+    print("View the dashboard at {}".format(dashboard_url))
+    print("=" * 70 + "\n")
+    return dashboard_url, process_info
+
+
 def start_ui(redis_address, notebook_name, stdout_file=None, stderr_file=None):
    """Start a UI process.

--- a/python/setup.py
+++ b/python/setup.py
@ -66,6 +66,7 @@ optional_ray_files += ray_autoscaler_files
 extras = {
    "rllib": ["pyyaml", "gym[atari]", "opencv-python", "lz4", "scipy"],
    "debug": ["psutil", "setproctitle", "py-spy"],
+    "dashboard": ["psutil", "aiohttp"],
 }


--- a/test/tempfile_test.py
+++ b/test/tempfile_test.py
@ -1,3 +1,7 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
 import shutil
 import time
@ -67,15 +71,14 @@ def test_raylet_tempfiles():
    ray.init(num_cpus=0)
    node = ray.worker._global_node
    top_levels = set(os.listdir(node.get_temp_dir_path()))
-    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
+    assert top_levels.issuperset({"sockets", "logs"})
    log_files = set(os.listdir(node.get_logs_dir_path()))
-    assert log_files == {
+    assert log_files.issuperset({
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
-        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
-        "monitor.err", "raylet_monitor.out", "raylet_monitor.err",
-        "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err",
-        "raylet.out", "raylet.err"
-    }  # with raylet logs
+        "plasma_store.err", "monitor.out", "monitor.err", "raylet_monitor.out",
+        "raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err",
+        "redis.out", "redis.err", "raylet.out", "raylet.err"
+    })  # with raylet logs
    socket_files = set(os.listdir(node.get_sockets_dir_path()))
    assert socket_files == {"plasma_store", "raylet"}
    ray.shutdown()
@ -83,15 +86,14 @@ def test_raylet_tempfiles():
    ray.init(num_cpus=2)
    node = ray.worker._global_node
    top_levels = set(os.listdir(node.get_temp_dir_path()))
-    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
+    assert top_levels.issuperset({"sockets", "logs"})
    time.sleep(3)  # wait workers to start
    log_files = set(os.listdir(node.get_logs_dir_path()))
    assert log_files.issuperset({
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
-        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
-        "monitor.err", "raylet_monitor.out", "raylet_monitor.err",
-        "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err",
-        "raylet.out", "raylet.err"
+        "plasma_store.err", "monitor.out", "monitor.err", "raylet_monitor.out",
+        "raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err",
+        "redis.out", "redis.err", "raylet.out", "raylet.err"
    })  # with raylet logs

    # Check numbers of worker log file.