mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
Add a web dashboard for monitoring node resource usage (#4066)
This commit is contained in:
parent
3ac8fd7ee8
commit
2e30f7ba38
13 changed files with 1239 additions and 44 deletions
|
@ -8,7 +8,7 @@ else
|
||||||
base_commit="$TRAVIS_BRANCH"
|
base_commit="$TRAVIS_BRANCH"
|
||||||
echo "Running clang-format against branch $base_commit, with hash $(git rev-parse $base_commit)"
|
echo "Running clang-format against branch $base_commit, with hash $(git rev-parse $base_commit)"
|
||||||
fi
|
fi
|
||||||
output="$(.travis/git-clang-format --binary clang-format-3.8 --commit $base_commit --diff --exclude '(.*thirdparty/|.*redismodule.h|.*webui*)')"
|
output="$(.travis/git-clang-format --binary clang-format-3.8 --commit $base_commit --diff --exclude '(.*thirdparty/|.*redismodule.h|.*.js)')"
|
||||||
if [ "$output" == "no modified files to format" ] || [ "$output" == "clang-format did not modify any files" ] ; then
|
if [ "$output" == "no modified files to format" ] || [ "$output" == "clang-format did not modify any files" ] ; then
|
||||||
echo "clang-format passed."
|
echo "clang-format passed."
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
@ -127,6 +127,7 @@ def kill_node(config_file, yes, override_cluster_name):
|
||||||
confirm("This will kill a node in your cluster", yes)
|
confirm("This will kill a node in your cluster", yes)
|
||||||
|
|
||||||
provider = get_node_provider(config["provider"], config["cluster_name"])
|
provider = get_node_provider(config["provider"], config["cluster_name"])
|
||||||
|
try:
|
||||||
nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
|
nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
|
||||||
node = random.choice(nodes)
|
node = random.choice(nodes)
|
||||||
logger.info("kill_node: Terminating worker {}".format(node))
|
logger.info("kill_node: Terminating worker {}".format(node))
|
||||||
|
@ -146,11 +147,14 @@ def kill_node(config_file, yes, override_cluster_name):
|
||||||
|
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
iip = provider.internal_ip(node)
|
if config.get("provider", {}).get("use_internal_ips", False) is True:
|
||||||
if iip:
|
node_ip = provider.internal_ip(node)
|
||||||
return iip
|
else:
|
||||||
|
node_ip = provider.external_ip(node)
|
||||||
|
finally:
|
||||||
|
provider.cleanup()
|
||||||
|
|
||||||
return provider.external_ip(node)
|
return node_ip
|
||||||
|
|
||||||
|
|
||||||
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
|
def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
|
||||||
|
@ -246,14 +250,18 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
|
||||||
# Refresh the node cache so we see the external ip if available
|
# Refresh the node cache so we see the external ip if available
|
||||||
provider.nodes(head_node_tags)
|
provider.nodes(head_node_tags)
|
||||||
|
|
||||||
|
if config.get("provider", {}).get("use_internal_ips", False) is True:
|
||||||
|
head_node_ip = provider.internal_ip(head_node)
|
||||||
|
else:
|
||||||
|
head_node_ip = provider.external_ip(head_node)
|
||||||
|
|
||||||
if updater.exitcode != 0:
|
if updater.exitcode != 0:
|
||||||
logger.error("get_or_create_head_node: "
|
logger.error("get_or_create_head_node: "
|
||||||
"Updating {} failed".format(
|
"Updating {} failed".format(head_node_ip))
|
||||||
provider.external_ip(head_node)))
|
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
logger.info("get_or_create_head_node: "
|
logger.info(
|
||||||
"Head node up-to-date, IP address is: {}".format(
|
"get_or_create_head_node: "
|
||||||
provider.external_ip(head_node)))
|
"Head node up-to-date, IP address is: {}".format(head_node_ip))
|
||||||
|
|
||||||
monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
|
monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
|
||||||
use_docker = bool(config["docker"]["container_name"])
|
use_docker = bool(config["docker"]["container_name"])
|
||||||
|
@ -268,10 +276,11 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
|
||||||
quote(monitor_str), modifiers))
|
quote(monitor_str), modifiers))
|
||||||
print("To open a console on the cluster:\n\n"
|
print("To open a console on the cluster:\n\n"
|
||||||
" ray attach {}{}\n".format(config_file, modifiers))
|
" ray attach {}{}\n".format(config_file, modifiers))
|
||||||
|
|
||||||
print("To ssh manually to the cluster, run:\n\n"
|
print("To ssh manually to the cluster, run:\n\n"
|
||||||
" ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"],
|
" ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"],
|
||||||
config["auth"]["ssh_user"],
|
config["auth"]["ssh_user"],
|
||||||
provider.external_ip(head_node)))
|
head_node_ip))
|
||||||
finally:
|
finally:
|
||||||
provider.cleanup()
|
provider.cleanup()
|
||||||
|
|
||||||
|
@ -456,11 +465,14 @@ def get_head_node_ip(config_file, override_cluster_name):
|
||||||
provider = get_node_provider(config["provider"], config["cluster_name"])
|
provider = get_node_provider(config["provider"], config["cluster_name"])
|
||||||
try:
|
try:
|
||||||
head_node = _get_head_node(config, config_file, override_cluster_name)
|
head_node = _get_head_node(config, config_file, override_cluster_name)
|
||||||
ip = provider.external_ip(head_node)
|
if config.get("provider", {}).get("use_internal_ips", False) is True:
|
||||||
|
head_node_ip = provider.internal_ip(head_node)
|
||||||
|
else:
|
||||||
|
head_node_ip = provider.external_ip(head_node)
|
||||||
finally:
|
finally:
|
||||||
provider.cleanup()
|
provider.cleanup()
|
||||||
|
|
||||||
return ip
|
return head_node_ip
|
||||||
|
|
||||||
|
|
||||||
def get_worker_node_ips(config_file, override_cluster_name):
|
def get_worker_node_ips(config_file, override_cluster_name):
|
||||||
|
@ -469,9 +481,17 @@ def get_worker_node_ips(config_file, override_cluster_name):
|
||||||
config = yaml.load(open(config_file).read())
|
config = yaml.load(open(config_file).read())
|
||||||
if override_cluster_name is not None:
|
if override_cluster_name is not None:
|
||||||
config["cluster_name"] = override_cluster_name
|
config["cluster_name"] = override_cluster_name
|
||||||
|
|
||||||
provider = get_node_provider(config["provider"], config["cluster_name"])
|
provider = get_node_provider(config["provider"], config["cluster_name"])
|
||||||
|
try:
|
||||||
nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
|
nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
|
||||||
|
|
||||||
|
if config.get("provider", {}).get("use_internal_ips", False) is True:
|
||||||
|
return [provider.internal_ip(node) for node in nodes]
|
||||||
|
else:
|
||||||
return [provider.external_ip(node) for node in nodes]
|
return [provider.external_ip(node) for node in nodes]
|
||||||
|
finally:
|
||||||
|
provider.cleanup()
|
||||||
|
|
||||||
|
|
||||||
def _get_head_node(config,
|
def _get_head_node(config,
|
||||||
|
|
348
python/ray/dashboard/dashboard.py
Normal file
348
python/ray/dashboard/dashboard.py
Normal file
|
@ -0,0 +1,348 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
try:
|
||||||
|
import aiohttp.web
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
print("The reporter requires aiohttp to run.")
|
||||||
|
import sys
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
import traceback
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter
|
||||||
|
from operator import itemgetter
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import ray.ray_constants as ray_constants
|
||||||
|
import ray.utils
|
||||||
|
|
||||||
|
# Logger for this module. It should be configured at the entry point
|
||||||
|
# into the program using Ray. Ray provides a default configuration at
|
||||||
|
# entry/init points.
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def to_unix_time(dt):
|
||||||
|
return (dt - datetime.datetime(1970, 1, 1)).total_seconds()
|
||||||
|
|
||||||
|
|
||||||
|
class Dashboard(object):
|
||||||
|
"""A dashboard process for monitoring Ray nodes.
|
||||||
|
|
||||||
|
This dashboard is made up of a REST API which collates data published by
|
||||||
|
Reporter processes on nodes into a json structure, and a webserver
|
||||||
|
which polls said API for display purposes.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
redis_client: A client used to communicate with the Redis server.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
redis_address,
|
||||||
|
http_port,
|
||||||
|
token,
|
||||||
|
temp_dir,
|
||||||
|
redis_password=None):
|
||||||
|
"""Initialize the dashboard object."""
|
||||||
|
self.ip = ray.services.get_node_ip_address()
|
||||||
|
self.port = http_port
|
||||||
|
self.token = token
|
||||||
|
self.temp_dir = temp_dir
|
||||||
|
self.node_stats = NodeStats(redis_address, redis_password)
|
||||||
|
|
||||||
|
self.app = aiohttp.web.Application(middlewares=[self.auth_middleware])
|
||||||
|
self.setup_routes()
|
||||||
|
|
||||||
|
@aiohttp.web.middleware
|
||||||
|
async def auth_middleware(self, req, handler):
|
||||||
|
def valid_token(req):
|
||||||
|
# If the cookie token is correct, accept that.
|
||||||
|
try:
|
||||||
|
if req.cookies["token"] == self.token:
|
||||||
|
return True
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If the query token is correct, accept that.
|
||||||
|
try:
|
||||||
|
if req.query["token"] == self.token:
|
||||||
|
return True
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Reject.
|
||||||
|
logger.warning("Dashboard: rejected an invalid token")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check that the token is present, either in query or as cookie.
|
||||||
|
if not valid_token(req):
|
||||||
|
return aiohttp.web.Response(status=401, text="401 Unauthorized")
|
||||||
|
|
||||||
|
resp = await handler(req)
|
||||||
|
resp.cookies["token"] = self.token
|
||||||
|
return resp
|
||||||
|
|
||||||
|
def setup_routes(self):
|
||||||
|
def forbidden() -> aiohttp.web.Response:
|
||||||
|
return aiohttp.web.Response(status=403, text="403 Forbidden")
|
||||||
|
|
||||||
|
def get_forbidden(_) -> aiohttp.web.Response:
|
||||||
|
return forbidden()
|
||||||
|
|
||||||
|
async def get_index(req) -> aiohttp.web.Response:
|
||||||
|
return aiohttp.web.FileResponse(
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)), "index.html"))
|
||||||
|
|
||||||
|
async def get_resource(req) -> aiohttp.web.Response:
|
||||||
|
try:
|
||||||
|
path = req.match_info["x"]
|
||||||
|
except KeyError:
|
||||||
|
return forbidden()
|
||||||
|
|
||||||
|
if path not in ["main.css", "main.js"]:
|
||||||
|
return forbidden()
|
||||||
|
|
||||||
|
return aiohttp.web.FileResponse(
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)),
|
||||||
|
"res/{}".format(path)))
|
||||||
|
|
||||||
|
async def json_response(result=None, error=None,
|
||||||
|
ts=None) -> aiohttp.web.Response:
|
||||||
|
if ts is None:
|
||||||
|
ts = datetime.datetime.utcnow()
|
||||||
|
|
||||||
|
return aiohttp.web.json_response({
|
||||||
|
"result": result,
|
||||||
|
"timestamp": to_unix_time(ts),
|
||||||
|
"error": error,
|
||||||
|
})
|
||||||
|
|
||||||
|
async def ray_config(_) -> aiohttp.web.Response:
|
||||||
|
try:
|
||||||
|
with open(
|
||||||
|
Path("~/ray_bootstrap_config.yaml").expanduser()) as f:
|
||||||
|
cfg = yaml.load(f)
|
||||||
|
except Exception:
|
||||||
|
return await json_response(error="No config")
|
||||||
|
|
||||||
|
D = {
|
||||||
|
"min_workers": cfg["min_workers"],
|
||||||
|
"max_workers": cfg["max_workers"],
|
||||||
|
"initial_workers": cfg["initial_workers"],
|
||||||
|
"idle_timeout_minutes": cfg["idle_timeout_minutes"],
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
D["head_type"] = cfg["head_node"]["InstanceType"]
|
||||||
|
except KeyError:
|
||||||
|
D["head_type"] = "unknown"
|
||||||
|
|
||||||
|
try:
|
||||||
|
D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
|
||||||
|
except KeyError:
|
||||||
|
D["worker_type"] = "unknown"
|
||||||
|
|
||||||
|
return await json_response(result=D)
|
||||||
|
|
||||||
|
async def node_info(req) -> aiohttp.web.Response:
|
||||||
|
now = datetime.datetime.utcnow()
|
||||||
|
D = self.node_stats.get_node_stats()
|
||||||
|
return await json_response(result=D, ts=now)
|
||||||
|
|
||||||
|
self.app.router.add_get("/", get_index)
|
||||||
|
self.app.router.add_get("/index.html", get_index)
|
||||||
|
self.app.router.add_get("/index.htm", get_index)
|
||||||
|
self.app.router.add_get("/res/{x}", get_resource)
|
||||||
|
|
||||||
|
self.app.router.add_get("/api/node_info", node_info)
|
||||||
|
self.app.router.add_get("/api/super_client_table", node_info)
|
||||||
|
self.app.router.add_get("/api/ray_config", ray_config)
|
||||||
|
|
||||||
|
self.app.router.add_get("/{_}", get_forbidden)
|
||||||
|
|
||||||
|
def log_dashboard_url(self):
|
||||||
|
url = "http://{}:{}?token={}".format(self.ip, self.port, self.token)
|
||||||
|
with open(os.path.join(self.temp_dir, "dashboard_url"), "w") as f:
|
||||||
|
f.write(url)
|
||||||
|
logger.info("Dashboard running on {}".format(url))
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.log_dashboard_url()
|
||||||
|
self.node_stats.start()
|
||||||
|
aiohttp.web.run_app(self.app, host=self.ip, port=self.port)
|
||||||
|
|
||||||
|
|
||||||
|
class NodeStats(threading.Thread):
|
||||||
|
def __init__(self, redis_address, redis_password=None):
|
||||||
|
self.redis_key = "{}.*".format(ray.gcs_utils.REPORTER_CHANNEL)
|
||||||
|
self.redis_client = ray.services.create_redis_client(
|
||||||
|
redis_address, password=redis_password)
|
||||||
|
|
||||||
|
self._node_stats = {}
|
||||||
|
self._node_stats_lock = threading.Lock()
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def calculate_totals(self) -> Dict:
|
||||||
|
total_boot_time = 0
|
||||||
|
total_cpus = 0
|
||||||
|
total_workers = 0
|
||||||
|
total_load = [0.0, 0.0, 0.0]
|
||||||
|
total_storage_avail = 0
|
||||||
|
total_storage_total = 0
|
||||||
|
total_ram_avail = 0
|
||||||
|
total_ram_total = 0
|
||||||
|
total_sent = 0
|
||||||
|
total_recv = 0
|
||||||
|
|
||||||
|
for v in self._node_stats.values():
|
||||||
|
total_boot_time += v["boot_time"]
|
||||||
|
total_cpus += v["cpus"][0]
|
||||||
|
total_workers += len(v["workers"])
|
||||||
|
total_load[0] += v["load_avg"][0][0]
|
||||||
|
total_load[1] += v["load_avg"][0][1]
|
||||||
|
total_load[2] += v["load_avg"][0][2]
|
||||||
|
total_storage_avail += v["disk"]["/"]["free"]
|
||||||
|
total_storage_total += v["disk"]["/"]["total"]
|
||||||
|
total_ram_avail += v["mem"][1]
|
||||||
|
total_ram_total += v["mem"][0]
|
||||||
|
total_sent += v["net"][0]
|
||||||
|
total_recv += v["net"][1]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"boot_time": total_boot_time,
|
||||||
|
"n_workers": total_workers,
|
||||||
|
"n_cores": total_cpus,
|
||||||
|
"m_avail": total_ram_avail,
|
||||||
|
"m_total": total_ram_total,
|
||||||
|
"d_avail": total_storage_avail,
|
||||||
|
"d_total": total_storage_total,
|
||||||
|
"load": total_load,
|
||||||
|
"n_sent": total_sent,
|
||||||
|
"n_recv": total_recv,
|
||||||
|
}
|
||||||
|
|
||||||
|
def calculate_tasks(self) -> Counter:
|
||||||
|
return Counter(
|
||||||
|
(x["name"]
|
||||||
|
for y in (v["workers"] for v in self._node_stats.values())
|
||||||
|
for x in y))
|
||||||
|
|
||||||
|
def purge_outdated_stats(self):
|
||||||
|
def current(then, now):
|
||||||
|
if (now - then) > 5:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
now = to_unix_time(datetime.datetime.utcnow())
|
||||||
|
self._node_stats = {
|
||||||
|
k: v
|
||||||
|
for k, v in self._node_stats.items() if current(v["now"], now)
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_node_stats(self) -> Dict:
|
||||||
|
with self._node_stats_lock:
|
||||||
|
self.purge_outdated_stats()
|
||||||
|
node_stats = sorted(
|
||||||
|
(v for v in self._node_stats.values()),
|
||||||
|
key=itemgetter("boot_time"))
|
||||||
|
return {
|
||||||
|
"totals": self.calculate_totals(),
|
||||||
|
"tasks": self.calculate_tasks(),
|
||||||
|
"clients": node_stats,
|
||||||
|
}
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
p = self.redis_client.pubsub(ignore_subscribe_messages=True)
|
||||||
|
p.psubscribe(self.redis_key)
|
||||||
|
logger.info("NodeStats: subscribed to {}".format(self.redis_key))
|
||||||
|
|
||||||
|
for x in p.listen():
|
||||||
|
try:
|
||||||
|
D = json.loads(x["data"])
|
||||||
|
with self._node_stats_lock:
|
||||||
|
self._node_stats[D["hostname"]] = D
|
||||||
|
except Exception:
|
||||||
|
logger.exception(traceback.format_exc())
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=("Parse Redis server for the "
|
||||||
|
"dashboard to connect to."))
|
||||||
|
parser.add_argument(
|
||||||
|
"--http-port",
|
||||||
|
required=True,
|
||||||
|
type=int,
|
||||||
|
help="The port to use for the HTTP server.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--token",
|
||||||
|
required=True,
|
||||||
|
type=str,
|
||||||
|
help="The token to use for the HTTP server.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--redis-address",
|
||||||
|
required=True,
|
||||||
|
type=str,
|
||||||
|
help="The address to use for Redis.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--redis-password",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="the password to use for Redis")
|
||||||
|
parser.add_argument(
|
||||||
|
"--logging-level",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=ray_constants.LOGGER_LEVEL,
|
||||||
|
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
||||||
|
help=ray_constants.LOGGER_LEVEL_HELP)
|
||||||
|
parser.add_argument(
|
||||||
|
"--logging-format",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=ray_constants.LOGGER_FORMAT,
|
||||||
|
help=ray_constants.LOGGER_FORMAT_HELP)
|
||||||
|
parser.add_argument(
|
||||||
|
"--temp-dir",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Specify the path of the temporary directory use by Ray process.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
ray.utils.setup_logger(args.logging_level, args.logging_format)
|
||||||
|
|
||||||
|
dashboard = Dashboard(
|
||||||
|
args.redis_address,
|
||||||
|
args.http_port,
|
||||||
|
args.token,
|
||||||
|
args.temp_dir,
|
||||||
|
redis_password=args.redis_password,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
dashboard.run()
|
||||||
|
except Exception as e:
|
||||||
|
# Something went wrong, so push an error to all drivers.
|
||||||
|
redis_client = ray.services.create_redis_client(
|
||||||
|
args.redis_address, password=args.redis_password)
|
||||||
|
traceback_str = ray.utils.format_error_message(traceback.format_exc())
|
||||||
|
message = ("The dashboard on node {} failed with the following "
|
||||||
|
"error:\n{}".format(os.uname()[1], traceback_str))
|
||||||
|
ray.utils.push_error_to_driver_through_redis(
|
||||||
|
redis_client, ray_constants.DASHBOARD_DIED_ERROR, message)
|
||||||
|
raise e
|
99
python/ray/dashboard/index.html
Normal file
99
python/ray/dashboard/index.html
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
<!doctype html>
|
||||||
|
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>ray dashboard</title>
|
||||||
|
<meta name="description" content="ray dashboard"</meta>
|
||||||
|
<link rel="stylesheet" href="res/main.css">
|
||||||
|
|
||||||
|
<meta name="referrer" content="same-origin">
|
||||||
|
<!--
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.4/vue.min.js"
|
||||||
|
integrity="sha384-rldcjlIPDkF0mEihgyEOIFhd2NW5YL717okjKC5YF2LrqoiBeMk4tpcgbRrlDHj5"
|
||||||
|
crossorigin="anonymous"></script>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.4/vue.js"
|
||||||
|
integrity="sha384-94H2I+MU5hfBDUinQG+/Y9JbHALTPlQmHO26R3Jv60MT6WWkOD5hlYNyT9ciiLsR"
|
||||||
|
crossorigin="anonymous"></script>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div id="dashboard">
|
||||||
|
<table v-if="clients && !error" class="ray_node_grid">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th class="hostname">Hostname</th>
|
||||||
|
<th class="uptime">Uptime</th>
|
||||||
|
<th class="workers">Workers</th>
|
||||||
|
<th class="mem">RAM</th>
|
||||||
|
<th class="storage">Disk</th>
|
||||||
|
<th class="load">Load (1m, 5m, 15m)</th>
|
||||||
|
<th class="netsent">Sent (M/s)</th>
|
||||||
|
<th class="netrecv">Recv (M/s)</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody is="node"
|
||||||
|
v-for="v in clients"
|
||||||
|
:key="v.hostname"
|
||||||
|
:now="now"
|
||||||
|
:hostname="v.hostname"
|
||||||
|
:boot_time="v.boot_time"
|
||||||
|
:n_workers="v.workers.length"
|
||||||
|
:n_cores="v.cpus[0]"
|
||||||
|
:m_avail="v.mem[1]"
|
||||||
|
:m_total="v.mem[0]"
|
||||||
|
:d_avail="v.disk['/'].free"
|
||||||
|
:d_total="v.disk['/'].total"
|
||||||
|
:load="v.load_avg[0]"
|
||||||
|
:n_sent="v.net[0]"
|
||||||
|
:n_recv="v.net[1]"
|
||||||
|
:workers="v.workers"
|
||||||
|
></tbody>
|
||||||
|
<tbody is="node"
|
||||||
|
class="totals"
|
||||||
|
v-if="totals"
|
||||||
|
:now="now"
|
||||||
|
:hostname="Object.keys(clients).length"
|
||||||
|
:boot_time="totals.boot_time"
|
||||||
|
:n_workers="totals.n_workers"
|
||||||
|
:n_cores="totals.n_cores"
|
||||||
|
:m_avail="totals.m_avail"
|
||||||
|
:m_total="totals.m_total"
|
||||||
|
:d_avail="totals.d_avail"
|
||||||
|
:d_total="totals.d_total"
|
||||||
|
:load="totals.load"
|
||||||
|
:n_sent="totals.n_sent"
|
||||||
|
:n_recv="totals.n_recv"
|
||||||
|
:workers="[]"
|
||||||
|
></tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<template v-if="error">
|
||||||
|
<h2>{{error}}</h2>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<h2 v-if="last_update" :class="outdated_cls">Last updated {{age}} ago</h2>
|
||||||
|
|
||||||
|
<div class="cols">
|
||||||
|
<div class="tasks">
|
||||||
|
<template v-if="tasks && !error">
|
||||||
|
<h2>tasks</h2>
|
||||||
|
<ul>
|
||||||
|
<li v-for="v, k, _ in tasks">{{k}}: {{v}}</li>
|
||||||
|
</ul>
|
||||||
|
</template>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="ray_config">
|
||||||
|
<template v-if="ray_config">
|
||||||
|
<h2>ray config</h2>
|
||||||
|
<pre>{{ray_config}}</pre>
|
||||||
|
</template>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
|
||||||
|
<script src="res/main.js"></script>
|
58
python/ray/dashboard/res/main.css
Normal file
58
python/ray/dashboard/res/main.css
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
* { font-family: monospace; margin: 0; padding: 0; }
|
||||||
|
h1, h2 { text-align: center; margin: 1rem 0; }
|
||||||
|
h1 { font-size: 3rem; margin: 0.5rem auto; text-align: center; }
|
||||||
|
h2 { font-size: 2rem; margin: 1rem auto; text-align: center; }
|
||||||
|
div#dashboard {
|
||||||
|
width: 116rem; margin: 1rem auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
table, tbody, thead {
|
||||||
|
width: 100%;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
tr {
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
tr.workerlist td, tr.workerlist th {
|
||||||
|
font-size: 0.8rem;
|
||||||
|
}
|
||||||
|
tr:hover {
|
||||||
|
filter: brightness(0.85);
|
||||||
|
}
|
||||||
|
td, th {
|
||||||
|
padding: 0.3rem;
|
||||||
|
font-size: 1.4rem;
|
||||||
|
font-family: monospace;
|
||||||
|
text-align: center;
|
||||||
|
background-color: white;
|
||||||
|
}
|
||||||
|
th {
|
||||||
|
background-color: #eeeeee;
|
||||||
|
border: 1px solid black;
|
||||||
|
}
|
||||||
|
|
||||||
|
tbody.totals {
|
||||||
|
font-weight: bold;
|
||||||
|
font-size: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
ul { list-style-position: inside; }
|
||||||
|
|
||||||
|
.critical { background-color: magenta; }
|
||||||
|
.bad { background-color: red; }
|
||||||
|
.high { background-color: orange; }
|
||||||
|
.average { background-color: limegreen; }
|
||||||
|
.low { background-color: aquamarine; }
|
||||||
|
|
||||||
|
div.cols {
|
||||||
|
width: 100%;
|
||||||
|
margin: 1rem 0;
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 1fr 1fr;
|
||||||
|
}
|
||||||
|
div.cols div {
|
||||||
|
padding: 1rem 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.outdated { background-color: red; }
|
265
python/ray/dashboard/res/main.js
Normal file
265
python/ray/dashboard/res/main.js
Normal file
|
@ -0,0 +1,265 @@
|
||||||
|
let dashboard = new Vue({
|
||||||
|
el: "#dashboard",
|
||||||
|
data: {
|
||||||
|
now: (new Date()).getTime() / 1000,
|
||||||
|
shown: {},
|
||||||
|
error: "loading...",
|
||||||
|
last_update: undefined,
|
||||||
|
clients: undefined,
|
||||||
|
totals: undefined,
|
||||||
|
tasks: undefined,
|
||||||
|
ray_config: undefined,
|
||||||
|
},
|
||||||
|
methods: {
|
||||||
|
updateNodeInfo: function() {
|
||||||
|
var self = this;
|
||||||
|
fetch("/api/node_info").then(function (resp) {
|
||||||
|
return resp.json();
|
||||||
|
}).then(function(data) {
|
||||||
|
self.error = data.error;
|
||||||
|
if (data.error) {
|
||||||
|
self.clients = undefined;
|
||||||
|
self.tasks = undefined;
|
||||||
|
self.totals = undefined;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
self.last_update = data.timestamp;
|
||||||
|
self.clients = data.result.clients;
|
||||||
|
self.tasks = data.result.tasks;
|
||||||
|
self.totals = data.result.totals;
|
||||||
|
}).catch(function() {
|
||||||
|
self.error = "request error"
|
||||||
|
self.clients = undefined;
|
||||||
|
self.tasks = undefined;
|
||||||
|
self.totals = undefined;
|
||||||
|
}).finally(function() {
|
||||||
|
setTimeout(self.updateNodeInfo, 500);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
updateRayConfig: function() {
|
||||||
|
var self = this;
|
||||||
|
fetch("/api/ray_config").then(function (resp) {
|
||||||
|
return resp.json();
|
||||||
|
}).then(function(data) {
|
||||||
|
if (data.error) {
|
||||||
|
self.ray_config = undefined;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
self.ray_config = data.result;
|
||||||
|
}).catch(function() {
|
||||||
|
self.error = "request error"
|
||||||
|
self.ray_config = undefined;
|
||||||
|
}).finally(function() {
|
||||||
|
setTimeout(self.updateRayConfig, 10000);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
updateAll: function() {
|
||||||
|
this.updateNodeInfo();
|
||||||
|
this.updateRayConfig();
|
||||||
|
},
|
||||||
|
tickClock: function() {
|
||||||
|
this.now = (new Date()).getTime() / 1000;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
computed: {
|
||||||
|
outdated_cls: function(ts) {
|
||||||
|
if ((this.now - this.last_update) > 5) {
|
||||||
|
return "outdated";
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
},
|
||||||
|
age: function(ts) {
|
||||||
|
return (this.now - this.last_update | 0) + "s";
|
||||||
|
},
|
||||||
|
},
|
||||||
|
filters: {
|
||||||
|
si: function(x) {
|
||||||
|
let prefixes = ["B", "K", "M", "G", "T"]
|
||||||
|
let i = 0;
|
||||||
|
while (x > 1024) {
|
||||||
|
x /= 1024;
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
return `${x.toFixed(1)}${prefixes[i]}`;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
Vue.component("worker-usage", {
|
||||||
|
props: ['cores', 'workers'],
|
||||||
|
computed: {
|
||||||
|
frac: function() {
|
||||||
|
return this.workers / this.cores;
|
||||||
|
},
|
||||||
|
cls: function() {
|
||||||
|
if (this.frac > 3) { return "critical"; }
|
||||||
|
if (this.frac > 2) { return "bad"; }
|
||||||
|
if (this.frac > 1.5) { return "high"; }
|
||||||
|
if (this.frac > 1) { return "average"; }
|
||||||
|
return "low";
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template: `
|
||||||
|
<td class="workers" :class="cls">
|
||||||
|
{{workers}}/{{cores}} {{(frac*100).toFixed(0)}}%
|
||||||
|
</td>
|
||||||
|
`,
|
||||||
|
});
|
||||||
|
|
||||||
|
Vue.component("node", {
|
||||||
|
props: [
|
||||||
|
"now",
|
||||||
|
"hostname",
|
||||||
|
"boot_time",
|
||||||
|
"n_workers",
|
||||||
|
"n_cores",
|
||||||
|
"m_avail",
|
||||||
|
"m_total",
|
||||||
|
"d_avail",
|
||||||
|
"d_total",
|
||||||
|
"load",
|
||||||
|
"n_sent",
|
||||||
|
"n_recv",
|
||||||
|
"workers",
|
||||||
|
],
|
||||||
|
data: function() {
|
||||||
|
return {
|
||||||
|
hidden: true,
|
||||||
|
};
|
||||||
|
},
|
||||||
|
computed: {
|
||||||
|
age: function() {
|
||||||
|
if (this.boot_time) {
|
||||||
|
let n = this.now;
|
||||||
|
if (this.boot_time > 2840140800) {
|
||||||
|
// Hack. It's a sum of multiple nodes.
|
||||||
|
n *= this.hostname;
|
||||||
|
}
|
||||||
|
let rs = n - this.boot_time | 0;
|
||||||
|
let s = rs % 60;
|
||||||
|
let m = ((rs / 60) % 60) | 0;
|
||||||
|
let h = (rs / 3600) | 0;
|
||||||
|
if (h) {
|
||||||
|
return `${h}h ${m}m ${s}s`;
|
||||||
|
}
|
||||||
|
if (m) {
|
||||||
|
return `${m}m ${s}s`;
|
||||||
|
}
|
||||||
|
return `${s}s`;
|
||||||
|
}
|
||||||
|
return "?"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
methods: {
|
||||||
|
toggleHide: function() {
|
||||||
|
this.hidden = !this.hidden;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
filters: {
|
||||||
|
mib(x) {
|
||||||
|
return `${(x/(1024**2)).toFixed(3)}M`;
|
||||||
|
},
|
||||||
|
hostnamefilter(x) {
|
||||||
|
if (isNaN(x)) {
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
return `Totals: ${x} nodes`;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template: `
|
||||||
|
<tbody v-on:click="toggleHide()">
|
||||||
|
<tr class="ray_node">
|
||||||
|
<td class="hostname">{{hostname | hostnamefilter}}</td>
|
||||||
|
<td class="uptime">{{age}}</td>
|
||||||
|
<worker-usage
|
||||||
|
:workers="n_workers"
|
||||||
|
:cores="n_cores"
|
||||||
|
></worker-usage>
|
||||||
|
<usagebar
|
||||||
|
:avail="m_avail" :total="m_total"
|
||||||
|
stat="mem"
|
||||||
|
></usagebar>
|
||||||
|
<usagebar
|
||||||
|
:avail="d_avail" :total="d_total"
|
||||||
|
stat="storage"
|
||||||
|
></usagebar>
|
||||||
|
<loadbar
|
||||||
|
:cores="n_cores"
|
||||||
|
:onem="load[0]"
|
||||||
|
:fivem="load[1]"
|
||||||
|
:fifteenm="load[2]"
|
||||||
|
>
|
||||||
|
</loadbar>
|
||||||
|
<td class="netsent">{{n_sent | mib}}/s</td>
|
||||||
|
<td class="netrecv">{{n_recv | mib}}/s</td>
|
||||||
|
</tr>
|
||||||
|
<template v-if="!hidden && workers">
|
||||||
|
<tr class="workerlist">
|
||||||
|
<th>time</th>
|
||||||
|
<th>name</th>
|
||||||
|
<th>pid</th>
|
||||||
|
<th>uss</th>
|
||||||
|
</tr>
|
||||||
|
<tr class="workerlist" v-for="x in workers">
|
||||||
|
<td>user: {{x.cpu_times.user}}s</td>
|
||||||
|
<td>{{x.name}}</td>
|
||||||
|
<td>{{x.pid}}</td>
|
||||||
|
<td>{{(x.memory_full_info.uss/1048576).toFixed(0)}}MiB</td>
|
||||||
|
</tr>
|
||||||
|
</template>
|
||||||
|
</tbody>
|
||||||
|
`,
|
||||||
|
});
|
||||||
|
|
||||||
|
Vue.component("usagebar", {
|
||||||
|
props: ['stat', 'avail', 'total'], // e.g. free -m avail
|
||||||
|
computed: {
|
||||||
|
used: function() { return this.total - this.avail; },
|
||||||
|
frac: function() { return (this.total - this.avail)/this.total; },
|
||||||
|
cls: function() {
|
||||||
|
if (this.frac > 0.95) { return "critical"; }
|
||||||
|
if (this.frac > 0.9) { return "bad"; }
|
||||||
|
if (this.frac > 0.8) { return "high"; }
|
||||||
|
if (this.frac > 0.5) { return "average"; }
|
||||||
|
return "low";
|
||||||
|
},
|
||||||
|
tcls: function() {
|
||||||
|
return `${this.stat} ${this.cls}`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
filters: {
|
||||||
|
gib(x) {
|
||||||
|
return `${(x/(1024**3)).toFixed(1)}G`;
|
||||||
|
},
|
||||||
|
pct(x) {
|
||||||
|
return `${(x*100).toFixed(0)}%`;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template: `
|
||||||
|
<td class="usagebar" :class="tcls">
|
||||||
|
{{used | gib}}/{{total | gib}} {{ frac | pct }}
|
||||||
|
</td>
|
||||||
|
`,
|
||||||
|
});
|
||||||
|
|
||||||
|
Vue.component("loadbar", {
|
||||||
|
props: ['cores', 'onem', 'fivem', 'fifteenm'],
|
||||||
|
computed: {
|
||||||
|
frac: function() { return this.onem/this.cores; },
|
||||||
|
cls: function() {
|
||||||
|
if (this.frac > 3) { return "critical"; }
|
||||||
|
if (this.frac > 2.5) { return "bad"; }
|
||||||
|
if (this.frac > 2) { return "high"; }
|
||||||
|
if (this.frac > 1.5) { return "average"; }
|
||||||
|
return "low";
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template: `
|
||||||
|
<td class="load loadbar" :class="cls">
|
||||||
|
{{onem.toFixed(2)}}, {{fivem.toFixed(2)}}, {{fifteenm.toFixed(2)}}
|
||||||
|
</td>
|
||||||
|
`,
|
||||||
|
});
|
||||||
|
|
||||||
|
setInterval(dashboard.tickClock, 1000);
|
||||||
|
dashboard.updateAll();
|
|
@ -39,6 +39,7 @@ __all__ = [
|
||||||
|
|
||||||
FUNCTION_PREFIX = "RemoteFunction:"
|
FUNCTION_PREFIX = "RemoteFunction:"
|
||||||
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
|
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
|
||||||
|
REPORTER_CHANNEL = "RAY_REPORTER"
|
||||||
|
|
||||||
# xray heartbeats
|
# xray heartbeats
|
||||||
XRAY_HEARTBEAT_CHANNEL = str(TablePubsub.HEARTBEAT).encode("ascii")
|
XRAY_HEARTBEAT_CHANNEL = str(TablePubsub.HEARTBEAT).encode("ascii")
|
||||||
|
|
|
@ -10,6 +10,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import signal
|
import signal
|
||||||
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
@ -24,6 +25,8 @@ from ray.utils import try_to_create_directory
|
||||||
# using logging.basicConfig in its entry/init points.
|
# using logging.basicConfig in its entry/init points.
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
PY3 = sys.version_info.major >= 3
|
||||||
|
|
||||||
|
|
||||||
class Node(object):
|
class Node(object):
|
||||||
"""An encapsulation of the Ray processes on a single node.
|
"""An encapsulation of the Ray processes on a single node.
|
||||||
|
@ -81,6 +84,7 @@ class Node(object):
|
||||||
self._plasma_store_socket_name = None
|
self._plasma_store_socket_name = None
|
||||||
self._raylet_socket_name = None
|
self._raylet_socket_name = None
|
||||||
self._webui_url = None
|
self._webui_url = None
|
||||||
|
self._dashboard_url = None
|
||||||
else:
|
else:
|
||||||
self._plasma_store_socket_name = (
|
self._plasma_store_socket_name = (
|
||||||
ray_params.plasma_store_socket_name)
|
ray_params.plasma_store_socket_name)
|
||||||
|
@ -284,6 +288,35 @@ class Node(object):
|
||||||
process_info
|
process_info
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def start_reporter(self):
|
||||||
|
"""Start the reporter."""
|
||||||
|
stdout_file, stderr_file = self.new_log_files("reporter", True)
|
||||||
|
process_info = ray.services.start_reporter(
|
||||||
|
self.redis_address,
|
||||||
|
stdout_file=stdout_file,
|
||||||
|
stderr_file=stderr_file,
|
||||||
|
redis_password=self._ray_params.redis_password)
|
||||||
|
assert ray_constants.PROCESS_TYPE_REPORTER not in self.all_processes
|
||||||
|
if process_info is not None:
|
||||||
|
self.all_processes[ray_constants.PROCESS_TYPE_REPORTER] = [
|
||||||
|
process_info
|
||||||
|
]
|
||||||
|
|
||||||
|
def start_dashboard(self):
|
||||||
|
"""Start the dashboard."""
|
||||||
|
stdout_file, stderr_file = self.new_log_files("dashboard", True)
|
||||||
|
self._dashboard_url, process_info = ray.services.start_dashboard(
|
||||||
|
self.redis_address,
|
||||||
|
self._temp_dir,
|
||||||
|
stdout_file=stdout_file,
|
||||||
|
stderr_file=stderr_file,
|
||||||
|
redis_password=self._ray_params.redis_password)
|
||||||
|
assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes
|
||||||
|
if process_info is not None:
|
||||||
|
self.all_processes[ray_constants.PROCESS_TYPE_DASHBOARD] = [
|
||||||
|
process_info
|
||||||
|
]
|
||||||
|
|
||||||
def start_ui(self):
|
def start_ui(self):
|
||||||
"""Start the web UI."""
|
"""Start the web UI."""
|
||||||
stdout_file, stderr_file = self.new_log_files("webui")
|
stdout_file, stderr_file = self.new_log_files("webui")
|
||||||
|
@ -408,14 +441,16 @@ class Node(object):
|
||||||
self.start_redis()
|
self.start_redis()
|
||||||
self.start_monitor()
|
self.start_monitor()
|
||||||
self.start_raylet_monitor()
|
self.start_raylet_monitor()
|
||||||
|
if PY3 and self._ray_params.include_webui:
|
||||||
|
self.start_dashboard()
|
||||||
|
|
||||||
self.start_plasma_store()
|
self.start_plasma_store()
|
||||||
self.start_raylet()
|
self.start_raylet()
|
||||||
|
if PY3 and self._ray_params.include_webui:
|
||||||
|
self.start_reporter()
|
||||||
|
|
||||||
if self._ray_params.include_log_monitor:
|
if self._ray_params.include_log_monitor:
|
||||||
self.start_log_monitor()
|
self.start_log_monitor()
|
||||||
if self._ray_params.include_webui:
|
|
||||||
self.start_ui()
|
|
||||||
|
|
||||||
def _kill_process_type(self,
|
def _kill_process_type(self,
|
||||||
process_type,
|
process_type,
|
||||||
|
@ -545,6 +580,26 @@ class Node(object):
|
||||||
self._kill_process_type(
|
self._kill_process_type(
|
||||||
ray_constants.PROCESS_TYPE_LOG_MONITOR, check_alive=check_alive)
|
ray_constants.PROCESS_TYPE_LOG_MONITOR, check_alive=check_alive)
|
||||||
|
|
||||||
|
def kill_reporter(self, check_alive=True):
|
||||||
|
"""Kill the reporter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
check_alive (bool): Raise an exception if the process was already
|
||||||
|
dead.
|
||||||
|
"""
|
||||||
|
self._kill_process_type(
|
||||||
|
ray_constants.PROCESS_TYPE_REPORTER, check_alive=check_alive)
|
||||||
|
|
||||||
|
def kill_dashboard(self, check_alive=True):
|
||||||
|
"""Kill the dashboard.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
check_alive (bool): Raise an exception if the process was already
|
||||||
|
dead.
|
||||||
|
"""
|
||||||
|
self._kill_process_type(
|
||||||
|
ray_constants.PROCESS_TYPE_DASHBOARD, check_alive=check_alive)
|
||||||
|
|
||||||
def kill_monitor(self, check_alive=True):
|
def kill_monitor(self, check_alive=True):
|
||||||
"""Kill the monitor.
|
"""Kill the monitor.
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,8 @@ INFEASIBLE_TASK_ERROR = "infeasible_task"
|
||||||
REMOVED_NODE_ERROR = "node_removed"
|
REMOVED_NODE_ERROR = "node_removed"
|
||||||
MONITOR_DIED_ERROR = "monitor_died"
|
MONITOR_DIED_ERROR = "monitor_died"
|
||||||
LOG_MONITOR_DIED_ERROR = "log_monitor_died"
|
LOG_MONITOR_DIED_ERROR = "log_monitor_died"
|
||||||
|
REPORTER_DIED_ERROR = "reporter_died"
|
||||||
|
DASHBOARD_DIED_ERROR = "dashboard_died"
|
||||||
|
|
||||||
# Abort autoscaling if more than this number of errors are encountered. This
|
# Abort autoscaling if more than this number of errors are encountered. This
|
||||||
# is a safety feature to prevent e.g. runaway node launches.
|
# is a safety feature to prevent e.g. runaway node launches.
|
||||||
|
@ -76,6 +78,9 @@ AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
|
||||||
AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S",
|
AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S",
|
||||||
30)
|
30)
|
||||||
|
|
||||||
|
# The reporter will report its' statistics this often (milliseconds).
|
||||||
|
REPORTER_UPDATE_INTERVAL_MS = env_integer("REPORTER_UPDATE_INTERVAL_MS", 500)
|
||||||
|
|
||||||
# Max number of retries to AWS (default is 5, time increases exponentially)
|
# Max number of retries to AWS (default is 5, time increases exponentially)
|
||||||
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
|
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
|
||||||
|
|
||||||
|
@ -96,6 +101,8 @@ INFINITE_RECONSTRUCTION = 2**30
|
||||||
PROCESS_TYPE_MONITOR = "monitor"
|
PROCESS_TYPE_MONITOR = "monitor"
|
||||||
PROCESS_TYPE_RAYLET_MONITOR = "raylet_monitor"
|
PROCESS_TYPE_RAYLET_MONITOR = "raylet_monitor"
|
||||||
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
|
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
|
||||||
|
PROCESS_TYPE_REPORTER = "reporter"
|
||||||
|
PROCESS_TYPE_DASHBOARD = "dashboard"
|
||||||
PROCESS_TYPE_WORKER = "worker"
|
PROCESS_TYPE_WORKER = "worker"
|
||||||
PROCESS_TYPE_RAYLET = "raylet"
|
PROCESS_TYPE_RAYLET = "raylet"
|
||||||
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
|
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
|
||||||
|
|
228
python/ray/reporter.py
Normal file
228
python/ray/reporter.py
Normal file
|
@ -0,0 +1,228 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
from socket import AddressFamily
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
print("The reporter requires psutil to run.")
|
||||||
|
import sys
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
import ray.ray_constants as ray_constants
|
||||||
|
import ray.utils
|
||||||
|
|
||||||
|
# Logger for this module. It should be configured at the entry point
|
||||||
|
# into the program using Ray. Ray provides a default configuration at
|
||||||
|
# entry/init points.
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def recursive_asdict(o):
|
||||||
|
if isinstance(o, tuple) and hasattr(o, "_asdict"):
|
||||||
|
return recursive_asdict(o._asdict())
|
||||||
|
|
||||||
|
if isinstance(o, (tuple, list)):
|
||||||
|
L = []
|
||||||
|
for k in o:
|
||||||
|
L.append(recursive_asdict(k))
|
||||||
|
return L
|
||||||
|
|
||||||
|
if isinstance(o, dict):
|
||||||
|
D = {k: recursive_asdict(v) for k, v in o.items()}
|
||||||
|
return D
|
||||||
|
|
||||||
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
def jsonify_asdict(o):
|
||||||
|
return json.dumps(recursive_asdict(o))
|
||||||
|
|
||||||
|
|
||||||
|
def running_worker(s):
|
||||||
|
if "ray_worker" not in s:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if s == "ray_worker":
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def determine_ip_address():
|
||||||
|
"""Return the first IP address for an ethernet interface on the system."""
|
||||||
|
addrs = [
|
||||||
|
x.address for k, v in psutil.net_if_addrs().items() if k[0] == "e"
|
||||||
|
for x in v if x.family == AddressFamily.AF_INET
|
||||||
|
]
|
||||||
|
return addrs[0]
|
||||||
|
|
||||||
|
|
||||||
|
def to_posix_time(dt):
|
||||||
|
return (dt - datetime.datetime(1970, 1, 1)).total_seconds()
|
||||||
|
|
||||||
|
|
||||||
|
class Reporter(object):
|
||||||
|
"""A monitor process for monitoring Ray nodes.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
host (str): The hostname of this machine. Used to improve the log
|
||||||
|
messages published to Redis.
|
||||||
|
redis_client: A client used to communicate with the Redis server.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, redis_address, redis_password=None):
|
||||||
|
"""Initialize the reporter object."""
|
||||||
|
self.cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False))
|
||||||
|
self.ip_addr = determine_ip_address()
|
||||||
|
self.hostname = os.uname().nodename
|
||||||
|
|
||||||
|
_ = psutil.cpu_percent() # For initialization
|
||||||
|
|
||||||
|
self.redis_key = "{}.{}".format(ray.gcs_utils.REPORTER_CHANNEL,
|
||||||
|
self.hostname)
|
||||||
|
self.redis_client = ray.services.create_redis_client(
|
||||||
|
redis_address, password=redis_password)
|
||||||
|
|
||||||
|
self.network_stats_hist = [(0, (0.0, 0.0))] # time, (sent, recv)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_cpu_percent():
|
||||||
|
return psutil.cpu_percent()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_boot_time():
|
||||||
|
return psutil.boot_time()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_network_stats():
|
||||||
|
ifaces = [
|
||||||
|
v for k, v in psutil.net_io_counters(pernic=True).items()
|
||||||
|
if k[0] == "e"
|
||||||
|
]
|
||||||
|
|
||||||
|
sent = sum((iface.bytes_sent for iface in ifaces))
|
||||||
|
recv = sum((iface.bytes_recv for iface in ifaces))
|
||||||
|
return sent, recv
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_mem_usage():
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
return vm.total, vm.available, vm.percent
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_disk_usage():
|
||||||
|
return {x: psutil.disk_usage(x) for x in ["/", "/tmp"]}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_workers():
|
||||||
|
return [
|
||||||
|
x.as_dict(attrs=[
|
||||||
|
"pid", "create_time", "cpu_times", "name", "memory_full_info"
|
||||||
|
]) for x in psutil.process_iter() if running_worker(x.name())
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_load_avg(self):
|
||||||
|
load = os.getloadavg()
|
||||||
|
per_cpu_load = tuple((round(x / self.cpu_counts[0], 2) for x in load))
|
||||||
|
return load, per_cpu_load
|
||||||
|
|
||||||
|
def get_all_stats(self):
|
||||||
|
now = to_posix_time(datetime.datetime.utcnow())
|
||||||
|
network_stats = self.get_network_stats()
|
||||||
|
|
||||||
|
self.network_stats_hist.append((now, network_stats))
|
||||||
|
self.network_stats_hist = self.network_stats_hist[-7:]
|
||||||
|
then, prev_network_stats = self.network_stats_hist[0]
|
||||||
|
netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then),
|
||||||
|
(network_stats[1] - prev_network_stats[1]) / (now - then))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"now": now,
|
||||||
|
"hostname": self.hostname,
|
||||||
|
"ip": self.ip_addr,
|
||||||
|
"cpu": self.get_cpu_percent(),
|
||||||
|
"cpus": self.cpu_counts,
|
||||||
|
"mem": self.get_mem_usage(),
|
||||||
|
"workers": self.get_workers(),
|
||||||
|
"boot_time": self.get_boot_time(),
|
||||||
|
"load_avg": self.get_load_avg(),
|
||||||
|
"disk": self.get_disk_usage(),
|
||||||
|
"net": netstats,
|
||||||
|
}
|
||||||
|
|
||||||
|
def perform_iteration(self):
|
||||||
|
"""Get any changes to the log files and push updates to Redis."""
|
||||||
|
stats = self.get_all_stats()
|
||||||
|
|
||||||
|
self.redis_client.publish(
|
||||||
|
self.redis_key,
|
||||||
|
jsonify_asdict(stats),
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""Run the reporter."""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
self.perform_iteration()
|
||||||
|
except Exception:
|
||||||
|
traceback.print_exc()
|
||||||
|
pass
|
||||||
|
|
||||||
|
time.sleep(ray_constants.REPORTER_UPDATE_INTERVAL_MS / 1000)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=("Parse Redis server for the "
|
||||||
|
"reporter to connect to."))
|
||||||
|
parser.add_argument(
|
||||||
|
"--redis-address",
|
||||||
|
required=True,
|
||||||
|
type=str,
|
||||||
|
help="The address to use for Redis.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--redis-password",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="the password to use for Redis")
|
||||||
|
parser.add_argument(
|
||||||
|
"--logging-level",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=ray_constants.LOGGER_LEVEL,
|
||||||
|
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
||||||
|
help=ray_constants.LOGGER_LEVEL_HELP)
|
||||||
|
parser.add_argument(
|
||||||
|
"--logging-format",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
default=ray_constants.LOGGER_FORMAT,
|
||||||
|
help=ray_constants.LOGGER_FORMAT_HELP)
|
||||||
|
args = parser.parse_args()
|
||||||
|
ray.utils.setup_logger(args.logging_level, args.logging_format)
|
||||||
|
|
||||||
|
reporter = Reporter(args.redis_address, redis_password=args.redis_password)
|
||||||
|
|
||||||
|
try:
|
||||||
|
reporter.run()
|
||||||
|
except Exception as e:
|
||||||
|
# Something went wrong, so push an error to all drivers.
|
||||||
|
redis_client = ray.services.create_redis_client(
|
||||||
|
args.redis_address, password=args.redis_password)
|
||||||
|
traceback_str = ray.utils.format_error_message(traceback.format_exc())
|
||||||
|
message = ("The reporter on node {} failed with the following "
|
||||||
|
"error:\n{}".format(os.uname()[1], traceback_str))
|
||||||
|
ray.utils.push_error_to_driver_through_redis(
|
||||||
|
redis_client, ray_constants.REPORTER_DIED_ERROR, message)
|
||||||
|
raise e
|
|
@ -827,6 +827,117 @@ def start_log_monitor(redis_address,
|
||||||
return process_info
|
return process_info
|
||||||
|
|
||||||
|
|
||||||
|
def start_reporter(redis_address,
|
||||||
|
stdout_file=None,
|
||||||
|
stderr_file=None,
|
||||||
|
redis_password=None):
|
||||||
|
"""Start a reporter process.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
redis_address (str): The address of the Redis instance.
|
||||||
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
||||||
|
no redirection should happen, then this should be None.
|
||||||
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
||||||
|
no redirection should happen, then this should be None.
|
||||||
|
redis_password (str): The password of the redis server.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ProcessInfo for the process that was started.
|
||||||
|
"""
|
||||||
|
reporter_filepath = os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)), "reporter.py")
|
||||||
|
command = [
|
||||||
|
sys.executable, "-u", reporter_filepath,
|
||||||
|
"--redis-address={}".format(redis_address)
|
||||||
|
]
|
||||||
|
if redis_password:
|
||||||
|
command += ["--redis-password", redis_password]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("Failed to start the reporter. The reporter requires "
|
||||||
|
"'pip install psutil'.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
process_info = start_ray_process(
|
||||||
|
command,
|
||||||
|
ray_constants.PROCESS_TYPE_REPORTER,
|
||||||
|
stdout_file=stdout_file,
|
||||||
|
stderr_file=stderr_file)
|
||||||
|
return process_info
|
||||||
|
|
||||||
|
|
||||||
|
def start_dashboard(redis_address,
|
||||||
|
temp_dir,
|
||||||
|
stdout_file=None,
|
||||||
|
stderr_file=None,
|
||||||
|
redis_password=None):
|
||||||
|
"""Start a dashboard process.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
redis_address (str): The address of the Redis instance.
|
||||||
|
temp_dir (str): The temporary directory used for log files and
|
||||||
|
information for this Ray session.
|
||||||
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
||||||
|
no redirection should happen, then this should be None.
|
||||||
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
||||||
|
no redirection should happen, then this should be None.
|
||||||
|
redis_password (str): The password of the redis server.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ProcessInfo for the process that was started.
|
||||||
|
"""
|
||||||
|
port = 8080
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
port_test_socket = socket.socket()
|
||||||
|
port_test_socket.bind(("127.0.0.1", port))
|
||||||
|
port_test_socket.close()
|
||||||
|
break
|
||||||
|
except socket.error:
|
||||||
|
port += 1
|
||||||
|
|
||||||
|
token = ray.utils.decode(binascii.hexlify(os.urandom(24)))
|
||||||
|
|
||||||
|
dashboard_filepath = os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)), "dashboard/dashboard.py")
|
||||||
|
command = [
|
||||||
|
sys.executable,
|
||||||
|
"-u",
|
||||||
|
dashboard_filepath,
|
||||||
|
"--redis-address={}".format(redis_address),
|
||||||
|
"--http-port={}".format(port),
|
||||||
|
"--token={}".format(token),
|
||||||
|
"--temp-dir={}".format(temp_dir),
|
||||||
|
]
|
||||||
|
if redis_password:
|
||||||
|
command += ["--redis-password", redis_password]
|
||||||
|
|
||||||
|
if sys.version_info <= (3, 0):
|
||||||
|
return None, None
|
||||||
|
try:
|
||||||
|
import aiohttp # noqa: F401
|
||||||
|
import psutil # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
logger.warning(
|
||||||
|
"Failed to start the dashboard. The dashboard requires Python 3 "
|
||||||
|
"as well as 'pip install aiohttp psutil'.")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
process_info = start_ray_process(
|
||||||
|
command,
|
||||||
|
ray_constants.PROCESS_TYPE_DASHBOARD,
|
||||||
|
stdout_file=stdout_file,
|
||||||
|
stderr_file=stderr_file)
|
||||||
|
dashboard_url = "http://{}:{}/?token={}".format(
|
||||||
|
ray.services.get_node_ip_address(), port, token)
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("View the dashboard at {}".format(dashboard_url))
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
return dashboard_url, process_info
|
||||||
|
|
||||||
|
|
||||||
def start_ui(redis_address, notebook_name, stdout_file=None, stderr_file=None):
|
def start_ui(redis_address, notebook_name, stdout_file=None, stderr_file=None):
|
||||||
"""Start a UI process.
|
"""Start a UI process.
|
||||||
|
|
||||||
|
|
|
@ -66,6 +66,7 @@ optional_ray_files += ray_autoscaler_files
|
||||||
extras = {
|
extras = {
|
||||||
"rllib": ["pyyaml", "gym[atari]", "opencv-python", "lz4", "scipy"],
|
"rllib": ["pyyaml", "gym[atari]", "opencv-python", "lz4", "scipy"],
|
||||||
"debug": ["psutil", "setproctitle", "py-spy"],
|
"debug": ["psutil", "setproctitle", "py-spy"],
|
||||||
|
"dashboard": ["psutil", "aiohttp"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import time
|
import time
|
||||||
|
@ -67,15 +71,14 @@ def test_raylet_tempfiles():
|
||||||
ray.init(num_cpus=0)
|
ray.init(num_cpus=0)
|
||||||
node = ray.worker._global_node
|
node = ray.worker._global_node
|
||||||
top_levels = set(os.listdir(node.get_temp_dir_path()))
|
top_levels = set(os.listdir(node.get_temp_dir_path()))
|
||||||
assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
|
assert top_levels.issuperset({"sockets", "logs"})
|
||||||
log_files = set(os.listdir(node.get_logs_dir_path()))
|
log_files = set(os.listdir(node.get_logs_dir_path()))
|
||||||
assert log_files == {
|
assert log_files.issuperset({
|
||||||
"log_monitor.out", "log_monitor.err", "plasma_store.out",
|
"log_monitor.out", "log_monitor.err", "plasma_store.out",
|
||||||
"plasma_store.err", "webui.out", "webui.err", "monitor.out",
|
"plasma_store.err", "monitor.out", "monitor.err", "raylet_monitor.out",
|
||||||
"monitor.err", "raylet_monitor.out", "raylet_monitor.err",
|
"raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err",
|
||||||
"redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err",
|
"redis.out", "redis.err", "raylet.out", "raylet.err"
|
||||||
"raylet.out", "raylet.err"
|
}) # with raylet logs
|
||||||
} # with raylet logs
|
|
||||||
socket_files = set(os.listdir(node.get_sockets_dir_path()))
|
socket_files = set(os.listdir(node.get_sockets_dir_path()))
|
||||||
assert socket_files == {"plasma_store", "raylet"}
|
assert socket_files == {"plasma_store", "raylet"}
|
||||||
ray.shutdown()
|
ray.shutdown()
|
||||||
|
@ -83,15 +86,14 @@ def test_raylet_tempfiles():
|
||||||
ray.init(num_cpus=2)
|
ray.init(num_cpus=2)
|
||||||
node = ray.worker._global_node
|
node = ray.worker._global_node
|
||||||
top_levels = set(os.listdir(node.get_temp_dir_path()))
|
top_levels = set(os.listdir(node.get_temp_dir_path()))
|
||||||
assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
|
assert top_levels.issuperset({"sockets", "logs"})
|
||||||
time.sleep(3) # wait workers to start
|
time.sleep(3) # wait workers to start
|
||||||
log_files = set(os.listdir(node.get_logs_dir_path()))
|
log_files = set(os.listdir(node.get_logs_dir_path()))
|
||||||
assert log_files.issuperset({
|
assert log_files.issuperset({
|
||||||
"log_monitor.out", "log_monitor.err", "plasma_store.out",
|
"log_monitor.out", "log_monitor.err", "plasma_store.out",
|
||||||
"plasma_store.err", "webui.out", "webui.err", "monitor.out",
|
"plasma_store.err", "monitor.out", "monitor.err", "raylet_monitor.out",
|
||||||
"monitor.err", "raylet_monitor.out", "raylet_monitor.err",
|
"raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err",
|
||||||
"redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err",
|
"redis.out", "redis.err", "raylet.out", "raylet.err"
|
||||||
"raylet.out", "raylet.err"
|
|
||||||
}) # with raylet logs
|
}) # with raylet logs
|
||||||
|
|
||||||
# Check numbers of worker log file.
|
# Check numbers of worker log file.
|
||||||
|
|
Loading…
Add table
Reference in a new issue