Expose raylet info to dashboard (#6045)

This commit is contained in:
Philipp Moritz 2019-10-31 17:36:59 -07:00 committed by GitHub
parent c86f945520
commit f7455839bf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 61 additions and 3 deletions

View file

@ -64,7 +64,7 @@ if [[ "$platform" == "linux" ]]; then
$PYTHON_EXE -m pytest -v "$INSTALLED_RAY_DIRECTORY/$TEST_SCRIPT" $PYTHON_EXE -m pytest -v "$INSTALLED_RAY_DIRECTORY/$TEST_SCRIPT"
# Run the UI test to make sure that the packaged UI works. # Run the UI test to make sure that the packaged UI works.
$PIP_CMD install -q aiohttp psutil requests setproctitle $PIP_CMD install -q aiohttp google grpcio psutil requests setproctitle
$PYTHON_EXE -m pytest -v "$INSTALLED_RAY_DIRECTORY/$UI_TEST_SCRIPT" $PYTHON_EXE -m pytest -v "$INSTALLED_RAY_DIRECTORY/$UI_TEST_SCRIPT"
# Check that the other wheels are present. # Check that the other wheels are present.
@ -106,7 +106,7 @@ elif [[ "$platform" == "macosx" ]]; then
if (( $(echo "$PY_MM >= 3.0" | bc) )); then if (( $(echo "$PY_MM >= 3.0" | bc) )); then
# Run the UI test to make sure that the packaged UI works. # Run the UI test to make sure that the packaged UI works.
$PIP_CMD install -q aiohttp psutil requests setproctitle $PIP_CMD install -q aiohttp google grpcio psutil requests setproctitle
$PYTHON_EXE -m pytest -v "$INSTALLED_RAY_DIRECTORY/$UI_TEST_SCRIPT" $PYTHON_EXE -m pytest -v "$INSTALLED_RAY_DIRECTORY/$UI_TEST_SCRIPT"
fi fi
done done

View file

@ -10,12 +10,14 @@ except ImportError:
sys.exit(1) sys.exit(1)
import argparse import argparse
import copy
import datetime import datetime
import json import json
import logging import logging
import os import os
import re import re
import threading import threading
import time
import traceback import traceback
import yaml import yaml
@ -25,7 +27,11 @@ from collections import defaultdict
from operator import itemgetter from operator import itemgetter
from typing import Dict from typing import Dict
import grpc
from google.protobuf.json_format import MessageToDict
import ray import ray
from ray.core.generated import node_manager_pb2
from ray.core.generated import node_manager_pb2_grpc
import ray.ray_constants as ray_constants import ray.ray_constants as ray_constants
import ray.utils import ray.utils
@ -64,6 +70,7 @@ class Dashboard(object):
self.temp_dir = temp_dir self.temp_dir = temp_dir
self.node_stats = NodeStats(redis_address, redis_password) self.node_stats = NodeStats(redis_address, redis_password)
self.raylet_stats = RayletStats(redis_address, redis_password)
# Setting the environment variable RAY_DASHBOARD_DEV=1 disables some # Setting the environment variable RAY_DASHBOARD_DEV=1 disables some
# security checks in the dashboard server to ease development while # security checks in the dashboard server to ease development while
@ -137,6 +144,10 @@ class Dashboard(object):
D = self.node_stats.get_node_stats() D = self.node_stats.get_node_stats()
return await json_response(result=D, ts=now) return await json_response(result=D, ts=now)
async def raylet_info(req) -> aiohttp.web.Response:
D = self.raylet_stats.get_raylet_stats()
return await json_response(result=D)
async def logs(req) -> aiohttp.web.Response: async def logs(req) -> aiohttp.web.Response:
hostname = req.query.get("hostname") hostname = req.query.get("hostname")
pid = req.query.get("pid") pid = req.query.get("pid")
@ -162,6 +173,7 @@ class Dashboard(object):
self.app.router.add_get("/api/ray_config", ray_config) self.app.router.add_get("/api/ray_config", ray_config)
self.app.router.add_get("/api/node_info", node_info) self.app.router.add_get("/api/node_info", node_info)
self.app.router.add_get("/api/raylet_info", raylet_info)
self.app.router.add_get("/api/logs", logs) self.app.router.add_get("/api/logs", logs)
self.app.router.add_get("/api/errors", errors) self.app.router.add_get("/api/errors", errors)
@ -176,6 +188,7 @@ class Dashboard(object):
def run(self): def run(self):
self.log_dashboard_url() self.log_dashboard_url()
self.node_stats.start() self.node_stats.start()
self.raylet_stats.start()
aiohttp.web.run_app(self.app, host=self.host, port=self.port) aiohttp.web.run_app(self.app, host=self.host, port=self.port)
@ -349,6 +362,51 @@ class NodeStats(threading.Thread):
continue continue
class RayletStats(threading.Thread):
def __init__(self, redis_address, redis_password=None):
self.nodes_lock = threading.Lock()
self.nodes = []
self.stubs = []
self._raylet_stats_lock = threading.Lock()
self._raylet_stats = {}
self.update_nodes()
super().__init__()
def update_nodes(self):
with self.nodes_lock:
self.nodes = ray.nodes()
self.stubs = []
for node in self.nodes:
channel = grpc.insecure_channel("{}:{}".format(
node["NodeManagerAddress"], node["NodeManagerPort"]))
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
self.stubs.append(stub)
def get_raylet_stats(self) -> Dict:
with self._raylet_stats_lock:
return copy.deepcopy(self._raylet_stats)
def run(self):
counter = 0
while True:
time.sleep(1.0)
with self._raylet_stats_lock:
for node, stub in zip(self.nodes, self.stubs):
reply = stub.GetNodeStats(
node_manager_pb2.NodeStatsRequest())
self._raylet_stats[node[
"NodeManagerAddress"]] = MessageToDict(reply)
counter += 1
# From time to time, check if new nodes have joined the cluster
# and update self.nodes
if counter % 10:
self.update_nodes()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description=("Parse Redis server for the " description=("Parse Redis server for the "

View file

@ -77,7 +77,7 @@ extras = {
"tabulate" "tabulate"
], ],
"debug": ["psutil", "setproctitle", "py-spy >= 0.2.0"], "debug": ["psutil", "setproctitle", "py-spy >= 0.2.0"],
"dashboard": ["aiohttp", "psutil", "setproctitle"], "dashboard": ["aiohttp", "google", "grpcio", "psutil", "setproctitle"],
"serve": ["uvicorn", "pygments", "werkzeug", "flask", "pandas"], "serve": ["uvicorn", "pygments", "werkzeug", "flask", "pandas"],
"tune": ["tabulate"], "tune": ["tabulate"],
} }