2021-02-24 08:27:48 +08:00
|
|
|
import sys
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
import argparse
|
|
|
|
import asyncio
|
|
|
|
import errno
|
|
|
|
import logging
|
|
|
|
import logging.handlers
|
|
|
|
import os
|
2020-10-23 16:52:14 -04:00
|
|
|
import platform
|
2020-07-27 11:34:47 +08:00
|
|
|
import traceback
|
|
|
|
|
2021-09-15 11:17:15 -05:00
|
|
|
import ray.dashboard.consts as dashboard_consts
|
|
|
|
import ray.dashboard.head as dashboard_head
|
|
|
|
import ray.dashboard.utils as dashboard_utils
|
2020-07-27 11:34:47 +08:00
|
|
|
import ray.ray_constants as ray_constants
|
2021-11-11 14:59:57 -08:00
|
|
|
import ray._private.gcs_utils as gcs_utils
|
2020-09-24 22:46:35 -07:00
|
|
|
import ray._private.services
|
2021-03-10 23:47:28 -07:00
|
|
|
import ray._private.utils
|
2021-11-11 14:59:57 -08:00
|
|
|
from ray._private.gcs_pubsub import gcs_pubsub_enabled, GcsPublisher
|
2021-03-10 23:47:28 -07:00
|
|
|
from ray._private.ray_logging import setup_component_logger
|
|
|
|
from ray._private.metrics_agent import PrometheusServiceDiscoveryWriter
|
2020-11-30 19:03:55 -08:00
|
|
|
|
2021-08-21 03:04:21 -05:00
|
|
|
# All third-party dependencies that are not included in the minimal Ray
|
|
|
|
# installation must be included in this file. This allows us to determine if
|
|
|
|
# the agent has the necessary dependencies to be started.
|
2021-09-15 11:17:15 -05:00
|
|
|
from ray.dashboard.optional_deps import aiohttp
|
2021-08-21 03:04:21 -05:00
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
# Logger for this module. It should be configured at the entry point
|
|
|
|
# into the program using Ray. Ray provides a default configuration at
|
|
|
|
# entry/init points.
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
routes = dashboard_utils.ClassMethodRouteTable
|
|
|
|
|
|
|
|
|
2021-02-24 08:27:48 +08:00
|
|
|
class FrontendNotFoundError(OSError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2020-08-25 04:24:23 +08:00
|
|
|
def setup_static_dir():
|
2020-07-27 11:34:47 +08:00
|
|
|
build_dir = os.path.join(
|
2020-10-23 16:52:14 -04:00
|
|
|
os.path.dirname(os.path.abspath(__file__)), "client", "build")
|
2020-07-27 11:34:47 +08:00
|
|
|
module_name = os.path.basename(os.path.dirname(__file__))
|
|
|
|
if not os.path.isdir(build_dir):
|
2021-02-24 08:27:48 +08:00
|
|
|
raise FrontendNotFoundError(
|
2020-07-27 11:34:47 +08:00
|
|
|
errno.ENOENT, "Dashboard build directory not found. If installing "
|
|
|
|
"from source, please follow the additional steps "
|
|
|
|
"required to build the dashboard"
|
2020-08-30 14:09:34 +08:00
|
|
|
f"(cd python/ray/{module_name}/client "
|
2020-07-27 11:34:47 +08:00
|
|
|
"&& npm install "
|
|
|
|
"&& npm ci "
|
2020-08-30 14:09:34 +08:00
|
|
|
"&& npm run build)", build_dir)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
static_dir = os.path.join(build_dir, "static")
|
2020-08-25 04:24:23 +08:00
|
|
|
routes.static("/static", static_dir, follow_symlinks=True)
|
2020-07-27 11:34:47 +08:00
|
|
|
return build_dir
|
|
|
|
|
|
|
|
|
|
|
|
class Dashboard:
|
|
|
|
"""A dashboard process for monitoring Ray nodes.
|
|
|
|
|
|
|
|
This dashboard is made up of a REST API which collates data published by
|
|
|
|
Reporter processes on nodes into a json structure, and a webserver
|
|
|
|
which polls said API for display purposes.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
host(str): Host address of dashboard aiohttp server.
|
|
|
|
port(int): Port number of dashboard aiohttp server.
|
2021-02-24 08:27:48 +08:00
|
|
|
port_retries(int): The retry times to select a valid port.
|
2020-07-27 11:34:47 +08:00
|
|
|
redis_address(str): GCS address of a Ray cluster
|
|
|
|
redis_password(str): Redis password to access GCS
|
2020-08-30 14:09:34 +08:00
|
|
|
log_dir(str): Log directory of dashboard.
|
2020-07-27 11:34:47 +08:00
|
|
|
"""
|
|
|
|
|
2020-08-30 14:09:34 +08:00
|
|
|
def __init__(self,
|
|
|
|
host,
|
|
|
|
port,
|
2021-02-24 08:27:48 +08:00
|
|
|
port_retries,
|
2020-08-30 14:09:34 +08:00
|
|
|
redis_address,
|
|
|
|
redis_password=None,
|
|
|
|
log_dir=None):
|
2020-07-27 11:34:47 +08:00
|
|
|
self.dashboard_head = dashboard_head.DashboardHead(
|
2020-08-25 04:24:23 +08:00
|
|
|
http_host=host,
|
|
|
|
http_port=port,
|
2021-02-24 08:27:48 +08:00
|
|
|
http_port_retries=port_retries,
|
2020-08-25 04:24:23 +08:00
|
|
|
redis_address=redis_address,
|
2020-08-30 14:09:34 +08:00
|
|
|
redis_password=redis_password,
|
|
|
|
log_dir=log_dir)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
# Setup Dashboard Routes
|
2021-02-24 08:27:48 +08:00
|
|
|
try:
|
|
|
|
build_dir = setup_static_dir()
|
|
|
|
logger.info("Setup static dir for dashboard: %s", build_dir)
|
|
|
|
except FrontendNotFoundError as ex:
|
|
|
|
# Not to raise FrontendNotFoundError due to NPM incompatibilities
|
|
|
|
# with Windows.
|
|
|
|
# Please refer to ci.sh::build_dashboard_front_end()
|
|
|
|
if sys.platform in ["win32", "cygwin"]:
|
|
|
|
logger.warning(ex)
|
|
|
|
else:
|
|
|
|
raise ex
|
2020-07-27 11:34:47 +08:00
|
|
|
dashboard_utils.ClassMethodRouteTable.bind(self)
|
|
|
|
|
|
|
|
@routes.get("/")
|
|
|
|
async def get_index(self, req) -> aiohttp.web.FileResponse:
|
|
|
|
return aiohttp.web.FileResponse(
|
|
|
|
os.path.join(
|
|
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
|
|
"client/build/index.html"))
|
|
|
|
|
|
|
|
@routes.get("/favicon.ico")
|
|
|
|
async def get_favicon(self, req) -> aiohttp.web.FileResponse:
|
|
|
|
return aiohttp.web.FileResponse(
|
|
|
|
os.path.join(
|
|
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
|
|
"client/build/favicon.ico"))
|
|
|
|
|
|
|
|
async def run(self):
|
2020-08-25 04:24:23 +08:00
|
|
|
await self.dashboard_head.run()
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2021-02-24 08:27:48 +08:00
|
|
|
parser = argparse.ArgumentParser(description="Ray dashboard.")
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--host",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
help="The host to use for the HTTP server.")
|
|
|
|
parser.add_argument(
|
|
|
|
"--port",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="The port to use for the HTTP server.")
|
2021-02-24 08:27:48 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--port-retries",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
|
|
|
default=0,
|
|
|
|
help="The retry times to select a valid port.")
|
2021-11-11 14:59:57 -08:00
|
|
|
parser.add_argument(
|
|
|
|
"--gcs-address",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
help="The address (ip:port) of GCS.")
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--redis-address",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
help="The address to use for Redis.")
|
|
|
|
parser.add_argument(
|
|
|
|
"--redis-password",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="The password to use for Redis")
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-level",
|
|
|
|
required=False,
|
|
|
|
type=lambda s: logging.getLevelName(s.upper()),
|
|
|
|
default=ray_constants.LOGGER_LEVEL,
|
|
|
|
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
|
|
|
help=ray_constants.LOGGER_LEVEL_HELP)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-format",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=ray_constants.LOGGER_FORMAT,
|
|
|
|
help=ray_constants.LOGGER_FORMAT_HELP)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-filename",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
2020-08-25 04:24:23 +08:00
|
|
|
default=dashboard_consts.DASHBOARD_LOG_FILENAME,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the name of log file, "
|
2020-08-25 04:24:23 +08:00
|
|
|
"log to stdout if set empty, default is \"{}\"".format(
|
|
|
|
dashboard_consts.DASHBOARD_LOG_FILENAME))
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-bytes",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BYTES,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the max bytes for rotating "
|
|
|
|
"log file, default is {} bytes.".format(
|
2020-11-30 19:03:55 -08:00
|
|
|
ray_constants.LOGGING_ROTATE_BYTES))
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-backup-count",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the backup count of rotated log file, default is {}.".
|
2020-11-30 19:03:55 -08:00
|
|
|
format(ray_constants.LOGGING_ROTATE_BACKUP_COUNT))
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--log-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of log directory.")
|
|
|
|
parser.add_argument(
|
|
|
|
"--temp-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of the temporary directory use by Ray process.")
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
2020-11-30 19:03:55 -08:00
|
|
|
setup_component_logger(
|
|
|
|
logging_level=args.logging_level,
|
|
|
|
logging_format=args.logging_format,
|
|
|
|
log_dir=args.log_dir,
|
|
|
|
filename=args.logging_filename,
|
|
|
|
max_bytes=args.logging_rotate_bytes,
|
|
|
|
backup_count=args.logging_rotate_backup_count)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
dashboard = Dashboard(
|
|
|
|
args.host,
|
|
|
|
args.port,
|
2021-02-24 08:27:48 +08:00
|
|
|
args.port_retries,
|
2020-07-27 11:34:47 +08:00
|
|
|
args.redis_address,
|
2020-08-30 14:09:34 +08:00
|
|
|
redis_password=args.redis_password,
|
2020-10-23 16:52:14 -04:00
|
|
|
log_dir=args.log_dir)
|
2021-07-27 14:05:44 +08:00
|
|
|
# TODO(fyrestone): Avoid using ray.state in dashboard, it's not
|
|
|
|
# asynchronous and will lead to low performance. ray disconnect()
|
|
|
|
# will be hang when the ray.state is connected and the GCS is exit.
|
|
|
|
# Please refer to: https://github.com/ray-project/ray/issues/16328
|
2020-11-19 11:04:26 -08:00
|
|
|
service_discovery = PrometheusServiceDiscoveryWriter(
|
|
|
|
args.redis_address, args.redis_password, args.temp_dir)
|
2021-02-24 08:27:48 +08:00
|
|
|
# Need daemon True to avoid dashboard hangs at exit.
|
|
|
|
service_discovery.daemon = True
|
2020-11-19 11:04:26 -08:00
|
|
|
service_discovery.start()
|
2020-07-27 11:34:47 +08:00
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
loop.run_until_complete(dashboard.run())
|
|
|
|
except Exception as e:
|
2021-03-10 23:47:28 -07:00
|
|
|
traceback_str = ray._private.utils.format_error_message(
|
|
|
|
traceback.format_exc())
|
2021-02-24 08:27:48 +08:00
|
|
|
message = f"The dashboard on node {platform.uname()[1]} " \
|
|
|
|
f"failed with the following " \
|
|
|
|
f"error:\n{traceback_str}"
|
|
|
|
if isinstance(e, FrontendNotFoundError):
|
2020-07-27 11:34:47 +08:00
|
|
|
logger.warning(message)
|
|
|
|
else:
|
2021-02-24 08:27:48 +08:00
|
|
|
logger.error(message)
|
2020-07-27 11:34:47 +08:00
|
|
|
raise e
|
2021-11-11 14:59:57 -08:00
|
|
|
|
|
|
|
# Something went wrong, so push an error to all drivers.
|
|
|
|
redis_client = ray._private.services.create_redis_client(
|
|
|
|
args.redis_address, password=args.redis_password)
|
|
|
|
gcs_publisher = None
|
|
|
|
if args.gcs_address:
|
|
|
|
gcs_publisher = GcsPublisher(address=args.gcs_address)
|
|
|
|
elif gcs_pubsub_enabled():
|
|
|
|
gcs_publisher = GcsPublisher(
|
|
|
|
address=gcs_utils.get_gcs_address_from_redis(redis_client))
|
|
|
|
ray._private.utils.publish_error_to_driver(
|
|
|
|
redis_client,
|
|
|
|
ray_constants.DASHBOARD_DIED_ERROR,
|
|
|
|
message,
|
|
|
|
redis_client=redis_client,
|
|
|
|
gcs_publisher=gcs_publisher)
|