2020-07-27 11:34:47 +08:00
|
|
|
import argparse
|
|
|
|
import asyncio
|
|
|
|
import logging
|
|
|
|
import logging.handlers
|
2020-10-23 16:52:14 -04:00
|
|
|
import platform
|
2020-07-27 11:34:47 +08:00
|
|
|
import traceback
|
2022-06-27 08:14:59 -07:00
|
|
|
import signal
|
|
|
|
import os
|
|
|
|
import sys
|
2020-07-27 11:34:47 +08:00
|
|
|
|
2022-06-21 15:13:29 -07:00
|
|
|
import ray._private.ray_constants as ray_constants
|
|
|
|
import ray._private.services
|
|
|
|
import ray._private.utils
|
2021-09-15 11:17:15 -05:00
|
|
|
import ray.dashboard.consts as dashboard_consts
|
|
|
|
import ray.dashboard.head as dashboard_head
|
2022-02-01 15:34:40 +09:00
|
|
|
import ray.dashboard.utils as dashboard_utils
|
2022-03-04 12:32:17 -08:00
|
|
|
from ray._private.gcs_pubsub import GcsPublisher
|
2021-03-10 23:47:28 -07:00
|
|
|
from ray._private.ray_logging import setup_component_logger
|
2022-07-29 15:01:49 +09:00
|
|
|
from typing import Optional, Set
|
2021-08-21 03:04:21 -05:00
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
# Logger for this module. It should be configured at the entry point
|
|
|
|
# into the program using Ray. Ray provides a default configuration at
|
|
|
|
# entry/init points.
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class Dashboard:
|
|
|
|
"""A dashboard process for monitoring Ray nodes.
|
|
|
|
|
|
|
|
This dashboard is made up of a REST API which collates data published by
|
|
|
|
Reporter processes on nodes into a json structure, and a webserver
|
|
|
|
which polls said API for display purposes.
|
|
|
|
|
|
|
|
Args:
|
2022-07-27 22:24:20 -07:00
|
|
|
host: Host address of dashboard aiohttp server.
|
|
|
|
port: Port number of dashboard aiohttp server.
|
|
|
|
port_retries: The retry times to select a valid port.
|
|
|
|
gcs_address: GCS address of the cluster
|
|
|
|
log_dir: Log directory of dashboard.
|
2020-07-27 11:34:47 +08:00
|
|
|
"""
|
|
|
|
|
2020-08-30 14:09:34 +08:00
|
|
|
def __init__(
|
|
|
|
self,
|
2022-07-29 15:01:49 +09:00
|
|
|
host: str,
|
|
|
|
port: int,
|
|
|
|
port_retries: int,
|
|
|
|
gcs_address: str,
|
|
|
|
log_dir: str = None,
|
|
|
|
temp_dir: str = None,
|
|
|
|
session_dir: str = None,
|
|
|
|
minimal: bool = False,
|
|
|
|
modules_to_load: Optional[Set[str]] = None,
|
2020-08-30 14:09:34 +08:00
|
|
|
):
|
2020-07-27 11:34:47 +08:00
|
|
|
self.dashboard_head = dashboard_head.DashboardHead(
|
2020-08-25 04:24:23 +08:00
|
|
|
http_host=host,
|
|
|
|
http_port=port,
|
2021-02-24 08:27:48 +08:00
|
|
|
http_port_retries=port_retries,
|
2021-12-21 16:58:03 -08:00
|
|
|
gcs_address=gcs_address,
|
2020-08-30 14:09:34 +08:00
|
|
|
log_dir=log_dir,
|
2022-02-01 15:34:40 +09:00
|
|
|
temp_dir=temp_dir,
|
2022-02-09 15:12:36 +09:00
|
|
|
session_dir=session_dir,
|
2022-02-01 15:34:40 +09:00
|
|
|
minimal=minimal,
|
2022-07-29 15:01:49 +09:00
|
|
|
modules_to_load=modules_to_load,
|
2022-01-29 18:41:57 -08:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
async def run(self):
|
2020-08-25 04:24:23 +08:00
|
|
|
await self.dashboard_head.run()
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2021-02-24 08:27:48 +08:00
|
|
|
parser = argparse.ArgumentParser(description="Ray dashboard.")
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--host", required=True, type=str, help="The host to use for the HTTP server."
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--port", required=True, type=int, help="The port to use for the HTTP server."
|
|
|
|
)
|
2021-02-24 08:27:48 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--port-retries",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
|
|
|
default=0,
|
|
|
|
help="The retry times to select a valid port.",
|
|
|
|
)
|
2021-11-11 14:59:57 -08:00
|
|
|
parser.add_argument(
|
2022-03-04 12:32:17 -08:00
|
|
|
"--gcs-address", required=True, type=str, help="The address (ip:port) of GCS."
|
2020-07-27 11:34:47 +08:00
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-level",
|
|
|
|
required=False,
|
|
|
|
type=lambda s: logging.getLevelName(s.upper()),
|
|
|
|
default=ray_constants.LOGGER_LEVEL,
|
|
|
|
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
|
|
|
help=ray_constants.LOGGER_LEVEL_HELP,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-format",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=ray_constants.LOGGER_FORMAT,
|
|
|
|
help=ray_constants.LOGGER_FORMAT_HELP,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-filename",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
2020-08-25 04:24:23 +08:00
|
|
|
default=dashboard_consts.DASHBOARD_LOG_FILENAME,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the name of log file, "
|
2020-08-25 04:24:23 +08:00
|
|
|
'log to stdout if set empty, default is "{}"'.format(
|
|
|
|
dashboard_consts.DASHBOARD_LOG_FILENAME
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2020-08-25 04:24:23 +08:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-bytes",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BYTES,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the max bytes for rotating "
|
|
|
|
"log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
|
2020-11-30 19:03:55 -08:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-backup-count",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the backup count of rotated log file, default is {}.".format(
|
2020-11-30 19:03:55 -08:00
|
|
|
ray_constants.LOGGING_ROTATE_BACKUP_COUNT
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2020-11-30 19:03:55 -08:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--log-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of log directory.",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--temp-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of the temporary directory use by Ray process.",
|
|
|
|
)
|
2022-02-09 15:12:36 +09:00
|
|
|
parser.add_argument(
|
|
|
|
"--session-dir",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of the session directory of the cluster.",
|
|
|
|
)
|
2022-01-26 21:03:54 +09:00
|
|
|
parser.add_argument(
|
|
|
|
"--minimal",
|
|
|
|
action="store_true",
|
|
|
|
help=(
|
|
|
|
"Minimal dashboard only contains a subset of features that don't "
|
|
|
|
"require additional dependencies installed when ray is installed "
|
|
|
|
"by `pip install ray[default]`."
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2022-01-26 21:03:54 +09:00
|
|
|
)
|
2022-07-29 15:01:49 +09:00
|
|
|
parser.add_argument(
|
|
|
|
"--modules-to-load",
|
|
|
|
required=False,
|
|
|
|
default=None,
|
|
|
|
help=(
|
|
|
|
"Specify the list of module names in [module_1],[module_2] format."
|
|
|
|
"E.g., JobHead,StateHead... "
|
|
|
|
"If nothing is specified, all modules are loaded."
|
|
|
|
),
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
2021-12-21 16:58:03 -08:00
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
try:
|
2020-11-30 19:03:55 -08:00
|
|
|
setup_component_logger(
|
|
|
|
logging_level=args.logging_level,
|
|
|
|
logging_format=args.logging_format,
|
|
|
|
log_dir=args.log_dir,
|
|
|
|
filename=args.logging_filename,
|
|
|
|
max_bytes=args.logging_rotate_bytes,
|
|
|
|
backup_count=args.logging_rotate_backup_count,
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
2022-07-29 15:01:49 +09:00
|
|
|
if args.modules_to_load:
|
|
|
|
modules_to_load = set(args.modules_to_load.strip(" ,").split(","))
|
|
|
|
else:
|
|
|
|
# None == default.
|
|
|
|
modules_to_load = None
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
dashboard = Dashboard(
|
|
|
|
args.host,
|
|
|
|
args.port,
|
2021-02-24 08:27:48 +08:00
|
|
|
args.port_retries,
|
2021-12-21 16:58:03 -08:00
|
|
|
args.gcs_address,
|
2020-10-23 16:52:14 -04:00
|
|
|
log_dir=args.log_dir,
|
2022-02-01 15:34:40 +09:00
|
|
|
temp_dir=args.temp_dir,
|
2022-02-09 15:12:36 +09:00
|
|
|
session_dir=args.session_dir,
|
2022-02-01 15:34:40 +09:00
|
|
|
minimal=args.minimal,
|
2022-07-29 15:01:49 +09:00
|
|
|
modules_to_load=modules_to_load,
|
2020-10-23 16:52:14 -04:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
loop = asyncio.get_event_loop()
|
2022-06-27 08:14:59 -07:00
|
|
|
|
|
|
|
def sigterm_handler():
|
|
|
|
logger.warn("Exiting with SIGTERM immediately...")
|
|
|
|
os._exit(signal.SIGTERM)
|
|
|
|
|
|
|
|
if sys.platform != "win32":
|
|
|
|
# TODO(rickyyx): we currently do not have any logic for actual
|
|
|
|
# graceful termination in the dashboard. Most of the underlying
|
|
|
|
# async tasks run by the dashboard head doesn't handle CancelledError.
|
|
|
|
# So a truly graceful shutdown is not trivial w/o much refactoring.
|
|
|
|
# Re-open the issue: https://github.com/ray-project/ray/issues/25518
|
|
|
|
# if a truly graceful shutdown is required.
|
|
|
|
loop.add_signal_handler(signal.SIGTERM, sigterm_handler)
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
loop.run_until_complete(dashboard.run())
|
|
|
|
except Exception as e:
|
2021-03-10 23:47:28 -07:00
|
|
|
traceback_str = ray._private.utils.format_error_message(traceback.format_exc())
|
2021-02-24 08:27:48 +08:00
|
|
|
message = (
|
|
|
|
f"The dashboard on node {platform.uname()[1]} "
|
|
|
|
f"failed with the following "
|
|
|
|
f"error:\n{traceback_str}"
|
2022-01-29 18:41:57 -08:00
|
|
|
)
|
2022-02-01 15:34:40 +09:00
|
|
|
if isinstance(e, dashboard_utils.FrontendNotFoundError):
|
2020-07-27 11:34:47 +08:00
|
|
|
logger.warning(message)
|
|
|
|
else:
|
2021-02-24 08:27:48 +08:00
|
|
|
logger.error(message)
|
2020-07-27 11:34:47 +08:00
|
|
|
raise e
|
2021-11-11 14:59:57 -08:00
|
|
|
|
|
|
|
# Something went wrong, so push an error to all drivers.
|
2022-04-14 19:47:57 +02:00
|
|
|
gcs_publisher = GcsPublisher(address=args.gcs_address)
|
2021-11-11 14:59:57 -08:00
|
|
|
ray._private.utils.publish_error_to_driver(
|
|
|
|
ray_constants.DASHBOARD_DIED_ERROR,
|
|
|
|
message,
|
2022-03-16 19:34:57 -07:00
|
|
|
gcs_publisher=gcs_publisher,
|
2021-11-11 14:59:57 -08:00
|
|
|
)
|