ray/dashboard/dashboard.py
SangBin Cho 20ab9188c6
[Ray Usage Stats] Record cluster metadata + Refactoring. (#22170)
This is the first PR to implement usage stats on Ray. Please refer to the file `usage_lib.py` for more details.

The full specification is here https://docs.google.com/document/d/1ZT-l9YbGHh-iWRUC91jS-ssQ5Qe2UQ43Lsoc1edCalc/edit#heading=h.17dss3b9evbj.

You can see the full PR for phase 1 from here; https://github.com/rkooo567/ray/pull/108/files.

The PR is doing some basic refactoring + adding cluster metadata to GCS instead of the version numbers. 

After this PR, we will add code to enable usage report "off by default".
2022-02-08 22:12:36 -08:00

242 lines
7.6 KiB
Python

import argparse
import asyncio
import logging
import logging.handlers
import platform
import traceback
import ray.dashboard.consts as dashboard_consts
import ray.dashboard.head as dashboard_head
import ray.dashboard.utils as dashboard_utils
import ray.ray_constants as ray_constants
import ray._private.gcs_utils as gcs_utils
import ray._private.services
import ray._private.utils
from ray._private.gcs_pubsub import gcs_pubsub_enabled, GcsPublisher
from ray._private.ray_logging import setup_component_logger
# Logger for this module. It should be configured at the entry point
# into the program using Ray. Ray provides a default configuration at
# entry/init points.
logger = logging.getLogger(__name__)
class Dashboard:
"""A dashboard process for monitoring Ray nodes.
This dashboard is made up of a REST API which collates data published by
Reporter processes on nodes into a json structure, and a webserver
which polls said API for display purposes.
Args:
host(str): Host address of dashboard aiohttp server.
port(int): Port number of dashboard aiohttp server.
port_retries(int): The retry times to select a valid port.
gcs_address(str): GCS address of the cluster
redis_address(str): Redis address of a Ray cluster
redis_password(str): Redis password to access GCS
log_dir(str): Log directory of dashboard.
"""
def __init__(
self,
host,
port,
port_retries,
gcs_address,
redis_address,
redis_password=None,
log_dir=None,
temp_dir=None,
session_dir=None,
minimal=False,
):
self.dashboard_head = dashboard_head.DashboardHead(
http_host=host,
http_port=port,
http_port_retries=port_retries,
gcs_address=gcs_address,
redis_address=redis_address,
redis_password=redis_password,
log_dir=log_dir,
temp_dir=temp_dir,
session_dir=session_dir,
minimal=minimal,
)
async def run(self):
await self.dashboard_head.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Ray dashboard.")
parser.add_argument(
"--host", required=True, type=str, help="The host to use for the HTTP server."
)
parser.add_argument(
"--port", required=True, type=int, help="The port to use for the HTTP server."
)
parser.add_argument(
"--port-retries",
required=False,
type=int,
default=0,
help="The retry times to select a valid port.",
)
parser.add_argument(
"--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
)
parser.add_argument(
"--redis-address", required=True, type=str, help="The address to use for Redis."
)
parser.add_argument(
"--redis-password",
required=False,
type=str,
default=None,
help="The password to use for Redis",
)
parser.add_argument(
"--logging-level",
required=False,
type=lambda s: logging.getLevelName(s.upper()),
default=ray_constants.LOGGER_LEVEL,
choices=ray_constants.LOGGER_LEVEL_CHOICES,
help=ray_constants.LOGGER_LEVEL_HELP,
)
parser.add_argument(
"--logging-format",
required=False,
type=str,
default=ray_constants.LOGGER_FORMAT,
help=ray_constants.LOGGER_FORMAT_HELP,
)
parser.add_argument(
"--logging-filename",
required=False,
type=str,
default=dashboard_consts.DASHBOARD_LOG_FILENAME,
help="Specify the name of log file, "
'log to stdout if set empty, default is "{}"'.format(
dashboard_consts.DASHBOARD_LOG_FILENAME
),
)
parser.add_argument(
"--logging-rotate-bytes",
required=False,
type=int,
default=ray_constants.LOGGING_ROTATE_BYTES,
help="Specify the max bytes for rotating "
"log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
)
parser.add_argument(
"--logging-rotate-backup-count",
required=False,
type=int,
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
help="Specify the backup count of rotated log file, default is {}.".format(
ray_constants.LOGGING_ROTATE_BACKUP_COUNT
),
)
parser.add_argument(
"--log-dir",
required=True,
type=str,
default=None,
help="Specify the path of log directory.",
)
parser.add_argument(
"--temp-dir",
required=True,
type=str,
default=None,
help="Specify the path of the temporary directory use by Ray process.",
)
parser.add_argument(
"--session-dir",
required=True,
type=str,
default=None,
help="Specify the path of the session directory of the cluster.",
)
parser.add_argument(
"--minimal",
action="store_true",
help=(
"Minimal dashboard only contains a subset of features that don't "
"require additional dependencies installed when ray is installed "
"by `pip install ray[default]`."
),
)
args = parser.parse_args()
if gcs_utils.use_gcs_for_bootstrap():
args.redis_address = None
args.redis_password = None
else:
args.gcs_address = None
try:
setup_component_logger(
logging_level=args.logging_level,
logging_format=args.logging_format,
log_dir=args.log_dir,
filename=args.logging_filename,
max_bytes=args.logging_rotate_bytes,
backup_count=args.logging_rotate_backup_count,
)
dashboard = Dashboard(
args.host,
args.port,
args.port_retries,
args.gcs_address,
args.redis_address,
redis_password=args.redis_password,
log_dir=args.log_dir,
temp_dir=args.temp_dir,
session_dir=args.session_dir,
minimal=args.minimal,
)
loop = asyncio.get_event_loop()
loop.run_until_complete(dashboard.run())
except Exception as e:
traceback_str = ray._private.utils.format_error_message(traceback.format_exc())
message = (
f"The dashboard on node {platform.uname()[1]} "
f"failed with the following "
f"error:\n{traceback_str}"
)
if isinstance(e, dashboard_utils.FrontendNotFoundError):
logger.warning(message)
else:
logger.error(message)
raise e
# Something went wrong, so push an error to all drivers.
redis_client = None
gcs_publisher = None
if gcs_pubsub_enabled():
if gcs_utils.use_gcs_for_bootstrap():
gcs_publisher = GcsPublisher(args.gcs_address)
else:
redis_client = ray._private.services.create_redis_client(
args.redis_address, password=args.redis_password
)
gcs_publisher = GcsPublisher(
address=gcs_utils.get_gcs_address_from_redis(redis_client)
)
redis_client = None
else:
redis_client = ray._private.services.create_redis_client(
args.redis_address, password=args.redis_password
)
ray._private.utils.publish_error_to_driver(
redis_client,
ray_constants.DASHBOARD_DIED_ERROR,
message,
redis_client=redis_client,
gcs_publisher=gcs_publisher,
)