2020-07-27 11:34:47 +08:00
|
|
|
import argparse
|
|
|
|
import asyncio
|
|
|
|
import logging
|
|
|
|
import logging.handlers
|
|
|
|
import os
|
2020-10-23 16:52:14 -04:00
|
|
|
import platform
|
2020-07-27 11:34:47 +08:00
|
|
|
import sys
|
2020-08-25 04:24:23 +08:00
|
|
|
import socket
|
|
|
|
import json
|
2020-07-27 11:34:47 +08:00
|
|
|
import traceback
|
|
|
|
|
|
|
|
import aiohttp
|
2020-08-25 04:24:23 +08:00
|
|
|
import aiohttp.web
|
|
|
|
import aiohttp_cors
|
2020-11-30 19:03:55 -08:00
|
|
|
import psutil
|
2020-08-25 04:24:23 +08:00
|
|
|
from aiohttp import hdrs
|
2020-07-27 11:34:47 +08:00
|
|
|
from grpc.experimental import aio as aiogrpc
|
|
|
|
|
|
|
|
import ray
|
|
|
|
import ray.new_dashboard.consts as dashboard_consts
|
|
|
|
import ray.new_dashboard.utils as dashboard_utils
|
|
|
|
import ray.ray_constants as ray_constants
|
2020-09-24 22:46:35 -07:00
|
|
|
import ray._private.services
|
2020-07-27 11:34:47 +08:00
|
|
|
import ray.utils
|
2020-08-25 04:24:23 +08:00
|
|
|
from ray.core.generated import agent_manager_pb2
|
|
|
|
from ray.core.generated import agent_manager_pb2_grpc
|
2020-11-30 19:03:55 -08:00
|
|
|
from ray.ray_logging import setup_component_logger
|
2020-07-27 11:34:47 +08:00
|
|
|
|
2020-08-30 14:09:34 +08:00
|
|
|
try:
|
|
|
|
create_task = asyncio.create_task
|
|
|
|
except AttributeError:
|
|
|
|
create_task = asyncio.ensure_future
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
logger = logging.getLogger(__name__)
|
2020-08-25 04:24:23 +08:00
|
|
|
routes = dashboard_utils.ClassMethodRouteTable
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
aiogrpc.init_grpc_aio()
|
|
|
|
|
|
|
|
|
|
|
|
class DashboardAgent(object):
|
|
|
|
def __init__(self,
|
2020-11-17 04:02:20 +08:00
|
|
|
node_ip_address,
|
2020-07-27 11:34:47 +08:00
|
|
|
redis_address,
|
2020-10-23 16:52:14 -04:00
|
|
|
dashboard_agent_port,
|
2020-07-27 11:34:47 +08:00
|
|
|
redis_password=None,
|
|
|
|
temp_dir=None,
|
|
|
|
log_dir=None,
|
2020-08-30 14:09:34 +08:00
|
|
|
metrics_export_port=None,
|
2020-07-27 11:34:47 +08:00
|
|
|
node_manager_port=None,
|
|
|
|
object_store_name=None,
|
|
|
|
raylet_name=None):
|
|
|
|
"""Initialize the DashboardAgent object."""
|
|
|
|
# Public attributes are accessible for all agent modules.
|
2020-11-17 04:02:20 +08:00
|
|
|
self.ip = node_ip_address
|
2020-08-25 04:24:23 +08:00
|
|
|
self.redis_address = dashboard_utils.address_tuple(redis_address)
|
2020-07-27 11:34:47 +08:00
|
|
|
self.redis_password = redis_password
|
|
|
|
self.temp_dir = temp_dir
|
|
|
|
self.log_dir = log_dir
|
2020-10-23 16:52:14 -04:00
|
|
|
self.dashboard_agent_port = dashboard_agent_port
|
2020-08-30 14:09:34 +08:00
|
|
|
self.metrics_export_port = metrics_export_port
|
2020-07-27 11:34:47 +08:00
|
|
|
self.node_manager_port = node_manager_port
|
|
|
|
self.object_store_name = object_store_name
|
|
|
|
self.raylet_name = raylet_name
|
2020-09-17 01:17:29 +08:00
|
|
|
self.node_id = os.environ["RAY_NODE_ID"]
|
|
|
|
assert self.node_id, "Empty node id (RAY_NODE_ID)."
|
2020-07-27 11:34:47 +08:00
|
|
|
self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
|
2020-10-23 16:52:14 -04:00
|
|
|
self.grpc_port = self.server.add_insecure_port(
|
|
|
|
f"[::]:{self.dashboard_agent_port}")
|
2020-08-25 04:24:23 +08:00
|
|
|
logger.info("Dashboard agent grpc address: %s:%s", self.ip,
|
|
|
|
self.grpc_port)
|
2020-07-27 11:34:47 +08:00
|
|
|
self.aioredis_client = None
|
2020-11-17 04:02:20 +08:00
|
|
|
self.aiogrpc_raylet_channel = aiogrpc.insecure_channel(
|
|
|
|
f"{self.ip}:{self.node_manager_port}")
|
2020-08-25 04:24:23 +08:00
|
|
|
self.http_session = None
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
def _load_modules(self):
|
|
|
|
"""Load dashboard agent modules."""
|
|
|
|
modules = []
|
2020-08-25 04:24:23 +08:00
|
|
|
agent_cls_list = dashboard_utils.get_all_modules(
|
|
|
|
dashboard_utils.DashboardAgentModule)
|
|
|
|
for cls in agent_cls_list:
|
|
|
|
logger.info("Loading %s: %s",
|
2020-07-27 11:34:47 +08:00
|
|
|
dashboard_utils.DashboardAgentModule.__name__, cls)
|
|
|
|
c = cls(self)
|
2020-08-25 04:24:23 +08:00
|
|
|
dashboard_utils.ClassMethodRouteTable.bind(c)
|
2020-07-27 11:34:47 +08:00
|
|
|
modules.append(c)
|
2020-08-30 14:09:34 +08:00
|
|
|
logger.info("Loaded %d modules.", len(modules))
|
2020-07-27 11:34:47 +08:00
|
|
|
return modules
|
|
|
|
|
|
|
|
async def run(self):
|
|
|
|
async def _check_parent():
|
|
|
|
"""Check if raylet is dead."""
|
|
|
|
curr_proc = psutil.Process()
|
|
|
|
while True:
|
|
|
|
parent = curr_proc.parent()
|
|
|
|
if parent is None or parent.pid == 1:
|
|
|
|
logger.error("raylet is dead, agent will die because "
|
|
|
|
"it fate-shares with raylet.")
|
|
|
|
sys.exit(0)
|
|
|
|
await asyncio.sleep(
|
|
|
|
dashboard_consts.
|
|
|
|
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)
|
|
|
|
|
2020-08-30 14:09:34 +08:00
|
|
|
check_parent_task = create_task(_check_parent())
|
2020-08-25 04:24:23 +08:00
|
|
|
|
|
|
|
# Create an aioredis client for all modules.
|
|
|
|
try:
|
|
|
|
self.aioredis_client = await dashboard_utils.get_aioredis_client(
|
|
|
|
self.redis_address, self.redis_password,
|
|
|
|
dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
|
|
|
|
dashboard_consts.RETRY_REDIS_CONNECTION_TIMES)
|
|
|
|
except (socket.gaierror, ConnectionRefusedError):
|
|
|
|
logger.error(
|
|
|
|
"Dashboard agent exiting: "
|
|
|
|
"Failed to connect to redis at %s", self.redis_address)
|
|
|
|
sys.exit(-1)
|
|
|
|
|
|
|
|
# Create a http session for all modules.
|
|
|
|
self.http_session = aiohttp.ClientSession(
|
|
|
|
loop=asyncio.get_event_loop())
|
|
|
|
|
|
|
|
# Start a grpc asyncio server.
|
|
|
|
await self.server.start()
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
modules = self._load_modules()
|
2020-08-25 04:24:23 +08:00
|
|
|
|
|
|
|
# Http server should be initialized after all modules loaded.
|
|
|
|
app = aiohttp.web.Application()
|
|
|
|
app.add_routes(routes=routes.bound_routes())
|
|
|
|
|
|
|
|
# Enable CORS on all routes.
|
|
|
|
cors = aiohttp_cors.setup(
|
|
|
|
app,
|
|
|
|
defaults={
|
|
|
|
"*": aiohttp_cors.ResourceOptions(
|
|
|
|
allow_credentials=True,
|
|
|
|
expose_headers="*",
|
|
|
|
allow_methods="*",
|
|
|
|
allow_headers=("Content-Type", "X-Header"),
|
|
|
|
)
|
|
|
|
})
|
|
|
|
for route in list(app.router.routes()):
|
|
|
|
cors.add(route)
|
|
|
|
|
|
|
|
runner = aiohttp.web.AppRunner(app)
|
|
|
|
await runner.setup()
|
|
|
|
site = aiohttp.web.TCPSite(runner, self.ip, 0)
|
|
|
|
await site.start()
|
|
|
|
http_host, http_port = site._server.sockets[0].getsockname()
|
|
|
|
logger.info("Dashboard agent http address: %s:%s", http_host,
|
|
|
|
http_port)
|
|
|
|
|
|
|
|
# Dump registered http routes.
|
|
|
|
dump_routes = [
|
|
|
|
r for r in app.router.routes() if r.method != hdrs.METH_HEAD
|
|
|
|
]
|
|
|
|
for r in dump_routes:
|
|
|
|
logger.info(r)
|
|
|
|
logger.info("Registered %s routes.", len(dump_routes))
|
|
|
|
|
|
|
|
# Write the dashboard agent port to redis.
|
|
|
|
await self.aioredis_client.set(
|
2020-09-17 01:17:29 +08:00
|
|
|
f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
|
|
|
|
json.dumps([http_port, self.grpc_port]))
|
2020-08-25 04:24:23 +08:00
|
|
|
|
|
|
|
# Register agent to agent manager.
|
|
|
|
raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
|
|
|
|
self.aiogrpc_raylet_channel)
|
|
|
|
|
|
|
|
await raylet_stub.RegisterAgent(
|
|
|
|
agent_manager_pb2.RegisterAgentRequest(
|
|
|
|
agent_pid=os.getpid(),
|
|
|
|
agent_port=self.grpc_port,
|
|
|
|
agent_ip_address=self.ip))
|
|
|
|
|
|
|
|
await asyncio.gather(check_parent_task,
|
2020-07-27 11:34:47 +08:00
|
|
|
*(m.run(self.server) for m in modules))
|
|
|
|
await self.server.wait_for_termination()
|
2020-08-25 04:24:23 +08:00
|
|
|
# Wait for finish signal.
|
|
|
|
await runner.cleanup()
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(description="Dashboard agent.")
|
2020-11-17 04:02:20 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--node-ip-address",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
help="the IP address of this node.")
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--redis-address",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
help="The address to use for Redis.")
|
2020-08-30 14:09:34 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--metrics-export-port",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="The port to expose metrics through Prometheus.")
|
2020-10-23 16:52:14 -04:00
|
|
|
parser.add_argument(
|
|
|
|
"--dashboard-agent-port",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="The port on which the dashboard agent will receive GRPCs.")
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--node-manager-port",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="The port to use for starting the node manager")
|
|
|
|
parser.add_argument(
|
|
|
|
"--object-store-name",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="The socket name of the plasma store")
|
|
|
|
parser.add_argument(
|
|
|
|
"--raylet-name",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="The socket path of the raylet process")
|
|
|
|
parser.add_argument(
|
|
|
|
"--redis-password",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="The password to use for Redis")
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-level",
|
|
|
|
required=False,
|
|
|
|
type=lambda s: logging.getLevelName(s.upper()),
|
|
|
|
default=ray_constants.LOGGER_LEVEL,
|
|
|
|
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
|
|
|
help=ray_constants.LOGGER_LEVEL_HELP)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-format",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=ray_constants.LOGGER_FORMAT,
|
|
|
|
help=ray_constants.LOGGER_FORMAT_HELP)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-filename",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME,
|
|
|
|
help="Specify the name of log file, "
|
|
|
|
"log to stdout if set empty, default is \"{}\".".format(
|
|
|
|
dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME))
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-bytes",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BYTES,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the max bytes for rotating "
|
|
|
|
"log file, default is {} bytes.".format(
|
2020-11-30 19:03:55 -08:00
|
|
|
ray_constants.LOGGING_ROTATE_BYTES))
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-backup-count",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the backup count of rotated log file, default is {}.".
|
2020-11-30 19:03:55 -08:00
|
|
|
format(ray_constants.LOGGING_ROTATE_BACKUP_COUNT))
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--log-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of log directory.")
|
|
|
|
parser.add_argument(
|
|
|
|
"--temp-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of the temporary directory use by Ray process.")
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
2020-11-30 19:03:55 -08:00
|
|
|
setup_component_logger(
|
|
|
|
logging_level=args.logging_level,
|
|
|
|
logging_format=args.logging_format,
|
|
|
|
log_dir=args.log_dir,
|
|
|
|
filename=args.logging_filename,
|
|
|
|
max_bytes=args.logging_rotate_bytes,
|
|
|
|
backup_count=args.logging_rotate_backup_count)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
agent = DashboardAgent(
|
2020-11-17 04:02:20 +08:00
|
|
|
args.node_ip_address,
|
2020-07-27 11:34:47 +08:00
|
|
|
args.redis_address,
|
2020-10-23 16:52:14 -04:00
|
|
|
args.dashboard_agent_port,
|
2020-07-27 11:34:47 +08:00
|
|
|
redis_password=args.redis_password,
|
2020-10-23 16:52:14 -04:00
|
|
|
temp_dir=args.temp_dir,
|
|
|
|
log_dir=args.log_dir,
|
2020-08-30 14:09:34 +08:00
|
|
|
metrics_export_port=args.metrics_export_port,
|
2020-07-27 11:34:47 +08:00
|
|
|
node_manager_port=args.node_manager_port,
|
|
|
|
object_store_name=args.object_store_name,
|
|
|
|
raylet_name=args.raylet_name)
|
|
|
|
|
|
|
|
loop = asyncio.get_event_loop()
|
2020-08-25 04:24:23 +08:00
|
|
|
loop.run_until_complete(agent.run())
|
2020-07-27 11:34:47 +08:00
|
|
|
except Exception as e:
|
|
|
|
# Something went wrong, so push an error to all drivers.
|
2020-09-24 22:46:35 -07:00
|
|
|
redis_client = ray._private.services.create_redis_client(
|
2020-07-27 11:34:47 +08:00
|
|
|
args.redis_address, password=args.redis_password)
|
|
|
|
traceback_str = ray.utils.format_error_message(traceback.format_exc())
|
|
|
|
message = ("The agent on node {} failed with the following "
|
2020-10-23 16:52:14 -04:00
|
|
|
"error:\n{}".format(platform.uname()[1], traceback_str))
|
2020-07-27 11:34:47 +08:00
|
|
|
ray.utils.push_error_to_driver_through_redis(
|
|
|
|
redis_client, ray_constants.DASHBOARD_AGENT_DIED_ERROR, message)
|
|
|
|
raise e
|