2020-07-27 11:34:47 +08:00
|
|
|
import argparse
|
|
|
|
import asyncio
|
2022-06-09 05:54:34 -07:00
|
|
|
import io
|
2022-08-08 22:38:19 +09:00
|
|
|
import json
|
2020-07-27 11:34:47 +08:00
|
|
|
import logging
|
|
|
|
import logging.handlers
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import ray
|
2022-06-21 15:13:29 -07:00
|
|
|
import ray._private.ray_constants as ray_constants
|
2022-06-14 17:44:41 -07:00
|
|
|
import ray._private.services
|
|
|
|
import ray._private.utils
|
2021-09-15 11:17:15 -05:00
|
|
|
import ray.dashboard.consts as dashboard_consts
|
|
|
|
import ray.dashboard.utils as dashboard_utils
|
2022-08-08 22:38:19 +09:00
|
|
|
import ray.experimental.internal_kv as internal_kv
|
2022-06-14 17:44:41 -07:00
|
|
|
from ray._private.gcs_pubsub import GcsAioPublisher, GcsPublisher
|
2022-06-21 15:02:36 -07:00
|
|
|
from ray._private.gcs_utils import GcsAioClient, GcsClient
|
2021-03-10 23:47:28 -07:00
|
|
|
from ray._private.ray_logging import setup_component_logger
|
2022-08-08 22:38:19 +09:00
|
|
|
from ray.core.generated import agent_manager_pb2, agent_manager_pb2_grpc
|
2022-06-21 15:02:36 -07:00
|
|
|
from ray.experimental.internal_kv import (
|
|
|
|
_initialize_internal_kv,
|
|
|
|
_internal_kv_initialized,
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
2021-07-19 11:14:10 -07:00
|
|
|
# Import psutil after ray so the packaged version is used.
|
|
|
|
import psutil
|
|
|
|
|
2022-06-14 17:44:41 -07:00
|
|
|
try:
|
|
|
|
from grpc import aio as aiogrpc
|
|
|
|
except ImportError:
|
|
|
|
from grpc.experimental import aio as aiogrpc
|
|
|
|
|
|
|
|
|
2022-06-09 05:54:34 -07:00
|
|
|
# Publishes at most this number of lines of Raylet logs, when the Raylet dies
|
|
|
|
# unexpectedly.
|
|
|
|
_RAYLET_LOG_MAX_PUBLISH_LINES = 20
|
|
|
|
|
|
|
|
# Reads at most this amount of Raylet logs from the tail, for publishing and
|
|
|
|
# checking if the Raylet was terminated gracefully.
|
|
|
|
_RAYLET_LOG_MAX_TAIL_SIZE = 1 * 1024 ** 2
|
|
|
|
|
2020-08-30 14:09:34 +08:00
|
|
|
try:
|
|
|
|
create_task = asyncio.create_task
|
|
|
|
except AttributeError:
|
|
|
|
create_task = asyncio.ensure_future
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
aiogrpc.init_grpc_aio()
|
|
|
|
|
|
|
|
|
2022-06-09 05:54:34 -07:00
|
|
|
class DashboardAgent:
|
2020-07-27 11:34:47 +08:00
|
|
|
def __init__(
|
|
|
|
self,
|
2020-11-17 04:02:20 +08:00
|
|
|
node_ip_address,
|
2020-10-23 16:52:14 -04:00
|
|
|
dashboard_agent_port,
|
2021-12-21 16:58:03 -08:00
|
|
|
gcs_address,
|
2022-01-26 21:03:54 +09:00
|
|
|
minimal,
|
2020-08-30 14:09:34 +08:00
|
|
|
metrics_export_port=None,
|
2020-07-27 11:34:47 +08:00
|
|
|
node_manager_port=None,
|
2022-07-06 19:37:30 -07:00
|
|
|
listen_port=ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
|
2022-05-02 21:33:03 +09:00
|
|
|
disable_metrics_collection: bool = False,
|
2022-07-20 06:26:49 +03:00
|
|
|
*, # the following are required kwargs
|
|
|
|
object_store_name: str,
|
|
|
|
raylet_name: str,
|
|
|
|
log_dir: str,
|
|
|
|
temp_dir: str,
|
|
|
|
session_dir: str,
|
|
|
|
runtime_env_dir: str,
|
|
|
|
logging_params: dict,
|
|
|
|
agent_id: int,
|
2021-07-27 16:39:15 -07:00
|
|
|
):
|
2020-07-27 11:34:47 +08:00
|
|
|
"""Initialize the DashboardAgent object."""
|
|
|
|
# Public attributes are accessible for all agent modules.
|
2020-11-17 04:02:20 +08:00
|
|
|
self.ip = node_ip_address
|
2022-01-26 21:03:54 +09:00
|
|
|
self.minimal = minimal
|
2021-12-21 16:58:03 -08:00
|
|
|
|
2022-03-04 12:32:17 -08:00
|
|
|
assert gcs_address is not None
|
|
|
|
self.gcs_address = gcs_address
|
2021-12-21 16:58:03 -08:00
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
self.temp_dir = temp_dir
|
2021-06-25 19:39:05 +08:00
|
|
|
self.session_dir = session_dir
|
2021-06-09 20:22:25 +08:00
|
|
|
self.runtime_env_dir = runtime_env_dir
|
2020-07-27 11:34:47 +08:00
|
|
|
self.log_dir = log_dir
|
2020-10-23 16:52:14 -04:00
|
|
|
self.dashboard_agent_port = dashboard_agent_port
|
2020-08-30 14:09:34 +08:00
|
|
|
self.metrics_export_port = metrics_export_port
|
2020-07-27 11:34:47 +08:00
|
|
|
self.node_manager_port = node_manager_port
|
2021-08-02 02:09:50 -07:00
|
|
|
self.listen_port = listen_port
|
2020-07-27 11:34:47 +08:00
|
|
|
self.object_store_name = object_store_name
|
|
|
|
self.raylet_name = raylet_name
|
2021-07-27 16:39:15 -07:00
|
|
|
self.logging_params = logging_params
|
2020-09-17 01:17:29 +08:00
|
|
|
self.node_id = os.environ["RAY_NODE_ID"]
|
2022-05-02 21:33:03 +09:00
|
|
|
self.metrics_collection_disabled = disable_metrics_collection
|
2022-07-20 06:26:49 +03:00
|
|
|
self.agent_id = agent_id
|
2021-01-25 10:35:25 -08:00
|
|
|
# TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is
|
|
|
|
# only used for fate-sharing with the raylet and we need a different
|
|
|
|
# fate-sharing mechanism for Windows anyways.
|
|
|
|
if sys.platform not in ["win32", "cygwin"]:
|
|
|
|
self.ppid = int(os.environ["RAY_RAYLET_PID"])
|
|
|
|
assert self.ppid > 0
|
|
|
|
logger.info("Parent pid is %s", self.ppid)
|
2022-03-15 16:09:15 +08:00
|
|
|
|
|
|
|
# Setup raylet channel
|
2022-05-10 11:30:46 +08:00
|
|
|
options = ray_constants.GLOBAL_GRPC_OPTIONS
|
2021-10-21 06:39:11 +01:00
|
|
|
self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel(
|
|
|
|
f"{self.ip}:{self.node_manager_port}", options, asynchronous=True
|
|
|
|
)
|
2022-01-26 21:03:54 +09:00
|
|
|
|
2022-03-15 16:09:15 +08:00
|
|
|
# Setup grpc server
|
|
|
|
self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0),))
|
|
|
|
grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
|
|
|
|
try:
|
|
|
|
self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
|
|
|
|
self.server, f"{grpc_ip}:{self.dashboard_agent_port}"
|
|
|
|
)
|
|
|
|
except Exception:
|
|
|
|
# TODO(SongGuyang): Catch the exception here because there is
|
|
|
|
# port conflict issue which brought from static port. We should
|
|
|
|
# remove this after we find better port resolution.
|
|
|
|
logger.exception(
|
|
|
|
"Failed to add port to grpc server. Agent will stay alive but "
|
|
|
|
"disable the grpc service."
|
|
|
|
)
|
|
|
|
self.server = None
|
|
|
|
self.grpc_port = None
|
|
|
|
else:
|
|
|
|
logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port)
|
|
|
|
|
2022-01-26 21:03:54 +09:00
|
|
|
# If the agent is started as non-minimal version, http server should
|
|
|
|
# be configured to communicate with the dashboard in a head node.
|
|
|
|
self.http_server = None
|
|
|
|
|
2022-06-09 05:54:34 -07:00
|
|
|
# Used by the agent and sub-modules.
|
2022-06-21 15:02:36 -07:00
|
|
|
# TODO(architkulkarni): Remove gcs_client once the agent exclusively uses
|
|
|
|
# gcs_aio_client and not gcs_client.
|
2022-06-09 05:54:34 -07:00
|
|
|
self.gcs_client = GcsClient(address=self.gcs_address)
|
2022-06-21 15:02:36 -07:00
|
|
|
_initialize_internal_kv(self.gcs_client)
|
|
|
|
assert _internal_kv_initialized()
|
|
|
|
self.gcs_aio_client = GcsAioClient(address=self.gcs_address)
|
2022-06-09 05:54:34 -07:00
|
|
|
self.publisher = GcsAioPublisher(address=self.gcs_address)
|
|
|
|
|
2022-01-26 21:03:54 +09:00
|
|
|
async def _configure_http_server(self, modules):
|
|
|
|
from ray.dashboard.http_server_agent import HttpServerAgent
|
2022-01-29 18:41:57 -08:00
|
|
|
|
2022-01-26 21:03:54 +09:00
|
|
|
http_server = HttpServerAgent(self.ip, self.listen_port)
|
|
|
|
await http_server.start(modules)
|
|
|
|
return http_server
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
def _load_modules(self):
|
|
|
|
"""Load dashboard agent modules."""
|
|
|
|
modules = []
|
2020-08-25 04:24:23 +08:00
|
|
|
agent_cls_list = dashboard_utils.get_all_modules(
|
|
|
|
dashboard_utils.DashboardAgentModule
|
|
|
|
)
|
|
|
|
for cls in agent_cls_list:
|
|
|
|
logger.info(
|
2020-07-27 11:34:47 +08:00
|
|
|
"Loading %s: %s", dashboard_utils.DashboardAgentModule.__name__, cls
|
|
|
|
)
|
|
|
|
c = cls(self)
|
|
|
|
modules.append(c)
|
2020-08-30 14:09:34 +08:00
|
|
|
logger.info("Loaded %d modules.", len(modules))
|
2020-07-27 11:34:47 +08:00
|
|
|
return modules
|
|
|
|
|
2022-01-26 21:03:54 +09:00
|
|
|
@property
|
|
|
|
def http_session(self):
|
|
|
|
assert self.http_server, "Accessing unsupported API in a minimal ray."
|
|
|
|
return self.http_server.http_session
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
async def run(self):
|
|
|
|
async def _check_parent():
|
2020-12-15 12:13:11 -06:00
|
|
|
"""Check if raylet is dead and fate-share if it is."""
|
|
|
|
try:
|
|
|
|
curr_proc = psutil.Process()
|
|
|
|
while True:
|
|
|
|
parent = curr_proc.parent()
|
|
|
|
if parent is None or parent.pid == 1 or self.ppid != parent.pid:
|
2022-06-09 05:54:34 -07:00
|
|
|
log_path = os.path.join(self.log_dir, "raylet.out")
|
|
|
|
error = False
|
|
|
|
msg = f"Raylet is terminated: ip={self.ip}, id={self.node_id}. "
|
|
|
|
try:
|
|
|
|
with open(log_path, "r", encoding="utf-8") as f:
|
|
|
|
# Seek to _RAYLET_LOG_MAX_TAIL_SIZE from the end if the
|
|
|
|
# file is larger than that.
|
|
|
|
f.seek(0, io.SEEK_END)
|
|
|
|
pos = max(0, f.tell() - _RAYLET_LOG_MAX_TAIL_SIZE)
|
|
|
|
f.seek(pos, io.SEEK_SET)
|
|
|
|
# Read remaining logs by lines.
|
|
|
|
raylet_logs = f.readlines()
|
|
|
|
# Assume the SIGTERM message must exist within the last
|
|
|
|
# _RAYLET_LOG_MAX_TAIL_SIZE of the log file.
|
|
|
|
if any(
|
|
|
|
"Raylet received SIGTERM" in line
|
|
|
|
for line in raylet_logs
|
|
|
|
):
|
|
|
|
msg += "Termination is graceful."
|
|
|
|
logger.info(msg)
|
|
|
|
else:
|
|
|
|
msg += (
|
|
|
|
"Termination is unexpected. Possible reasons "
|
|
|
|
"include: (1) SIGKILL by the user or system "
|
|
|
|
"OOM killer, (2) Invalid memory access from "
|
|
|
|
"Raylet causing SIGSEGV or SIGBUS, "
|
|
|
|
"(3) Other termination signals. "
|
|
|
|
f"Last {_RAYLET_LOG_MAX_PUBLISH_LINES} lines "
|
|
|
|
"of the Raylet logs:\n"
|
|
|
|
)
|
|
|
|
msg += " " + " ".join(
|
|
|
|
raylet_logs[-_RAYLET_LOG_MAX_PUBLISH_LINES:]
|
|
|
|
)
|
|
|
|
error = True
|
|
|
|
except Exception as e:
|
|
|
|
msg += f"Failed to read Raylet logs at {log_path}: {e}!"
|
|
|
|
logger.exception()
|
|
|
|
error = True
|
|
|
|
if error:
|
|
|
|
logger.error(msg)
|
|
|
|
# TODO: switch to async if necessary.
|
|
|
|
ray._private.utils.publish_error_to_driver(
|
|
|
|
ray_constants.RAYLET_DIED_ERROR,
|
|
|
|
msg,
|
|
|
|
gcs_publisher=GcsPublisher(address=self.gcs_address),
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
logger.info(msg)
|
2020-12-15 12:13:11 -06:00
|
|
|
sys.exit(0)
|
|
|
|
await asyncio.sleep(
|
|
|
|
dashboard_consts.DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS
|
|
|
|
)
|
|
|
|
except Exception:
|
|
|
|
logger.error("Failed to check parent PID, exiting.")
|
|
|
|
sys.exit(1)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
2021-01-25 10:35:25 -08:00
|
|
|
if sys.platform not in ["win32", "cygwin"]:
|
|
|
|
check_parent_task = create_task(_check_parent())
|
2020-08-25 04:24:23 +08:00
|
|
|
|
|
|
|
# Start a grpc asyncio server.
|
2022-03-15 16:09:15 +08:00
|
|
|
if self.server:
|
|
|
|
await self.server.start()
|
2021-12-21 16:58:03 -08:00
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
modules = self._load_modules()
|
2020-08-25 04:24:23 +08:00
|
|
|
|
2022-01-26 21:03:54 +09:00
|
|
|
# Setup http server if necessary.
|
|
|
|
if not self.minimal:
|
|
|
|
# If the agent is not minimal it should start the http server
|
|
|
|
# to communicate with the dashboard in a head node.
|
|
|
|
# Http server is not started in the minimal version because
|
|
|
|
# it requires additional dependencies that are not
|
|
|
|
# included in the minimal ray package.
|
2022-03-15 16:09:15 +08:00
|
|
|
try:
|
|
|
|
self.http_server = await self._configure_http_server(modules)
|
|
|
|
except Exception:
|
|
|
|
# TODO(SongGuyang): Catch the exception here because there is
|
|
|
|
# port conflict issue which brought from static port. We should
|
|
|
|
# remove this after we find better port resolution.
|
|
|
|
logger.exception(
|
|
|
|
"Failed to start http server. Agent will stay alive but "
|
|
|
|
"disable the http service."
|
|
|
|
)
|
2020-08-25 04:24:23 +08:00
|
|
|
|
2022-03-04 12:32:17 -08:00
|
|
|
# Write the dashboard agent port to kv.
|
2021-11-10 20:24:53 -08:00
|
|
|
# TODO: Use async version if performance is an issue
|
2022-01-26 21:03:54 +09:00
|
|
|
# -1 should indicate that http server is not started.
|
|
|
|
http_port = -1 if not self.http_server else self.http_server.http_port
|
2022-08-08 22:38:19 +09:00
|
|
|
internal_kv._internal_kv_put(
|
|
|
|
f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
|
|
|
|
json.dumps([http_port, self.grpc_port]),
|
|
|
|
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
|
|
|
|
)
|
2020-08-25 04:24:23 +08:00
|
|
|
|
|
|
|
# Register agent to agent manager.
|
|
|
|
raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
|
|
|
|
self.aiogrpc_raylet_channel
|
|
|
|
)
|
2022-08-08 22:38:19 +09:00
|
|
|
|
2020-08-25 04:24:23 +08:00
|
|
|
await raylet_stub.RegisterAgent(
|
|
|
|
agent_manager_pb2.RegisterAgentRequest(
|
2022-08-08 22:38:19 +09:00
|
|
|
agent_id=self.agent_id,
|
|
|
|
agent_port=self.grpc_port,
|
|
|
|
agent_ip_address=self.ip,
|
2020-08-25 04:24:23 +08:00
|
|
|
)
|
2022-01-29 18:41:57 -08:00
|
|
|
)
|
2020-08-25 04:24:23 +08:00
|
|
|
|
2021-01-27 10:19:58 -07:00
|
|
|
tasks = [m.run(self.server) for m in modules]
|
|
|
|
if sys.platform not in ["win32", "cygwin"]:
|
|
|
|
tasks.append(check_parent_task)
|
|
|
|
await asyncio.gather(*tasks)
|
|
|
|
|
2020-07-27 11:34:47 +08:00
|
|
|
await self.server.wait_for_termination()
|
2022-01-26 21:03:54 +09:00
|
|
|
|
|
|
|
if self.http_server:
|
|
|
|
await self.http_server.cleanup()
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(description="Dashboard agent.")
|
2020-11-17 04:02:20 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--node-ip-address",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
help="the IP address of this node.",
|
|
|
|
)
|
2021-11-11 14:59:57 -08:00
|
|
|
parser.add_argument(
|
2022-03-04 12:32:17 -08:00
|
|
|
"--gcs-address", required=True, type=str, help="The address (ip:port) of GCS."
|
2020-07-27 11:34:47 +08:00
|
|
|
)
|
2020-08-30 14:09:34 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--metrics-export-port",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="The port to expose metrics through Prometheus.",
|
|
|
|
)
|
2020-10-23 16:52:14 -04:00
|
|
|
parser.add_argument(
|
|
|
|
"--dashboard-agent-port",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="The port on which the dashboard agent will receive GRPCs.",
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--node-manager-port",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="The port to use for starting the node manager",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--object-store-name",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="The socket name of the plasma store",
|
|
|
|
)
|
2021-08-02 02:09:50 -07:00
|
|
|
parser.add_argument(
|
|
|
|
"--listen-port",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2022-07-06 19:37:30 -07:00
|
|
|
default=ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
|
2021-08-02 02:09:50 -07:00
|
|
|
help="Port for HTTP server to listen on",
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--raylet-name",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="The socket path of the raylet process",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-level",
|
|
|
|
required=False,
|
|
|
|
type=lambda s: logging.getLevelName(s.upper()),
|
|
|
|
default=ray_constants.LOGGER_LEVEL,
|
|
|
|
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
|
|
|
help=ray_constants.LOGGER_LEVEL_HELP,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-format",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=ray_constants.LOGGER_FORMAT,
|
|
|
|
help=ray_constants.LOGGER_FORMAT_HELP,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-filename",
|
|
|
|
required=False,
|
|
|
|
type=str,
|
|
|
|
default=dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME,
|
|
|
|
help="Specify the name of log file, "
|
|
|
|
'log to stdout if set empty, default is "{}".'.format(
|
|
|
|
dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2020-07-27 11:34:47 +08:00
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-bytes",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BYTES,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the max bytes for rotating "
|
|
|
|
"log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
|
2020-11-30 19:03:55 -08:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--logging-rotate-backup-count",
|
|
|
|
required=False,
|
|
|
|
type=int,
|
2020-11-30 19:03:55 -08:00
|
|
|
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
2020-07-27 11:34:47 +08:00
|
|
|
help="Specify the backup count of rotated log file, default is {}.".format(
|
2020-11-30 19:03:55 -08:00
|
|
|
ray_constants.LOGGING_ROTATE_BACKUP_COUNT
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2020-11-30 19:03:55 -08:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--log-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of log directory.",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--temp-dir",
|
2020-10-23 16:52:14 -04:00
|
|
|
required=True,
|
2020-07-27 11:34:47 +08:00
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of the temporary directory use by Ray process.",
|
|
|
|
)
|
2021-06-25 19:39:05 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--session-dir",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of this session.",
|
|
|
|
)
|
2021-06-09 20:22:25 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"--runtime-env-dir",
|
|
|
|
required=True,
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Specify the path of the resource directory used by runtime_env.",
|
|
|
|
)
|
2022-01-26 21:03:54 +09:00
|
|
|
parser.add_argument(
|
|
|
|
"--minimal",
|
|
|
|
action="store_true",
|
|
|
|
help=(
|
|
|
|
"Minimal agent only contains a subset of features that don't "
|
|
|
|
"require additional dependencies installed when ray is installed "
|
|
|
|
"by `pip install ray[default]`."
|
2022-01-29 18:41:57 -08:00
|
|
|
),
|
2022-01-26 21:03:54 +09:00
|
|
|
)
|
2022-05-02 21:33:03 +09:00
|
|
|
parser.add_argument(
|
|
|
|
"--disable-metrics-collection",
|
|
|
|
action="store_true",
|
|
|
|
help=("If this arg is set, metrics report won't be enabled from the agent."),
|
|
|
|
)
|
2022-07-20 06:26:49 +03:00
|
|
|
parser.add_argument(
|
|
|
|
"--agent-id",
|
|
|
|
required=True,
|
|
|
|
type=int,
|
|
|
|
help="ID to report when registering with raylet",
|
|
|
|
default=os.getpid(),
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
2021-07-27 16:39:15 -07:00
|
|
|
logging_params = dict(
|
2020-11-30 19:03:55 -08:00
|
|
|
logging_level=args.logging_level,
|
|
|
|
logging_format=args.logging_format,
|
|
|
|
log_dir=args.log_dir,
|
|
|
|
filename=args.logging_filename,
|
|
|
|
max_bytes=args.logging_rotate_bytes,
|
|
|
|
backup_count=args.logging_rotate_backup_count,
|
|
|
|
)
|
2021-07-27 16:39:15 -07:00
|
|
|
setup_component_logger(**logging_params)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
agent = DashboardAgent(
|
2020-11-17 04:02:20 +08:00
|
|
|
args.node_ip_address,
|
2020-10-23 16:52:14 -04:00
|
|
|
args.dashboard_agent_port,
|
2021-12-21 16:58:03 -08:00
|
|
|
args.gcs_address,
|
2022-01-26 21:03:54 +09:00
|
|
|
args.minimal,
|
2020-10-23 16:52:14 -04:00
|
|
|
temp_dir=args.temp_dir,
|
2021-06-25 19:39:05 +08:00
|
|
|
session_dir=args.session_dir,
|
2021-06-09 20:22:25 +08:00
|
|
|
runtime_env_dir=args.runtime_env_dir,
|
2020-10-23 16:52:14 -04:00
|
|
|
log_dir=args.log_dir,
|
2020-08-30 14:09:34 +08:00
|
|
|
metrics_export_port=args.metrics_export_port,
|
2020-07-27 11:34:47 +08:00
|
|
|
node_manager_port=args.node_manager_port,
|
2021-08-02 02:09:50 -07:00
|
|
|
listen_port=args.listen_port,
|
2020-07-27 11:34:47 +08:00
|
|
|
object_store_name=args.object_store_name,
|
2021-07-27 16:39:15 -07:00
|
|
|
raylet_name=args.raylet_name,
|
|
|
|
logging_params=logging_params,
|
2022-05-02 21:33:03 +09:00
|
|
|
disable_metrics_collection=args.disable_metrics_collection,
|
2022-07-20 06:26:49 +03:00
|
|
|
agent_id=args.agent_id,
|
2021-07-27 16:39:15 -07:00
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
loop = asyncio.get_event_loop()
|
2020-08-25 04:24:23 +08:00
|
|
|
loop.run_until_complete(agent.run())
|
2022-03-15 16:09:15 +08:00
|
|
|
except Exception:
|
|
|
|
logger.exception("Agent is working abnormally. It will exit immediately.")
|
2021-10-04 21:01:51 +09:00
|
|
|
exit(1)
|