2020-07-27 11:34:47 +08:00
|
|
|
import json
|
|
|
|
import logging
|
2020-09-29 17:57:49 -07:00
|
|
|
import yaml
|
|
|
|
import os
|
2020-07-27 11:34:47 +08:00
|
|
|
import aiohttp.web
|
|
|
|
|
|
|
|
import ray
|
2021-09-15 11:17:15 -05:00
|
|
|
import ray.dashboard.utils as dashboard_utils
|
2022-01-24 14:11:32 +09:00
|
|
|
import ray.dashboard.optional_utils as dashboard_optional_utils
|
2022-02-01 15:34:40 +09:00
|
|
|
import ray.experimental.internal_kv as internal_kv
|
2020-09-24 22:46:35 -07:00
|
|
|
import ray._private.services
|
2021-03-10 23:47:28 -07:00
|
|
|
import ray._private.utils
|
2022-01-25 14:43:24 -08:00
|
|
|
from ray.ray_constants import (
|
2022-05-10 11:30:46 +08:00
|
|
|
GLOBAL_GRPC_OPTIONS,
|
2022-01-25 14:43:24 -08:00
|
|
|
DEBUG_AUTOSCALING_STATUS,
|
|
|
|
DEBUG_AUTOSCALING_STATUS_LEGACY,
|
|
|
|
DEBUG_AUTOSCALING_ERROR,
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
from ray.core.generated import reporter_pb2
|
|
|
|
from ray.core.generated import reporter_pb2_grpc
|
2022-03-15 23:56:15 -07:00
|
|
|
from ray._private.gcs_pubsub import GcsAioResourceUsageSubscriber
|
2022-02-01 15:34:40 +09:00
|
|
|
from ray._private.metrics_agent import PrometheusServiceDiscoveryWriter
|
2021-09-15 11:17:15 -05:00
|
|
|
from ray.dashboard.datacenter import DataSource
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2022-01-24 14:11:32 +09:00
|
|
|
routes = dashboard_optional_utils.ClassMethodRouteTable
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
class ReportHead(dashboard_utils.DashboardHeadModule):
|
|
|
|
def __init__(self, dashboard_head):
|
|
|
|
super().__init__(dashboard_head)
|
|
|
|
self._stubs = {}
|
2020-10-13 21:23:23 -04:00
|
|
|
self._ray_config = None
|
2020-07-27 11:34:47 +08:00
|
|
|
DataSource.agents.signal.append(self._update_stubs)
|
2022-02-01 15:34:40 +09:00
|
|
|
# TODO(fyrestone): Avoid using ray.state in dashboard, it's not
|
|
|
|
# asynchronous and will lead to low performance. ray disconnect()
|
|
|
|
# will be hang when the ray.state is connected and the GCS is exit.
|
|
|
|
# Please refer to: https://github.com/ray-project/ray/issues/16328
|
|
|
|
assert dashboard_head.gcs_address or dashboard_head.redis_address
|
|
|
|
gcs_address = dashboard_head.gcs_address
|
|
|
|
temp_dir = dashboard_head.temp_dir
|
2022-03-04 12:32:17 -08:00
|
|
|
self.service_discovery = PrometheusServiceDiscoveryWriter(gcs_address, temp_dir)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
|
|
|
async def _update_stubs(self, change):
|
2020-08-30 14:09:34 +08:00
|
|
|
if change.old:
|
2020-09-17 01:17:29 +08:00
|
|
|
node_id, port = change.old
|
|
|
|
ip = DataSource.node_id_to_ip[node_id]
|
2020-08-30 14:09:34 +08:00
|
|
|
self._stubs.pop(ip)
|
2020-07-27 11:34:47 +08:00
|
|
|
if change.new:
|
2020-09-17 01:17:29 +08:00
|
|
|
node_id, ports = change.new
|
|
|
|
ip = DataSource.node_id_to_ip[node_id]
|
2022-05-10 11:30:46 +08:00
|
|
|
options = GLOBAL_GRPC_OPTIONS
|
2021-10-21 06:39:11 +01:00
|
|
|
channel = ray._private.utils.init_grpc_channel(
|
|
|
|
f"{ip}:{ports[1]}", options=options, asynchronous=True
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
stub = reporter_pb2_grpc.ReporterServiceStub(channel)
|
|
|
|
self._stubs[ip] = stub
|
|
|
|
|
|
|
|
@routes.get("/api/launch_profiling")
|
|
|
|
async def launch_profiling(self, req) -> aiohttp.web.Response:
|
2020-08-30 14:09:34 +08:00
|
|
|
ip = req.query["ip"]
|
|
|
|
pid = int(req.query["pid"])
|
|
|
|
duration = int(req.query["duration"])
|
|
|
|
reporter_stub = self._stubs[ip]
|
2020-07-27 11:34:47 +08:00
|
|
|
reply = await reporter_stub.GetProfilingStats(
|
|
|
|
reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration)
|
2022-01-29 18:41:57 -08:00
|
|
|
)
|
2020-08-30 14:09:34 +08:00
|
|
|
profiling_info = (
|
|
|
|
json.loads(reply.profiling_stats)
|
|
|
|
if reply.profiling_stats
|
|
|
|
else reply.std_out
|
|
|
|
)
|
2022-01-24 14:11:32 +09:00
|
|
|
return dashboard_optional_utils.rest_response(
|
2020-08-30 14:09:34 +08:00
|
|
|
success=True, message="Profiling success.", profiling_info=profiling_info
|
|
|
|
)
|
2020-07-27 11:34:47 +08:00
|
|
|
|
2020-09-29 17:57:49 -07:00
|
|
|
@routes.get("/api/ray_config")
|
|
|
|
async def get_ray_config(self, req) -> aiohttp.web.Response:
|
|
|
|
if self._ray_config is None:
|
|
|
|
try:
|
|
|
|
config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
|
|
|
|
with open(config_path) as f:
|
|
|
|
cfg = yaml.safe_load(f)
|
|
|
|
except yaml.YAMLError:
|
2022-01-24 14:11:32 +09:00
|
|
|
return dashboard_optional_utils.rest_response(
|
2020-09-29 17:57:49 -07:00
|
|
|
success=False,
|
|
|
|
message=f"No config found at {config_path}.",
|
|
|
|
)
|
|
|
|
except FileNotFoundError:
|
2022-01-24 14:11:32 +09:00
|
|
|
return dashboard_optional_utils.rest_response(
|
2020-10-10 13:27:05 +08:00
|
|
|
success=False, message="Invalid config, could not load YAML."
|
|
|
|
)
|
2020-09-29 17:57:49 -07:00
|
|
|
|
|
|
|
payload = {
|
2021-05-10 15:47:51 -07:00
|
|
|
"min_workers": cfg.get("min_workers", "unspecified"),
|
|
|
|
"max_workers": cfg.get("max_workers", "unspecified"),
|
2020-09-29 17:57:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
payload["head_type"] = cfg["head_node"]["InstanceType"]
|
|
|
|
except KeyError:
|
|
|
|
payload["head_type"] = "unknown"
|
|
|
|
|
|
|
|
try:
|
|
|
|
payload["worker_type"] = cfg["worker_nodes"]["InstanceType"]
|
|
|
|
except KeyError:
|
|
|
|
payload["worker_type"] = "unknown"
|
|
|
|
|
|
|
|
self._ray_config = payload
|
|
|
|
|
2022-01-24 14:11:32 +09:00
|
|
|
return dashboard_optional_utils.rest_response(
|
2020-09-29 17:57:49 -07:00
|
|
|
success=True,
|
|
|
|
message="Fetched ray config.",
|
|
|
|
**self._ray_config,
|
|
|
|
)
|
|
|
|
|
2020-10-19 11:00:47 -05:00
|
|
|
@routes.get("/api/cluster_status")
|
|
|
|
async def get_cluster_status(self, req):
|
|
|
|
"""Returns status information about the cluster.
|
|
|
|
|
|
|
|
Currently contains two fields:
|
2022-06-01 11:27:54 -07:00
|
|
|
autoscaling_status (str)-- a status message from the autoscaler.
|
|
|
|
autoscaling_error (str)-- an error message from the autoscaler if
|
2020-10-19 11:00:47 -05:00
|
|
|
anything has gone wrong during autoscaling.
|
|
|
|
|
|
|
|
These fields are both read from the GCS, it's expected that the
|
|
|
|
autoscaler writes them there.
|
|
|
|
"""
|
|
|
|
|
2021-11-10 20:24:53 -08:00
|
|
|
assert ray.experimental.internal_kv._internal_kv_initialized()
|
|
|
|
legacy_status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS_LEGACY)
|
|
|
|
formatted_status_string = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS)
|
2020-12-23 12:02:55 -08:00
|
|
|
formatted_status = (
|
|
|
|
json.loads(formatted_status_string.decode())
|
|
|
|
if formatted_status_string
|
|
|
|
else {}
|
|
|
|
)
|
2021-11-10 20:24:53 -08:00
|
|
|
error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
|
2022-01-24 14:11:32 +09:00
|
|
|
return dashboard_optional_utils.rest_response(
|
2020-10-19 11:00:47 -05:00
|
|
|
success=True,
|
|
|
|
message="Got cluster status.",
|
2020-12-23 12:02:55 -08:00
|
|
|
autoscaling_status=legacy_status.decode() if legacy_status else None,
|
2020-10-19 11:00:47 -05:00
|
|
|
autoscaling_error=error.decode() if error else None,
|
2020-12-23 12:02:55 -08:00
|
|
|
cluster_status=formatted_status if formatted_status else None,
|
2020-10-19 11:00:47 -05:00
|
|
|
)
|
|
|
|
|
2020-08-25 04:24:23 +08:00
|
|
|
async def run(self, server):
|
2022-02-01 15:34:40 +09:00
|
|
|
# Need daemon True to avoid dashboard hangs at exit.
|
|
|
|
self.service_discovery.daemon = True
|
|
|
|
self.service_discovery.start()
|
2022-03-15 23:56:15 -07:00
|
|
|
gcs_addr = self._dashboard_head.gcs_address
|
|
|
|
subscriber = GcsAioResourceUsageSubscriber(gcs_addr)
|
|
|
|
await subscriber.subscribe()
|
|
|
|
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
# The key is b'RAY_REPORTER:{node id hex}',
|
|
|
|
# e.g. b'RAY_REPORTER:2b4fbd...'
|
|
|
|
key, data = await subscriber.poll()
|
|
|
|
if key is None:
|
|
|
|
continue
|
|
|
|
data = json.loads(data)
|
|
|
|
node_id = key.split(":")[-1]
|
|
|
|
DataSource.node_physical_stats[node_id] = data
|
|
|
|
except Exception:
|
|
|
|
logger.exception(
|
|
|
|
"Error receiving node physical stats from reporter agent."
|
|
|
|
)
|
2022-01-26 21:03:54 +09:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def is_minimal_module():
|
|
|
|
return False
|