ray/dashboard/modules/reporter/reporter_head.py

152 lines
5.9 KiB
Python
Raw Normal View History

import json
import logging
import yaml
import os
import aiohttp.web
from aioredis.pubsub import Receiver
from grpc.experimental import aio as aiogrpc
import ray
import ray.gcs_utils
import ray.new_dashboard.modules.reporter.reporter_consts as reporter_consts
import ray.new_dashboard.utils as dashboard_utils
import ray._private.services
import ray._private.utils
from ray.autoscaler._private.util import (DEBUG_AUTOSCALING_STATUS,
DEBUG_AUTOSCALING_STATUS_LEGACY,
DEBUG_AUTOSCALING_ERROR)
from ray.core.generated import reporter_pb2
from ray.core.generated import reporter_pb2_grpc
from ray.new_dashboard.datacenter import DataSource
logger = logging.getLogger(__name__)
routes = dashboard_utils.ClassMethodRouteTable
class ReportHead(dashboard_utils.DashboardHeadModule):
def __init__(self, dashboard_head):
super().__init__(dashboard_head)
self._stubs = {}
self._ray_config = None
DataSource.agents.signal.append(self._update_stubs)
async def _update_stubs(self, change):
if change.old:
node_id, port = change.old
ip = DataSource.node_id_to_ip[node_id]
self._stubs.pop(ip)
if change.new:
node_id, ports = change.new
ip = DataSource.node_id_to_ip[node_id]
options = (("grpc.enable_http_proxy", 0), )
channel = aiogrpc.insecure_channel(
f"{ip}:{ports[1]}", options=options)
stub = reporter_pb2_grpc.ReporterServiceStub(channel)
self._stubs[ip] = stub
@routes.get("/api/launch_profiling")
async def launch_profiling(self, req) -> aiohttp.web.Response:
ip = req.query["ip"]
pid = int(req.query["pid"])
duration = int(req.query["duration"])
reporter_stub = self._stubs[ip]
reply = await reporter_stub.GetProfilingStats(
reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration))
profiling_info = (json.loads(reply.profiling_stats)
if reply.profiling_stats else reply.std_out)
return dashboard_utils.rest_response(
success=True,
message="Profiling success.",
profiling_info=profiling_info)
@routes.get("/api/ray_config")
async def get_ray_config(self, req) -> aiohttp.web.Response:
if self._ray_config is None:
try:
config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
with open(config_path) as f:
cfg = yaml.safe_load(f)
except yaml.YAMLError:
return dashboard_utils.rest_response(
success=False,
message=f"No config found at {config_path}.",
)
except FileNotFoundError:
return dashboard_utils.rest_response(
success=False,
message="Invalid config, could not load YAML.")
payload = {
"min_workers": cfg["min_workers"],
deprecate useless fields in the cluster yaml. (#13637) * prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields Co-authored-by: Ameer Haj Ali <ameerhajali@ameers-mbp.lan> Co-authored-by: Alex Wu <alex@anyscale.io> Co-authored-by: Alex Wu <itswu.alex@gmail.com> Co-authored-by: Eric Liang <ekhliang@gmail.com> Co-authored-by: Ameer Haj Ali <ameerhajali@Ameers-MacBook-Pro.local> Co-authored-by: root <root@ip-172-31-56-188.us-west-2.compute.internal> Co-authored-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
2021-01-23 22:06:51 +02:00
"max_workers": cfg["max_workers"]
}
try:
payload["head_type"] = cfg["head_node"]["InstanceType"]
except KeyError:
payload["head_type"] = "unknown"
try:
payload["worker_type"] = cfg["worker_nodes"]["InstanceType"]
except KeyError:
payload["worker_type"] = "unknown"
self._ray_config = payload
return dashboard_utils.rest_response(
success=True,
message="Fetched ray config.",
**self._ray_config,
)
@routes.get("/api/cluster_status")
async def get_cluster_status(self, req):
"""Returns status information about the cluster.
Currently contains two fields:
autoscaling_status (str): a status message from the autoscaler.
autoscaling_error (str): an error message from the autoscaler if
anything has gone wrong during autoscaling.
These fields are both read from the GCS, it's expected that the
autoscaler writes them there.
"""
aioredis_client = self._dashboard_head.aioredis_client
legacy_status = await aioredis_client.hget(
DEBUG_AUTOSCALING_STATUS_LEGACY, "value")
formatted_status_string = await aioredis_client.hget(
DEBUG_AUTOSCALING_STATUS, "value")
formatted_status = json.loads(formatted_status_string.decode()
) if formatted_status_string else {}
error = await aioredis_client.hget(DEBUG_AUTOSCALING_ERROR, "value")
return dashboard_utils.rest_response(
success=True,
message="Got cluster status.",
autoscaling_status=legacy_status.decode()
if legacy_status else None,
autoscaling_error=error.decode() if error else None,
cluster_status=formatted_status if formatted_status else None,
)
async def run(self, server):
aioredis_client = self._dashboard_head.aioredis_client
receiver = Receiver()
reporter_key = "{}*".format(reporter_consts.REPORTER_PREFIX)
await aioredis_client.psubscribe(receiver.pattern(reporter_key))
logger.info(f"Subscribed to {reporter_key}")
async for sender, msg in receiver.iter():
try:
# The key is b'RAY_REPORTER:{node id hex}',
# e.g. b'RAY_REPORTER:2b4fbd406898cc86fb88fb0acfd5456b0afd87cf'
key, data = msg
data = json.loads(ray._private.utils.decode(data))
key = key.decode("utf-8")
node_id = key.split(":")[-1]
DataSource.node_physical_stats[node_id] = data
except Exception:
logger.exception(
"Error receiving node physical stats from reporter agent.")