ray/dashboard/modules/reporter/reporter_head.py
Ameer Haj Ali b7dd7ddb52
deprecate useless fields in the cluster yaml. (#13637)
* prepare for head node

* move command runner interface outside _private

* remove space

* Eric

* flake

* min_workers in multi node type

* fixing edge cases

* eric not idle

* fix target_workers to consider min_workers of node types

* idle timeout

* minor

* minor fix

* test

* lint

* eric v2

* eric 3

* min_workers constraint before bin packing

* Update resource_demand_scheduler.py

* Revert "Update resource_demand_scheduler.py"

This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5.

* reducing diff

* make get_nodes_to_launch return a dict

* merge

* weird merge fix

* auto fill instance types for AWS

* Alex/Eric

* Update doc/source/cluster/autoscaling.rst

* merge autofill and input from user

* logger.exception

* make the yaml use the default autofill

* docs Eric

* remove test_autoscaler_yaml from windows tests

* lets try changing the test a bit

* return test

* lets see

* edward

* Limit max launch concurrency

* commenting frac TODO

* move to resource demand scheduler

* use STATUS UP TO DATE

* Eric

* make logger of gc freed refs debug instead of info

* add cluster name to docker mount prefix directory

* grrR

* fix tests

* moving docker directory to sdk

* move the import to prevent circular dependency

* smallf fix

* ian

* fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running

* small fix

* deflake test_joblib

* lint

* placement groups bypass

* remove space

* Eric

* first ocmmit

* lint

* exmaple

* documentation

* hmm

* file path fix

* fix test

* some format issue in docs

* modified docs

* joblib strikes again on windows

* add ability to not start autoscaler/monitor

* a

* remove worker_default

* Remove default pod type from operator

* Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types

* deprecate useless fields

Co-authored-by: Ameer Haj Ali <ameerhajali@ameers-mbp.lan>
Co-authored-by: Alex Wu <alex@anyscale.io>
Co-authored-by: Alex Wu <itswu.alex@gmail.com>
Co-authored-by: Eric Liang <ekhliang@gmail.com>
Co-authored-by: Ameer Haj Ali <ameerhajali@Ameers-MacBook-Pro.local>
Co-authored-by: root <root@ip-172-31-56-188.us-west-2.compute.internal>
Co-authored-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
2021-01-23 12:06:51 -08:00

151 lines
5.9 KiB
Python

import json
import logging
import yaml
import os
import aiohttp.web
from aioredis.pubsub import Receiver
from grpc.experimental import aio as aiogrpc
import ray
import ray.gcs_utils
import ray.new_dashboard.modules.reporter.reporter_consts as reporter_consts
import ray.new_dashboard.utils as dashboard_utils
import ray._private.services
import ray.utils
from ray.autoscaler._private.util import (DEBUG_AUTOSCALING_STATUS,
DEBUG_AUTOSCALING_STATUS_LEGACY,
DEBUG_AUTOSCALING_ERROR)
from ray.core.generated import reporter_pb2
from ray.core.generated import reporter_pb2_grpc
from ray.new_dashboard.datacenter import DataSource
logger = logging.getLogger(__name__)
routes = dashboard_utils.ClassMethodRouteTable
class ReportHead(dashboard_utils.DashboardHeadModule):
def __init__(self, dashboard_head):
super().__init__(dashboard_head)
self._stubs = {}
self._ray_config = None
DataSource.agents.signal.append(self._update_stubs)
async def _update_stubs(self, change):
if change.old:
node_id, port = change.old
ip = DataSource.node_id_to_ip[node_id]
self._stubs.pop(ip)
if change.new:
node_id, ports = change.new
ip = DataSource.node_id_to_ip[node_id]
options = (("grpc.enable_http_proxy", 0), )
channel = aiogrpc.insecure_channel(
f"{ip}:{ports[1]}", options=options)
stub = reporter_pb2_grpc.ReporterServiceStub(channel)
self._stubs[ip] = stub
@routes.get("/api/launch_profiling")
async def launch_profiling(self, req) -> aiohttp.web.Response:
ip = req.query["ip"]
pid = int(req.query["pid"])
duration = int(req.query["duration"])
reporter_stub = self._stubs[ip]
reply = await reporter_stub.GetProfilingStats(
reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration))
profiling_info = (json.loads(reply.profiling_stats)
if reply.profiling_stats else reply.std_out)
return dashboard_utils.rest_response(
success=True,
message="Profiling success.",
profiling_info=profiling_info)
@routes.get("/api/ray_config")
async def get_ray_config(self, req) -> aiohttp.web.Response:
if self._ray_config is None:
try:
config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
with open(config_path) as f:
cfg = yaml.safe_load(f)
except yaml.YAMLError:
return dashboard_utils.rest_response(
success=False,
message=f"No config found at {config_path}.",
)
except FileNotFoundError:
return dashboard_utils.rest_response(
success=False,
message="Invalid config, could not load YAML.")
payload = {
"min_workers": cfg["min_workers"],
"max_workers": cfg["max_workers"]
}
try:
payload["head_type"] = cfg["head_node"]["InstanceType"]
except KeyError:
payload["head_type"] = "unknown"
try:
payload["worker_type"] = cfg["worker_nodes"]["InstanceType"]
except KeyError:
payload["worker_type"] = "unknown"
self._ray_config = payload
return dashboard_utils.rest_response(
success=True,
message="Fetched ray config.",
**self._ray_config,
)
@routes.get("/api/cluster_status")
async def get_cluster_status(self, req):
"""Returns status information about the cluster.
Currently contains two fields:
autoscaling_status (str): a status message from the autoscaler.
autoscaling_error (str): an error message from the autoscaler if
anything has gone wrong during autoscaling.
These fields are both read from the GCS, it's expected that the
autoscaler writes them there.
"""
aioredis_client = self._dashboard_head.aioredis_client
legacy_status = await aioredis_client.hget(
DEBUG_AUTOSCALING_STATUS_LEGACY, "value")
formatted_status_string = await aioredis_client.hget(
DEBUG_AUTOSCALING_STATUS, "value")
formatted_status = json.loads(formatted_status_string.decode()
) if formatted_status_string else {}
error = await aioredis_client.hget(DEBUG_AUTOSCALING_ERROR, "value")
return dashboard_utils.rest_response(
success=True,
message="Got cluster status.",
autoscaling_status=legacy_status.decode()
if legacy_status else None,
autoscaling_error=error.decode() if error else None,
cluster_status=formatted_status if formatted_status else None,
)
async def run(self, server):
aioredis_client = self._dashboard_head.aioredis_client
receiver = Receiver()
reporter_key = "{}*".format(reporter_consts.REPORTER_PREFIX)
await aioredis_client.psubscribe(receiver.pattern(reporter_key))
logger.info(f"Subscribed to {reporter_key}")
async for sender, msg in receiver.iter():
try:
# The key is b'RAY_REPORTER:{node id hex}',
# e.g. b'RAY_REPORTER:2b4fbd406898cc86fb88fb0acfd5456b0afd87cf'
key, data = msg
data = json.loads(ray.utils.decode(data))
key = key.decode("utf-8")
node_id = key.split(":")[-1]
DataSource.node_physical_stats[node_id] = data
except Exception:
logger.exception(
"Error receiving node physical stats from reporter agent.")