mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

* prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields Co-authored-by: Ameer Haj Ali <ameerhajali@ameers-mbp.lan> Co-authored-by: Alex Wu <alex@anyscale.io> Co-authored-by: Alex Wu <itswu.alex@gmail.com> Co-authored-by: Eric Liang <ekhliang@gmail.com> Co-authored-by: Ameer Haj Ali <ameerhajali@Ameers-MacBook-Pro.local> Co-authored-by: root <root@ip-172-31-56-188.us-west-2.compute.internal> Co-authored-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
151 lines
5.9 KiB
Python
151 lines
5.9 KiB
Python
import json
|
|
import logging
|
|
import yaml
|
|
import os
|
|
import aiohttp.web
|
|
from aioredis.pubsub import Receiver
|
|
from grpc.experimental import aio as aiogrpc
|
|
|
|
import ray
|
|
import ray.gcs_utils
|
|
import ray.new_dashboard.modules.reporter.reporter_consts as reporter_consts
|
|
import ray.new_dashboard.utils as dashboard_utils
|
|
import ray._private.services
|
|
import ray.utils
|
|
from ray.autoscaler._private.util import (DEBUG_AUTOSCALING_STATUS,
|
|
DEBUG_AUTOSCALING_STATUS_LEGACY,
|
|
DEBUG_AUTOSCALING_ERROR)
|
|
from ray.core.generated import reporter_pb2
|
|
from ray.core.generated import reporter_pb2_grpc
|
|
from ray.new_dashboard.datacenter import DataSource
|
|
|
|
logger = logging.getLogger(__name__)
|
|
routes = dashboard_utils.ClassMethodRouteTable
|
|
|
|
|
|
class ReportHead(dashboard_utils.DashboardHeadModule):
|
|
def __init__(self, dashboard_head):
|
|
super().__init__(dashboard_head)
|
|
self._stubs = {}
|
|
self._ray_config = None
|
|
DataSource.agents.signal.append(self._update_stubs)
|
|
|
|
async def _update_stubs(self, change):
|
|
if change.old:
|
|
node_id, port = change.old
|
|
ip = DataSource.node_id_to_ip[node_id]
|
|
self._stubs.pop(ip)
|
|
if change.new:
|
|
node_id, ports = change.new
|
|
ip = DataSource.node_id_to_ip[node_id]
|
|
options = (("grpc.enable_http_proxy", 0), )
|
|
channel = aiogrpc.insecure_channel(
|
|
f"{ip}:{ports[1]}", options=options)
|
|
stub = reporter_pb2_grpc.ReporterServiceStub(channel)
|
|
self._stubs[ip] = stub
|
|
|
|
@routes.get("/api/launch_profiling")
|
|
async def launch_profiling(self, req) -> aiohttp.web.Response:
|
|
ip = req.query["ip"]
|
|
pid = int(req.query["pid"])
|
|
duration = int(req.query["duration"])
|
|
reporter_stub = self._stubs[ip]
|
|
reply = await reporter_stub.GetProfilingStats(
|
|
reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration))
|
|
profiling_info = (json.loads(reply.profiling_stats)
|
|
if reply.profiling_stats else reply.std_out)
|
|
return dashboard_utils.rest_response(
|
|
success=True,
|
|
message="Profiling success.",
|
|
profiling_info=profiling_info)
|
|
|
|
@routes.get("/api/ray_config")
|
|
async def get_ray_config(self, req) -> aiohttp.web.Response:
|
|
if self._ray_config is None:
|
|
try:
|
|
config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
|
|
with open(config_path) as f:
|
|
cfg = yaml.safe_load(f)
|
|
except yaml.YAMLError:
|
|
return dashboard_utils.rest_response(
|
|
success=False,
|
|
message=f"No config found at {config_path}.",
|
|
)
|
|
except FileNotFoundError:
|
|
return dashboard_utils.rest_response(
|
|
success=False,
|
|
message="Invalid config, could not load YAML.")
|
|
|
|
payload = {
|
|
"min_workers": cfg["min_workers"],
|
|
"max_workers": cfg["max_workers"]
|
|
}
|
|
|
|
try:
|
|
payload["head_type"] = cfg["head_node"]["InstanceType"]
|
|
except KeyError:
|
|
payload["head_type"] = "unknown"
|
|
|
|
try:
|
|
payload["worker_type"] = cfg["worker_nodes"]["InstanceType"]
|
|
except KeyError:
|
|
payload["worker_type"] = "unknown"
|
|
|
|
self._ray_config = payload
|
|
|
|
return dashboard_utils.rest_response(
|
|
success=True,
|
|
message="Fetched ray config.",
|
|
**self._ray_config,
|
|
)
|
|
|
|
@routes.get("/api/cluster_status")
|
|
async def get_cluster_status(self, req):
|
|
"""Returns status information about the cluster.
|
|
|
|
Currently contains two fields:
|
|
autoscaling_status (str): a status message from the autoscaler.
|
|
autoscaling_error (str): an error message from the autoscaler if
|
|
anything has gone wrong during autoscaling.
|
|
|
|
These fields are both read from the GCS, it's expected that the
|
|
autoscaler writes them there.
|
|
"""
|
|
|
|
aioredis_client = self._dashboard_head.aioredis_client
|
|
legacy_status = await aioredis_client.hget(
|
|
DEBUG_AUTOSCALING_STATUS_LEGACY, "value")
|
|
formatted_status_string = await aioredis_client.hget(
|
|
DEBUG_AUTOSCALING_STATUS, "value")
|
|
formatted_status = json.loads(formatted_status_string.decode()
|
|
) if formatted_status_string else {}
|
|
error = await aioredis_client.hget(DEBUG_AUTOSCALING_ERROR, "value")
|
|
return dashboard_utils.rest_response(
|
|
success=True,
|
|
message="Got cluster status.",
|
|
autoscaling_status=legacy_status.decode()
|
|
if legacy_status else None,
|
|
autoscaling_error=error.decode() if error else None,
|
|
cluster_status=formatted_status if formatted_status else None,
|
|
)
|
|
|
|
async def run(self, server):
|
|
aioredis_client = self._dashboard_head.aioredis_client
|
|
receiver = Receiver()
|
|
|
|
reporter_key = "{}*".format(reporter_consts.REPORTER_PREFIX)
|
|
await aioredis_client.psubscribe(receiver.pattern(reporter_key))
|
|
logger.info(f"Subscribed to {reporter_key}")
|
|
|
|
async for sender, msg in receiver.iter():
|
|
try:
|
|
# The key is b'RAY_REPORTER:{node id hex}',
|
|
# e.g. b'RAY_REPORTER:2b4fbd406898cc86fb88fb0acfd5456b0afd87cf'
|
|
key, data = msg
|
|
data = json.loads(ray.utils.decode(data))
|
|
key = key.decode("utf-8")
|
|
node_id = key.split(":")[-1]
|
|
DataSource.node_physical_stats[node_id] = data
|
|
except Exception:
|
|
logger.exception(
|
|
"Error receiving node physical stats from reporter agent.")
|