mirror of
https://github.com/vale981/ray
synced 2025-03-10 13:26:39 -04:00

Signed-off-by: Alan Guo <aguo@anyscale.com> ## Why are these changes needed? Reduces memory footprint of the dashboard. Also adds some cleanup to the errors data. Also cleans up actor cache by removing dead actors from the cache. Dashboard UI no longer allows you to see logs for all workers in a node. You must click into each worker's logs individually. <img width="1739" alt="Screen Shot 2022-07-20 at 9 13 00 PM" src="https://user-images.githubusercontent.com/711935/180128633-1633c187-39c9-493e-b694-009fbb27f73b.png"> ## Related issue number fixes #23680 fixes #22027 fixes #24272
276 lines
11 KiB
Python
276 lines
11 KiB
Python
import asyncio
|
|
from collections import deque
|
|
import logging
|
|
import os
|
|
|
|
import aiohttp.web
|
|
|
|
import ray._private.ray_constants as ray_constants
|
|
import ray._private.utils
|
|
import ray.dashboard.optional_utils as dashboard_optional_utils
|
|
import ray.dashboard.utils as dashboard_utils
|
|
from ray._private.gcs_pubsub import GcsAioActorSubscriber
|
|
from ray.core.generated import (
|
|
core_worker_pb2,
|
|
core_worker_pb2_grpc,
|
|
gcs_service_pb2,
|
|
gcs_service_pb2_grpc,
|
|
node_manager_pb2_grpc,
|
|
)
|
|
from ray.dashboard.datacenter import DataOrganizer, DataSource
|
|
from ray.dashboard.modules.actor import actor_consts, actor_utils
|
|
from ray.dashboard.modules.actor.actor_utils import actor_classname_from_func_descriptor
|
|
from ray.dashboard.optional_utils import rest_response
|
|
|
|
try:
|
|
from grpc import aio as aiogrpc
|
|
except ImportError:
|
|
from grpc.experimental import aio as aiogrpc
|
|
|
|
logger = logging.getLogger(__name__)
|
|
routes = dashboard_optional_utils.ClassMethodRouteTable
|
|
|
|
MAX_ACTORS_TO_CACHE = int(os.environ.get("RAY_DASHBOARD_MAX_ACTORS_TO_CACHE", 1000))
|
|
ACTOR_CLEANUP_FREQUENCY = 10 # seconds
|
|
|
|
|
|
def actor_table_data_to_dict(message):
|
|
orig_message = dashboard_utils.message_to_dict(
|
|
message,
|
|
{
|
|
"actorId",
|
|
"parentId",
|
|
"jobId",
|
|
"workerId",
|
|
"rayletId",
|
|
"actorCreationDummyObjectId",
|
|
"callerId",
|
|
"taskId",
|
|
"parentTaskId",
|
|
"sourceActorId",
|
|
"placementGroupId",
|
|
},
|
|
including_default_value_fields=True,
|
|
)
|
|
# The complete schema for actor table is here:
|
|
# src/ray/protobuf/gcs.proto
|
|
# It is super big and for dashboard, we don't need that much information.
|
|
# Only preserve the necessary ones here for memory usage.
|
|
fields = {
|
|
"actorId",
|
|
"jobId",
|
|
"pid",
|
|
"address",
|
|
"state",
|
|
"name",
|
|
"numRestarts",
|
|
"functionDescriptor",
|
|
"timestamp",
|
|
"numExecutedTasks",
|
|
}
|
|
light_message = {k: v for (k, v) in orig_message.items() if k in fields}
|
|
if "functionDescriptor" in light_message:
|
|
actor_class = actor_classname_from_func_descriptor(
|
|
light_message["functionDescriptor"]
|
|
)
|
|
light_message["actorClass"] = actor_class
|
|
return light_message
|
|
|
|
|
|
class ActorHead(dashboard_utils.DashboardHeadModule):
|
|
def __init__(self, dashboard_head):
|
|
super().__init__(dashboard_head)
|
|
self._stubs = {}
|
|
# ActorInfoGcsService
|
|
self._gcs_actor_info_stub = None
|
|
# A queue of dead actors in order of when they died
|
|
self.dead_actors_queue = deque()
|
|
DataSource.nodes.signal.append(self._update_stubs)
|
|
|
|
async def _update_stubs(self, change):
|
|
if change.old:
|
|
node_id, node_info = change.old
|
|
self._stubs.pop(node_id)
|
|
if change.new:
|
|
# TODO(fyrestone): Handle exceptions.
|
|
node_id, node_info = change.new
|
|
address = "{}:{}".format(
|
|
node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])
|
|
)
|
|
options = ray_constants.GLOBAL_GRPC_OPTIONS
|
|
channel = ray._private.utils.init_grpc_channel(
|
|
address, options, asynchronous=True
|
|
)
|
|
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
|
self._stubs[node_id] = stub
|
|
|
|
async def _update_actors(self):
|
|
# Get all actor info.
|
|
while True:
|
|
try:
|
|
logger.info("Getting all actor info from GCS.")
|
|
request = gcs_service_pb2.GetAllActorInfoRequest()
|
|
reply = await self._gcs_actor_info_stub.GetAllActorInfo(
|
|
request, timeout=5
|
|
)
|
|
if reply.status.code == 0:
|
|
actors = {}
|
|
for message in reply.actor_table_data:
|
|
actor_table_data = actor_table_data_to_dict(message)
|
|
actors[actor_table_data["actorId"]] = actor_table_data
|
|
# Update actors.
|
|
DataSource.actors.reset(actors)
|
|
# Update node actors and job actors.
|
|
job_actors = {}
|
|
node_actors = {}
|
|
for actor_id, actor_table_data in actors.items():
|
|
job_id = actor_table_data["jobId"]
|
|
node_id = actor_table_data["address"]["rayletId"]
|
|
job_actors.setdefault(job_id, {})[actor_id] = actor_table_data
|
|
# Update only when node_id is not Nil.
|
|
if node_id != actor_consts.NIL_NODE_ID:
|
|
node_actors.setdefault(node_id, {})[
|
|
actor_id
|
|
] = actor_table_data
|
|
DataSource.job_actors.reset(job_actors)
|
|
DataSource.node_actors.reset(node_actors)
|
|
logger.info("Received %d actor info from GCS.", len(actors))
|
|
break
|
|
else:
|
|
raise Exception(
|
|
f"Failed to GetAllActorInfo: {reply.status.message}"
|
|
)
|
|
except Exception:
|
|
logger.exception("Error Getting all actor info from GCS.")
|
|
await asyncio.sleep(
|
|
actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS
|
|
)
|
|
|
|
state_keys = ("state", "address", "numRestarts", "timestamp", "pid")
|
|
|
|
def process_actor_data_from_pubsub(actor_id, actor_table_data):
|
|
actor_table_data = actor_table_data_to_dict(actor_table_data)
|
|
# If actor is not new registered but updated, we only update
|
|
# states related fields.
|
|
if actor_table_data["state"] != "DEPENDENCIES_UNREADY":
|
|
actor_table_data_copy = dict(DataSource.actors[actor_id])
|
|
for k in state_keys:
|
|
actor_table_data_copy[k] = actor_table_data[k]
|
|
actor_table_data = actor_table_data_copy
|
|
actor_id = actor_table_data["actorId"]
|
|
job_id = actor_table_data["jobId"]
|
|
node_id = actor_table_data["address"]["rayletId"]
|
|
if actor_table_data["state"] == "DEAD":
|
|
self.dead_actors_queue.append(actor_id)
|
|
# Update actors.
|
|
DataSource.actors[actor_id] = actor_table_data
|
|
# Update node actors (only when node_id is not Nil).
|
|
if node_id != actor_consts.NIL_NODE_ID:
|
|
node_actors = dict(DataSource.node_actors.get(node_id, {}))
|
|
node_actors[actor_id] = actor_table_data
|
|
DataSource.node_actors[node_id] = node_actors
|
|
# Update job actors.
|
|
job_actors = dict(DataSource.job_actors.get(job_id, {}))
|
|
job_actors[actor_id] = actor_table_data
|
|
DataSource.job_actors[job_id] = job_actors
|
|
|
|
# Receive actors from channel.
|
|
gcs_addr = self._dashboard_head.gcs_address
|
|
subscriber = GcsAioActorSubscriber(address=gcs_addr)
|
|
await subscriber.subscribe()
|
|
|
|
while True:
|
|
try:
|
|
actor_id, actor_table_data = await subscriber.poll()
|
|
if actor_id is not None:
|
|
# Convert to lower case hex ID.
|
|
actor_id = actor_id.hex()
|
|
process_actor_data_from_pubsub(actor_id, actor_table_data)
|
|
except Exception:
|
|
logger.exception("Error processing actor info from GCS.")
|
|
|
|
async def _cleanup_actors(self):
|
|
while True:
|
|
try:
|
|
if len(DataSource.actors) > MAX_ACTORS_TO_CACHE:
|
|
logger.debug("Cleaning up dead actors from GCS")
|
|
while len(DataSource.actors) > MAX_ACTORS_TO_CACHE:
|
|
if not self.dead_actors_queue:
|
|
logger.warning(
|
|
f"More than {MAX_ACTORS_TO_CACHE} "
|
|
"live actors are cached"
|
|
)
|
|
break
|
|
actor_id = self.dead_actors_queue.popleft()
|
|
if actor_id in DataSource.actors:
|
|
actor = DataSource.actors.pop(actor_id)
|
|
job_id = actor["jobId"]
|
|
del DataSource.job_actors[job_id][actor_id]
|
|
node_id = actor["address"].get("rayletId")
|
|
if node_id:
|
|
del DataSource.node_actors[node_id][actor_id]
|
|
await asyncio.sleep(ACTOR_CLEANUP_FREQUENCY)
|
|
except Exception:
|
|
logger.exception("Error cleaning up actor info from GCS.")
|
|
|
|
@routes.get("/logical/actor_groups")
|
|
async def get_actor_groups(self, req) -> aiohttp.web.Response:
|
|
actors = await DataOrganizer.get_all_actors()
|
|
actor_creation_tasks = await DataOrganizer.get_actor_creation_tasks()
|
|
# actor_creation_tasks have some common interface with actors,
|
|
# and they get processed and shown in tandem in the logical view
|
|
# hence we merge them together before constructing actor groups.
|
|
actors.update(actor_creation_tasks)
|
|
actor_groups = actor_utils.construct_actor_groups(actors)
|
|
return rest_response(
|
|
success=True, message="Fetched actor groups.", actor_groups=actor_groups
|
|
)
|
|
|
|
@routes.get("/logical/actors")
|
|
@dashboard_optional_utils.aiohttp_cache
|
|
async def get_all_actors(self, req) -> aiohttp.web.Response:
|
|
return rest_response(
|
|
success=True, message="All actors fetched.", actors=DataSource.actors
|
|
)
|
|
|
|
@routes.get("/logical/kill_actor")
|
|
async def kill_actor(self, req) -> aiohttp.web.Response:
|
|
try:
|
|
actor_id = req.query["actorId"]
|
|
ip_address = req.query["ipAddress"]
|
|
port = req.query["port"]
|
|
except KeyError:
|
|
return rest_response(success=False, message="Bad Request")
|
|
try:
|
|
options = ray_constants.GLOBAL_GRPC_OPTIONS
|
|
channel = ray._private.utils.init_grpc_channel(
|
|
f"{ip_address}:{port}", options=options, asynchronous=True
|
|
)
|
|
stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel)
|
|
|
|
await stub.KillActor(
|
|
core_worker_pb2.KillActorRequest(
|
|
intended_actor_id=ray._private.utils.hex_to_binary(actor_id)
|
|
)
|
|
)
|
|
|
|
except aiogrpc.AioRpcError:
|
|
# This always throws an exception because the worker
|
|
# is killed and the channel is closed on the worker side
|
|
# before this handler, however it deletes the actor correctly.
|
|
pass
|
|
|
|
return rest_response(success=True, message=f"Killed actor with id {actor_id}")
|
|
|
|
async def run(self, server):
|
|
gcs_channel = self._dashboard_head.aiogrpc_gcs_channel
|
|
self._gcs_actor_info_stub = gcs_service_pb2_grpc.ActorInfoGcsServiceStub(
|
|
gcs_channel
|
|
)
|
|
|
|
asyncio.get_event_loop().create_task(self._cleanup_actors())
|
|
await asyncio.gather(self._update_actors())
|
|
|
|
@staticmethod
|
|
def is_minimal_module():
|
|
return False
|