ray/dashboard/modules/actor/actor_head.py
Alan Guo 50b20809b8
[Dashboard] Stop caching logs in memory. Use state observability api to fetch on demand. (#26818)
Signed-off-by: Alan Guo <aguo@anyscale.com>

## Why are these changes needed?
Reduces memory footprint of the dashboard.
Also adds some cleanup to the errors data.

Also cleans up actor cache by removing dead actors from the cache.

Dashboard UI no longer allows you to see logs for all workers in a node. You must click into each worker's logs individually.
<img width="1739" alt="Screen Shot 2022-07-20 at 9 13 00 PM" src="https://user-images.githubusercontent.com/711935/180128633-1633c187-39c9-493e-b694-009fbb27f73b.png">


## Related issue number
fixes #23680 
fixes #22027
fixes #24272
2022-07-26 03:10:57 -07:00

276 lines
11 KiB
Python

import asyncio
from collections import deque
import logging
import os
import aiohttp.web
import ray._private.ray_constants as ray_constants
import ray._private.utils
import ray.dashboard.optional_utils as dashboard_optional_utils
import ray.dashboard.utils as dashboard_utils
from ray._private.gcs_pubsub import GcsAioActorSubscriber
from ray.core.generated import (
core_worker_pb2,
core_worker_pb2_grpc,
gcs_service_pb2,
gcs_service_pb2_grpc,
node_manager_pb2_grpc,
)
from ray.dashboard.datacenter import DataOrganizer, DataSource
from ray.dashboard.modules.actor import actor_consts, actor_utils
from ray.dashboard.modules.actor.actor_utils import actor_classname_from_func_descriptor
from ray.dashboard.optional_utils import rest_response
try:
from grpc import aio as aiogrpc
except ImportError:
from grpc.experimental import aio as aiogrpc
logger = logging.getLogger(__name__)
routes = dashboard_optional_utils.ClassMethodRouteTable
MAX_ACTORS_TO_CACHE = int(os.environ.get("RAY_DASHBOARD_MAX_ACTORS_TO_CACHE", 1000))
ACTOR_CLEANUP_FREQUENCY = 10 # seconds
def actor_table_data_to_dict(message):
orig_message = dashboard_utils.message_to_dict(
message,
{
"actorId",
"parentId",
"jobId",
"workerId",
"rayletId",
"actorCreationDummyObjectId",
"callerId",
"taskId",
"parentTaskId",
"sourceActorId",
"placementGroupId",
},
including_default_value_fields=True,
)
# The complete schema for actor table is here:
# src/ray/protobuf/gcs.proto
# It is super big and for dashboard, we don't need that much information.
# Only preserve the necessary ones here for memory usage.
fields = {
"actorId",
"jobId",
"pid",
"address",
"state",
"name",
"numRestarts",
"functionDescriptor",
"timestamp",
"numExecutedTasks",
}
light_message = {k: v for (k, v) in orig_message.items() if k in fields}
if "functionDescriptor" in light_message:
actor_class = actor_classname_from_func_descriptor(
light_message["functionDescriptor"]
)
light_message["actorClass"] = actor_class
return light_message
class ActorHead(dashboard_utils.DashboardHeadModule):
def __init__(self, dashboard_head):
super().__init__(dashboard_head)
self._stubs = {}
# ActorInfoGcsService
self._gcs_actor_info_stub = None
# A queue of dead actors in order of when they died
self.dead_actors_queue = deque()
DataSource.nodes.signal.append(self._update_stubs)
async def _update_stubs(self, change):
if change.old:
node_id, node_info = change.old
self._stubs.pop(node_id)
if change.new:
# TODO(fyrestone): Handle exceptions.
node_id, node_info = change.new
address = "{}:{}".format(
node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])
)
options = ray_constants.GLOBAL_GRPC_OPTIONS
channel = ray._private.utils.init_grpc_channel(
address, options, asynchronous=True
)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
self._stubs[node_id] = stub
async def _update_actors(self):
# Get all actor info.
while True:
try:
logger.info("Getting all actor info from GCS.")
request = gcs_service_pb2.GetAllActorInfoRequest()
reply = await self._gcs_actor_info_stub.GetAllActorInfo(
request, timeout=5
)
if reply.status.code == 0:
actors = {}
for message in reply.actor_table_data:
actor_table_data = actor_table_data_to_dict(message)
actors[actor_table_data["actorId"]] = actor_table_data
# Update actors.
DataSource.actors.reset(actors)
# Update node actors and job actors.
job_actors = {}
node_actors = {}
for actor_id, actor_table_data in actors.items():
job_id = actor_table_data["jobId"]
node_id = actor_table_data["address"]["rayletId"]
job_actors.setdefault(job_id, {})[actor_id] = actor_table_data
# Update only when node_id is not Nil.
if node_id != actor_consts.NIL_NODE_ID:
node_actors.setdefault(node_id, {})[
actor_id
] = actor_table_data
DataSource.job_actors.reset(job_actors)
DataSource.node_actors.reset(node_actors)
logger.info("Received %d actor info from GCS.", len(actors))
break
else:
raise Exception(
f"Failed to GetAllActorInfo: {reply.status.message}"
)
except Exception:
logger.exception("Error Getting all actor info from GCS.")
await asyncio.sleep(
actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS
)
state_keys = ("state", "address", "numRestarts", "timestamp", "pid")
def process_actor_data_from_pubsub(actor_id, actor_table_data):
actor_table_data = actor_table_data_to_dict(actor_table_data)
# If actor is not new registered but updated, we only update
# states related fields.
if actor_table_data["state"] != "DEPENDENCIES_UNREADY":
actor_table_data_copy = dict(DataSource.actors[actor_id])
for k in state_keys:
actor_table_data_copy[k] = actor_table_data[k]
actor_table_data = actor_table_data_copy
actor_id = actor_table_data["actorId"]
job_id = actor_table_data["jobId"]
node_id = actor_table_data["address"]["rayletId"]
if actor_table_data["state"] == "DEAD":
self.dead_actors_queue.append(actor_id)
# Update actors.
DataSource.actors[actor_id] = actor_table_data
# Update node actors (only when node_id is not Nil).
if node_id != actor_consts.NIL_NODE_ID:
node_actors = dict(DataSource.node_actors.get(node_id, {}))
node_actors[actor_id] = actor_table_data
DataSource.node_actors[node_id] = node_actors
# Update job actors.
job_actors = dict(DataSource.job_actors.get(job_id, {}))
job_actors[actor_id] = actor_table_data
DataSource.job_actors[job_id] = job_actors
# Receive actors from channel.
gcs_addr = self._dashboard_head.gcs_address
subscriber = GcsAioActorSubscriber(address=gcs_addr)
await subscriber.subscribe()
while True:
try:
actor_id, actor_table_data = await subscriber.poll()
if actor_id is not None:
# Convert to lower case hex ID.
actor_id = actor_id.hex()
process_actor_data_from_pubsub(actor_id, actor_table_data)
except Exception:
logger.exception("Error processing actor info from GCS.")
async def _cleanup_actors(self):
while True:
try:
if len(DataSource.actors) > MAX_ACTORS_TO_CACHE:
logger.debug("Cleaning up dead actors from GCS")
while len(DataSource.actors) > MAX_ACTORS_TO_CACHE:
if not self.dead_actors_queue:
logger.warning(
f"More than {MAX_ACTORS_TO_CACHE} "
"live actors are cached"
)
break
actor_id = self.dead_actors_queue.popleft()
if actor_id in DataSource.actors:
actor = DataSource.actors.pop(actor_id)
job_id = actor["jobId"]
del DataSource.job_actors[job_id][actor_id]
node_id = actor["address"].get("rayletId")
if node_id:
del DataSource.node_actors[node_id][actor_id]
await asyncio.sleep(ACTOR_CLEANUP_FREQUENCY)
except Exception:
logger.exception("Error cleaning up actor info from GCS.")
@routes.get("/logical/actor_groups")
async def get_actor_groups(self, req) -> aiohttp.web.Response:
actors = await DataOrganizer.get_all_actors()
actor_creation_tasks = await DataOrganizer.get_actor_creation_tasks()
# actor_creation_tasks have some common interface with actors,
# and they get processed and shown in tandem in the logical view
# hence we merge them together before constructing actor groups.
actors.update(actor_creation_tasks)
actor_groups = actor_utils.construct_actor_groups(actors)
return rest_response(
success=True, message="Fetched actor groups.", actor_groups=actor_groups
)
@routes.get("/logical/actors")
@dashboard_optional_utils.aiohttp_cache
async def get_all_actors(self, req) -> aiohttp.web.Response:
return rest_response(
success=True, message="All actors fetched.", actors=DataSource.actors
)
@routes.get("/logical/kill_actor")
async def kill_actor(self, req) -> aiohttp.web.Response:
try:
actor_id = req.query["actorId"]
ip_address = req.query["ipAddress"]
port = req.query["port"]
except KeyError:
return rest_response(success=False, message="Bad Request")
try:
options = ray_constants.GLOBAL_GRPC_OPTIONS
channel = ray._private.utils.init_grpc_channel(
f"{ip_address}:{port}", options=options, asynchronous=True
)
stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel)
await stub.KillActor(
core_worker_pb2.KillActorRequest(
intended_actor_id=ray._private.utils.hex_to_binary(actor_id)
)
)
except aiogrpc.AioRpcError:
# This always throws an exception because the worker
# is killed and the channel is closed on the worker side
# before this handler, however it deletes the actor correctly.
pass
return rest_response(success=True, message=f"Killed actor with id {actor_id}")
async def run(self, server):
gcs_channel = self._dashboard_head.aiogrpc_gcs_channel
self._gcs_actor_info_stub = gcs_service_pb2_grpc.ActorInfoGcsServiceStub(
gcs_channel
)
asyncio.get_event_loop().create_task(self._cleanup_actors())
await asyncio.gather(self._update_actors())
@staticmethod
def is_minimal_module():
return False