ray/dashboard/modules/job/job_head.py
2021-01-13 16:45:12 +08:00

117 lines
4.4 KiB
Python

import logging
import asyncio
import aiohttp.web
from aioredis.pubsub import Receiver
import ray
import ray.gcs_utils
import ray.new_dashboard.modules.job.job_consts as job_consts
import ray.new_dashboard.utils as dashboard_utils
from ray.core.generated import gcs_service_pb2
from ray.core.generated import gcs_service_pb2_grpc
from ray.new_dashboard.datacenter import (
DataSource,
GlobalSignals,
)
logger = logging.getLogger(__name__)
routes = dashboard_utils.ClassMethodRouteTable
def job_table_data_to_dict(message):
decode_keys = {"jobId", "rayletId"}
return dashboard_utils.message_to_dict(
message, decode_keys, including_default_value_fields=True)
class JobHead(dashboard_utils.DashboardHeadModule):
def __init__(self, dashboard_head):
super().__init__(dashboard_head)
# JobInfoGcsServiceStub
self._gcs_job_info_stub = None
@routes.get("/jobs")
@dashboard_utils.aiohttp_cache
async def get_all_jobs(self, req) -> aiohttp.web.Response:
view = req.query.get("view")
if view == "summary":
return dashboard_utils.rest_response(
success=True,
message="All job summary fetched.",
summary=list(DataSource.jobs.values()))
else:
return dashboard_utils.rest_response(
success=False, message="Unknown view {}".format(view))
@routes.get("/jobs/{job_id}")
@dashboard_utils.aiohttp_cache
async def get_job(self, req) -> aiohttp.web.Response:
job_id = req.match_info.get("job_id")
view = req.query.get("view")
if view is None:
job_detail = {
"jobInfo": DataSource.jobs.get(job_id, {}),
"jobActors": DataSource.job_actors.get(job_id, {}),
"jobWorkers": DataSource.job_workers.get(job_id, []),
}
await GlobalSignals.job_info_fetched.send(job_detail)
return dashboard_utils.rest_response(
success=True, message="Job detail fetched.", detail=job_detail)
else:
return dashboard_utils.rest_response(
success=False, message="Unknown view {}".format(view))
async def _update_jobs(self):
# Subscribe job channel.
aioredis_client = self._dashboard_head.aioredis_client
receiver = Receiver()
key = f"{job_consts.JOB_CHANNEL}:*"
pattern = receiver.pattern(key)
await aioredis_client.psubscribe(pattern)
logger.info("Subscribed to %s", key)
# Get all job info.
while True:
try:
logger.info("Getting all job info from GCS.")
request = gcs_service_pb2.GetAllJobInfoRequest()
reply = await self._gcs_job_info_stub.GetAllJobInfo(
request, timeout=5)
if reply.status.code == 0:
jobs = {}
for job_table_data in reply.job_info_list:
data = job_table_data_to_dict(job_table_data)
jobs[data["jobId"]] = data
# Update jobs.
DataSource.jobs.reset(jobs)
logger.info("Received %d job info from GCS.", len(jobs))
break
else:
raise Exception(
f"Failed to GetAllJobInfo: {reply.status.message}")
except Exception:
logger.exception("Error Getting all job info from GCS.")
await asyncio.sleep(
job_consts.RETRY_GET_ALL_JOB_INFO_INTERVAL_SECONDS)
# Receive jobs from channel.
async for sender, msg in receiver.iter():
try:
_, data = msg
pubsub_message = ray.gcs_utils.PubSubMessage.FromString(data)
message = ray.gcs_utils.JobTableData.FromString(
pubsub_message.data)
job_table_data = job_table_data_to_dict(message)
job_id = job_table_data["jobId"]
# Update jobs.
DataSource.jobs[job_id] = job_table_data
except Exception:
logger.exception("Error receiving job info.")
async def run(self, server):
self._gcs_job_info_stub = gcs_service_pb2_grpc.JobInfoGcsServiceStub(
self._dashboard_head.aiogrpc_gcs_channel)
await asyncio.gather(self._update_jobs())