diff --git a/python/ray/dashboard/client/src/api.ts b/python/ray/dashboard/client/src/api.ts index 7ed6b5ad6..fe9304496 100644 --- a/python/ray/dashboard/client/src/api.ts +++ b/python/ray/dashboard/client/src/api.ts @@ -133,9 +133,18 @@ export type NodeInfoResponse = { export const getNodeInfo = () => get("/api/node_info", {}); +export type ResourceSlot = { + slot: number; + allocation: number; +}; + +export type ResourceAllocations = { + resourceSlots: ResourceSlot[]; +}; + export type RayletCoreWorkerStats = { usedResources: { - [key: string]: number; + [key: string]: ResourceAllocations; }; }; @@ -168,7 +177,7 @@ export type RayletActorInfo = taskQueueLength: number; timestamp: number; usedObjectStoreMemory: number; - usedResources: { [key: string]: number }; + usedResources: { [key: string]: ResourceAllocations }; currentTaskDesc?: string; numPendingTasks?: number; webuiDisplay?: Record; diff --git a/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx b/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx index 3d21d4e5f..12e83d321 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/logical-view/Actor.tsx @@ -16,6 +16,7 @@ import { launchProfiling, RayletActorInfo, } from "../../../api"; +import { sum } from "../../../common/util"; import ActorDetailsPane from "./ActorDetailsPane"; import Actors from "./Actors"; @@ -137,7 +138,12 @@ class Actor extends React.Component, State> { Object.entries(actor.usedResources).length > 0 && Object.entries(actor.usedResources) .sort((a, b) => a[0].localeCompare(b[0])) - .map(([key, value]) => `${value.toLocaleString()} ${key}`) + .map( + ([key, value]) => + `${sum( + value.resourceSlots.map((slot) => slot.allocation), + )} ${key}`, + ) .join(", "), }, { diff --git a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx index 2b8fd862c..9e62e2cee 100644 --- a/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx +++ b/python/ray/dashboard/client/src/pages/dashboard/node-info/features/GPU.tsx @@ -63,13 +63,22 @@ export const NodeGPU: NodeFeatureComponent = ({ node }) => { export const WorkerGPU: WorkerFeatureComponent = ({ rayletWorker }) => { const workerRes = rayletWorker?.coreWorkerStats.usedResources; - const workerUsedGPUResources = workerRes?.["GPU"] || NaN; - const message = isNaN(workerUsedGPUResources) ? ( - - N/A - - ) : ( - `${workerUsedGPUResources} GPUs in use` - ); + const workerUsedGPUResources = workerRes?.["GPU"]; + let message; + if (workerUsedGPUResources === undefined) { + message = ( + + N/A + + ); + } else { + const aggregateAllocation = sum( + workerUsedGPUResources.resourceSlots.map( + (resourceSlot) => resourceSlot.allocation, + ), + ); + const plural = aggregateAllocation === 1 ? "" : "s"; + message = {`${aggregateAllocation} GPU${plural} in use`}; + } return
{message}
; }; diff --git a/python/ray/tests/test_metrics.py b/python/ray/tests/test_metrics.py index 8b27373d4..d1bf8b31a 100644 --- a/python/ray/tests/test_metrics.py +++ b/python/ray/tests/test_metrics.py @@ -223,7 +223,13 @@ def test_raylet_info_endpoint(shutdown_only): raise Exception( "Timed out while waiting for dashboard to start.") - assert parent_actor_info["usedResources"]["CPU"] == 2 + def cpu_resources(actor_info): + cpu_resources = 0 + for slot in actor_info["usedResources"]["CPU"]["resourceSlots"]: + cpu_resources += slot["allocation"] + return cpu_resources + + assert cpu_resources(parent_actor_info) == 2 assert parent_actor_info["numExecutedTasks"] == 4 for _, child_actor_info in children.items(): if child_actor_info["state"] == -1: @@ -231,7 +237,7 @@ def test_raylet_info_endpoint(shutdown_only): else: assert child_actor_info["state"] == 1 assert len(child_actor_info["children"]) == 0 - assert child_actor_info["usedResources"]["CPU"] == 1 + assert cpu_resources(child_actor_info) == 1 profiling_id = requests.get( webui_url + "/api/launch_profiling", diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index b24a99340..e39aed379 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1602,6 +1602,9 @@ Status CoreWorker::ExecuteTask(const TaskSpecification &task_spec, { absl::MutexLock lock(&mutex_); current_task_ = TaskSpecification(); + if (task_spec.IsNormalTask()) { + resource_ids_.reset(new ResourceMappingType()); + } } RAY_LOG(DEBUG) << "Finished executing task " << task_spec.TaskId(); @@ -1962,11 +1965,13 @@ void CoreWorker::HandleGetCoreWorkerStats(const rpc::GetCoreWorkerStatsRequest & stats->set_actor_id(actor_id_.Binary()); auto used_resources_map = stats->mutable_used_resources(); for (auto const &it : *resource_ids_) { - double quantity = 0; + rpc::ResourceAllocations allocations; for (auto const &pair : it.second) { - quantity += pair.second; + auto resource_slot = allocations.add_resource_slots(); + resource_slot->set_slot(pair.first); + resource_slot->set_allocation(pair.second); } - (*used_resources_map)[it.first] = quantity; + (*used_resources_map)[it.first] = allocations; } stats->set_actor_title(actor_title_); google::protobuf::Map webui_map(webui_display_.begin(), diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index cef9c087c..d27def1b0 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -254,6 +254,17 @@ message ObjectRefInfo { bool pinned_in_memory = 7; } +// Details about the allocation of a given resource. Some resources +// (e.g., GPUs) have individually allocatable units that are represented +// as "slots" here. +message ResourceAllocations { + message ResourceSlot { + int64 slot = 1; + double allocation = 2; + } + repeated ResourceSlot resource_slots = 1; +} + // Debug info returned from the core worker. message CoreWorkerStats { // Debug string of the currently executing task. @@ -270,8 +281,8 @@ message CoreWorkerStats { int64 port = 7; // Actor ID. bytes actor_id = 8; - // A map from the resource name (e.g. "CPU") to the amount of resource used. - map used_resources = 9; + // A map from the resource name (e.g. "CPU") to its allocation. + map used_resources = 9; // A string displayed on Dashboard. map webui_display = 10; // Number of objects that are IN_PLASMA_ERROR in the local memory store.