[dashboard] Pipe resource assignments to dashboard (#8998)

This commit is contained in:
Edward Oakes 2020-06-18 11:14:59 -05:00 committed by GitHub
parent 0de2efd330
commit 8a99fd205e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 64 additions and 18 deletions

View file

@ -133,9 +133,18 @@ export type NodeInfoResponse = {
export const getNodeInfo = () => get<NodeInfoResponse>("/api/node_info", {});
export type ResourceSlot = {
slot: number;
allocation: number;
};
export type ResourceAllocations = {
resourceSlots: ResourceSlot[];
};
export type RayletCoreWorkerStats = {
usedResources: {
[key: string]: number;
[key: string]: ResourceAllocations;
};
};
@ -168,7 +177,7 @@ export type RayletActorInfo =
taskQueueLength: number;
timestamp: number;
usedObjectStoreMemory: number;
usedResources: { [key: string]: number };
usedResources: { [key: string]: ResourceAllocations };
currentTaskDesc?: string;
numPendingTasks?: number;
webuiDisplay?: Record<string, string>;

View file

@ -16,6 +16,7 @@ import {
launchProfiling,
RayletActorInfo,
} from "../../../api";
import { sum } from "../../../common/util";
import ActorDetailsPane from "./ActorDetailsPane";
import Actors from "./Actors";
@ -137,7 +138,12 @@ class Actor extends React.Component<Props & WithStyles<typeof styles>, State> {
Object.entries(actor.usedResources).length > 0 &&
Object.entries(actor.usedResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
.map(
([key, value]) =>
`${sum(
value.resourceSlots.map((slot) => slot.allocation),
)} ${key}`,
)
.join(", "),
},
{

View file

@ -63,13 +63,22 @@ export const NodeGPU: NodeFeatureComponent = ({ node }) => {
export const WorkerGPU: WorkerFeatureComponent = ({ rayletWorker }) => {
const workerRes = rayletWorker?.coreWorkerStats.usedResources;
const workerUsedGPUResources = workerRes?.["GPU"] || NaN;
const message = isNaN(workerUsedGPUResources) ? (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
) : (
<b>`${workerUsedGPUResources} GPUs in use`</b>
);
const workerUsedGPUResources = workerRes?.["GPU"];
let message;
if (workerUsedGPUResources === undefined) {
message = (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
);
} else {
const aggregateAllocation = sum(
workerUsedGPUResources.resourceSlots.map(
(resourceSlot) => resourceSlot.allocation,
),
);
const plural = aggregateAllocation === 1 ? "" : "s";
message = <b>{`${aggregateAllocation} GPU${plural} in use`}</b>;
}
return <div style={{ minWidth: 60 }}>{message}</div>;
};

View file

@ -223,7 +223,13 @@ def test_raylet_info_endpoint(shutdown_only):
raise Exception(
"Timed out while waiting for dashboard to start.")
assert parent_actor_info["usedResources"]["CPU"] == 2
def cpu_resources(actor_info):
cpu_resources = 0
for slot in actor_info["usedResources"]["CPU"]["resourceSlots"]:
cpu_resources += slot["allocation"]
return cpu_resources
assert cpu_resources(parent_actor_info) == 2
assert parent_actor_info["numExecutedTasks"] == 4
for _, child_actor_info in children.items():
if child_actor_info["state"] == -1:
@ -231,7 +237,7 @@ def test_raylet_info_endpoint(shutdown_only):
else:
assert child_actor_info["state"] == 1
assert len(child_actor_info["children"]) == 0
assert child_actor_info["usedResources"]["CPU"] == 1
assert cpu_resources(child_actor_info) == 1
profiling_id = requests.get(
webui_url + "/api/launch_profiling",

View file

@ -1602,6 +1602,9 @@ Status CoreWorker::ExecuteTask(const TaskSpecification &task_spec,
{
absl::MutexLock lock(&mutex_);
current_task_ = TaskSpecification();
if (task_spec.IsNormalTask()) {
resource_ids_.reset(new ResourceMappingType());
}
}
RAY_LOG(DEBUG) << "Finished executing task " << task_spec.TaskId();
@ -1962,11 +1965,13 @@ void CoreWorker::HandleGetCoreWorkerStats(const rpc::GetCoreWorkerStatsRequest &
stats->set_actor_id(actor_id_.Binary());
auto used_resources_map = stats->mutable_used_resources();
for (auto const &it : *resource_ids_) {
double quantity = 0;
rpc::ResourceAllocations allocations;
for (auto const &pair : it.second) {
quantity += pair.second;
auto resource_slot = allocations.add_resource_slots();
resource_slot->set_slot(pair.first);
resource_slot->set_allocation(pair.second);
}
(*used_resources_map)[it.first] = quantity;
(*used_resources_map)[it.first] = allocations;
}
stats->set_actor_title(actor_title_);
google::protobuf::Map<std::string, std::string> webui_map(webui_display_.begin(),

View file

@ -254,6 +254,17 @@ message ObjectRefInfo {
bool pinned_in_memory = 7;
}
// Details about the allocation of a given resource. Some resources
// (e.g., GPUs) have individually allocatable units that are represented
// as "slots" here.
message ResourceAllocations {
message ResourceSlot {
int64 slot = 1;
double allocation = 2;
}
repeated ResourceSlot resource_slots = 1;
}
// Debug info returned from the core worker.
message CoreWorkerStats {
// Debug string of the currently executing task.
@ -270,8 +281,8 @@ message CoreWorkerStats {
int64 port = 7;
// Actor ID.
bytes actor_id = 8;
// A map from the resource name (e.g. "CPU") to the amount of resource used.
map<string, double> used_resources = 9;
// A map from the resource name (e.g. "CPU") to its allocation.
map<string, ResourceAllocations> used_resources = 9;
// A string displayed on Dashboard.
map<string, string> webui_display = 10;
// Number of objects that are IN_PLASMA_ERROR in the local memory store.