mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[dashboard] Pipe resource assignments to dashboard (#8998)
This commit is contained in:
parent
0de2efd330
commit
8a99fd205e
6 changed files with 64 additions and 18 deletions
|
@ -133,9 +133,18 @@ export type NodeInfoResponse = {
|
|||
|
||||
export const getNodeInfo = () => get<NodeInfoResponse>("/api/node_info", {});
|
||||
|
||||
export type ResourceSlot = {
|
||||
slot: number;
|
||||
allocation: number;
|
||||
};
|
||||
|
||||
export type ResourceAllocations = {
|
||||
resourceSlots: ResourceSlot[];
|
||||
};
|
||||
|
||||
export type RayletCoreWorkerStats = {
|
||||
usedResources: {
|
||||
[key: string]: number;
|
||||
[key: string]: ResourceAllocations;
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -168,7 +177,7 @@ export type RayletActorInfo =
|
|||
taskQueueLength: number;
|
||||
timestamp: number;
|
||||
usedObjectStoreMemory: number;
|
||||
usedResources: { [key: string]: number };
|
||||
usedResources: { [key: string]: ResourceAllocations };
|
||||
currentTaskDesc?: string;
|
||||
numPendingTasks?: number;
|
||||
webuiDisplay?: Record<string, string>;
|
||||
|
|
|
@ -16,6 +16,7 @@ import {
|
|||
launchProfiling,
|
||||
RayletActorInfo,
|
||||
} from "../../../api";
|
||||
import { sum } from "../../../common/util";
|
||||
import ActorDetailsPane from "./ActorDetailsPane";
|
||||
import Actors from "./Actors";
|
||||
|
||||
|
@ -137,7 +138,12 @@ class Actor extends React.Component<Props & WithStyles<typeof styles>, State> {
|
|||
Object.entries(actor.usedResources).length > 0 &&
|
||||
Object.entries(actor.usedResources)
|
||||
.sort((a, b) => a[0].localeCompare(b[0]))
|
||||
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
|
||||
.map(
|
||||
([key, value]) =>
|
||||
`${sum(
|
||||
value.resourceSlots.map((slot) => slot.allocation),
|
||||
)} ${key}`,
|
||||
)
|
||||
.join(", "),
|
||||
},
|
||||
{
|
||||
|
|
|
@ -63,13 +63,22 @@ export const NodeGPU: NodeFeatureComponent = ({ node }) => {
|
|||
|
||||
export const WorkerGPU: WorkerFeatureComponent = ({ rayletWorker }) => {
|
||||
const workerRes = rayletWorker?.coreWorkerStats.usedResources;
|
||||
const workerUsedGPUResources = workerRes?.["GPU"] || NaN;
|
||||
const message = isNaN(workerUsedGPUResources) ? (
|
||||
<Typography color="textSecondary" component="span" variant="inherit">
|
||||
N/A
|
||||
</Typography>
|
||||
) : (
|
||||
<b>`${workerUsedGPUResources} GPUs in use`</b>
|
||||
);
|
||||
const workerUsedGPUResources = workerRes?.["GPU"];
|
||||
let message;
|
||||
if (workerUsedGPUResources === undefined) {
|
||||
message = (
|
||||
<Typography color="textSecondary" component="span" variant="inherit">
|
||||
N/A
|
||||
</Typography>
|
||||
);
|
||||
} else {
|
||||
const aggregateAllocation = sum(
|
||||
workerUsedGPUResources.resourceSlots.map(
|
||||
(resourceSlot) => resourceSlot.allocation,
|
||||
),
|
||||
);
|
||||
const plural = aggregateAllocation === 1 ? "" : "s";
|
||||
message = <b>{`${aggregateAllocation} GPU${plural} in use`}</b>;
|
||||
}
|
||||
return <div style={{ minWidth: 60 }}>{message}</div>;
|
||||
};
|
||||
|
|
|
@ -223,7 +223,13 @@ def test_raylet_info_endpoint(shutdown_only):
|
|||
raise Exception(
|
||||
"Timed out while waiting for dashboard to start.")
|
||||
|
||||
assert parent_actor_info["usedResources"]["CPU"] == 2
|
||||
def cpu_resources(actor_info):
|
||||
cpu_resources = 0
|
||||
for slot in actor_info["usedResources"]["CPU"]["resourceSlots"]:
|
||||
cpu_resources += slot["allocation"]
|
||||
return cpu_resources
|
||||
|
||||
assert cpu_resources(parent_actor_info) == 2
|
||||
assert parent_actor_info["numExecutedTasks"] == 4
|
||||
for _, child_actor_info in children.items():
|
||||
if child_actor_info["state"] == -1:
|
||||
|
@ -231,7 +237,7 @@ def test_raylet_info_endpoint(shutdown_only):
|
|||
else:
|
||||
assert child_actor_info["state"] == 1
|
||||
assert len(child_actor_info["children"]) == 0
|
||||
assert child_actor_info["usedResources"]["CPU"] == 1
|
||||
assert cpu_resources(child_actor_info) == 1
|
||||
|
||||
profiling_id = requests.get(
|
||||
webui_url + "/api/launch_profiling",
|
||||
|
|
|
@ -1602,6 +1602,9 @@ Status CoreWorker::ExecuteTask(const TaskSpecification &task_spec,
|
|||
{
|
||||
absl::MutexLock lock(&mutex_);
|
||||
current_task_ = TaskSpecification();
|
||||
if (task_spec.IsNormalTask()) {
|
||||
resource_ids_.reset(new ResourceMappingType());
|
||||
}
|
||||
}
|
||||
RAY_LOG(DEBUG) << "Finished executing task " << task_spec.TaskId();
|
||||
|
||||
|
@ -1962,11 +1965,13 @@ void CoreWorker::HandleGetCoreWorkerStats(const rpc::GetCoreWorkerStatsRequest &
|
|||
stats->set_actor_id(actor_id_.Binary());
|
||||
auto used_resources_map = stats->mutable_used_resources();
|
||||
for (auto const &it : *resource_ids_) {
|
||||
double quantity = 0;
|
||||
rpc::ResourceAllocations allocations;
|
||||
for (auto const &pair : it.second) {
|
||||
quantity += pair.second;
|
||||
auto resource_slot = allocations.add_resource_slots();
|
||||
resource_slot->set_slot(pair.first);
|
||||
resource_slot->set_allocation(pair.second);
|
||||
}
|
||||
(*used_resources_map)[it.first] = quantity;
|
||||
(*used_resources_map)[it.first] = allocations;
|
||||
}
|
||||
stats->set_actor_title(actor_title_);
|
||||
google::protobuf::Map<std::string, std::string> webui_map(webui_display_.begin(),
|
||||
|
|
|
@ -254,6 +254,17 @@ message ObjectRefInfo {
|
|||
bool pinned_in_memory = 7;
|
||||
}
|
||||
|
||||
// Details about the allocation of a given resource. Some resources
|
||||
// (e.g., GPUs) have individually allocatable units that are represented
|
||||
// as "slots" here.
|
||||
message ResourceAllocations {
|
||||
message ResourceSlot {
|
||||
int64 slot = 1;
|
||||
double allocation = 2;
|
||||
}
|
||||
repeated ResourceSlot resource_slots = 1;
|
||||
}
|
||||
|
||||
// Debug info returned from the core worker.
|
||||
message CoreWorkerStats {
|
||||
// Debug string of the currently executing task.
|
||||
|
@ -270,8 +281,8 @@ message CoreWorkerStats {
|
|||
int64 port = 7;
|
||||
// Actor ID.
|
||||
bytes actor_id = 8;
|
||||
// A map from the resource name (e.g. "CPU") to the amount of resource used.
|
||||
map<string, double> used_resources = 9;
|
||||
// A map from the resource name (e.g. "CPU") to its allocation.
|
||||
map<string, ResourceAllocations> used_resources = 9;
|
||||
// A string displayed on Dashboard.
|
||||
map<string, string> webui_display = 10;
|
||||
// Number of objects that are IN_PLASMA_ERROR in the local memory store.
|
||||
|
|
Loading…
Add table
Reference in a new issue