Display GPU Utilization in the Dashboard (#8564)

This commit is contained in:
Max Fitton 2020-06-15 13:27:44 -07:00 committed by GitHub
parent 6c49c01837
commit ddb9368f2c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 313 additions and 48 deletions

View file

@ -33,6 +33,7 @@
"test": "react-scripts test",
"eject": "react-scripts eject",
"lint": "npm run eslint && npm run prettier",
"lint-fix": "npm run prettier -- --write && npm run eslint -- --fix",
"prettier": "./node_modules/.bin/prettier -c src/",
"eslint": "./node_modules/.bin/eslint \"src/**\""
},

View file

@ -74,6 +74,29 @@ export type NodeInfoResponseWorker = {
};
};
export type GPUProcessStats = {
// Sub stat of GPU stats, this type represents the GPU
// utilization of a single process of a single GPU.
username: string;
command: string;
gpu_memory_usage: number;
pid: number;
};
export type GPUStats = {
// This represents stats fetched from a node about a single GPU
uuid: string;
name: string;
temperature_gpu: number;
fan_speed: number;
utilization_gpu: number;
power_draw: number;
enforced_power_limit: number;
memory_used: number;
memory_total: number;
processes: Array<GPUProcessStats>;
};
export type NodeInfoResponse = {
clients: Array<{
now: number;
@ -82,6 +105,7 @@ export type NodeInfoResponse = {
boot_time: number; // System boot time expressed in seconds since epoch
cpu: number; // System-wide CPU utilization expressed as a percentage
cpus: [number, number]; // Number of logical CPUs and physical CPUs
gpus: Array<GPUStats>; // GPU stats fetched from node, 1 entry per GPU
mem: [number, number, number]; // Total, available, and used percentage of memory
disk: {
[path: string]: {
@ -109,6 +133,18 @@ export type NodeInfoResponse = {
export const getNodeInfo = () => get<NodeInfoResponse>("/api/node_info", {});
export type RayletCoreWorkerStats = {
usedResources: {
[key: string]: number;
};
};
export type RayletWorkerStats = {
pid: number;
isDriver?: boolean;
coreWorkerStats: RayletCoreWorkerStats;
};
export type RayletActorInfo =
| {
actorId: string;
@ -145,10 +181,7 @@ export type RayletInfoResponse = {
nodes: {
[ip: string]: {
extraInfo?: string;
workersStats: {
pid: number;
isDriver?: boolean;
}[];
workersStats: Array<RayletWorkerStats>;
};
};
actors: {

View file

@ -17,6 +17,10 @@ export const formatUsage = (
return `${usedFormatted} / ${totalFormatted} (${percent.toFixed(0)}%)`;
};
// Formats, e.g. 400 and 6000 as "400 MiB / 6000 MiB (6.7%)"
export const MiBRatio = (used: number, total: number) =>
`${used} MiB / ${total} MiB (${(100 * (used / total)).toFixed(1)}%)`;
export const formatDuration = (durationInSeconds: number) => {
const durationSeconds = Math.floor(durationInSeconds) % 60;
const durationMinutes = Math.floor(durationInSeconds / 60) % 60;

View file

@ -0,0 +1,20 @@
export const getWeightedAverage = (
input: {
weight: number;
value: number;
}[],
) => {
if (input.length === 0) {
return 0;
}
let totalWeightTimesValue = 0;
let totalWeight = 0;
for (const { weight, value } of input) {
totalWeightTimesValue += weight * value;
totalWeight += weight;
}
return totalWeightTimesValue / totalWeight;
};
export const sum = (vals: number[]) => vals.reduce((acc, val) => acc + val, 0);

View file

@ -13,6 +13,7 @@ import {
import React from "react";
import { connect } from "react-redux";
import { RayletInfoResponse } from "../../../api";
import { sum } from "../../../common/util";
import { StoreState } from "../../../store";
import Errors from "./dialogs/errors/Errors";
import Logs from "./dialogs/logs/Logs";
@ -117,9 +118,11 @@ class NodeInfo extends React.Component<
// the node info can contain data from more than one cluster
// if more than one cluster is running on a machine.
const clusterWorkerPidsByIp = clusterWorkerPids(rayletInfo);
const clusterTotalWorkers = Array.from(
clusterWorkerPidsByIp.values(),
).reduce((acc, workerSet) => acc + workerSet.size, 0);
const clusterTotalWorkers = sum(
Array.from(clusterWorkerPidsByIp.values()).map(
(workerSet) => workerSet.size,
),
);
// Initialize inner structure of the count objects
for (const client of nodeInfo.clients) {
const clusterWorkerPids = clusterWorkerPidsByIp.get(client.ip);
@ -129,9 +132,8 @@ class NodeInfo extends React.Component<
const filteredLogEntries = Object.entries(
nodeInfo.log_counts[client.ip] || {},
).filter(([pid, _]) => clusterWorkerPids.has(pid));
const totalLogEntries = filteredLogEntries.reduce(
(acc, [_, count]) => acc + count,
0,
const totalLogEntries = sum(
filteredLogEntries.map(([_, count]) => count),
);
logCounts[client.ip] = {
perWorker: Object.fromEntries(filteredLogEntries),
@ -141,9 +143,8 @@ class NodeInfo extends React.Component<
const filteredErrEntries = Object.entries(
nodeInfo.error_counts[client.ip] || {},
).filter(([pid, _]) => clusterWorkerPids.has(pid));
const totalErrEntries = filteredErrEntries.reduce(
(acc, [_, count]) => acc + count,
0,
const totalErrEntries = sum(
filteredErrEntries.map(([_, count]) => count),
);
errorCounts[client.ip] = {
perWorker: Object.fromEntries(filteredErrEntries),
@ -162,6 +163,8 @@ class NodeInfo extends React.Component<
<TableCell className={classes.cell}>Uptime</TableCell>
<TableCell className={classes.cell}>CPU</TableCell>
<TableCell className={classes.cell}>RAM</TableCell>
<TableCell className={classes.cell}>GPU</TableCell>
<TableCell className={classes.cell}>GRAM</TableCell>
<TableCell className={classes.cell}>Disk</TableCell>
<TableCell className={classes.cell}>Sent</TableCell>
<TableCell className={classes.cell}>Received</TableCell>

View file

@ -18,6 +18,8 @@ import {
import { NodeCPU, WorkerCPU } from "./features/CPU";
import { NodeDisk, WorkerDisk } from "./features/Disk";
import { makeNodeErrors, makeWorkerErrors } from "./features/Errors";
import { NodeGPU, WorkerGPU } from "./features/GPU";
import { NodeGRAM, WorkerGRAM } from "./features/GRAM";
import { NodeHost, WorkerHost } from "./features/Host";
import { makeNodeLogs, makeWorkerLogs } from "./features/Logs";
import { NodeRAM, WorkerRAM } from "./features/RAM";
@ -108,6 +110,8 @@ class NodeRowGroup extends React.Component<
{ NodeFeature: NodeUptime, WorkerFeature: WorkerUptime },
{ NodeFeature: NodeCPU, WorkerFeature: WorkerCPU },
{ NodeFeature: NodeRAM, WorkerFeature: WorkerRAM },
{ NodeFeature: NodeGPU, WorkerFeature: WorkerGPU },
{ NodeFeature: NodeGRAM, WorkerFeature: WorkerGRAM },
{ NodeFeature: NodeDisk, WorkerFeature: WorkerDisk },
{ NodeFeature: NodeSent, WorkerFeature: WorkerSent },
{ NodeFeature: NodeReceived, WorkerFeature: WorkerReceived },
@ -153,16 +157,27 @@ class NodeRowGroup extends React.Component<
</TableCell>
</TableRow>
)}
{clusterWorkers.map((worker, index: number) => (
<TableRow hover key={index}>
<TableCell className={classes.cell} />
{features.map(({ WorkerFeature }, index) => (
<TableCell className={classes.cell} key={index}>
<WorkerFeature node={node} worker={worker} />
</TableCell>
))}
</TableRow>
))}
{clusterWorkers.map((worker, index: number) => {
const rayletWorker =
raylet?.workersStats.find(
(rayletWorker) => worker.pid === rayletWorker.pid,
) || null;
return (
<TableRow hover key={index}>
<TableCell className={classes.cell} />
{features.map(({ WorkerFeature }, index) => (
<TableCell className={classes.cell} key={index}>
<WorkerFeature
node={node}
worker={worker}
rayletWorker={rayletWorker}
/>
</TableCell>
))}
</TableRow>
);
})}
</React.Fragment>
)}
</React.Fragment>

View file

@ -12,6 +12,8 @@ import { NodeInfoResponse } from "../../../api";
import { ClusterCPU } from "./features/CPU";
import { ClusterDisk } from "./features/Disk";
import { makeClusterErrors } from "./features/Errors";
import { ClusterGPU } from "./features/GPU";
import { ClusterGRAM } from "./features/GRAM";
import { ClusterHost } from "./features/Host";
import { makeClusterLogs } from "./features/Logs";
import { ClusterRAM } from "./features/RAM";
@ -72,6 +74,8 @@ class TotalRow extends React.Component<Props & WithStyles<typeof styles>> {
{ ClusterFeature: ClusterUptime },
{ ClusterFeature: ClusterCPU },
{ ClusterFeature: ClusterRAM },
{ ClusterFeature: ClusterGPU },
{ ClusterFeature: ClusterGRAM },
{ ClusterFeature: ClusterDisk },
{ ClusterFeature: ClusterSent },
{ ClusterFeature: ClusterReceived },

View file

@ -1,30 +1,12 @@
import React from "react";
import UsageBar from "../../../../common/UsageBar";
import { getWeightedAverage } from "../../../../common/util";
import {
ClusterFeatureComponent,
NodeFeatureComponent,
WorkerFeatureComponent,
} from "./types";
const getWeightedAverage = (
input: {
weight: number;
value: number;
}[],
) => {
if (input.length === 0) {
return 0;
}
let totalWeightTimesValue = 0;
let totalWeight = 0;
for (const { weight, value } of input) {
totalWeightTimesValue += weight * value;
totalWeight += weight;
}
return totalWeightTimesValue / totalWeight;
};
export const ClusterCPU: ClusterFeatureComponent = ({ nodes }) => {
const cpuWeightedAverage = getWeightedAverage(
nodes.map((node) => ({ weight: node.cpus[0], value: node.cpu })),

View file

@ -0,0 +1,75 @@
import { Typography } from "@material-ui/core";
import React from "react";
import UsageBar from "../../../../common/UsageBar";
import { getWeightedAverage, sum } from "../../../../common/util";
import {
ClusterFeatureComponent,
Node,
NodeFeatureComponent,
WorkerFeatureComponent,
} from "./types";
const clusterUtilization = (nodes: Array<Node>): number => {
const utils = nodes
.map((node) => ({ weight: node.gpus.length, value: nodeUtilization(node) }))
.filter((util) => !isNaN(util.value));
if (utils.length === 0) {
return NaN;
}
return getWeightedAverage(utils);
};
const nodeUtilization = (node: Node): number => {
if (!node.gpus || node.gpus.length === 0) {
return NaN;
}
const utilizationSum = sum(node.gpus.map((gpu) => gpu.utilization_gpu));
const avgUtilization = utilizationSum / node.gpus.length;
return avgUtilization;
};
export const ClusterGPU: ClusterFeatureComponent = ({ nodes }) => {
const clusterAverageUtilization = clusterUtilization(nodes);
return (
<div style={{ minWidth: 60 }}>
{isNaN(clusterAverageUtilization) ? (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
) : (
<UsageBar
percent={clusterAverageUtilization}
text={`${clusterAverageUtilization.toFixed(1)}%`}
/>
)}
</div>
);
};
export const NodeGPU: NodeFeatureComponent = ({ node }) => {
const nodeUtil = nodeUtilization(node);
return (
<div style={{ minWidth: 60 }}>
{isNaN(nodeUtil) ? (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
) : (
<UsageBar percent={nodeUtil} text={`${nodeUtil.toFixed(1)}%`} />
)}
</div>
);
};
export const WorkerGPU: WorkerFeatureComponent = ({ rayletWorker }) => {
const workerRes = rayletWorker?.coreWorkerStats.usedResources;
const workerUsedGPUResources = workerRes?.["GPU"] || NaN;
const message = isNaN(workerUsedGPUResources) ? (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
) : (
<b>`${workerUsedGPUResources} GPUs in use`</b>
);
return <div style={{ minWidth: 60 }}>{message}</div>;
};

View file

@ -0,0 +1,96 @@
import { Typography } from "@material-ui/core";
import React from "react";
import { GPUStats } from "../../../../api";
import { MiBRatio } from "../../../../common/formatUtils";
import UsageBar from "../../../../common/UsageBar";
import { getWeightedAverage, sum } from "../../../../common/util";
import {
ClusterFeatureComponent,
Node,
NodeFeatureComponent,
WorkerFeatureComponent,
} from "./types";
const nodeGRAMUtilization = (node: Node) => {
const utilization = (gpu: GPUStats) => gpu.memory_used / gpu.memory_total;
if (node.gpus.length === 0) {
return NaN;
}
const utilizationSum = sum(node.gpus.map((gpu) => utilization(gpu)));
const avgUtilization = utilizationSum / node.gpus.length;
// Convert to a percent before returning
return avgUtilization * 100;
};
const clusterGRAMUtilization = (nodes: Array<Node>) => {
const utils = nodes
.map((node) => ({
weight: node.gpus.length,
value: nodeGRAMUtilization(node),
}))
.filter((util) => !isNaN(util.value));
if (utils.length === 0) {
return NaN;
}
return getWeightedAverage(utils);
};
export const ClusterGRAM: ClusterFeatureComponent = ({ nodes }) => {
const clusterAverageUtilization = clusterGRAMUtilization(nodes);
return (
<div style={{ minWidth: 60 }}>
{isNaN(clusterAverageUtilization) ? (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
) : (
<UsageBar
percent={clusterAverageUtilization}
text={`${clusterAverageUtilization.toFixed(1)}%`}
/>
)}
</div>
);
};
export const NodeGRAM: NodeFeatureComponent = ({ node }) => {
const gramUtil = nodeGRAMUtilization(node);
return (
<div style={{ minWidth: 60 }}>
{isNaN(gramUtil) ? (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
) : (
<UsageBar percent={gramUtil} text={`${gramUtil.toFixed(1)}%`} />
)}
</div>
);
};
export const WorkerGRAM: WorkerFeatureComponent = ({ worker, node }) => {
const workerProcessPerGPU = node.gpus
.map((gpu) => gpu.processes)
.map((processes) =>
processes.find((process) => process.pid === worker.pid),
);
const workerUtilPerGPU = workerProcessPerGPU.map(
(proc) => proc?.gpu_memory_usage || 0,
);
const totalNodeGRAM = sum(node.gpus.map((gpu) => gpu.memory_total));
const usedGRAM = sum(workerUtilPerGPU);
return (
<div style={{ minWidth: 60 }}>
{node.gpus.length === 0 ? (
<Typography color="textSecondary" component="span" variant="inherit">
N/A
</Typography>
) : (
<UsageBar
percent={100 * (usedGRAM / totalNodeGRAM)}
text={MiBRatio(usedGRAM, totalNodeGRAM)}
/>
)}
</div>
);
};

View file

@ -1,13 +1,17 @@
import React from "react";
import { NodeInfoResponse } from "../../../../api";
import { NodeInfoResponse, RayletWorkerStats } from "../../../../api";
type ArrayType<T> = T extends Array<infer U> ? U : never;
type Node = ArrayType<NodeInfoResponse["clients"]>;
type Worker = ArrayType<Node["workers"]>;
export type Node = ArrayType<NodeInfoResponse["clients"]>;
export type Worker = ArrayType<Node["workers"]>;
type ClusterFeatureData = { nodes: Node[] };
type NodeFeatureData = { node: Node };
type WorkerFeatureData = { node: Node; worker: Worker };
type WorkerFeatureData = {
node: Node;
worker: Worker;
rayletWorker: RayletWorkerStats | null;
};
export type ClusterFeatureComponent = (
data: ClusterFeatureData,

View file

@ -10,7 +10,6 @@ import platform
import subprocess
import sys
from concurrent import futures
import ray
import psutil
import ray.ray_constants as ray_constants
@ -24,6 +23,13 @@ from ray.core.generated import reporter_pb2_grpc
# entry/init points.
logger = logging.getLogger(__name__)
try:
import gpustat.core as gpustat
except ImportError:
gpustat = None
logger.warning(
"Install gpustat with 'pip install gpustat' to enable GPU monitoring.")
class ReporterServer(reporter_pb2_grpc.ReporterServiceServicer):
def __init__(self):
@ -107,6 +113,27 @@ class Reporter:
def get_cpu_percent():
return psutil.cpu_percent()
@staticmethod
def get_gpu_usage():
if gpustat is None:
return []
gpu_utilizations = []
gpus = []
try:
gpus = gpustat.new_query().gpus
except Exception as e:
logger.debug(
"gpustat failed to retrieve GPU information: {}".format(e))
for gpu in gpus:
# Note the keys in this dict have periods which throws
# off javascript so we change .s to _s
gpu_data = {
"_".join(key.split(".")): val
for key, val in gpu.entry.items()
}
gpu_utilizations.append(gpu_data)
return gpu_utilizations
@staticmethod
def get_boot_time():
return psutil.boot_time()
@ -179,6 +206,7 @@ class Reporter:
"boot_time": self.get_boot_time(),
"load_avg": self.get_load_avg(),
"disk": self.get_disk_usage(),
"gpus": self.get_gpu_usage(),
"net": netstats,
}

View file

@ -82,7 +82,7 @@ if "RAY_USE_NEW_GCS" in os.environ and os.environ["RAY_USE_NEW_GCS"] == "on":
extras = {
"debug": [],
"dashboard": ["requests"],
"dashboard": ["requests", "gpustat"],
"serve": ["uvicorn", "flask", "blist"],
"tune": ["tabulate", "tensorboardX", "pandas"]
}