mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[dashboard][k8s] Better CPU reporting when running on K8s (#14593)
This commit is contained in:
parent
ee4b6e7e3b
commit
a90cffe26c
5 changed files with 248 additions and 11 deletions
100
dashboard/k8s_utils.py
Normal file
100
dashboard/k8s_utils.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
import logging
|
||||
|
||||
import ray._private.utils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CPU_SHARES_PATH = "/sys/fs/cgroup/cpu/cpu.shares"
|
||||
CPU_USAGE_PATH = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
|
||||
PROC_STAT_PATH = "/proc/stat"
|
||||
|
||||
container_num_cpus = None
|
||||
host_num_cpus = None
|
||||
|
||||
last_cpu_usage = None
|
||||
last_system_usage = None
|
||||
|
||||
|
||||
def cpu_percent():
|
||||
"""Estimate CPU usage percent for Ray pod managed by Kubernetes
|
||||
Operator.
|
||||
|
||||
Computed by the following steps
|
||||
(1) Replicate the logic used by 'docker stats' cli command.
|
||||
See https://github.com/docker/cli/blob/c0a6b1c7b30203fbc28cd619acb901a95a80e30e/cli/command/container/stats_helpers.go#L166.
|
||||
(2) Divide by the number of CPUs available to the container, so that
|
||||
e.g. full capacity use of 2 CPUs will read as 100%,
|
||||
rather than 200%.
|
||||
|
||||
Step (1) above works by
|
||||
dividing delta in cgroup's cpuacct.usage by
|
||||
delta in total host cpu usage, averaged over host's cpus.
|
||||
|
||||
Since deltas are not initially available, return 0.0 on first call.
|
||||
""" # noqa
|
||||
global last_system_usage
|
||||
global last_cpu_usage
|
||||
try:
|
||||
cpu_usage = _cpu_usage()
|
||||
system_usage = _system_usage()
|
||||
# Return 0.0 on first call.
|
||||
if last_system_usage is None:
|
||||
cpu_percent = 0.0
|
||||
else:
|
||||
cpu_delta = cpu_usage - last_cpu_usage
|
||||
# "System time passed." (Typically close to clock time.)
|
||||
system_delta = (
|
||||
(system_usage - last_system_usage) / _host_num_cpus())
|
||||
|
||||
quotient = cpu_delta / system_delta
|
||||
cpu_percent = round(
|
||||
quotient * 100 / ray._private.utils.get_k8s_cpus(), 1)
|
||||
last_system_usage = system_usage
|
||||
last_cpu_usage = cpu_usage
|
||||
# Computed percentage might be slightly above 100%.
|
||||
return min(cpu_percent, 100.0)
|
||||
except Exception as e:
|
||||
logger.exception("Error computing CPU usage of Ray Kubernetes pod.", e)
|
||||
return 0.0
|
||||
|
||||
|
||||
def _cpu_usage():
|
||||
"""Compute total cpu usage of the container in nanoseconds
|
||||
by reading from cgroup/cpuacct."""
|
||||
return int(open(CPU_USAGE_PATH).read())
|
||||
|
||||
|
||||
def _system_usage():
|
||||
"""
|
||||
Computes total CPU usage of the host in nanoseconds.
|
||||
|
||||
Logic taken from here:
|
||||
https://github.com/moby/moby/blob/b42ac8d370a8ef8ec720dff0ca9dfb3530ac0a6a/daemon/stats/collector_unix.go#L31
|
||||
|
||||
See also the /proc/stat entry here:
|
||||
https://man7.org/linux/man-pages/man5/proc.5.html
|
||||
""" # noqa
|
||||
cpu_summary_str = open(PROC_STAT_PATH).read().split("\n")[0]
|
||||
parts = cpu_summary_str.split()
|
||||
assert parts[0] == "cpu"
|
||||
usage_data = parts[1:8]
|
||||
total_clock_ticks = sum(int(entry) for entry in usage_data)
|
||||
# 100 clock ticks per second, 10^9 ns per second
|
||||
usage_ns = total_clock_ticks * 10**7
|
||||
return usage_ns
|
||||
|
||||
|
||||
def _host_num_cpus():
|
||||
"""Number of physical CPUs, obtained by parsing /proc/stat."""
|
||||
global host_num_cpus
|
||||
if host_num_cpus is None:
|
||||
proc_stat_lines = open(PROC_STAT_PATH).read().split("\n")
|
||||
split_proc_stat_lines = [line.split() for line in proc_stat_lines]
|
||||
cpu_lines = [
|
||||
split_line for split_line in split_proc_stat_lines
|
||||
if len(split_line) > 0 and "cpu" in split_line[0]
|
||||
]
|
||||
# Number of lines starting with a word including 'cpu', subtracting
|
||||
# 1 for the first summary line.
|
||||
host_num_cpus = len(cpu_lines) - 1
|
||||
return host_num_cpus
|
|
@ -13,6 +13,7 @@ import aioredis
|
|||
import ray
|
||||
import ray.gcs_utils
|
||||
import ray.new_dashboard.modules.reporter.reporter_consts as reporter_consts
|
||||
from ray.new_dashboard import k8s_utils
|
||||
import ray.new_dashboard.utils as dashboard_utils
|
||||
import ray._private.services
|
||||
import ray._private.utils
|
||||
|
@ -23,6 +24,9 @@ import psutil
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Are we in a K8s pod?
|
||||
IN_KUBERNETES_POD = "KUBERNETES_SERVICE_HOST" in os.environ
|
||||
|
||||
try:
|
||||
import gpustat.core as gpustat
|
||||
except ImportError:
|
||||
|
@ -113,8 +117,15 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
|
|||
def __init__(self, dashboard_agent):
|
||||
"""Initialize the reporter object."""
|
||||
super().__init__(dashboard_agent)
|
||||
self._cpu_counts = (psutil.cpu_count(),
|
||||
psutil.cpu_count(logical=False))
|
||||
if IN_KUBERNETES_POD:
|
||||
# psutil does not compute this correctly when in a K8s pod.
|
||||
# Use ray._private.utils instead.
|
||||
cpu_count = ray._private.utils.get_num_cpus()
|
||||
self._cpu_counts = (cpu_count, cpu_count)
|
||||
else:
|
||||
self._cpu_counts = (psutil.cpu_count(),
|
||||
psutil.cpu_count(logical=False))
|
||||
|
||||
self._ip = ray._private.services.get_node_ip_address()
|
||||
self._hostname = socket.gethostname()
|
||||
self._workers = set()
|
||||
|
@ -156,7 +167,10 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
|
|||
|
||||
@staticmethod
|
||||
def _get_cpu_percent():
|
||||
return psutil.cpu_percent()
|
||||
if IN_KUBERNETES_POD:
|
||||
return k8s_utils.cpu_percent()
|
||||
else:
|
||||
return psutil.cpu_percent()
|
||||
|
||||
@staticmethod
|
||||
def _get_gpu_usage():
|
||||
|
|
|
@ -2,6 +2,7 @@ import binascii
|
|||
import errno
|
||||
import hashlib
|
||||
import logging
|
||||
import math
|
||||
import multiprocessing
|
||||
import os
|
||||
import signal
|
||||
|
@ -10,6 +11,7 @@ import sys
|
|||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
import uuid
|
||||
|
||||
from inspect import signature
|
||||
|
@ -412,8 +414,9 @@ def get_system_memory():
|
|||
|
||||
def _get_docker_cpus(
|
||||
cpu_quota_file_name="/sys/fs/cgroup/cpu/cpu.cfs_quota_us",
|
||||
cpu_share_file_name="/sys/fs/cgroup/cpu/cpu.cfs_period_us",
|
||||
cpuset_file_name="/sys/fs/cgroup/cpuset/cpuset.cpus"):
|
||||
cpu_period_file_name="/sys/fs/cgroup/cpu/cpu.cfs_period_us",
|
||||
cpuset_file_name="/sys/fs/cgroup/cpuset/cpuset.cpus"
|
||||
) -> Optional[float]:
|
||||
# TODO (Alex): Don't implement this logic oursleves.
|
||||
# Docker has 2 underyling ways of implementing CPU limits:
|
||||
# https://docs.docker.com/config/containers/resource_constraints/#configure-the-default-cfs-scheduler
|
||||
|
@ -428,13 +431,13 @@ def _get_docker_cpus(
|
|||
cpu_quota_file_name):
|
||||
try:
|
||||
with open(cpu_quota_file_name, "r") as quota_file, open(
|
||||
cpu_share_file_name, "r") as period_file:
|
||||
cpu_period_file_name, "r") as period_file:
|
||||
cpu_quota = float(quota_file.read()) / float(
|
||||
period_file.read())
|
||||
except Exception as e:
|
||||
logger.exception("Unexpected error calculating docker cpu quota.",
|
||||
e)
|
||||
if cpu_quota < 0:
|
||||
if (cpu_quota is not None) and (cpu_quota < 0):
|
||||
cpu_quota = None
|
||||
|
||||
cpuset_num = None
|
||||
|
@ -461,7 +464,30 @@ def _get_docker_cpus(
|
|||
return cpu_quota or cpuset_num
|
||||
|
||||
|
||||
def get_num_cpus():
|
||||
def get_k8s_cpus(cpu_share_file_name="/sys/fs/cgroup/cpu/cpu.shares") -> float:
|
||||
"""Get number of CPUs available for use by this container, in terms of
|
||||
cgroup cpu shares.
|
||||
|
||||
This is the number of CPUs K8s has assigned to the container based
|
||||
on pod spec requests and limits.
|
||||
|
||||
Note: using cpu_quota as in _get_docker_cpus() works
|
||||
only if the user set CPU limit in their pod spec (in addition to CPU
|
||||
request). Otherwise, the quota is unset.
|
||||
"""
|
||||
try:
|
||||
cpu_shares = int(open(cpu_share_file_name).read())
|
||||
container_num_cpus = cpu_shares / 1024
|
||||
return container_num_cpus
|
||||
except Exception as e:
|
||||
logger.exception("Error computing CPU limit of Ray Kubernetes pod.", e)
|
||||
return 1.0
|
||||
|
||||
|
||||
def get_num_cpus() -> int:
|
||||
if "KUBERNETES_SERVICE_HOST" in os.environ:
|
||||
# If in a K8S pod, use cgroup cpu shares and round up.
|
||||
return int(math.ceil(get_k8s_cpus()))
|
||||
cpu_count = multiprocessing.cpu_count()
|
||||
if os.environ.get("RAY_USE_MULTIPROCESSING_CPU_COUNT"):
|
||||
logger.info(
|
||||
|
|
|
@ -74,6 +74,7 @@ if __name__ == "__main__":
|
|||
do_link("experimental", force=args.yes)
|
||||
do_link("util", force=args.yes)
|
||||
do_link("serve", force=args.yes)
|
||||
do_link("_private", force=args.yes)
|
||||
# Link package's `new_dashboard` directly to local (repo's) dashboard.
|
||||
# The repo's `new_dashboard` is a file, soft-linking to which will not work
|
||||
# on Mac.
|
||||
|
|
|
@ -6,14 +6,17 @@ import sys
|
|||
import socket
|
||||
import tempfile
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
import numpy as np
|
||||
import pickle
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
from ray.new_dashboard import k8s_utils
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.util.accelerators
|
||||
import ray._private.utils
|
||||
import ray._private.cluster_utils
|
||||
import ray.test_utils
|
||||
from ray import resource_spec
|
||||
|
@ -601,7 +604,7 @@ def test_detect_docker_cpus():
|
|||
cpuset_file.flush()
|
||||
assert ray._private.utils._get_docker_cpus(
|
||||
cpu_quota_file_name=quota_file.name,
|
||||
cpu_share_file_name=period_file.name,
|
||||
cpu_period_file_name=period_file.name,
|
||||
cpuset_file_name=cpuset_file.name) == 64
|
||||
|
||||
# No cpuset used
|
||||
|
@ -617,7 +620,7 @@ def test_detect_docker_cpus():
|
|||
cpuset_file.flush()
|
||||
assert ray._private.utils._get_docker_cpus(
|
||||
cpu_quota_file_name=quota_file.name,
|
||||
cpu_share_file_name=period_file.name,
|
||||
cpu_period_file_name=period_file.name,
|
||||
cpuset_file_name=cpuset_file.name) == 26
|
||||
|
||||
# Quota set
|
||||
|
@ -633,10 +636,103 @@ def test_detect_docker_cpus():
|
|||
cpuset_file.flush()
|
||||
assert ray._private.utils._get_docker_cpus(
|
||||
cpu_quota_file_name=quota_file.name,
|
||||
cpu_share_file_name=period_file.name,
|
||||
cpu_period_file_name=period_file.name,
|
||||
cpuset_file_name=cpuset_file.name) == 0.42
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform.startswith("win"), reason="No need to test on Windows.")
|
||||
def test_k8s_cpu():
|
||||
"""Test all the functions in dashboard/k8s_utils.py.
|
||||
Also test ray._private.utils.get_num_cpus when running in a K8s pod.
|
||||
Files were obtained from within a K8s pod with 2 CPU request, CPU limit
|
||||
unset, with 1 CPU of stress applied.
|
||||
"""
|
||||
|
||||
# Some experimentally-obtained K8S CPU usage files for use in test_k8s_cpu.
|
||||
PROCSTAT1 = \
|
||||
"""cpu 2945022 98 3329420 148744854 39522 0 118587 0 0 0
|
||||
cpu0 370299 14 413841 18589778 5304 0 15288 0 0 0
|
||||
cpu1 378637 10 414414 18589275 5283 0 14731 0 0 0
|
||||
cpu2 367328 8 420914 18590974 4844 0 14416 0 0 0
|
||||
cpu3 368378 11 423720 18572899 4948 0 14394 0 0 0
|
||||
cpu4 369051 13 414615 18607285 4736 0 14383 0 0 0
|
||||
cpu5 362958 10 415984 18576655 4590 0 16614 0 0 0
|
||||
cpu6 362536 13 414430 18605197 4785 0 14353 0 0 0
|
||||
cpu7 365833 15 411499 18612787 5028 0 14405 0 0 0
|
||||
intr 1000694027 125 0 0 39 154 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1028 0 2160913 0 2779605 8 0 3981333 3665198 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
ctxt 1574979439
|
||||
btime 1615208601
|
||||
processes 857411
|
||||
procs_running 6
|
||||
procs_blocked 0
|
||||
softirq 524311775 0 230142964 27143 63542182 0 0 171 74042767 0 156556548
|
||||
""" # noqa
|
||||
|
||||
PROCSTAT2 = \
|
||||
"""cpu 2945152 98 3329436 148745483 39522 0 118587 0 0 0
|
||||
cpu0 370399 14 413841 18589778 5304 0 15288 0 0 0
|
||||
cpu1 378647 10 414415 18589362 5283 0 14731 0 0 0
|
||||
cpu2 367329 8 420916 18591067 4844 0 14416 0 0 0
|
||||
cpu3 368381 11 423724 18572989 4948 0 14395 0 0 0
|
||||
cpu4 369052 13 414618 18607374 4736 0 14383 0 0 0
|
||||
cpu5 362968 10 415986 18576741 4590 0 16614 0 0 0
|
||||
cpu6 362537 13 414432 18605290 4785 0 14353 0 0 0
|
||||
cpu7 365836 15 411502 18612878 5028 0 14405 0 0 0
|
||||
intr 1000700905 125 0 0 39 154 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1028 0 2160923 0 2779605 8 0 3981353 3665218 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
ctxt 1574988760
|
||||
btime 1615208601
|
||||
processes 857411
|
||||
procs_running 4
|
||||
procs_blocked 0
|
||||
softirq 524317451 0 230145523 27143 63542930 0 0 171 74043232 0 156558452
|
||||
""" # noqa
|
||||
|
||||
CPUACCTUSAGE1 = "2268980984108"
|
||||
|
||||
CPUACCTUSAGE2 = "2270120061999"
|
||||
|
||||
CPUSHARES = "2048"
|
||||
|
||||
shares_file, cpu_file, proc_stat_file = [
|
||||
tempfile.NamedTemporaryFile("w+") for _ in range(3)
|
||||
]
|
||||
shares_file.write(CPUSHARES)
|
||||
cpu_file.write(CPUACCTUSAGE1)
|
||||
proc_stat_file.write(PROCSTAT1)
|
||||
for file in shares_file, cpu_file, proc_stat_file:
|
||||
file.flush()
|
||||
with mock.patch("ray._private.utils.os.environ",
|
||||
{"KUBERNETES_SERVICE_HOST"}),\
|
||||
mock.patch("ray.new_dashboard.k8s_utils.CPU_USAGE_PATH",
|
||||
cpu_file.name),\
|
||||
mock.patch("ray.new_dashboard.k8s_utils.PROC_STAT_PATH",
|
||||
proc_stat_file.name),\
|
||||
mock.patch("ray._private.utils.get_k8s_cpus.__defaults__",
|
||||
(shares_file.name,)):
|
||||
|
||||
# Test helpers
|
||||
assert ray._private.utils.get_num_cpus() == 2
|
||||
assert k8s_utils._cpu_usage() == 2268980984108
|
||||
assert k8s_utils._system_usage() == 1551775030000000
|
||||
assert k8s_utils._host_num_cpus() == 8
|
||||
|
||||
# No delta for first computation, return 0.
|
||||
assert k8s_utils.cpu_percent() == 0.0
|
||||
|
||||
# Write new usage info obtained after 1 sec wait.
|
||||
for file in cpu_file, proc_stat_file:
|
||||
file.truncate(0)
|
||||
file.seek(0)
|
||||
cpu_file.write(CPUACCTUSAGE2)
|
||||
proc_stat_file.write(PROCSTAT2)
|
||||
for file in cpu_file, proc_stat_file:
|
||||
file.flush()
|
||||
|
||||
# Files were extracted under 1 CPU of load on a 2 CPU pod
|
||||
assert 50 < k8s_utils.cpu_percent() < 60
|
||||
|
||||
|
||||
def test_override_environment_variables_task(ray_start_regular):
|
||||
@ray.remote
|
||||
def get_env(key):
|
||||
|
|
Loading…
Add table
Reference in a new issue