mirror of
https://github.com/vale981/ray
synced 2025-03-10 05:16:49 -04:00

https://github.com/ray-project/ray/pull/14676 disabled the disk usage/total display for Ray nodes on K8s, because Ray nodes on K8s are run as pods, which in general do not use up the entire machine. However, in some situations, it is useful to run one Ray pod per K8s node and report the disk usage. This PR adds a flag to enable displaying disk usage in those situations.
271 lines
9.5 KiB
Python
271 lines
9.5 KiB
Python
import os
|
|
import sys
|
|
import logging
|
|
from mock import patch
|
|
import requests
|
|
import time
|
|
|
|
import pytest
|
|
import ray
|
|
from ray import ray_constants
|
|
from ray.dashboard.tests.conftest import * # noqa
|
|
from ray.dashboard.utils import Bunch
|
|
from ray.dashboard.modules.reporter.reporter_agent import ReporterAgent
|
|
from ray._private.test_utils import (
|
|
format_web_url,
|
|
RayTestTimeoutException,
|
|
wait_until_server_available,
|
|
wait_for_condition,
|
|
fetch_prometheus,
|
|
)
|
|
|
|
try:
|
|
import prometheus_client
|
|
except ImportError:
|
|
prometheus_client = None
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def test_profiling(shutdown_only):
|
|
addresses = ray.init(include_dashboard=True, num_cpus=6)
|
|
|
|
@ray.remote(num_cpus=2)
|
|
class Actor:
|
|
def getpid(self):
|
|
return os.getpid()
|
|
|
|
c = Actor.remote()
|
|
actor_pid = ray.get(c.getpid.remote())
|
|
|
|
webui_url = addresses["webui_url"]
|
|
assert wait_until_server_available(webui_url) is True
|
|
webui_url = format_web_url(webui_url)
|
|
|
|
start_time = time.time()
|
|
launch_profiling = None
|
|
while True:
|
|
# Sometimes some startup time is required
|
|
if time.time() - start_time > 15:
|
|
raise RayTestTimeoutException(
|
|
"Timed out while collecting profiling stats, "
|
|
f"launch_profiling: {launch_profiling}"
|
|
)
|
|
launch_profiling = requests.get(
|
|
webui_url + "/api/launch_profiling",
|
|
params={
|
|
"ip": ray.nodes()[0]["NodeManagerAddress"],
|
|
"pid": actor_pid,
|
|
"duration": 5,
|
|
},
|
|
).json()
|
|
if launch_profiling["result"]:
|
|
profiling_info = launch_profiling["data"]["profilingInfo"]
|
|
break
|
|
time.sleep(1)
|
|
logger.info(profiling_info)
|
|
|
|
|
|
def test_node_physical_stats(enable_test_module, shutdown_only):
|
|
addresses = ray.init(include_dashboard=True, num_cpus=6)
|
|
|
|
@ray.remote(num_cpus=1)
|
|
class Actor:
|
|
def getpid(self):
|
|
return os.getpid()
|
|
|
|
actors = [Actor.remote() for _ in range(6)]
|
|
actor_pids = ray.get([actor.getpid.remote() for actor in actors])
|
|
actor_pids = set(actor_pids)
|
|
|
|
webui_url = addresses["webui_url"]
|
|
assert wait_until_server_available(webui_url) is True
|
|
webui_url = format_web_url(webui_url)
|
|
|
|
def _check_workers():
|
|
try:
|
|
resp = requests.get(webui_url + "/test/dump?key=node_physical_stats")
|
|
resp.raise_for_status()
|
|
result = resp.json()
|
|
assert result["result"] is True
|
|
node_physical_stats = result["data"]["nodePhysicalStats"]
|
|
assert len(node_physical_stats) == 1
|
|
current_stats = node_physical_stats[addresses["node_id"]]
|
|
# Check Actor workers
|
|
current_actor_pids = set()
|
|
for worker in current_stats["workers"]:
|
|
if "ray::Actor" in worker["cmdline"][0]:
|
|
current_actor_pids.add(worker["pid"])
|
|
assert current_actor_pids == actor_pids
|
|
# Check raylet cmdline
|
|
assert "raylet" in current_stats["cmdline"][0]
|
|
return True
|
|
except Exception as ex:
|
|
logger.info(ex)
|
|
return False
|
|
|
|
wait_for_condition(_check_workers, timeout=10)
|
|
|
|
|
|
@pytest.mark.skipif(prometheus_client is None, reason="prometheus_client not installed")
|
|
def test_prometheus_physical_stats_record(enable_test_module, shutdown_only):
|
|
addresses = ray.init(include_dashboard=True, num_cpus=1)
|
|
metrics_export_port = addresses["metrics_export_port"]
|
|
addr = addresses["raylet_ip_address"]
|
|
prom_addresses = [f"{addr}:{metrics_export_port}"]
|
|
|
|
def test_case_stats_exist():
|
|
components_dict, metric_names, metric_samples = fetch_prometheus(prom_addresses)
|
|
return all(
|
|
[
|
|
"ray_node_cpu_utilization" in metric_names,
|
|
"ray_node_cpu_count" in metric_names,
|
|
"ray_node_mem_used" in metric_names,
|
|
"ray_node_mem_available" in metric_names,
|
|
"ray_node_mem_total" in metric_names,
|
|
"ray_raylet_cpu" in metric_names,
|
|
"ray_raylet_mem" in metric_names,
|
|
"ray_node_disk_io_read" in metric_names,
|
|
"ray_node_disk_io_write" in metric_names,
|
|
"ray_node_disk_io_read_count" in metric_names,
|
|
"ray_node_disk_io_write_count" in metric_names,
|
|
"ray_node_disk_io_read_speed" in metric_names,
|
|
"ray_node_disk_io_write_speed" in metric_names,
|
|
"ray_node_disk_read_iops" in metric_names,
|
|
"ray_node_disk_write_iops" in metric_names,
|
|
"ray_node_disk_usage" in metric_names,
|
|
"ray_node_disk_free" in metric_names,
|
|
"ray_node_disk_utilization_percentage" in metric_names,
|
|
"ray_node_network_sent" in metric_names,
|
|
"ray_node_network_received" in metric_names,
|
|
"ray_node_network_send_speed" in metric_names,
|
|
"ray_node_network_receive_speed" in metric_names,
|
|
]
|
|
)
|
|
|
|
def test_case_ip_correct():
|
|
components_dict, metric_names, metric_samples = fetch_prometheus(prom_addresses)
|
|
raylet_proc = ray.worker._global_node.all_processes[
|
|
ray_constants.PROCESS_TYPE_RAYLET
|
|
][0]
|
|
raylet_pid = None
|
|
# Find the raylet pid recorded in the tag.
|
|
for sample in metric_samples:
|
|
if sample.name == "ray_raylet_cpu":
|
|
raylet_pid = sample.labels["pid"]
|
|
break
|
|
return str(raylet_proc.process.pid) == str(raylet_pid)
|
|
|
|
wait_for_condition(test_case_stats_exist, retry_interval_ms=1000)
|
|
wait_for_condition(test_case_ip_correct, retry_interval_ms=1000)
|
|
|
|
|
|
def test_report_stats():
|
|
class ReporterAgentDummy(object):
|
|
pass
|
|
|
|
obj = ReporterAgentDummy()
|
|
obj._is_head_node = True
|
|
|
|
test_stats = {
|
|
"now": 1614826393.975763,
|
|
"hostname": "fake_hostname.local",
|
|
"ip": "127.0.0.1",
|
|
"cpu": 57.4,
|
|
"cpus": (8, 4),
|
|
"mem": (17179869184, 5723353088, 66.7, 9234341888),
|
|
"workers": [
|
|
{
|
|
"memory_info": Bunch(
|
|
rss=55934976, vms=7026937856, pfaults=15354, pageins=0
|
|
),
|
|
"cpu_percent": 0.0,
|
|
"cmdline": ["ray::IDLE", "", "", "", "", "", "", "", "", "", "", ""],
|
|
"create_time": 1614826391.338613,
|
|
"pid": 7174,
|
|
"cpu_times": Bunch(
|
|
user=0.607899328,
|
|
system=0.274044032,
|
|
children_user=0.0,
|
|
children_system=0.0,
|
|
),
|
|
}
|
|
],
|
|
"raylet": {
|
|
"memory_info": Bunch(rss=18354176, vms=6921486336, pfaults=6206, pageins=3),
|
|
"cpu_percent": 0.0,
|
|
"cmdline": ["fake raylet cmdline"],
|
|
"create_time": 1614826390.274854,
|
|
"pid": 7153,
|
|
"cpu_times": Bunch(
|
|
user=0.03683138,
|
|
system=0.035913716,
|
|
children_user=0.0,
|
|
children_system=0.0,
|
|
),
|
|
},
|
|
"bootTime": 1612934656.0,
|
|
"loadAvg": ((4.4521484375, 3.61083984375, 3.5400390625), (0.56, 0.45, 0.44)),
|
|
"disk_io": (100, 100, 100, 100),
|
|
"disk_io_speed": (100, 100, 100, 100),
|
|
"disk": {
|
|
"/": Bunch(
|
|
total=250790436864, used=11316781056, free=22748921856, percent=33.2
|
|
),
|
|
"/tmp": Bunch(
|
|
total=250790436864, used=209532035072, free=22748921856, percent=90.2
|
|
),
|
|
},
|
|
"gpus": [],
|
|
"network": (13621160960, 11914936320),
|
|
"network_speed": (8.435062128545095, 7.378462703142336),
|
|
}
|
|
|
|
cluster_stats = {
|
|
"autoscaler_report": {
|
|
"active_nodes": {"head_node": 1, "worker-node-0": 2},
|
|
"failed_nodes": [],
|
|
"pending_launches": {},
|
|
"pending_nodes": [],
|
|
}
|
|
}
|
|
|
|
records = ReporterAgent._record_stats(obj, test_stats, cluster_stats)
|
|
assert len(records) == 24
|
|
# Test stats without raylets
|
|
test_stats["raylet"] = {}
|
|
records = ReporterAgent._record_stats(obj, test_stats, cluster_stats)
|
|
assert len(records) == 22
|
|
# Test stats with gpus
|
|
test_stats["gpus"] = [
|
|
{"utilization_gpu": 1, "memory_used": 100, "memory_total": 1000}
|
|
]
|
|
records = ReporterAgent._record_stats(obj, test_stats, cluster_stats)
|
|
assert len(records) == 26
|
|
# Test stats without autoscaler report
|
|
cluster_stats = {}
|
|
records = ReporterAgent._record_stats(obj, test_stats, cluster_stats)
|
|
assert len(records) == 24
|
|
|
|
|
|
@pytest.mark.parametrize("enable_k8s_disk_usage", [True, False])
|
|
def test_enable_k8s_disk_usage(enable_k8s_disk_usage: bool):
|
|
"""Test enabling display of K8s node disk usage when in a K8s pod."""
|
|
with patch.multiple(
|
|
"ray.dashboard.modules.reporter.reporter_agent",
|
|
IN_KUBERNETES_POD=True,
|
|
ENABLE_K8S_DISK_USAGE=enable_k8s_disk_usage,
|
|
):
|
|
root_usage = ReporterAgent._get_disk_usage()["/"]
|
|
if enable_k8s_disk_usage:
|
|
# Since K8s disk usage is enabled, we shouuld get non-dummy values.
|
|
assert root_usage.total != 1
|
|
assert root_usage.free != 1
|
|
else:
|
|
# Unless K8s disk usage display is enabled, we should get dummy values.
|
|
assert root_usage.total == 1
|
|
assert root_usage.free == 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(pytest.main(["-v", __file__]))
|