ray/dashboard/modules/log/tests/test_log.py
jon-chuang ddcc252b51
[Core] Ray logs API (1/n) (#23435)
Expose HTTP endpoint to retrieve logs from ray cluster
2022-04-20 23:11:02 -07:00

247 lines
7.9 KiB
Python

import sys
import logging
import requests
import time
import traceback
import html.parser
import urllib.parse
import json
import os
from ray.dashboard.tests.conftest import * # noqa
import pytest
import ray
from ray._private.test_utils import (
format_web_url,
wait_until_server_available,
)
from ray.dashboard.modules.log.log_agent import tail as tail_file
logger = logging.getLogger(__name__)
class LogUrlParser(html.parser.HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._urls = []
def handle_starttag(self, tag, attrs):
if tag == "a":
self._urls.append(dict(attrs)["href"])
def error(self, message):
logger.error(message)
def get_urls(self):
return self._urls
def test_log(disable_aiohttp_cache, ray_start_with_dashboard):
@ray.remote
def write_log(s):
print(s)
test_log_text = "test_log_text"
ray.get(write_log.remote(test_log_text))
test_file = "test.log"
with open(
f"{ray.worker.global_worker.node.get_logs_dir_path()}/{test_file}", "w"
) as f:
f.write(test_log_text)
assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True
webui_url = ray_start_with_dashboard["webui_url"]
webui_url = format_web_url(webui_url)
node_id = ray_start_with_dashboard["node_id"]
timeout_seconds = 10
start_time = time.time()
last_ex = None
while True:
time.sleep(1)
try:
response = requests.get(webui_url + "/log_index")
response.raise_for_status()
parser = LogUrlParser()
parser.feed(response.text)
all_nodes_log_urls = parser.get_urls()
assert len(all_nodes_log_urls) == 1
response = requests.get(all_nodes_log_urls[0])
response.raise_for_status()
parser = LogUrlParser()
parser.feed(response.text)
# Search test_log_text from all worker logs.
parsed_url = urllib.parse.urlparse(all_nodes_log_urls[0])
paths = parser.get_urls()
urls = []
for p in paths:
if "worker" in p:
urls.append(parsed_url._replace(path=p).geturl())
for u in urls:
response = requests.get(u)
response.raise_for_status()
if test_log_text in response.text:
break
else:
raise Exception(f"Can't find {test_log_text} from {urls}")
# Test range request.
response = requests.get(
webui_url + f"/logs/{test_file}", headers={"Range": "bytes=2-5"}
)
response.raise_for_status()
assert response.text == test_log_text[2:6]
# Test logUrl in node info.
response = requests.get(webui_url + f"/nodes/{node_id}")
response.raise_for_status()
node_info = response.json()
assert node_info["result"] is True
node_info = node_info["data"]["detail"]
assert "logUrl" in node_info
assert node_info["logUrl"] in all_nodes_log_urls
break
except Exception as ex:
last_ex = ex
finally:
if time.time() > start_time + timeout_seconds:
ex_stack = (
traceback.format_exception(
type(last_ex), last_ex, last_ex.__traceback__
)
if last_ex
else []
)
ex_stack = "".join(ex_stack)
raise Exception(f"Timed out while testing, {ex_stack}")
def test_log_proxy(ray_start_with_dashboard):
assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True
webui_url = ray_start_with_dashboard["webui_url"]
webui_url = format_web_url(webui_url)
timeout_seconds = 5
start_time = time.time()
last_ex = None
test_log_text = "test_log_text"
test_file = "test.log"
with open(
f"{ray.worker.global_worker.node.get_logs_dir_path()}/{test_file}", "w"
) as f:
f.write(test_log_text)
while True:
time.sleep(1)
try:
# Test range request.
response = requests.get(
f"{webui_url}/log_proxy?url={webui_url}/logs/{test_file}",
headers={"Range": "bytes=2-5"},
)
response.raise_for_status()
assert response.text == test_log_text[2:6]
# Test 404.
response = requests.get(
f"{webui_url}/log_proxy?" f"url={webui_url}/logs/not_exist_file.log"
)
assert response.status_code == 404
break
except Exception as ex:
last_ex = ex
finally:
if time.time() > start_time + timeout_seconds:
ex_stack = (
traceback.format_exception(
type(last_ex), last_ex, last_ex.__traceback__
)
if last_ex
else []
)
ex_stack = "".join(ex_stack)
raise Exception(f"Timed out while testing, {ex_stack}")
def test_logs_experimental_list(ray_start_with_dashboard):
assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True
webui_url = ray_start_with_dashboard["webui_url"]
webui_url = format_web_url(webui_url)
# test that list logs is comprehensive
response = requests.get(webui_url + "/api/experimental/logs/list")
response.raise_for_status()
logs = json.loads(response.text)
assert len(logs) == 1
node_id = next(iter(logs))
# test worker logs
outs = logs[node_id]["worker_outs"]
errs = logs[node_id]["worker_outs"]
core_worker_logs = logs[node_id]["python_core_worker_logs"]
assert len(outs) == len(errs) == len(core_worker_logs)
assert len(outs) > 0
for file in ["debug_state_gcs.txt", "gcs_server.out", "gcs_server.err"]:
assert file in logs[node_id]["gcs_server"]
for file in ["raylet.out", "raylet.err"]:
assert file in logs[node_id]["raylet"]
for file in ["dashboard_agent.log", "dashboard.log"]:
assert file in logs[node_id]["dashboard"]
return True
# Test that logs/list can be filtered
response = requests.get(webui_url + "/api/experimental/logs/list?filters=gcs")
response.raise_for_status()
logs = json.loads(response.text)
assert len(logs) == 1
node_id = next(iter(logs))
assert "gcs_server" in logs[node_id]
for category in logs[node_id]:
if category != "gcs_server":
assert len(logs[node_id][category]) == 0
response = requests.get(webui_url + "/api/experimental/logs/list?filters=worker")
response.raise_for_status()
logs = json.loads(response.text)
assert len(logs) == 1
node_id = next(iter(logs))
worker_log_categories = [
"python_core_worker_logs",
"worker_outs",
"worker_errors",
]
assert all([cat in logs[node_id] for cat in worker_log_categories])
for category in logs[node_id]:
if category not in worker_log_categories:
assert len(logs[node_id][category]) == 0
def test_logs_tail():
"""
Unit test for tail
"""
TOTAL_LINES = 1000
FILE_NAME = "test_file.txt"
try:
with open(FILE_NAME, "w") as f:
for i in range(TOTAL_LINES):
f.write(f"Message {i:4}\n")
file = open(FILE_NAME, "rb")
text, byte_pos = tail_file(file, 100)
assert byte_pos == TOTAL_LINES * len(b"Message 1000\n")
lines = text.decode("utf-8").split("\n")
assert len(lines) == 100
assert lines[0] == "Message 900"
assert lines[99] == "Message 999"
except Exception as e:
raise e
finally:
if os.path.exists(FILE_NAME):
os.remove(FILE_NAME)
if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))