[CI] Create zip of ray session_latest/logs dir on test failure and upload to buildkite via /artifact-mount (#23783)

Creates a zip of session_latest dir with test name and timestamp upon python test failure. Writes to dir specified by env var `RAY_TEST_FAILURE_LOGS_DIR`. Noop if env var does not exist.

Downstream consumer (e.g. CI) can upload all created artifacts in this dir. Thereby, PR submitters can more easily debug their CI failures, especially if they can't repro locally.

Limitations:
- a conftest.py file importing the main ray conftest.py needs to be present in same dir as test. This presents a challenge for e.g. dashboard tests which are highly scattered
This commit is contained in:
jon-chuang 2022-04-22 04:48:53 -04:00 committed by GitHub
parent 1807cff9b6
commit e6a458a31e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 69 additions and 3 deletions

View file

@ -43,7 +43,7 @@ The following practices can avoid such pitfalls while maintaining intuitive cont
(The sheer length of the script is a secondary concern and can be mitigated by keeping functions modular.)
- Avoid adding new scripts if possible. If it's necessary that you do so, call them instead of sourcing them.
Note that thies implies new scripts should not modify the environment, or the caller will not see such changes!
Note that this implies new scripts should not modify the environment, or the caller will not see such changes!
- Always add code inside a function, not at global scope. Use `local` for variables where it makes sense.
However, be careful and know the shell rules: for example, e.g. `local x=$(false)` succeeds even under `set -e`.

View file

@ -191,7 +191,8 @@ test_python() {
# For running large Python tests on Linux and MacOS.
test_large() {
bazel test --config=ci "$(./ci/run/bazel_export_options)" --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
# shellcheck disable=SC2046
bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
--test_env=CONDA_SHLVL --test_env=CONDA_PREFIX --test_env=CONDA_DEFAULT_ENV --test_env=CONDA_PROMPT_MODIFIER \
--test_env=CI --test_tag_filters="large_size_python_tests_shard_${BUILDKITE_PARALLEL_JOB}" \
-- python/ray/tests/...

View file

@ -1,4 +1,7 @@
#!/usr/bin/env bash
mkdir -p /tmp/bazel_event_logs
echo "--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)"
event_json_flag=--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)
logs_archive_flag=--test_env=RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR=/artifact-mount/.failed_test_logs
echo "${event_json_flag} ${logs_archive_flag}"

View file

@ -9,6 +9,9 @@ from ray.data.block import BlockAccessor
from ray.data.tests.mock_server import * # noqa
from ray.data.datasource.file_based_datasource import BlockWritePathProvider
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
@pytest.fixture(scope="function")
def aws_credentials():

View file

@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa

View file

@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa

View file

@ -8,6 +8,8 @@ import random
import ray
from ray import serve
from ray.tests.conftest import pytest_runtest_makereport # noqa
# https://tools.ietf.org/html/rfc6335#section-6
MIN_DYNAMIC_PORT = 49152
MAX_DYNAMIC_PORT = 65535

View file

@ -11,6 +11,9 @@ import json
import time
from pathlib import Path
from unittest import mock
import shutil
import platform
from tempfile import gettempdir
import ray
import ray.ray_constants as ray_constants
@ -679,3 +682,40 @@ def set_bad_runtime_env_cache_ttl_seconds(request):
os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"] = ttl
yield ttl
del os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"]
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
# execute all other hooks to obtain the report object
outcome = yield
rep = outcome.get_result()
# We temporarily restrict to Linux until we have artifact dirs
# for Windows and Mac
if platform.system() != "Linux":
return
# Only archive failed tests after the "call" phase of the test
if rep.when != "call" or not rep.failed:
return
# Get dir to write zipped logs to
archive_dir = os.environ.get("RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR")
if not archive_dir:
return
if not os.path.exists(archive_dir):
os.makedirs(archive_dir)
# Get logs dir from the latest ray session
tmp_dir = gettempdir()
logs_dir = os.path.join(tmp_dir, "ray", "session_latest", "logs")
if not os.path.exists(logs_dir):
return
# Write zipped logs to logs archive dir
test_name = rep.nodeid.replace(os.sep, "::")
output_file = os.path.join(archive_dir, f"{test_name}_{time.time():.4f}")
shutil.make_archive(output_file, "zip", logs_dir)

View file

@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa

View file

@ -23,6 +23,9 @@ from ray.train.constants import (
from ray.train.worker_group import WorkerGroup
from ray.util.placement_group import get_current_placement_group
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
@pytest.fixture
def ray_start_2_cpus():

View file

@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa

View file

@ -7,6 +7,9 @@ from mock_server import start_service, stop_process
import tempfile
from ray.tests.conftest import get_default_fixture_ray_kwargs
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
import os
import uuid
from ray.workflow.tests import utils

View file

@ -1 +1,4 @@
from ray.tests.conftest import ray_start_regular_shared # noqa: F401
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa