[CI] Create zip of ray session_latest/logs dir on test failure and upload to buildkite via /artifact-mount (#23783)

Creates a zip of session_latest dir with test name and timestamp upon python test failure. Writes to dir specified by env var `RAY_TEST_FAILURE_LOGS_DIR`. Noop if env var does not exist. Downstream consumer (e.g. CI) can upload all created artifacts in this dir. Thereby, PR submitters can more easily debug their CI failures, especially if they can't repro locally. Limitations: - a conftest.py file importing the main ray conftest.py needs to be present in same dir as test. This presents a challenge for e.g. dashboard tests which are highly scattered
2025-03-04 17:41:43 -05:00 · 2022-04-22 04:48:53 -04:00 · 2022-04-22 04:48:53 -04:00 · e6a458a31e
commit e6a458a31e
parent 1807cff9b6
13 changed files with 69 additions and 3 deletions
--- a/ci/README.md
+++ b/ci/README.md
@ -43,7 +43,7 @@ The following practices can avoid such pitfalls while maintaining intuitive cont
  (The sheer length of the script is a secondary concern and can be mitigated by keeping functions modular.)

 - Avoid adding new scripts if possible. If it's necessary that you do so, call them instead of sourcing them.
-  Note that thies implies new scripts should not modify the environment, or the caller will not see such changes!
+  Note that this implies new scripts should not modify the environment, or the caller will not see such changes!

 - Always add code inside a function, not at global scope. Use `local` for variables where it makes sense.
  However, be careful and know the shell rules: for example, e.g. `local x=$(false)` succeeds even under `set -e`.
--- a/ci/ci.sh
+++ b/ci/ci.sh
@ -191,7 +191,8 @@ test_python() {

 # For running large Python tests on Linux and MacOS.
 test_large() {
-  bazel test --config=ci "$(./ci/run/bazel_export_options)" --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
+  # shellcheck disable=SC2046
+  bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
      --test_env=CONDA_SHLVL --test_env=CONDA_PREFIX --test_env=CONDA_DEFAULT_ENV --test_env=CONDA_PROMPT_MODIFIER \
      --test_env=CI --test_tag_filters="large_size_python_tests_shard_${BUILDKITE_PARALLEL_JOB}" \
      -- python/ray/tests/...
--- a/ci/run/bazel_export_options
+++ b/ci/run/bazel_export_options
@ -1,4 +1,7 @@
 #!/usr/bin/env bash

 mkdir -p /tmp/bazel_event_logs
-echo "--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)"
+event_json_flag=--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)
+logs_archive_flag=--test_env=RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR=/artifact-mount/.failed_test_logs
+
+echo "${event_json_flag} ${logs_archive_flag}"
--- a/python/ray/data/tests/conftest.py
+++ b/python/ray/data/tests/conftest.py
@ -9,6 +9,9 @@ from ray.data.block import BlockAccessor
 from ray.data.tests.mock_server import *  # noqa
 from ray.data.datasource.file_based_datasource import BlockWritePathProvider

+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+

@pytest.fixture(scope="function")
 def aws_credentials():
--- a/python/ray/experimental/dag/tests/conftest.py
+++ b/python/ray/experimental/dag/tests/conftest.py
@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
--- a/python/ray/ml/tests/conftest.py
+++ b/python/ray/ml/tests/conftest.py
@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
--- a/python/ray/serve/tests/conftest.py
+++ b/python/ray/serve/tests/conftest.py
@ -8,6 +8,8 @@ import random
 import ray
 from ray import serve

+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+
 # https://tools.ietf.org/html/rfc6335#section-6
 MIN_DYNAMIC_PORT = 49152
 MAX_DYNAMIC_PORT = 65535
--- a/python/ray/tests/conftest.py
+++ b/python/ray/tests/conftest.py
@ -11,6 +11,9 @@ import json
 import time
 from pathlib import Path
 from unittest import mock
+import shutil
+import platform
+from tempfile import gettempdir

 import ray
 import ray.ray_constants as ray_constants
@ -679,3 +682,40 @@ def set_bad_runtime_env_cache_ttl_seconds(request):
    os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"] = ttl
    yield ttl
    del os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"]
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    # execute all other hooks to obtain the report object
+    outcome = yield
+    rep = outcome.get_result()
+
+    # We temporarily restrict to Linux until we have artifact dirs
+    # for Windows and Mac
+    if platform.system() != "Linux":
+        return
+
+    # Only archive failed tests after the "call" phase of the test
+    if rep.when != "call" or not rep.failed:
+        return
+
+    # Get dir to write zipped logs to
+    archive_dir = os.environ.get("RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR")
+
+    if not archive_dir:
+        return
+
+    if not os.path.exists(archive_dir):
+        os.makedirs(archive_dir)
+
+    # Get logs dir from the latest ray session
+    tmp_dir = gettempdir()
+    logs_dir = os.path.join(tmp_dir, "ray", "session_latest", "logs")
+
+    if not os.path.exists(logs_dir):
+        return
+
+    # Write zipped logs to logs archive dir
+    test_name = rep.nodeid.replace(os.sep, "::")
+    output_file = os.path.join(archive_dir, f"{test_name}_{time.time():.4f}")
+    shutil.make_archive(output_file, "zip", logs_dir)
--- a/python/ray/train/tests/conftest.py
+++ b/python/ray/train/tests/conftest.py
@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
--- a/python/ray/train/tests/test_backend.py
+++ b/python/ray/train/tests/test_backend.py
@ -23,6 +23,9 @@ from ray.train.constants import (
 from ray.train.worker_group import WorkerGroup
 from ray.util.placement_group import get_current_placement_group

+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+

@pytest.fixture
 def ray_start_2_cpus():
--- a/python/ray/tune/tests/conftest.py
+++ b/python/ray/tune/tests/conftest.py
@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
--- a/python/ray/workflow/tests/conftest.py
+++ b/python/ray/workflow/tests/conftest.py
@ -7,6 +7,9 @@ from mock_server import start_service, stop_process

 import tempfile
 from ray.tests.conftest import get_default_fixture_ray_kwargs
+
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
 import os
 import uuid
 from ray.workflow.tests import utils
--- a/rllib/tests/conftest.py
+++ b/rllib/tests/conftest.py
@ -1 +1,4 @@
 from ray.tests.conftest import ray_start_regular_shared  # noqa: F401
+
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa