ray/release/run_release_test.sh

#!/bin/bash

set -e

if [ -n "$DEBUG" ]; then
  set -x
fi

cd "${0%/*}" || exit 1

reason() {
  # Keep in sync with e2e.py ExitCode enum
  if [ "$1" -eq 0 ]; then
    REASON="success"
  elif [ "$1" -ge 1 ] && [ "$1" -lt 10 ]; then
    REASON="runtime error"
  elif [ "$1" -ge 10 ] && [ "$1" -lt 20 ]; then
    REASON="infra error"
  elif [ "$1" -ge 30 ] && [ "$1" -lt 40 ]; then
    REASON="infra timeout"
  elif [ "$1" -eq 42 ]; then
    REASON="command timeout"
  elif [ "$1" -ge 40 ] && [ "$1" -lt 50 ]; then
    REASON="command error"
  fi
  echo "${REASON}"
}

RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}

# This is not a great idea if your OS is different to the one
# used in the product clusters. However, we need this in CI as reloading
# Ray within the python process does not work for protobuf changes.
INSTALL_MATCHING_RAY=${BUILDKITE-false}

export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR

if [ -z "${NO_INSTALL}" ]; then
  pip install -q -r requirements.txt
  pip install -q -U boto3 botocore

  if [ "${INSTALL_MATCHING_RAY-false}" == "true" ]; then
    # Find ray-wheels parameter and install locally
    i=1
    for arg in "$@"; do
      j=$((i+1))
      if [ "$arg" == "--ray-wheels" ]; then
        PARSED_RAY_WHEELS="${!j}"
      fi
      i=$j
    done

    if [ -n "${PARSED_RAY_WHEELS}" ]; then
      echo "Installing Ray wheels locally: ${PARSED_RAY_WHEELS}"
      pip install -U --force-reinstall "${PARSED_RAY_WHEELS}"
    else
      echo "Warning: No Ray wheels found to install locally"
    fi
  fi
fi

if [ -z "${NO_CLONE}" ]; then
  TMPDIR=$(mktemp -d -t release-XXXXXXXXXX)
  git clone --depth 1 -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}"
  pushd "${TMPDIR}/release" || true
fi

if [ -z "${NO_INSTALL}" ]; then
  pip install -e .
fi

RETRY_NUM=0
MAX_RETRIES=${MAX_RETRIES-1}

if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
  echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
  MAX_RETRIES=1
fi

ALL_EXIT_CODES=()
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
  RETRY_NUM=$((RETRY_NUM + 1))

  if [ "$RETRY_NUM" -gt 1 ]; then
    # Sleep for random time between 30 and 90 minutes
    SLEEP_TIME=$((1800 + RANDOM % 5400))

    if [ -n "${OVERRIDE_SLEEP_TIME}" ]; then
      SLEEP_TIME=${OVERRIDE_SLEEP_TIME}
    fi

    echo "----------------------------------------"
    echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
    echo "----------------------------------------"
    sleep "${SLEEP_TIME}"
  fi

  if [ -z "${NO_ARTIFACTS}" ]; then
    sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
  fi

  set +e
  python "${RAY_TEST_SCRIPT}" "$@"
  EXIT_CODE=$?
  set -e
  REASON=$(reason "${EXIT_CODE}")
  ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE

  case ${EXIT_CODE} in
    0)
    echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
    break
    ;;
    30 | 31 | 32 | 33)
    echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
    ;;
    *)
    echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
    break
    ;;
  esac

done

if [ -z "${NO_ARTIFACTS}" ]; then
  sudo rm -rf /tmp/ray_release_test_artifacts/* || true
  sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
fi

echo "----------------------------------------"
echo "Release test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
echo "Run results:"

COUNTER=1
for EX in "${ALL_EXIT_CODES[@]}"; do
  REASON=$(reason "${EX}")
  echo "  Run $COUNTER: Exit code = ${EX} (${REASON})"
  COUNTER=$((COUNTER + 1))
done

echo "----------------------------------------"

REASON=$(reason "${EXIT_CODE}")
echo "Final release test exit code is ${EXIT_CODE} (${REASON})"

if [ "$EXIT_CODE" -eq 0 ]; then
  echo "RELEASE MANAGER: This test seems to have passed."
elif [ "$EXIT_CODE" -ge 30 ] && [ "$EXIT_CODE" -lt 40 ]; then
  echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
else
  echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
fi

if [ -z "${NO_CLONE}" ]; then
  popd || true
  rm -rf "${TMPDIR}" || true
fi

exit $EXIT_CODE
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`#!/bin/bash`

			`set -e`

			`if [ -n "$DEBUG" ]; then`
			`set -x`
			`fi`

			`cd "${0%/*}" \|\| exit 1`

			`reason() {`
			`# Keep in sync with e2e.py ExitCode enum`
			`if [ "$1" -eq 0 ]; then`
			`REASON="success"`
[ci/release] Fix result output in Buildkite pipeline run (#22946) The new buildkite pipeline prints out faulty results due to a confusion of -ge/-gt and -le/-lt in the retry script. This is a cosmetic error (so behavior was still correct) that is resolved with this PR. 2022-03-09 17:29:31 +00:00			`elif [ "$1" -ge 1 ] && [ "$1" -lt 10 ]; then`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`REASON="runtime error"`
[ci/release] Fix result output in Buildkite pipeline run (#22946) The new buildkite pipeline prints out faulty results due to a confusion of -ge/-gt and -le/-lt in the retry script. This is a cosmetic error (so behavior was still correct) that is resolved with this PR. 2022-03-09 17:29:31 +00:00			`elif [ "$1" -ge 10 ] && [ "$1" -lt 20 ]; then`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`REASON="infra error"`
[ci/release] Fix result output in Buildkite pipeline run (#22946) The new buildkite pipeline prints out faulty results due to a confusion of -ge/-gt and -le/-lt in the retry script. This is a cosmetic error (so behavior was still correct) that is resolved with this PR. 2022-03-09 17:29:31 +00:00			`elif [ "$1" -ge 30 ] && [ "$1" -lt 40 ]; then`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`REASON="infra timeout"`
			`elif [ "$1" -eq 42 ]; then`
			`REASON="command timeout"`
[ci/release] Fix result output in Buildkite pipeline run (#22946) The new buildkite pipeline prints out faulty results due to a confusion of -ge/-gt and -le/-lt in the retry script. This is a cosmetic error (so behavior was still correct) that is resolved with this PR. 2022-03-09 17:29:31 +00:00			`elif [ "$1" -ge 40 ] && [ "$1" -lt 50 ]; then`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`REASON="command error"`
			`fi`
			`echo "${REASON}"`
			`}`

			`RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}`
			`RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}`
			`RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}`
			`RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}`

[ci/release] Install local wheels in release test shell script (#25227) We're currently installing matching wheels on the fly in the python script for Ray client tests. However, we can't reload modules with changed protobuf configurations, and thus can't reload ray completely. Since the `anyscale` pacakge depends on Ray, this effectively prevents us from installing matching wheels within the python script. There are a few possible solutions to this. First, we could separate out the local environment preparation from the test running - this will duplicate some logic and is thus a bit more involved, but should be considered in the future. For now, we adjust the `run_release_tests.sh` shell script to install any passed `--ray-wheels` wheels locally. We only do this in CI instances per default as these wheels will not be compatible with e.g. MacOS. Link to successful build: https://buildkite.com/ray-project/release-tests-branch/builds/619#_ 2022-06-02 11:32:51 +02:00			`# This is not a great idea if your OS is different to the one`
			`# used in the product clusters. However, we need this in CI as reloading`
			`# Ray within the python process does not work for protobuf changes.`
			`INSTALL_MATCHING_RAY=${BUILDKITE-false}`

[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR`

			`if [ -z "${NO_INSTALL}" ]; then`
			`pip install -q -r requirements.txt`
			`pip install -q -U boto3 botocore`
[ci/release] Install local wheels in release test shell script (#25227) We're currently installing matching wheels on the fly in the python script for Ray client tests. However, we can't reload modules with changed protobuf configurations, and thus can't reload ray completely. Since the `anyscale` pacakge depends on Ray, this effectively prevents us from installing matching wheels within the python script. There are a few possible solutions to this. First, we could separate out the local environment preparation from the test running - this will duplicate some logic and is thus a bit more involved, but should be considered in the future. For now, we adjust the `run_release_tests.sh` shell script to install any passed `--ray-wheels` wheels locally. We only do this in CI instances per default as these wheels will not be compatible with e.g. MacOS. Link to successful build: https://buildkite.com/ray-project/release-tests-branch/builds/619#_ 2022-06-02 11:32:51 +02:00
			`if [ "${INSTALL_MATCHING_RAY-false}" == "true" ]; then`
			`# Find ray-wheels parameter and install locally`
			`i=1`
			`for arg in "$@"; do`
			`j=$((i+1))`
			`if [ "$arg" == "--ray-wheels" ]; then`
			`PARSED_RAY_WHEELS="${!j}"`
			`fi`
			`i=$j`
			`done`

			`if [ -n "${PARSED_RAY_WHEELS}" ]; then`
			`echo "Installing Ray wheels locally: ${PARSED_RAY_WHEELS}"`
			`pip install -U --force-reinstall "${PARSED_RAY_WHEELS}"`
			`else`
			`echo "Warning: No Ray wheels found to install locally"`
			`fi`
			`fi`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`fi`

			`if [ -z "${NO_CLONE}" ]; then`
			`TMPDIR=$(mktemp -d -t release-XXXXXXXXXX)`
			`git clone --depth 1 -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}"`
			`pushd "${TMPDIR}/release" \|\| true`
			`fi`

			`if [ -z "${NO_INSTALL}" ]; then`
			`pip install -e .`
			`fi`

			`RETRY_NUM=0`
[ci/release] Disable infra retries for now (#23132) Infra errors are tackled with concurrency groups. Thus we can disable old mitigation methods like automatic infra retry for now. We keep the script as it does other logic (e.g. checkout local test branch) and infra retry can be enabled via env variable if needed. 2022-03-14 11:51:11 +00:00			`MAX_RETRIES=${MAX_RETRIES-1}`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00
			`if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then`
			`echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."`
			`MAX_RETRIES=1`
			`fi`

			`ALL_EXIT_CODES=()`
			`while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do`
			`RETRY_NUM=$((RETRY_NUM + 1))`

			`if [ "$RETRY_NUM" -gt 1 ]; then`
			`# Sleep for random time between 30 and 90 minutes`
			`SLEEP_TIME=$((1800 + RANDOM % 5400))`

			`if [ -n "${OVERRIDE_SLEEP_TIME}" ]; then`
			`SLEEP_TIME=${OVERRIDE_SLEEP_TIME}`
			`fi`

			`echo "----------------------------------------"`
			`echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."`
			`echo "----------------------------------------"`
			`sleep "${SLEEP_TIME}"`
			`fi`

			`if [ -z "${NO_ARTIFACTS}" ]; then`
			`sudo rm -rf "${RELEASE_RESULTS_DIR}"/* \|\| true`
			`fi`

			`set +e`
			`python "${RAY_TEST_SCRIPT}" "$@"`
			`EXIT_CODE=$?`
			`set -e`
			`REASON=$(reason "${EXIT_CODE}")`
			`ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE`

			`case ${EXIT_CODE} in`
			`0)`
			`echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"`
			`break`
			`;;`
			`30 \| 31 \| 32 \| 33)`
			`echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."`
			`;;`
			`*)`
			`echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."`
			`break`
			`;;`
			`esac`

			`done`

			`if [ -z "${NO_ARTIFACTS}" ]; then`
			`sudo rm -rf /tmp/ray_release_test_artifacts/* \|\| true`
			`sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ \|\| true`
			`fi`

			`echo "----------------------------------------"`
[ci/release] Disable infra retries for now (#23132) Infra errors are tackled with concurrency groups. Thus we can disable old mitigation methods like automatic infra retry for now. We keep the script as it does other logic (e.g. checkout local test branch) and infra retry can be enabled via env variable if needed. 2022-03-14 11:51:11 +00:00			`echo "Release test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`echo "Run results:"`

			`COUNTER=1`
			`for EX in "${ALL_EXIT_CODES[@]}"; do`
			`REASON=$(reason "${EX}")`
			`echo " Run $COUNTER: Exit code = ${EX} (${REASON})"`
			`COUNTER=$((COUNTER + 1))`
			`done`

			`echo "----------------------------------------"`

			`REASON=$(reason "${EXIT_CODE}")`
			`echo "Final release test exit code is ${EXIT_CODE} (${REASON})"`

			`if [ "$EXIT_CODE" -eq 0 ]; then`
			`echo "RELEASE MANAGER: This test seems to have passed."`
[ci/release] Fix result output in Buildkite pipeline run (#22946) The new buildkite pipeline prints out faulty results due to a confusion of -ge/-gt and -le/-lt in the retry script. This is a cosmetic error (so behavior was still correct) that is resolved with this PR. 2022-03-09 17:29:31 +00:00			`elif [ "$EXIT_CODE" -ge 30 ] && [ "$EXIT_CODE" -lt 40 ]; then`
[ci/release] Refactor release test e2e into package (#22351) Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023 2022-02-16 17:35:02 +00:00			`echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."`
			`else`
			`echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."`
			`fi`

			`if [ -z "${NO_CLONE}" ]; then`
			`popd \|\| true`
			`rm -rf "${TMPDIR}" \|\| true`
			`fi`

			`exit $EXIT_CODE`