ray/release/run_e2e.sh
Kai Fricke 6b683ec8dc
[ci] Retry release tests on infra error (#20478)
This PR introduces proper exit codes for release tests. These are used to restart a certain set of infrastructure related failures automatically.
2021-12-02 10:34:40 -08:00

169 lines
3.5 KiB
Bash
Executable file

#!/bin/bash
cd "${0%/*}" || exit 1
reason() {
# Keep in sync with e2e.py ExitCode enum
case $1 in
0)
REASON="success"
;;
2)
REASON="unspecified"
;;
3)
REASON="unknown"
;;
4)
REASON="runtime error"
;;
5)
REASON="command error"
;;
6)
REASON="command timeout"
;;
7)
REASON="prepare timeout"
;;
8)
REASON="filesync timeout"
;;
9)
REASON="session timeout"
;;
10)
REASON="prepare error"
;;
11)
REASON="app config build error"
;;
12)
REASON="infra error"
;;
*)
REASON="untracked error"
;;
esac
echo "${REASON}"
}
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--ray-repo)
shift
RAY_REPO=$1
;;
--ray-branch)
shift
RAY_BRANCH=$1
;;
--ray-version)
shift
RAY_VERSION=$1
;;
--ray-wheels)
shift
RAY_WHEELS=$1
;;
--ray-test-repo)
shift
RAY_TEST_REPO=$1
;;
--ray-test-branch)
shift
RAY_TEST_BRANCH=$1
;;
--release-results-dir)
shift
RELEASE_RESULTS_DIR=$1
;;
*)
break
esac
shift
done
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
pip install -q -r requirements.txt
pip install -U boto3 botocore
git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
RETRY_NUM=0
MAX_RETRIES=${MAX_RETRIES-3}
if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
MAX_RETRIES=1
fi
ALL_EXIT_CODES=()
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
RETRY_NUM=$((RETRY_NUM + 1))
if [ "$RETRY_NUM" -gt 1 ]; then
# Sleep for random time between 30 and 90 minutes
SLEEP_TIME=$((1800 + RANDOM % 5400))
echo "----------------------------------------"
echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
echo "----------------------------------------"
sleep ${SLEEP_TIME}
fi
python e2e.py "$@"
EXIT_CODE=$?
REASON=$(reason "${EXIT_CODE}")
ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
case ${EXIT_CODE} in
0)
echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
break
;;
7 | 9 | 10)
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
;;
*)
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
break
;;
esac
done
sudo cp -rf /tmp/artifacts/* /tmp/ray_release_test_artifacts || true
echo "----------------------------------------"
echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
echo "Run results:"
COUNTER=1
for EX in "${ALL_EXIT_CODES[@]}"; do
REASON=$(reason "${EX}")
echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
COUNTER=$((COUNTER + 1))
done
echo "----------------------------------------"
REASON=$(reason "${EXIT_CODE}")
echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
case ${EXIT_CODE} in
0)
;;
7 | 9 | 10)
echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
;;
*)
echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
;;
esac
exit $EXIT_CODE