mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

With the new job-based file copy, fetching results takes longer. We thus have to increase the long running update test check times in order not to run into bogus release test failures. Also fixes artifact uploading issues.
176 lines
3.7 KiB
Bash
Executable file
176 lines
3.7 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
set -ex
|
|
|
|
cd "${0%/*}" || exit 1
|
|
|
|
reason() {
|
|
# Keep in sync with e2e.py ExitCode enum
|
|
case $1 in
|
|
0)
|
|
REASON="success"
|
|
;;
|
|
2)
|
|
REASON="unspecified"
|
|
;;
|
|
3)
|
|
REASON="unknown"
|
|
;;
|
|
4)
|
|
REASON="runtime error"
|
|
;;
|
|
5)
|
|
REASON="command error"
|
|
;;
|
|
6)
|
|
REASON="command timeout"
|
|
;;
|
|
7)
|
|
REASON="prepare timeout"
|
|
;;
|
|
8)
|
|
REASON="filesync timeout"
|
|
;;
|
|
9)
|
|
REASON="session timeout"
|
|
;;
|
|
10)
|
|
REASON="prepare error"
|
|
;;
|
|
11)
|
|
REASON="app config build error"
|
|
;;
|
|
12)
|
|
REASON="infra error"
|
|
;;
|
|
*)
|
|
REASON="untracked error"
|
|
;;
|
|
esac
|
|
echo "${REASON}"
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]
|
|
do
|
|
key="$1"
|
|
case $key in
|
|
--ray-repo)
|
|
shift
|
|
RAY_REPO=$1
|
|
;;
|
|
--ray-branch)
|
|
shift
|
|
RAY_BRANCH=$1
|
|
;;
|
|
--ray-version)
|
|
shift
|
|
RAY_VERSION=$1
|
|
;;
|
|
--ray-wheels)
|
|
shift
|
|
RAY_WHEELS=$1
|
|
;;
|
|
--ray-test-repo)
|
|
shift
|
|
RAY_TEST_REPO=$1
|
|
;;
|
|
--ray-test-branch)
|
|
shift
|
|
RAY_TEST_BRANCH=$1
|
|
;;
|
|
--release-results-dir)
|
|
shift
|
|
RELEASE_RESULTS_DIR=$1
|
|
;;
|
|
*)
|
|
break
|
|
esac
|
|
shift
|
|
done
|
|
|
|
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
|
|
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
|
|
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
|
|
|
|
export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
|
|
|
|
pip uninstall -q -y ray
|
|
pip install -q -r requirements.txt
|
|
pip install -q -U boto3 botocore
|
|
git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
|
|
|
|
RETRY_NUM=0
|
|
MAX_RETRIES=${MAX_RETRIES-3}
|
|
|
|
if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
|
|
echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
|
|
MAX_RETRIES=1
|
|
fi
|
|
|
|
ALL_EXIT_CODES=()
|
|
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
|
|
RETRY_NUM=$((RETRY_NUM + 1))
|
|
|
|
if [ "$RETRY_NUM" -gt 1 ]; then
|
|
# Sleep for random time between 30 and 90 minutes
|
|
SLEEP_TIME=$((1800 + RANDOM % 5400))
|
|
echo "----------------------------------------"
|
|
echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
|
|
echo "----------------------------------------"
|
|
sleep ${SLEEP_TIME}
|
|
fi
|
|
|
|
sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
|
|
|
|
python e2e.py "$@"
|
|
EXIT_CODE=$?
|
|
REASON=$(reason "${EXIT_CODE}")
|
|
ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
|
|
|
|
case ${EXIT_CODE} in
|
|
0)
|
|
echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
|
|
break
|
|
;;
|
|
7 | 9 | 10)
|
|
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
|
|
;;
|
|
*)
|
|
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
|
|
break
|
|
;;
|
|
esac
|
|
|
|
done
|
|
|
|
sudo rm -rf /tmp/ray_release_test_artifacts/* || true
|
|
sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
|
|
|
|
echo "----------------------------------------"
|
|
echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
|
|
echo "Run results:"
|
|
|
|
COUNTER=1
|
|
for EX in "${ALL_EXIT_CODES[@]}"; do
|
|
REASON=$(reason "${EX}")
|
|
echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
|
|
COUNTER=$((COUNTER + 1))
|
|
done
|
|
|
|
echo "----------------------------------------"
|
|
|
|
REASON=$(reason "${EXIT_CODE}")
|
|
echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
|
|
|
|
case ${EXIT_CODE} in
|
|
0)
|
|
;;
|
|
7 | 9 | 10)
|
|
echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
|
|
;;
|
|
*)
|
|
echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
|
|
;;
|
|
esac
|
|
|
|
exit $EXIT_CODE
|