mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00

We're currently installing matching wheels on the fly in the python script for Ray client tests. However, we can't reload modules with changed protobuf configurations, and thus can't reload ray completely. Since the `anyscale` pacakge depends on Ray, this effectively prevents us from installing matching wheels within the python script. There are a few possible solutions to this. First, we could separate out the local environment preparation from the test running - this will duplicate some logic and is thus a bit more involved, but should be considered in the future. For now, we adjust the `run_release_tests.sh` shell script to install any passed `--ray-wheels` wheels locally. We only do this in CI instances per default as these wheels will not be compatible with e.g. MacOS. Link to successful build: https://buildkite.com/ray-project/release-tests-branch/builds/619#_
162 lines
4.4 KiB
Bash
Executable file
162 lines
4.4 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
set -e
|
|
|
|
if [ -n "$DEBUG" ]; then
|
|
set -x
|
|
fi
|
|
|
|
cd "${0%/*}" || exit 1
|
|
|
|
reason() {
|
|
# Keep in sync with e2e.py ExitCode enum
|
|
if [ "$1" -eq 0 ]; then
|
|
REASON="success"
|
|
elif [ "$1" -ge 1 ] && [ "$1" -lt 10 ]; then
|
|
REASON="runtime error"
|
|
elif [ "$1" -ge 10 ] && [ "$1" -lt 20 ]; then
|
|
REASON="infra error"
|
|
elif [ "$1" -ge 30 ] && [ "$1" -lt 40 ]; then
|
|
REASON="infra timeout"
|
|
elif [ "$1" -eq 42 ]; then
|
|
REASON="command timeout"
|
|
elif [ "$1" -ge 40 ] && [ "$1" -lt 50 ]; then
|
|
REASON="command error"
|
|
fi
|
|
echo "${REASON}"
|
|
}
|
|
|
|
RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
|
|
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
|
|
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
|
|
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
|
|
|
|
# This is not a great idea if your OS is different to the one
|
|
# used in the product clusters. However, we need this in CI as reloading
|
|
# Ray within the python process does not work for protobuf changes.
|
|
INSTALL_MATCHING_RAY=${BUILDKITE-false}
|
|
|
|
export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
|
|
|
|
if [ -z "${NO_INSTALL}" ]; then
|
|
pip install -q -r requirements.txt
|
|
pip install -q -U boto3 botocore
|
|
|
|
if [ "${INSTALL_MATCHING_RAY-false}" == "true" ]; then
|
|
# Find ray-wheels parameter and install locally
|
|
i=1
|
|
for arg in "$@"; do
|
|
j=$((i+1))
|
|
if [ "$arg" == "--ray-wheels" ]; then
|
|
PARSED_RAY_WHEELS="${!j}"
|
|
fi
|
|
i=$j
|
|
done
|
|
|
|
if [ -n "${PARSED_RAY_WHEELS}" ]; then
|
|
echo "Installing Ray wheels locally: ${PARSED_RAY_WHEELS}"
|
|
pip install -U --force-reinstall "${PARSED_RAY_WHEELS}"
|
|
else
|
|
echo "Warning: No Ray wheels found to install locally"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [ -z "${NO_CLONE}" ]; then
|
|
TMPDIR=$(mktemp -d -t release-XXXXXXXXXX)
|
|
git clone --depth 1 -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}"
|
|
pushd "${TMPDIR}/release" || true
|
|
fi
|
|
|
|
if [ -z "${NO_INSTALL}" ]; then
|
|
pip install -e .
|
|
fi
|
|
|
|
RETRY_NUM=0
|
|
MAX_RETRIES=${MAX_RETRIES-1}
|
|
|
|
if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
|
|
echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
|
|
MAX_RETRIES=1
|
|
fi
|
|
|
|
ALL_EXIT_CODES=()
|
|
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
|
|
RETRY_NUM=$((RETRY_NUM + 1))
|
|
|
|
if [ "$RETRY_NUM" -gt 1 ]; then
|
|
# Sleep for random time between 30 and 90 minutes
|
|
SLEEP_TIME=$((1800 + RANDOM % 5400))
|
|
|
|
if [ -n "${OVERRIDE_SLEEP_TIME}" ]; then
|
|
SLEEP_TIME=${OVERRIDE_SLEEP_TIME}
|
|
fi
|
|
|
|
echo "----------------------------------------"
|
|
echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
|
|
echo "----------------------------------------"
|
|
sleep "${SLEEP_TIME}"
|
|
fi
|
|
|
|
if [ -z "${NO_ARTIFACTS}" ]; then
|
|
sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
|
|
fi
|
|
|
|
set +e
|
|
python "${RAY_TEST_SCRIPT}" "$@"
|
|
EXIT_CODE=$?
|
|
set -e
|
|
REASON=$(reason "${EXIT_CODE}")
|
|
ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
|
|
|
|
case ${EXIT_CODE} in
|
|
0)
|
|
echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
|
|
break
|
|
;;
|
|
30 | 31 | 32 | 33)
|
|
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
|
|
;;
|
|
*)
|
|
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
|
|
break
|
|
;;
|
|
esac
|
|
|
|
done
|
|
|
|
if [ -z "${NO_ARTIFACTS}" ]; then
|
|
sudo rm -rf /tmp/ray_release_test_artifacts/* || true
|
|
sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
|
|
fi
|
|
|
|
echo "----------------------------------------"
|
|
echo "Release test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
|
|
echo "Run results:"
|
|
|
|
COUNTER=1
|
|
for EX in "${ALL_EXIT_CODES[@]}"; do
|
|
REASON=$(reason "${EX}")
|
|
echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
|
|
COUNTER=$((COUNTER + 1))
|
|
done
|
|
|
|
echo "----------------------------------------"
|
|
|
|
REASON=$(reason "${EXIT_CODE}")
|
|
echo "Final release test exit code is ${EXIT_CODE} (${REASON})"
|
|
|
|
if [ "$EXIT_CODE" -eq 0 ]; then
|
|
echo "RELEASE MANAGER: This test seems to have passed."
|
|
elif [ "$EXIT_CODE" -ge 30 ] && [ "$EXIT_CODE" -lt 40 ]; then
|
|
echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
|
|
else
|
|
echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
|
|
fi
|
|
|
|
if [ -z "${NO_CLONE}" ]; then
|
|
popd || true
|
|
rm -rf "${TMPDIR}" || true
|
|
fi
|
|
|
|
exit $EXIT_CODE
|