[RLlib] Add GPU tests to CI (run per-PR). (#17891)

Co-authored-by: simon-mo <simon.mo@hey.com>
This commit is contained in:
Kai Fricke 2021-08-24 09:20:45 +02:00 committed by GitHub
parent f0edbf0d30
commit d058f98546
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 129 additions and 1 deletions

View file

@ -44,7 +44,7 @@ RUN echo "ulimit -c 0" >> /root/.bashrc
# Setup Bazel caches
RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \
(if [ ${BUILDKITE_PULL_REQUEST} != "false" ]; then (echo "build --remote_upload_local_results=false" >> /root/.bazelrc); fi); \
(if [ "${BUILDKITE_PULL_REQUEST}" != "false" ]; then (echo "build --remote_upload_local_results=false" >> /root/.bazelrc); fi); \
cat /root/.bazelrc
RUN mkdir /ray

62
.buildkite/Dockerfile.gpu Normal file
View file

@ -0,0 +1,62 @@
FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04
# This is a copy of the regular Dockerfile
# adjusted for ubuntu 18 (removed python-is-python3)
ARG REMOTE_CACHE_URL
ARG BUILDKITE_PULL_REQUEST
ARG BUILDKITE_COMMIT
ARG BUILDKITE_PULL_REQUEST_BASE_BRANCH
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=America/Los_Angeles
ENV BUILDKITE=true
ENV CI=true
ENV PYTHON=3.6
ENV RAY_USE_RANDOM_PORTS=1
ENV RAY_DEFAULT_BUILD=1
ENV RAY_INSTALL_JAVA=1
ENV BUILDKITE_PULL_REQUEST=${BUILDKITE_PULL_REQUEST}
ENV BUILDKITE_COMMIT=${BUILDKITE_COMMIT}
ENV BUILDKITE_PULL_REQUEST_BASE_BRANCH=${BUILDKITE_PULL_REQUEST_BASE_BRANCH}
# For wheel build
# https://github.com/docker-library/docker/blob/master/20.10/docker-entrypoint.sh
ENV DOCKER_TLS_CERTDIR=/certs
ENV DOCKER_HOST=tcp://docker:2376
ENV DOCKER_TLS_VERIFY=1
ENV DOCKER_CERT_PATH=/certs/client
ENV TRAVIS_COMMIT=${BUILDKITE_COMMIT}
ENV BUILDKITE_BAZEL_CACHE_URL=${REMOTE_CACHE_URL}
RUN apt-get update -qq
RUN apt-get install -y -qq \
curl git build-essential \
sudo unzip apt-utils dialog tzdata wget rsync \
language-pack-en tmux cmake gdb vim htop \
libgtk2.0-dev zlib1g-dev libgl1-mesa-dev maven \
openjdk-8-jre openjdk-8-jdk clang-format-7
RUN ln -s /usr/bin/clang-format-7 /usr/bin/clang-format
RUN curl -o- https://get.docker.com | sh
# System conf for tests
RUN locale -a
ENV LC_ALL=en_US.utf8
ENV LANG=en_US.utf8
RUN echo "ulimit -c 0" >> /root/.bashrc
# Setup Bazel caches
RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \
(if [ "${BUILDKITE_PULL_REQUEST}" != "false" ]; then (echo "build --remote_upload_local_results=false" >> /root/.bazelrc); fi); \
cat /root/.bazelrc
RUN mkdir /ray
WORKDIR /ray
# Below should be re-run each time
COPY . .
RUN ./ci/travis/ci.sh init
RUN bash --login -i ./ci/travis/ci.sh build
# Run determine test to run
RUN bash --login -i -c "python ./ci/travis/determine_tests_to_run.py --output=json > affected_set.json"
RUN cat affected_set.json

View file

@ -0,0 +1,45 @@
- label: ":tv: :brain: RLlib: GPU Examples {A/B}"
conditions: ["RAY_CI_RLLIB_AFFECTED"]
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
- RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
- pip install -Ur ./python/requirements_ml_docker.txt
- ./ci/travis/env_info.sh
# Todo: enable once tests pass
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
# --test_tag_filters=examples_A,examples_B,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1 rllib/...
# Todo: enable once tests pass
#- label: ":tv: :brain: RLlib: GPU Examples {C/D}"
# conditions: ["RAY_CI_RLLIB_AFFECTED"]
# commands:
# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
# - RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
# - pip install -Ur ./python/requirements_ml_docker.txt
# - ./ci/travis/env_info.sh
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
# --test_tag_filters=examples_C,examples_D,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1 rllib/...
# Todo: enable once tests pass
#- label: ":tv: :brain: RLlib: GPU Examples {E/P}"
# conditions: ["RAY_CI_RLLIB_AFFECTED"]
# commands:
# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
# - RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
# - pip install -Ur ./python/requirements_ml_docker.txt
# - ./ci/travis/env_info.sh
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
# --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1
# rllib/...
# Todo: enable once tests pass
#- label: ":tv: :brain: RLlib: GPU Examples {Q/Z}"
# conditions: ["RAY_CI_RLLIB_AFFECTED"]
# commands:
# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
# - RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
# - pip install -Ur ./python/requirements_ml_docker.txt
# - ./ci/travis/env_info.sh
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
# --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1
# rllib/...

21
ci/travis/env_info.sh Executable file
View file

@ -0,0 +1,21 @@
#!/bin/bash
echo "Test environment information"
echo "----------------------------"
echo "Python version: $(python --version 2>/dev/null || echo 'Python not installed')"
echo "Ray version: $(ray --version 2>/dev/null || echo 'Ray not installed')"
echo "Installed pip packages:"
python -m pip freeze 2>/dev/null || echo 'Pip not installed'
echo "----------------------------"
echo "GPU information"
echo "----------------------------"
GPUCMD="nvidia-smi"
if ! command -v "${GPUCMD}" &> /dev/null
then
echo "No GPU support found (${GPUCMD} not found)."
else
eval "${GPUCMD}"
python -c "import torch; print('Torch cuda available:', torch.cuda.is_available())"
fi
echo "----------------------------"