mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[RLlib] Add GPU tests to CI (run per-PR). (#17891)
Co-authored-by: simon-mo <simon.mo@hey.com>
This commit is contained in:
parent
f0edbf0d30
commit
d058f98546
4 changed files with 129 additions and 1 deletions
|
@ -44,7 +44,7 @@ RUN echo "ulimit -c 0" >> /root/.bashrc
|
|||
|
||||
# Setup Bazel caches
|
||||
RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \
|
||||
(if [ ${BUILDKITE_PULL_REQUEST} != "false" ]; then (echo "build --remote_upload_local_results=false" >> /root/.bazelrc); fi); \
|
||||
(if [ "${BUILDKITE_PULL_REQUEST}" != "false" ]; then (echo "build --remote_upload_local_results=false" >> /root/.bazelrc); fi); \
|
||||
cat /root/.bazelrc
|
||||
|
||||
RUN mkdir /ray
|
||||
|
|
62
.buildkite/Dockerfile.gpu
Normal file
62
.buildkite/Dockerfile.gpu
Normal file
|
@ -0,0 +1,62 @@
|
|||
FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04
|
||||
|
||||
# This is a copy of the regular Dockerfile
|
||||
# adjusted for ubuntu 18 (removed python-is-python3)
|
||||
ARG REMOTE_CACHE_URL
|
||||
ARG BUILDKITE_PULL_REQUEST
|
||||
ARG BUILDKITE_COMMIT
|
||||
ARG BUILDKITE_PULL_REQUEST_BASE_BRANCH
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV TZ=America/Los_Angeles
|
||||
|
||||
ENV BUILDKITE=true
|
||||
ENV CI=true
|
||||
ENV PYTHON=3.6
|
||||
ENV RAY_USE_RANDOM_PORTS=1
|
||||
ENV RAY_DEFAULT_BUILD=1
|
||||
ENV RAY_INSTALL_JAVA=1
|
||||
ENV BUILDKITE_PULL_REQUEST=${BUILDKITE_PULL_REQUEST}
|
||||
ENV BUILDKITE_COMMIT=${BUILDKITE_COMMIT}
|
||||
ENV BUILDKITE_PULL_REQUEST_BASE_BRANCH=${BUILDKITE_PULL_REQUEST_BASE_BRANCH}
|
||||
# For wheel build
|
||||
# https://github.com/docker-library/docker/blob/master/20.10/docker-entrypoint.sh
|
||||
ENV DOCKER_TLS_CERTDIR=/certs
|
||||
ENV DOCKER_HOST=tcp://docker:2376
|
||||
ENV DOCKER_TLS_VERIFY=1
|
||||
ENV DOCKER_CERT_PATH=/certs/client
|
||||
ENV TRAVIS_COMMIT=${BUILDKITE_COMMIT}
|
||||
ENV BUILDKITE_BAZEL_CACHE_URL=${REMOTE_CACHE_URL}
|
||||
|
||||
RUN apt-get update -qq
|
||||
RUN apt-get install -y -qq \
|
||||
curl git build-essential \
|
||||
sudo unzip apt-utils dialog tzdata wget rsync \
|
||||
language-pack-en tmux cmake gdb vim htop \
|
||||
libgtk2.0-dev zlib1g-dev libgl1-mesa-dev maven \
|
||||
openjdk-8-jre openjdk-8-jdk clang-format-7
|
||||
RUN ln -s /usr/bin/clang-format-7 /usr/bin/clang-format
|
||||
RUN curl -o- https://get.docker.com | sh
|
||||
|
||||
# System conf for tests
|
||||
RUN locale -a
|
||||
ENV LC_ALL=en_US.utf8
|
||||
ENV LANG=en_US.utf8
|
||||
RUN echo "ulimit -c 0" >> /root/.bashrc
|
||||
|
||||
# Setup Bazel caches
|
||||
RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \
|
||||
(if [ "${BUILDKITE_PULL_REQUEST}" != "false" ]; then (echo "build --remote_upload_local_results=false" >> /root/.bazelrc); fi); \
|
||||
cat /root/.bazelrc
|
||||
|
||||
RUN mkdir /ray
|
||||
WORKDIR /ray
|
||||
|
||||
# Below should be re-run each time
|
||||
COPY . .
|
||||
RUN ./ci/travis/ci.sh init
|
||||
RUN bash --login -i ./ci/travis/ci.sh build
|
||||
|
||||
# Run determine test to run
|
||||
RUN bash --login -i -c "python ./ci/travis/determine_tests_to_run.py --output=json > affected_set.json"
|
||||
RUN cat affected_set.json
|
45
.buildkite/pipeline.gpu.yml
Normal file
45
.buildkite/pipeline.gpu.yml
Normal file
|
@ -0,0 +1,45 @@
|
|||
- label: ":tv: :brain: RLlib: GPU Examples {A/B}"
|
||||
conditions: ["RAY_CI_RLLIB_AFFECTED"]
|
||||
commands:
|
||||
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
|
||||
- RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
|
||||
- pip install -Ur ./python/requirements_ml_docker.txt
|
||||
- ./ci/travis/env_info.sh
|
||||
# Todo: enable once tests pass
|
||||
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
|
||||
# --test_tag_filters=examples_A,examples_B,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1 rllib/...
|
||||
|
||||
# Todo: enable once tests pass
|
||||
#- label: ":tv: :brain: RLlib: GPU Examples {C/D}"
|
||||
# conditions: ["RAY_CI_RLLIB_AFFECTED"]
|
||||
# commands:
|
||||
# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
|
||||
# - RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
|
||||
# - pip install -Ur ./python/requirements_ml_docker.txt
|
||||
# - ./ci/travis/env_info.sh
|
||||
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
|
||||
# --test_tag_filters=examples_C,examples_D,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1 rllib/...
|
||||
|
||||
# Todo: enable once tests pass
|
||||
#- label: ":tv: :brain: RLlib: GPU Examples {E/P}"
|
||||
# conditions: ["RAY_CI_RLLIB_AFFECTED"]
|
||||
# commands:
|
||||
# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
|
||||
# - RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
|
||||
# - pip install -Ur ./python/requirements_ml_docker.txt
|
||||
# - ./ci/travis/env_info.sh
|
||||
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
|
||||
# --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1
|
||||
# rllib/...
|
||||
|
||||
# Todo: enable once tests pass
|
||||
#- label: ":tv: :brain: RLlib: GPU Examples {Q/Z}"
|
||||
# conditions: ["RAY_CI_RLLIB_AFFECTED"]
|
||||
# commands:
|
||||
# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
|
||||
# - RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
|
||||
# - pip install -Ur ./python/requirements_ml_docker.txt
|
||||
# - ./ci/travis/env_info.sh
|
||||
# - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only
|
||||
# --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z,-flaky --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 --test_env=RLLIB_NUM_GPUS=1
|
||||
# rllib/...
|
21
ci/travis/env_info.sh
Executable file
21
ci/travis/env_info.sh
Executable file
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo "Test environment information"
|
||||
echo "----------------------------"
|
||||
echo "Python version: $(python --version 2>/dev/null || echo 'Python not installed')"
|
||||
echo "Ray version: $(ray --version 2>/dev/null || echo 'Ray not installed')"
|
||||
echo "Installed pip packages:"
|
||||
python -m pip freeze 2>/dev/null || echo 'Pip not installed'
|
||||
echo "----------------------------"
|
||||
|
||||
echo "GPU information"
|
||||
echo "----------------------------"
|
||||
GPUCMD="nvidia-smi"
|
||||
if ! command -v "${GPUCMD}" &> /dev/null
|
||||
then
|
||||
echo "No GPU support found (${GPUCMD} not found)."
|
||||
else
|
||||
eval "${GPUCMD}"
|
||||
python -c "import torch; print('Torch cuda available:', torch.cuda.is_available())"
|
||||
fi
|
||||
echo "----------------------------"
|
Loading…
Add table
Reference in a new issue