mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00

This PR adds GPU support for pytorch and tensorflow predictor, as well as automatic setting `use_gpu` flag in `BatchPredictor`. Notable changes: - Added `use_gpu` flag in the constructor of `TorchPredictor` and `TensorflowPredictor` (note it's slightly different from our latest design doc that puts this flag at `predict()` call) - Added `use_gpu` flag to `SklearnPredictor` so its interface is compatible with `BatchPredictor` - Code to move both model weights and input tensor to default visible GPU at index 0 if flag is set - parametrized existing predictor tests to use GPU for both CPU & GPU coverage - Changed BUILD CI tests with an added `gpu` tag (I'm not 100% sure if that's a right way tho) Follow ups: https://github.com/ray-project/ray/issues/26249 is created in case our host has multiple GPU devices. It's a bit out of scope for this PR, but for GPU batch inference ideally we should be able to evenly use all GPU devices on host where CPU & DRAM are busy with pre-fetching + data movement to GPU. We might approximately do the same by scheduling same # of Predictor instances on the host, but that's worth verifying once benchmarks are set.
53 lines
3.2 KiB
YAML
53 lines
3.2 KiB
YAML
- label: ":tv: :steam_locomotive: Train GPU tests "
|
|
conditions: ["RAY_CI_TRAIN_AFFECTED"]
|
|
commands:
|
|
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
|
|
- PYTHON=3.7 TRAIN_TESTING=1 TUNE_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh
|
|
# Because Python version changed, we need to re-install Ray here
|
|
- rm -rf ./python/ray/thirdparty_files; rm -rf ./python/ray/pickle5_files; ./ci/ci.sh build
|
|
- pip install -Ur ./python/requirements_ml_docker.txt
|
|
- ./ci/env/env_info.sh
|
|
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,gpu_only,-ray_air python/ray/train/...
|
|
|
|
- label: ":tv: :database: :steam_locomotive: Datasets Train Integration GPU Tests and Examples (Python 3.7)"
|
|
conditions: ["RAY_CI_TRAIN_AFFECTED"]
|
|
commands:
|
|
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
|
|
- PYTHON=3.7 TRAIN_TESTING=1 DATA_PROCESSING_TESTING=1 ./ci/env/install-dependencies.sh
|
|
- pip install -Ur ./python/requirements_ml_docker.txt
|
|
- ./ci/env/env_info.sh
|
|
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=datasets_train doc/...
|
|
|
|
- label: ":tv: :brain: RLlib: Multi-GPU Tests"
|
|
conditions: ["RAY_CI_RLLIB_AFFECTED"]
|
|
commands:
|
|
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
|
|
- PYTHON=3.7 RLLIB_TESTING=1 ./ci/env/install-dependencies.sh
|
|
- pip install -Ur ./python/requirements_ml_docker.txt
|
|
- ./ci/env/env_info.sh
|
|
# --jobs 2 is necessary as we only need to have at least 2 gpus on the machine
|
|
# and running tests in parallel would cause timeouts as the other scripts would
|
|
# wait for the GPU to become available.
|
|
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --jobs 2
|
|
--test_tag_filters=multi_gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/...
|
|
|
|
|
|
- label: ":tv: :airplane: ML GPU tests (ray/air)"
|
|
conditions: ["RAY_CI_ML_AFFECTED"]
|
|
commands:
|
|
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
|
|
- DATA_PROCESSING_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh
|
|
- pip install -Ur ./python/requirements_ml_docker.txt
|
|
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu python/ray/air/...
|
|
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu python/ray/train/...
|
|
|
|
|
|
- label: ":tv: :book: Doc GPU tests and examples"
|
|
conditions:
|
|
["RAY_CI_PYTHON_AFFECTED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_DOC_AFFECTED"]
|
|
commands:
|
|
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
|
|
- DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 PYTHON=3.7 ./ci/env/install-dependencies.sh
|
|
- pip install -Ur ./python/requirements_ml_docker.txt
|
|
- ./ci/env/env_info.sh
|
|
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-py37,-post_wheel_build doc/...
|