From e58fcca40493c2f6c4f29d31a29bbb99428cb62c Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Tue, 26 Oct 2021 12:55:20 -0700 Subject: [PATCH] Revert "[Docker] Support multiple CUDA Versions (#19505)" (#19756) This reverts commit f0053d405ba86d58d76891fd8bbb3c184e3e8f67. --- .buildkite/pipeline.yml | 48 +- ci/travis/build-docker-images.py | 559 +++++++++--------------- doc/source/installation.rst | 8 +- docker/autoscaler/README.md | 2 +- docker/base-deps/Dockerfile | 2 +- docker/base-deps/README.md | 2 +- docker/fix-docker-latest.sh | 2 + docker/ray-deps/Dockerfile | 4 +- docker/ray-deps/README.md | 2 +- docker/ray-ml/Dockerfile | 7 +- docker/ray-ml/README.md | 4 +- docker/ray/Dockerfile | 4 +- docker/ray/README.md | 2 +- docker/retag-lambda/cuda_versions.txt | 5 - docker/retag-lambda/lambda_function.py | 32 +- docker/retag-lambda/python_versions.txt | 4 - 16 files changed, 229 insertions(+), 458 deletions(-) delete mode 100644 docker/retag-lambda/cuda_versions.txt delete mode 100644 docker/retag-lambda/python_versions.txt diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index bfa62af5c..7fafad9d7 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -75,69 +75,37 @@ # # Upload to latest directory. # - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi -- label: ":docker: Build Images: py36 (1/2)" +- label: ":docker: Build Images: py36" conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] commands: - LINUX_WHEELS=1 ./ci/travis/ci.sh build - pip install -q docker aws_requests_auth boto3 - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base + - python ./ci/travis/build-docker-images.py --py-versions PY36 --build-type BUILDKITE --build-base -- label: ":docker: Build Images: py36 (2/2)" +- label: ":docker: Build Images: py37" conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] commands: - LINUX_WHEELS=1 ./ci/travis/ci.sh build - pip install -q docker aws_requests_auth boto3 - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base + - python ./ci/travis/build-docker-images.py --py-versions PY37 --build-type BUILDKITE --build-base -- label: ":docker: Build Images: py37 (1/2)" +- label: ":docker: Build Images: py38" conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] commands: - LINUX_WHEELS=1 ./ci/travis/ci.sh build - pip install -q docker aws_requests_auth boto3 - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base + - python ./ci/travis/build-docker-images.py --py-versions PY38 --build-type BUILDKITE --build-base -- label: ":docker: Build Images: py37 (2/2)" +- label: ":docker: Build Images: py39" conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] commands: - LINUX_WHEELS=1 ./ci/travis/ci.sh build - pip install -q docker aws_requests_auth boto3 - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base - -- label: ":docker: Build Images: py38 (1/2)" - conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] - commands: - - LINUX_WHEELS=1 ./ci/travis/ci.sh build - - pip install -q docker aws_requests_auth boto3 - - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base - -- label: ":docker: Build Images: py38 (2/2)" - conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] - commands: - - LINUX_WHEELS=1 ./ci/travis/ci.sh build - - pip install -q docker aws_requests_auth boto3 - - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base - -- label: ":docker: Build Images: py39 (1/2)" - conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] - commands: - - LINUX_WHEELS=1 ./ci/travis/ci.sh build - - pip install -q docker aws_requests_auth boto3 - - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base - -- label: ":docker: Build Images: py39 (2/2)" - conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] - commands: - - LINUX_WHEELS=1 ./ci/travis/ci.sh build - - pip install -q docker aws_requests_auth boto3 - - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base + - python ./ci/travis/build-docker-images.py --py-versions PY39 --build-type BUILDKITE --build-base - label: ":book: Lint" commands: diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py index e6991b424..30f1d9069 100644 --- a/ci/travis/build-docker-images.py +++ b/ci/travis/build-docker-images.py @@ -3,13 +3,11 @@ import datetime import json import functools import glob -import itertools import os import re import shutil import subprocess import sys -from collections import defaultdict from typing import List, Tuple import docker @@ -26,39 +24,18 @@ DOCKER_HUB_DESCRIPTION = { "https://hub.docker.com/r/rayproject/ray"), "ray": "Official Docker Images for Ray, the distributed computing API.", "ray-ml": "Developer ready Docker Image for Ray.", + "autoscaler": ( + "Deprecated image, please use: " + "https://hub.docker.com/repository/docker/rayproject/ray-ml") } PY_MATRIX = { - "py36": "3.6.12", - "py37": "3.7.7", - "py38": "3.8.5", - "py39": "3.9.5" + "-py36": "3.6.12", + "-py37": "3.7.7", + "-py38": "3.8.5", + "-py39": "3.9.5" } -BASE_IMAGES = { - "cu112": "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04", - "cu111": "nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04", - "cu110": "nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04", - "cu102": "nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04", - "cu101": "nvidia/cuda:10.1-cudnn8-devel-ubuntu18.04", - "cpu": "ubuntu:focal", -} - -CUDA_FULL = { - "cu112": "CUDA 11.2", - "cu111": "CUDA 11.1", - "cu110": "CUDA 11.0", - "cu102": "CUDA 10.2", - "cu101": "CUDA 10.1" -} - -# The CUDA version to use for the ML Docker image. -ML_CUDA_VERSION = "cu112" - -DEFAULT_PYTHON_VERSION = "py37" - -IMAGE_NAMES = list(DOCKER_HUB_DESCRIPTION.keys()) - def _get_branch(): branch = (os.environ.get("TRAVIS_BRANCH") @@ -142,117 +119,83 @@ def _check_if_docker_files_modified(): return affected -def _build_docker_image(image_name: str, - py_version: str, - image_type: str, - no_cache=True): - """Builds Docker image with the provided info. +def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]: + built_images = [] + for gpu in ["-cpu", "-gpu"]: + for py_name, py_version in PY_MATRIX.items(): + # TODO(https://github.com/ray-project/ray/issues/16599): + # remove below after supporting ray-ml images with Python 3.9 + if image_name in ["ray-ml", "autoscaler" + ] and py_version.startswith("3.9"): + print(f"{image_name} image is currently unsupported with " + "Python 3.9") + continue - image_name (str): The name of the image to build. Must be one of - IMAGE_NAMES. - py_version (str): The Python version to build the image for. - Must be one of PY_MATRIX.keys() - image_type (str): The image type to build. Must be one of - BASE_IMAGES.keys() - no_cache (bool): If True, don't use caching when building the image. - """ + build_args = {} + build_args["PYTHON_VERSION"] = py_version + # I.e. "-py36"[-1] == 6 + build_args["PYTHON_MINOR_VERSION"] = py_name[-1] - if image_name not in IMAGE_NAMES: - raise ValueError( - f"The provided image name {image_name} is not " - f"recognized. Image names must be one of {IMAGE_NAMES}") + if image_name == "base-deps": + build_args["BASE_IMAGE"] = ( + "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" + if gpu == "-gpu" else "ubuntu:focal") + else: + # NOTE(ilr) This is a bit of an abuse of the name "GPU" + build_args["GPU"] = f"{py_name}{gpu}" - if py_version not in PY_MATRIX.keys(): - raise ValueError(f"The provided python version {py_version} is not " - f"recognized. Python version must be one of" - f" {PY_MATRIX.keys()}") + if image_name in ["ray", "ray-deps", "ray-worker-container"]: + wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"]) + build_args["WHEEL_PATH"] = f".whl/{wheel}" + # Add pip option "--find-links .whl/" to ensure ray-cpp wheel + # can be found. + build_args["FIND_LINKS_PATH"] = ".whl" - if image_type not in BASE_IMAGES.keys(): - raise ValueError(f"The provided CUDA version {image_type} is not " - f"recognized. CUDA version must be one of" - f" {image_type.keys()}") + tagged_name = f"rayproject/{image_name}:nightly{py_name}{gpu}" + for i in range(2): + cleanup = DOCKER_CLIENT.containers.prune().get( + "SpaceReclaimed") + if cleanup is not None: + print(f"Cleaned up {cleanup / (2**20)}MB") + output = DOCKER_CLIENT.api.build( + path=os.path.join(_get_root_dir(), "docker", image_name), + tag=tagged_name, + nocache=no_cache, + buildargs=build_args) - # TODO(https://github.com/ray-project/ray/issues/16599): - # remove below after supporting ray-ml images with Python 3.9 - if image_name == "ray-ml" and py_version == "py39": - print(f"{image_name} image is currently unsupported with " - "Python 3.9") - return + cmd_output = [] + try: + start = datetime.datetime.now() + current_iter = start + for line in output: + cmd_output.append(line.decode("utf-8")) + if datetime.datetime.now( + ) - current_iter >= datetime.timedelta(minutes=5): + current_iter = datetime.datetime.now() + elapsed = datetime.datetime.now() - start + print(f"Still building {tagged_name} after " + f"{elapsed.seconds} seconds") + if elapsed >= datetime.timedelta(minutes=15): + print("Additional build output:") + print(*cmd_output, sep="\n") + # Clear cmd_output after printing, so the next + # iteration will not print out the same lines. + cmd_output = [] + except Exception as e: + print(f"FAILURE with error {e}") - build_args = {} - build_args["PYTHON_VERSION"] = PY_MATRIX[py_version] - # I.e. "py36"[-1] == 6 - build_args["PYTHON_MINOR_VERSION"] = py_version[-1] + if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: + print(f"ERROR building: {tagged_name}. Output below:") + print(*cmd_output, sep="\n") + if (i == 1): + raise Exception("FAILED TO BUILD IMAGE") + print("TRYING AGAIN") + else: + break - device_tag = f"{image_type}" - - if image_name == "base-deps": - base_image = BASE_IMAGES[image_type] - else: - base_image = f"-{py_version}-{device_tag}" - - if image_name != "ray-worker-container": - build_args["BASE_IMAGE"] = base_image - - if image_name in ["ray", "ray-deps", "ray-worker-container"]: - wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"]) - build_args["WHEEL_PATH"] = f".whl/{wheel}" - # Add pip option "--find-links .whl/" to ensure ray-cpp wheel - # can be found. - build_args["FIND_LINKS_PATH"] = ".whl" - - tagged_name = f"rayproject/{image_name}:nightly-{py_version}-{device_tag}" - - for i in range(2): - cleanup = DOCKER_CLIENT.containers.prune().get("SpaceReclaimed") - if cleanup is not None: - print(f"Cleaned up {cleanup / (2 ** 20)}MB") - - labels = { - "image-name": image_name, - "python-version": PY_MATRIX[py_version] - } - if image_type in CUDA_FULL: - labels["cuda-version"] = CUDA_FULL[image_type] - - output = DOCKER_CLIENT.api.build( - path=os.path.join(_get_root_dir(), "docker", image_name), - tag=tagged_name, - nocache=no_cache, - labels=labels, - buildargs=build_args) - - cmd_output = [] - try: - start = datetime.datetime.now() - current_iter = start - for line in output: - cmd_output.append(line.decode("utf-8")) - if datetime.datetime.now( - ) - current_iter >= datetime.timedelta(minutes=5): - current_iter = datetime.datetime.now() - elapsed = datetime.datetime.now() - start - print(f"Still building {tagged_name} after " - f"{elapsed.seconds} seconds") - if elapsed >= datetime.timedelta(minutes=15): - print("Additional build output:") - print(*cmd_output, sep="\n") - # Clear cmd_output after printing, so the next - # iteration will not print out the same lines. - cmd_output = [] - except Exception as e: - print(f"FAILURE with error {e}") - - if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: - print(f"ERROR building: {tagged_name}. Output below:") - print(*cmd_output, sep="\n") - if i == 1: - raise Exception("FAILED TO BUILD IMAGE") - print("TRYING AGAIN") - else: - break - - print("BUILT: ", tagged_name) + print("BUILT: ", tagged_name) + built_images.append(tagged_name) + return built_images def copy_wheels(human_build): @@ -275,66 +218,41 @@ def copy_wheels(human_build): shutil.copy(source, ray_worker_container_dst) -def check_staleness(repository, tag): - DOCKER_CLIENT.api.pull(repository=repository, tag=tag) +def build_or_pull_base_images(rebuild_base_images: bool = True) -> List[str]: + """Returns images to tag and build""" + DOCKER_CLIENT.api.pull(repository="rayproject/base-deps", tag="nightly") - age = DOCKER_CLIENT.api.inspect_image(f"{repository}:{tag}")["Created"] + age = DOCKER_CLIENT.api.inspect_image("rayproject/base-deps:nightly")[ + "Created"] short_date = datetime.datetime.strptime(age.split("T")[0], "%Y-%m-%d") is_stale = ( datetime.datetime.now() - short_date) > datetime.timedelta(days=14) - return is_stale + print("Pulling images for caching") -def build_for_all_versions(image_name, py_versions, image_types, **kwargs): - """Builds the given Docker image for all Python & CUDA versions""" - for py_version in py_versions: - for image_type in image_types: - _build_docker_image( - image_name, - py_version=py_version, - image_type=image_type, - **kwargs) + DOCKER_CLIENT.api.pull( + repository="rayproject/base-deps", tag="nightly-cpu") + DOCKER_CLIENT.api.pull( + repository="rayproject/base-deps", tag="nightly-gpu") + DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-gpu") + DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-cpu") -def build_base_images(py_versions, image_types): - build_for_all_versions( - "base-deps", py_versions, image_types, no_cache=False) - build_for_all_versions( - "ray-deps", py_versions, image_types, no_cache=False) - - -def build_or_pull_base_images(py_versions: List[str], - image_types: List[str], - rebuild_base_images: bool = True) -> bool: - """Returns images to tag and build.""" - repositories = ["rayproject/base-deps", "rayproject/ray-deps"] - tags = [ - f"nightly-{py_version}-{image_type}" - for py_version, image_type in itertools.product( - py_versions, image_types) - ] - - try: - is_stale = check_staleness(repositories[0], tags[0]) - - # We still pull even if we have to rebuild the base images to help with - # caching. - for repository in repositories: - for tag in tags: - DOCKER_CLIENT.api.pull(repository=repository, tag=tag) - except Exception as e: - print(e) - is_stale = True - - if rebuild_base_images or _release_build() or is_stale: - build_base_images(py_versions, image_types) + # TODO(ilr) See if any caching happens + if (rebuild_base_images or is_stale or _release_build()): + for image in ["base-deps", "ray-deps"]: + _build_cpu_gpu_images(image, no_cache=False) return True else: print("Just pulling images!") return False -def prep_ray_ml(): +def build_ray(): + return _build_cpu_gpu_images("ray") + + +def build_ray_ml(): root_dir = _get_root_dir() requirement_files = glob.glob( f"{_get_root_dir()}/python/**/requirements*.txt", recursive=True) @@ -343,6 +261,11 @@ def prep_ray_ml(): # Install atari roms script shutil.copy(f"{_get_root_dir()}/rllib/utils/install_atari_roms.sh", os.path.join(root_dir, "docker/ray-ml/")) + ray_ml_images = _build_cpu_gpu_images("ray-ml") + for img in ray_ml_images: + tag = img.split(":")[-1] + DOCKER_CLIENT.api.tag( + image=img, repository="rayproject/autoscaler", tag=tag) def _get_docker_creds() -> Tuple[str, str]: @@ -351,52 +274,39 @@ def _get_docker_creds() -> Tuple[str, str]: return DOCKER_USERNAME, docker_password -def _docker_push(image, tag): - print(f"PUSHING: {image}:{tag}, result:") - # This docker API is janky. Without "stream=True" it returns a - # massive string filled with every progress bar update, which can - # cause CI to back up. - # - # With stream=True, it's a line-at-a-time generator of the same - # info. So we can slow it down by printing every couple hundred - # lines - i = 0 - for progress_line in DOCKER_CLIENT.api.push(image, tag=tag, stream=True): - if i % 100 == 0: - print(progress_line) - - -def _tag_and_push(full_image_name, old_tag, new_tag, merge_build=False): - # Do not tag release builds because they are no longer up to - # date after the branch cut. - if "nightly" in new_tag and _release_build(): - return - if old_tag != new_tag: - DOCKER_CLIENT.api.tag( - image=f"{full_image_name}:{old_tag}", - repository=full_image_name, - tag=new_tag) - if not merge_build: - print("This is a PR Build! On a merge build, we would normally push" - f"to: {full_image_name}:{new_tag}") - else: - _docker_push(full_image_name, new_tag) - - -def _create_new_tags(all_tags, old_str, new_str): - new_tags = [] - for full_tag in all_tags: - new_tag = full_tag.replace(old_str, new_str) - new_tags.append(new_tag) - return new_tags +def build_ray_worker_container(): + return _build_cpu_gpu_images("ray-worker-container") # For non-release builds, push "nightly" & "sha" # For release builds, push "nightly" & "latest" & "x.x.x" -def push_and_tag_images(py_versions: List[str], - image_types: List[str], - push_base_images: bool, - merge_build: bool = False): +def push_and_tag_images(push_base_images: bool, merge_build: bool = False): + def docker_push(image, tag): + # Do not tag release builds because they are no longer up to + # date after the branch cut. + if "nightly" in tag and _release_build(): + return + if merge_build: + print(f"PUSHING: {image}:{tag}, result:") + # This docker API is janky. Without "stream=True" it returns a + # massive string filled with every progress bar update, which can + # cause CI to back up. + # + # With stream=True, it's a line-at-a-time generator of the same + # info. So we can slow it down by printing every couple hundred + # lines + i = 0 + for progress_line in DOCKER_CLIENT.api.push( + image, tag=tag, stream=True): + if i % 100 == 0: + print(progress_line) + else: + print( + "This is a PR Build! On a merge build, we would normally push " + f"to: {image}:{tag}") + + def get_new_tag(old_tag, new_tag): + return old_tag.replace("nightly", new_tag) date_tag = datetime.datetime.now().strftime("%Y-%m-%d") sha_tag = _get_commit_sha() @@ -406,97 +316,61 @@ def push_and_tag_images(py_versions: List[str], date_tag = release_name sha_tag = release_name - image_list = ["ray", "ray-ml"] + image_list = ["ray", "ray-ml", "autoscaler"] if push_base_images: image_list.extend(["base-deps", "ray-deps"]) - for image_name in image_list: - full_image_name = f"rayproject/{image_name}" + for image in image_list: + for py_name, py_version in PY_MATRIX.items(): + # TODO(https://github.com/ray-project/ray/issues/16599): + # remove below after supporting ray-ml images with Python 3.9 + if image in ["ray-ml", "autoscaler" + ] and py_version.startswith("3.9"): + print( + f"{image} image is currently unsupported with Python 3.9") + continue - # Mapping from old tags to new tags. - # These are the tags we will push. - # The key is the full image name, and the values are all the tags - # for that image. - tag_mapping = defaultdict(list) - for py_name in py_versions: - for image_type in image_types: - if image_name == "ray-ml" and image_type != ML_CUDA_VERSION: - print("ML Docker image is not built for the following " - f"device type: {image_type}") - continue + full_image = f"rayproject/{image}" - # TODO(https://github.com/ray-project/ray/issues/16599): - # remove below after supporting ray-ml images with Python 3.9 - if image_name in ["ray-ml" - ] and PY_MATRIX[py_name].startswith("3.9"): - print(f"{image_name} image is currently " - f"unsupported with Python 3.9") - continue + # Tag "nightly-py3x" from "nightly-py3x-cpu" + DOCKER_CLIENT.api.tag( + image=f"{full_image}:nightly{py_name}-cpu", + repository=full_image, + tag=f"nightly{py_name}") - tag = f"nightly-{py_name}-{image_type}" - tag_mapping[tag].append(tag) + for arch_tag in ["-cpu", "-gpu", ""]: + full_arch_tag = f"nightly{py_name}{arch_tag}" - # If no device is specified, it should map to CPU image. - # "-gpu" tag should refer to the ML_CUDA_VERSION - for old_tag in tag_mapping.keys(): - if "cpu" in old_tag: - new_tags = _create_new_tags( - tag_mapping[old_tag], old_str="-cpu", new_str="") - tag_mapping[old_tag].extend(new_tags) - elif ML_CUDA_VERSION in old_tag: - new_tags = _create_new_tags( - tag_mapping[old_tag], - old_str=f"-{ML_CUDA_VERSION}", - new_str="-gpu") - tag_mapping[old_tag].extend(new_tags) + # Tag and push rayproject/:nightly + docker_push(full_image, full_arch_tag) - # No Python version specified should refer to DEFAULT_PYTHON_VERSION - for old_tag in tag_mapping.keys(): - if DEFAULT_PYTHON_VERSION in old_tag: - new_tags = _create_new_tags( - tag_mapping[old_tag], - old_str=f"-{DEFAULT_PYTHON_VERSION}", - new_str="") - tag_mapping[old_tag].extend(new_tags) + # Ex: specific_tag == "1.0.1" or "" or "" + specific_tag = get_new_tag( + full_arch_tag, date_tag if "-deps" in image else sha_tag) - # For all tags, create Date/Sha tags - for old_tag in tag_mapping.keys(): - new_tags = _create_new_tags( - tag_mapping[old_tag], - old_str="nightly", - new_str=date_tag if "-deps" in image_name else sha_tag) - tag_mapping[old_tag].extend(new_tags) + # Tag and push rayproject/: + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=specific_tag) + docker_push(full_image, specific_tag) - # Sanity checking. - for old_tag in tag_mapping.keys(): - if DEFAULT_PYTHON_VERSION in old_tag: - if "-cpu" in old_tag: - assert "nightly-cpu" in tag_mapping[old_tag] - assert "nightly" in tag_mapping[old_tag] - if "-deps" in image_name: - assert f"{date_tag}-cpu" in tag_mapping[old_tag] - assert f"{date_tag}" in tag_mapping[old_tag] - else: - assert f"{sha_tag}-cpu" in tag_mapping[old_tag] - assert f"{sha_tag}" in tag_mapping[old_tag] + if "-py37" in py_name: + non_python_specific_tag = specific_tag.replace("-py37", "") + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=non_python_specific_tag) + # Tag and push rayproject/: + docker_push(full_image, non_python_specific_tag) - elif ML_CUDA_VERSION in old_tag: - assert "nightly-gpu" in tag_mapping[old_tag] - if "-deps" in image_name: - assert f"{date_tag}-gpu" in tag_mapping[old_tag] - else: - assert f"{sha_tag}-gpu" in tag_mapping[old_tag] - - print(f"These tags will be created for {image_name}: ", tag_mapping) - - # Tag and push all images. - for old_tag in tag_mapping.keys(): - for new_tag in tag_mapping[old_tag]: - _tag_and_push( - full_image_name, - old_tag=old_tag, - new_tag=new_tag, - merge_build=merge_build) + non_python_nightly_tag = full_arch_tag.replace("-py37", "") + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=non_python_nightly_tag) + # Tag and push rayproject/:nightly + docker_push(full_image, non_python_nightly_tag) # Push infra here: @@ -535,30 +409,22 @@ def push_readmes(merge_build: bool): # Build base-deps/ray-deps only on file change, 2 weeks, per release -# Build ray, ray-ml every time +# Build ray, ray-ml, autoscaler every time # build-docker-images.py --py-versions PY37 --build-type PR --rebuild-all MERGE = "MERGE" HUMAN = "HUMAN" PR = "PR" BUILDKITE = "BUILDKITE" BUILD_TYPES = [MERGE, HUMAN, PR, BUILDKITE] - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--py-versions", - choices=list(PY_MATRIX.keys()), - default="py37", + choices=["PY36", "PY37", "PY38", "PY39"], + default="PY37", nargs="*", help="Which python versions to build. " - "Must be in (py36, py37, py38, py39)") - parser.add_argument( - "--device-types", - choices=list(BASE_IMAGES.keys()), - default=None, - nargs="*", - help="Which device types (CPU/CUDA versions) to build images for. " - "If not specified, images will be built for all device types.") + "Must be in (PY36, PY37, PY38, PY39)") parser.add_argument( "--build-type", choices=BUILD_TYPES, @@ -582,47 +448,26 @@ if __name__ == "__main__": py_versions = args.py_versions py_versions = py_versions if isinstance(py_versions, list) else [py_versions] + for key in set(PY_MATRIX.keys()): + if key[1:].upper() not in py_versions: + PY_MATRIX.pop(key) + assert len(PY_MATRIX) == len( + py_versions + ), f"Length of PY_MATRIX != args {PY_MATRIX} : {args.py_versions}" - image_types = args.device_types if args.device_types else list( - BASE_IMAGES.keys()) - - assert set(list(CUDA_FULL.keys()) + ["cpu"]) == set(BASE_IMAGES.keys()) - - # Make sure the python images and cuda versions we build here are - # consistent with the ones used with fix-latest-docker.sh script. - py_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda", - "python_versions.txt") - with open(py_version_file) as f: - py_file_versions = f.read().splitlines() - assert set(PY_MATRIX.keys()) == set(py_file_versions), \ - (PY_MATRIX.keys(), py_file_versions) - - cuda_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda", - "cuda_versions.txt") - - with open(cuda_version_file) as f: - cuda_file_versions = f.read().splitlines() - assert set(BASE_IMAGES.keys()) == set(cuda_file_versions + ["cpu"]),\ - (BASE_IMAGES.keys(), cuda_file_versions + ["cpu"]) - - print("Building the following python versions: ", - [PY_MATRIX[py_version] for py_version in py_versions]) - print("Building images for the following devices: ", image_types) + print("Building the following python versions: ", PY_MATRIX) print("Building base images: ", args.base) build_type = args.build_type is_buildkite = build_type == BUILDKITE - if build_type == BUILDKITE: if os.environ.get("BUILDKITE_PULL_REQUEST", "") == "false": build_type = MERGE else: build_type = PR - if build_type == HUMAN: - # If manually triggered, request user for branch and SHA value to use. _configure_human_version() - if (build_type in {HUMAN, MERGE, BUILDKITE} + if (build_type in {HUMAN, MERGE} or is_buildkite or _check_if_docker_files_modified()): DOCKER_CLIENT = docker.from_env() is_merge = build_type == MERGE @@ -633,31 +478,25 @@ if __name__ == "__main__": username, password = _get_docker_creds() DOCKER_CLIENT.api.login(username=username, password=password) copy_wheels(build_type == HUMAN) - is_base_images_built = build_or_pull_base_images( - py_versions, image_types, args.base) - + base_images_built = build_or_pull_base_images(args.base) if args.only_build_worker_container: - build_for_all_versions("ray-worker-container", py_versions, - image_types) + build_ray_worker_container() # TODO Currently don't push ray_worker_container else: - # Build Ray Docker images. - build_for_all_versions("ray", py_versions, image_types) - - if ML_CUDA_VERSION in image_types: - # Build Ray ML Docker images only if ML_CUDA_VERSION is - # specified. - prep_ray_ml() - # Only build ML Docker for the ML_CUDA_VERSION - build_for_all_versions( - "ray-ml", py_versions, image_types=[ML_CUDA_VERSION]) + build_ray() + build_ray_ml() + if build_type in {MERGE, PR}: + valid_branch = _valid_branch() + if (not valid_branch) and is_merge: + print(f"Invalid Branch found: {_get_branch()}") + push_and_tag_images(base_images_built, valid_branch + and is_merge) if build_type in {MERGE, PR}: valid_branch = _valid_branch() if (not valid_branch) and is_merge: print(f"Invalid Branch found: {_get_branch()}") - push_and_tag_images(py_versions, image_types, - is_base_images_built, valid_branch + push_and_tag_images(base_images_built, valid_branch and is_merge) # TODO(ilr) Re-Enable Push READMEs by using a normal password diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 44d30d8df..f501277bc 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -290,7 +290,7 @@ Image releases are `tagged` using the following format: - A specific nightly build (uses a SHA from the Github ``master``). -Some tags also have `variants` that add or change functionality: +Each tag has `variants` that add or change functionality: .. list-table:: :widths: 16 40 @@ -298,12 +298,10 @@ Some tags also have `variants` that add or change functionality: * - Variant - Description + * - -gpu + - These are based off of an NVIDIA CUDA image. They require the Nvidia Docker Runtime. * - -cpu - These are based off of an Ubuntu image. - * - -cuXX - - These are based off of an NVIDIA CUDA image with the specified CUDA version. They require the Nvidia Docker Runtime. - * - -gpu - - Aliases to a specific ``-cuXX`` tagged image. * - - Aliases to ``-cpu`` tagged images diff --git a/docker/autoscaler/README.md b/docker/autoscaler/README.md index 3cb68571e..807c37fff 100644 --- a/docker/autoscaler/README.md +++ b/docker/autoscaler/README.md @@ -1,7 +1,7 @@ # DEPRECATED -- Please use [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml) ## About This image used to be the base image for the Ray autoscaler, but it has been replaced by [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml). -Please use that instead, *this image is deprecated*. +Please use that instead, *this image will be removed in the near future*. ## Tags diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index a3f5a0b56..6f7350e9b 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -1,6 +1,6 @@ # The base-deps Docker image installs main libraries needed to run Ray -# The GPU options are NVIDIA CUDA developer images. +# The GPU option is nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04 ARG BASE_IMAGE="ubuntu:focal" FROM ${BASE_IMAGE} # FROM directive resets ARG diff --git a/docker/base-deps/README.md b/docker/base-deps/README.md index e8fcd9d9a..9a75db782 100644 --- a/docker/base-deps/README.md +++ b/docker/base-deps/README.md @@ -14,7 +14,7 @@ This image has the system-level dependencies for `Ray` and the `Ray Autoscaler` * `:DATE` - A specific build. ### Suffixes -* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. +* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. * `-cpu`- These are based off of an `Ubuntu` image. * Tags without a suffix refer to `-cpu` images diff --git a/docker/fix-docker-latest.sh b/docker/fix-docker-latest.sh index 8208abb7f..d09873aa2 100755 --- a/docker/fix-docker-latest.sh +++ b/docker/fix-docker-latest.sh @@ -29,6 +29,8 @@ AWS_ACCESS_KEY_ID=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.AccessK AWS_SECRET_ACCESS_KEY=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SecretAccessKey) AWS_SESSION_TOKEN=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SessionToken) + + echo -e "Invoking this lambda!\nView logs at https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups" AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN AWS_SECURITY_TOKEN='' aws \ lambda invoke --function-name DockerTagLatest \ diff --git a/docker/ray-deps/Dockerfile b/docker/ray-deps/Dockerfile index a0e8e481d..92f40013f 100644 --- a/docker/ray-deps/Dockerfile +++ b/docker/ray-deps/Dockerfile @@ -1,5 +1,5 @@ -ARG BASE_IMAGE="" -FROM rayproject/base-deps:nightly"$BASE_IMAGE" +ARG GPU="" +FROM rayproject/base-deps:nightly"$GPU" # If this arg is not "autoscaler" then no autoscaler requirements will be included ARG AUTOSCALER="autoscaler" ARG WHEEL_PATH diff --git a/docker/ray-deps/README.md b/docker/ray-deps/README.md index 80b969240..a393bd752 100644 --- a/docker/ray-deps/README.md +++ b/docker/ray-deps/README.md @@ -13,7 +13,7 @@ This has the python-level dependencies for `Ray` and the `Ray Autoscaler`. The ` * `:DATE` - A specific build. ### Suffixes -* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. +* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. * `-cpu`- These are based off of an `Ubuntu` image. * Tags without a suffix refer to `-cpu` images diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 53a13738e..b4e0fd17e 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -1,5 +1,5 @@ -ARG BASE_IMAGE -FROM rayproject/ray:nightly"$BASE_IMAGE" +ARG GPU +FROM rayproject/ray:nightly"$GPU" ARG PYTHON_MINOR_VERSION=7 # We have to uninstall wrapt this way for Tensorflow compatibility @@ -8,7 +8,6 @@ COPY requirements_dl.txt ./ COPY requirements_ml_docker.txt ./ COPY requirements_rllib.txt ./ COPY requirements_tune.txt ./requirements_tune.txt -COPY requirements_upstream.txt ./ COPY install_atari_roms.sh ./install_atari_roms.sh RUN sudo apt-get update \ @@ -24,13 +23,11 @@ RUN sudo apt-get update \ && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_rllib.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_tune.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ - && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_upstream.txt \ # Remove dataclasses & typing because they are included in Python > 3.6 && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \ $HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi \ && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \ && sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \ - && sudo rm requirements_upstream.txt \ && sudo apt-get clean # Make sure tfp is installed correctly and matches tf version. diff --git a/docker/ray-ml/README.md b/docker/ray-ml/README.md index d96623f35..0262e1ddd 100644 --- a/docker/ray-ml/README.md +++ b/docker/ray-ml/README.md @@ -11,7 +11,9 @@ This image is an extension of the [`rayproject/ray`](https://hub.docker.com/repo * `:SHA` - A specific nightly build. ### Suffixes -* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. +* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. +* `-cpu`- These are based off of an `Ubuntu` image. +* Tags without a suffix refer to `-cpu` images ## Other Images * [`rayproject/ray`](https://hub.docker.com/repository/docker/rayproject/ray) - Ray and all of its dependencies. diff --git a/docker/ray/Dockerfile b/docker/ray/Dockerfile index ad6253b0d..16acac888 100644 --- a/docker/ray/Dockerfile +++ b/docker/ray/Dockerfile @@ -1,5 +1,5 @@ -ARG BASE_IMAGE -FROM rayproject/ray-deps:nightly"$BASE_IMAGE" +ARG GPU +FROM rayproject/ray-deps:nightly"$GPU" ARG WHEEL_PATH ARG FIND_LINKS_PATH=".whl" # For Click diff --git a/docker/ray/README.md b/docker/ray/README.md index eba6b6a84..dce6068f6 100644 --- a/docker/ray/README.md +++ b/docker/ray/README.md @@ -12,7 +12,7 @@ everything needed to get started with running Ray! They work for both local deve * `:SHA` - A specific nightly build. ### Suffixes -* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. +* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. * `-cpu`- These are based off of an `Ubuntu` image. * Tags without a suffix refer to `-cpu` images diff --git a/docker/retag-lambda/cuda_versions.txt b/docker/retag-lambda/cuda_versions.txt deleted file mode 100644 index c88bc7143..000000000 --- a/docker/retag-lambda/cuda_versions.txt +++ /dev/null @@ -1,5 +0,0 @@ -cu112 -cu111 -cu110 -cu102 -cu101 \ No newline at end of file diff --git a/docker/retag-lambda/lambda_function.py b/docker/retag-lambda/lambda_function.py index d11e2e032..19763a270 100644 --- a/docker/retag-lambda/lambda_function.py +++ b/docker/retag-lambda/lambda_function.py @@ -1,5 +1,4 @@ import json -import os import subprocess import boto3 @@ -8,10 +7,6 @@ DOCKER_USER = None DOCKER_PASS = None -def _get_curr_dir(): - return os.path.dirname(os.path.realpath(__file__)) - - def get_secrets(): global DOCKER_PASS, DOCKER_USER secret_name = "dockerRetagLatestCredentials" @@ -41,30 +36,15 @@ def retag(repo: str, source: str, destination: str) -> str: }) -def parse_versions(version_file): - with open(version_file) as f: - file_versions = f.read().splitlines() - return file_versions - - def lambda_handler(event, context): source_image = event["source_tag"] destination_image = event["destination_tag"] total_results = [] - python_versions = parse_versions( - os.path.join(_get_curr_dir(), "python_versions.txt")) - cuda_versions = parse_versions( - os.path.join(_get_curr_dir(), "cuda_versions.txt")) - for repo in ["ray", "ray-ml"]: + for repo in ["ray", "ray-ml", "autoscaler"]: results = [] - for pyversion in python_versions: + for pyversion in ["py36", "py37", "py38", "py39"]: source_tag = f"{source_image}-{pyversion}" destination_tag = f"{destination_image}-{pyversion}" - for cudaversion in cuda_versions: - cuda_source_tag = source_tag + f"-{cudaversion}" - cuda_destination_tag = destination_tag + f"-{cudaversion}" - results.append( - retag(repo, cuda_source_tag, cuda_destination_tag)) results.append(retag(repo, source_tag, destination_tag)) results.append(retag(repo, source_tag, destination_tag + "-cpu")) results.append( @@ -74,13 +54,7 @@ def lambda_handler(event, context): # Retag images without a python version specified (defaults to py37) results = [] - for repo in ["ray", "ray-ml", "ray-deps", "base-deps"]: - for cudaversion in cuda_versions: - source_tag = f"{source_image}-{cudaversion}" - destination_tag = f"{destination_image}-{cudaversion}" - results.append(retag(repo, source_tag, destination_tag)) - - # ray:nightly -> ray:1.x + for repo in ["ray", "ray-ml", "autoscaler", "ray-deps", "base-deps"]: results.append(retag(repo, source_image, destination_image)) results.append(retag(repo, source_image, destination_image + "-cpu")) results.append( diff --git a/docker/retag-lambda/python_versions.txt b/docker/retag-lambda/python_versions.txt deleted file mode 100644 index 558717e70..000000000 --- a/docker/retag-lambda/python_versions.txt +++ /dev/null @@ -1,4 +0,0 @@ -py36 -py37 -py38 -py39 \ No newline at end of file