Revert "Revert "[Docker] Support multiple CUDA Versions (#19505)" (#19756)" (#19763)

This reverts commit e58fcca404.
2025-03-05 10:01:43 -05:00 · 2021-10-26 17:32:56 -07:00 · 2021-10-26 17:32:56 -07:00 · db863aafc0
commit db863aafc0
parent 47744d282c
16 changed files with 458 additions and 229 deletions
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@ -75,37 +75,69 @@
 #     # Upload to latest directory.
 #     - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi
- label: ":docker: Build Images: py36"
+- label: ":docker: Build Images: py36 (1/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions PY36 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py37"
+- label: ":docker: Build Images: py36 (2/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions PY37 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py38"
+- label: ":docker: Build Images: py37 (1/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions PY38 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py39"
+- label: ":docker: Build Images: py37 (2/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions PY39 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
 - label: ":docker: Build Images: py38 (1/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
    - python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
 - label: ":docker: Build Images: py38 (2/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
    - python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
 - label: ":docker: Build Images: py39 (1/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
    - python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
 - label: ":docker: Build Images: py39 (2/2)"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
    - python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
 - label: ":book: Lint"
  commands:
--- a/ci/travis/build-docker-images.py
+++ b/ci/travis/build-docker-images.py
@ -3,11 +3,13 @@ import datetime
 import json
 import functools
 import glob
 import itertools
 import os
 import re
 import shutil
 import subprocess
 import sys
 from collections import defaultdict
 from typing import List, Tuple
 import docker
@ -24,18 +26,39 @@ DOCKER_HUB_DESCRIPTION = {
                 "https://hub.docker.com/r/rayproject/ray"),
    "ray": "Official Docker Images for Ray, the distributed computing API.",
    "ray-ml": "Developer ready Docker Image for Ray.",
    "autoscaler": (
        "Deprecated image, please use: "
        "https://hub.docker.com/repository/docker/rayproject/ray-ml")
 }
 PY_MATRIX = {
-    "-py36": "3.6.12",
+    "py36": "3.6.12",
-    "-py37": "3.7.7",
+    "py37": "3.7.7",
-    "-py38": "3.8.5",
+    "py38": "3.8.5",
-    "-py39": "3.9.5"
+    "py39": "3.9.5"
 }
 BASE_IMAGES = {
    "cu112": "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04",
    "cu111": "nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04",
    "cu110": "nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04",
    "cu102": "nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04",
    "cu101": "nvidia/cuda:10.1-cudnn8-devel-ubuntu18.04",
    "cpu": "ubuntu:focal",
 }
 CUDA_FULL = {
    "cu112": "CUDA 11.2",
    "cu111": "CUDA 11.1",
    "cu110": "CUDA 11.0",
    "cu102": "CUDA 10.2",
    "cu101": "CUDA 10.1"
 }
 # The CUDA version to use for the ML Docker image.
 ML_CUDA_VERSION = "cu112"
 DEFAULT_PYTHON_VERSION = "py37"
 IMAGE_NAMES = list(DOCKER_HUB_DESCRIPTION.keys())
 def _get_branch():
    branch = (os.environ.get("TRAVIS_BRANCH")
@ -119,83 +142,117 @@ def _check_if_docker_files_modified():
    return affected
-def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]:
+def _build_docker_image(image_name: str,
-    built_images = []
+                        py_version: str,
-    for gpu in ["-cpu", "-gpu"]:
+                        image_type: str,
-        for py_name, py_version in PY_MATRIX.items():
+                        no_cache=True):
-            # TODO(https://github.com/ray-project/ray/issues/16599):
+    """Builds Docker image with the provided info.
            # remove below after supporting ray-ml images with Python 3.9
            if image_name in ["ray-ml", "autoscaler"
                              ] and py_version.startswith("3.9"):
                print(f"{image_name} image is currently unsupported with "
                      "Python 3.9")
                continue
-            build_args = {}
+    image_name (str): The name of the image to build. Must be one of
-            build_args["PYTHON_VERSION"] = py_version
+        IMAGE_NAMES.
-            # I.e. "-py36"[-1] == 6
+    py_version (str): The Python version to build the image for.
-            build_args["PYTHON_MINOR_VERSION"] = py_name[-1]
+        Must be one of PY_MATRIX.keys()
    image_type (str): The image type to build. Must be one of
        BASE_IMAGES.keys()
    no_cache (bool): If True, don't use caching when building the image.
    """
-            if image_name == "base-deps":
+    if image_name not in IMAGE_NAMES:
-                build_args["BASE_IMAGE"] = (
+        raise ValueError(
-                    "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04"
+            f"The provided image name {image_name} is not "
-                    if gpu == "-gpu" else "ubuntu:focal")
+            f"recognized. Image names must be one of {IMAGE_NAMES}")
            else:
                # NOTE(ilr) This is a bit of an abuse of the name "GPU"
                build_args["GPU"] = f"{py_name}{gpu}"
-            if image_name in ["ray", "ray-deps", "ray-worker-container"]:
+    if py_version not in PY_MATRIX.keys():
-                wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"])
+        raise ValueError(f"The provided python version {py_version} is not "
-                build_args["WHEEL_PATH"] = f".whl/{wheel}"
+                         f"recognized. Python version must be one of"
-                # Add pip option "--find-links .whl/" to ensure ray-cpp wheel
+                         f" {PY_MATRIX.keys()}")
                # can be found.
                build_args["FIND_LINKS_PATH"] = ".whl"
-            tagged_name = f"rayproject/{image_name}:nightly{py_name}{gpu}"
+    if image_type not in BASE_IMAGES.keys():
-            for i in range(2):
+        raise ValueError(f"The provided CUDA version {image_type} is not "
-                cleanup = DOCKER_CLIENT.containers.prune().get(
+                         f"recognized. CUDA version must be one of"
-                    "SpaceReclaimed")
+                         f" {image_type.keys()}")
                if cleanup is not None:
                    print(f"Cleaned up {cleanup / (2**20)}MB")
                output = DOCKER_CLIENT.api.build(
                    path=os.path.join(_get_root_dir(), "docker", image_name),
                    tag=tagged_name,
                    nocache=no_cache,
                    buildargs=build_args)
-                cmd_output = []
+    # TODO(https://github.com/ray-project/ray/issues/16599):
-                try:
+    # remove below after supporting ray-ml images with Python 3.9
-                    start = datetime.datetime.now()
+    if image_name == "ray-ml" and py_version == "py39":
-                    current_iter = start
+        print(f"{image_name} image is currently unsupported with "
-                    for line in output:
+              "Python 3.9")
-                        cmd_output.append(line.decode("utf-8"))
+        return
                        if datetime.datetime.now(
                        ) - current_iter >= datetime.timedelta(minutes=5):
                            current_iter = datetime.datetime.now()
                            elapsed = datetime.datetime.now() - start
                            print(f"Still building {tagged_name} after "
                                  f"{elapsed.seconds} seconds")
                            if elapsed >= datetime.timedelta(minutes=15):
                                print("Additional build output:")
                                print(*cmd_output, sep="\n")
                                # Clear cmd_output after printing, so the next
                                # iteration will not print out the same lines.
                                cmd_output = []
                except Exception as e:
                    print(f"FAILURE with error {e}")
-                if len(DOCKER_CLIENT.api.images(tagged_name)) == 0:
+    build_args = {}
-                    print(f"ERROR building: {tagged_name}. Output below:")
+    build_args["PYTHON_VERSION"] = PY_MATRIX[py_version]
-                    print(*cmd_output, sep="\n")
+    # I.e. "py36"[-1] == 6
-                    if (i == 1):
+    build_args["PYTHON_MINOR_VERSION"] = py_version[-1]
                        raise Exception("FAILED TO BUILD IMAGE")
                    print("TRYING AGAIN")
                else:
                    break
-            print("BUILT: ", tagged_name)
+    device_tag = f"{image_type}"
-            built_images.append(tagged_name)
+
-    return built_images
+    if image_name == "base-deps":
        base_image = BASE_IMAGES[image_type]
    else:
        base_image = f"-{py_version}-{device_tag}"
    if image_name != "ray-worker-container":
        build_args["BASE_IMAGE"] = base_image
    if image_name in ["ray", "ray-deps", "ray-worker-container"]:
        wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"])
        build_args["WHEEL_PATH"] = f".whl/{wheel}"
        # Add pip option "--find-links .whl/" to ensure ray-cpp wheel
        # can be found.
        build_args["FIND_LINKS_PATH"] = ".whl"
    tagged_name = f"rayproject/{image_name}:nightly-{py_version}-{device_tag}"
    for i in range(2):
        cleanup = DOCKER_CLIENT.containers.prune().get("SpaceReclaimed")
        if cleanup is not None:
            print(f"Cleaned up {cleanup / (2 ** 20)}MB")
        labels = {
            "image-name": image_name,
            "python-version": PY_MATRIX[py_version]
        }
        if image_type in CUDA_FULL:
            labels["cuda-version"] = CUDA_FULL[image_type]
        output = DOCKER_CLIENT.api.build(
            path=os.path.join(_get_root_dir(), "docker", image_name),
            tag=tagged_name,
            nocache=no_cache,
            labels=labels,
            buildargs=build_args)
        cmd_output = []
        try:
            start = datetime.datetime.now()
            current_iter = start
            for line in output:
                cmd_output.append(line.decode("utf-8"))
                if datetime.datetime.now(
                ) - current_iter >= datetime.timedelta(minutes=5):
                    current_iter = datetime.datetime.now()
                    elapsed = datetime.datetime.now() - start
                    print(f"Still building {tagged_name} after "
                          f"{elapsed.seconds} seconds")
                    if elapsed >= datetime.timedelta(minutes=15):
                        print("Additional build output:")
                        print(*cmd_output, sep="\n")
                        # Clear cmd_output after printing, so the next
                        # iteration will not print out the same lines.
                        cmd_output = []
        except Exception as e:
            print(f"FAILURE with error {e}")
        if len(DOCKER_CLIENT.api.images(tagged_name)) == 0:
            print(f"ERROR building: {tagged_name}. Output below:")
            print(*cmd_output, sep="\n")
            if i == 1:
                raise Exception("FAILED TO BUILD IMAGE")
            print("TRYING AGAIN")
        else:
            break
    print("BUILT: ", tagged_name)
 def copy_wheels(human_build):
@ -218,41 +275,66 @@ def copy_wheels(human_build):
        shutil.copy(source, ray_worker_container_dst)
-def build_or_pull_base_images(rebuild_base_images: bool = True) -> List[str]:
+def check_staleness(repository, tag):
-    """Returns images to tag and build"""
+    DOCKER_CLIENT.api.pull(repository=repository, tag=tag)
    DOCKER_CLIENT.api.pull(repository="rayproject/base-deps", tag="nightly")
-    age = DOCKER_CLIENT.api.inspect_image("rayproject/base-deps:nightly")[
+    age = DOCKER_CLIENT.api.inspect_image(f"{repository}:{tag}")["Created"]
        "Created"]
    short_date = datetime.datetime.strptime(age.split("T")[0], "%Y-%m-%d")
    is_stale = (
        datetime.datetime.now() - short_date) > datetime.timedelta(days=14)
    return is_stale
    print("Pulling images for caching")
-    DOCKER_CLIENT.api.pull(
+def build_for_all_versions(image_name, py_versions, image_types, **kwargs):
-        repository="rayproject/base-deps", tag="nightly-cpu")
+    """Builds the given Docker image for all Python & CUDA versions"""
-    DOCKER_CLIENT.api.pull(
+    for py_version in py_versions:
-        repository="rayproject/base-deps", tag="nightly-gpu")
+        for image_type in image_types:
            _build_docker_image(
                image_name,
                py_version=py_version,
                image_type=image_type,
                **kwargs)
    DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-gpu")
    DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-cpu")
-    # TODO(ilr) See if any caching happens
+def build_base_images(py_versions, image_types):
-    if (rebuild_base_images or is_stale or _release_build()):
+    build_for_all_versions(
-        for image in ["base-deps", "ray-deps"]:
+        "base-deps", py_versions, image_types, no_cache=False)
-            _build_cpu_gpu_images(image, no_cache=False)
+    build_for_all_versions(
        "ray-deps", py_versions, image_types, no_cache=False)
 def build_or_pull_base_images(py_versions: List[str],
                              image_types: List[str],
                              rebuild_base_images: bool = True) -> bool:
    """Returns images to tag and build."""
    repositories = ["rayproject/base-deps", "rayproject/ray-deps"]
    tags = [
        f"nightly-{py_version}-{image_type}"
        for py_version, image_type in itertools.product(
            py_versions, image_types)
    ]
    try:
        is_stale = check_staleness(repositories[0], tags[0])
        # We still pull even if we have to rebuild the base images to help with
        # caching.
        for repository in repositories:
            for tag in tags:
                DOCKER_CLIENT.api.pull(repository=repository, tag=tag)
    except Exception as e:
        print(e)
        is_stale = True
    if rebuild_base_images or _release_build() or is_stale:
        build_base_images(py_versions, image_types)
        return True
    else:
        print("Just pulling images!")
        return False
-def build_ray():
+def prep_ray_ml():
    return _build_cpu_gpu_images("ray")
 def build_ray_ml():
    root_dir = _get_root_dir()
    requirement_files = glob.glob(
        f"{_get_root_dir()}/python/**/requirements*.txt", recursive=True)
@ -261,11 +343,6 @@ def build_ray_ml():
    # Install atari roms script
    shutil.copy(f"{_get_root_dir()}/rllib/utils/install_atari_roms.sh",
                os.path.join(root_dir, "docker/ray-ml/"))
    ray_ml_images = _build_cpu_gpu_images("ray-ml")
    for img in ray_ml_images:
        tag = img.split(":")[-1]
        DOCKER_CLIENT.api.tag(
            image=img, repository="rayproject/autoscaler", tag=tag)
 def _get_docker_creds() -> Tuple[str, str]:
@ -274,39 +351,52 @@ def _get_docker_creds() -> Tuple[str, str]:
    return DOCKER_USERNAME, docker_password
-def build_ray_worker_container():
+def _docker_push(image, tag):
-    return _build_cpu_gpu_images("ray-worker-container")
+    print(f"PUSHING: {image}:{tag}, result:")
    # This docker API is janky. Without "stream=True" it returns a
    # massive string filled with every progress bar update, which can
    # cause CI to back up.
    #
    # With stream=True, it's a line-at-a-time generator of the same
    # info. So we can slow it down by printing every couple hundred
    # lines
    i = 0
    for progress_line in DOCKER_CLIENT.api.push(image, tag=tag, stream=True):
        if i % 100 == 0:
            print(progress_line)
 def _tag_and_push(full_image_name, old_tag, new_tag, merge_build=False):
    # Do not tag release builds because they are no longer up to
    # date after the branch cut.
    if "nightly" in new_tag and _release_build():
        return
    if old_tag != new_tag:
        DOCKER_CLIENT.api.tag(
            image=f"{full_image_name}:{old_tag}",
            repository=full_image_name,
            tag=new_tag)
    if not merge_build:
        print("This is a PR Build! On a merge build, we would normally push"
              f"to: {full_image_name}:{new_tag}")
    else:
        _docker_push(full_image_name, new_tag)
 def _create_new_tags(all_tags, old_str, new_str):
    new_tags = []
    for full_tag in all_tags:
        new_tag = full_tag.replace(old_str, new_str)
        new_tags.append(new_tag)
    return new_tags
 # For non-release builds, push "nightly" & "sha"
 # For release builds, push "nightly" & "latest" & "x.x.x"
-def push_and_tag_images(push_base_images: bool, merge_build: bool = False):
+def push_and_tag_images(py_versions: List[str],
-    def docker_push(image, tag):
+                        image_types: List[str],
-        # Do not tag release builds because they are no longer up to
+                        push_base_images: bool,
-        # date after the branch cut.
+                        merge_build: bool = False):
        if "nightly" in tag and _release_build():
            return
        if merge_build:
            print(f"PUSHING: {image}:{tag}, result:")
            # This docker API is janky. Without "stream=True" it returns a
            # massive string filled with every progress bar update, which can
            # cause CI to back up.
            #
            # With stream=True, it's a line-at-a-time generator of the same
            # info. So we can slow it down by printing every couple hundred
            # lines
            i = 0
            for progress_line in DOCKER_CLIENT.api.push(
                    image, tag=tag, stream=True):
                if i % 100 == 0:
                    print(progress_line)
        else:
            print(
                "This is a PR Build! On a merge build, we would normally push "
                f"to: {image}:{tag}")
    def get_new_tag(old_tag, new_tag):
        return old_tag.replace("nightly", new_tag)
    date_tag = datetime.datetime.now().strftime("%Y-%m-%d")
    sha_tag = _get_commit_sha()
@ -316,61 +406,97 @@ def push_and_tag_images(push_base_images: bool, merge_build: bool = False):
        date_tag = release_name
        sha_tag = release_name
-    image_list = ["ray", "ray-ml", "autoscaler"]
+    image_list = ["ray", "ray-ml"]
    if push_base_images:
        image_list.extend(["base-deps", "ray-deps"])
-    for image in image_list:
+    for image_name in image_list:
-        for py_name, py_version in PY_MATRIX.items():
+        full_image_name = f"rayproject/{image_name}"
            # TODO(https://github.com/ray-project/ray/issues/16599):
            # remove below after supporting ray-ml images with Python 3.9
            if image in ["ray-ml", "autoscaler"
                         ] and py_version.startswith("3.9"):
                print(
                    f"{image} image is currently unsupported with Python 3.9")
                continue
-            full_image = f"rayproject/{image}"
+        # Mapping from old tags to new tags.
        # These are the tags we will push.
        # The key is the full image name, and the values are all the tags
        # for that image.
        tag_mapping = defaultdict(list)
        for py_name in py_versions:
            for image_type in image_types:
                if image_name == "ray-ml" and image_type != ML_CUDA_VERSION:
                    print("ML Docker image is not built for the following "
                          f"device type: {image_type}")
                    continue
-            # Tag "nightly-py3x" from "nightly-py3x-cpu"
+                # TODO(https://github.com/ray-project/ray/issues/16599):
-            DOCKER_CLIENT.api.tag(
+                # remove below after supporting ray-ml images with Python 3.9
-                image=f"{full_image}:nightly{py_name}-cpu",
+                if image_name in ["ray-ml"
-                repository=full_image,
+                                  ] and PY_MATRIX[py_name].startswith("3.9"):
-                tag=f"nightly{py_name}")
+                    print(f"{image_name} image is currently "
                          f"unsupported with Python 3.9")
                    continue
-            for arch_tag in ["-cpu", "-gpu", ""]:
+                tag = f"nightly-{py_name}-{image_type}"
-                full_arch_tag = f"nightly{py_name}{arch_tag}"
+                tag_mapping[tag].append(tag)
-                # Tag and push rayproject/<image>:nightly<py_tag><arch_tag>
+        # If no device is specified, it should map to CPU image.
-                docker_push(full_image, full_arch_tag)
+        # "-gpu" tag should refer to the ML_CUDA_VERSION
        for old_tag in tag_mapping.keys():
            if "cpu" in old_tag:
                new_tags = _create_new_tags(
                    tag_mapping[old_tag], old_str="-cpu", new_str="")
                tag_mapping[old_tag].extend(new_tags)
            elif ML_CUDA_VERSION in old_tag:
                new_tags = _create_new_tags(
                    tag_mapping[old_tag],
                    old_str=f"-{ML_CUDA_VERSION}",
                    new_str="-gpu")
                tag_mapping[old_tag].extend(new_tags)
-                # Ex: specific_tag == "1.0.1" or "<sha>" or "<date>"
+        # No Python version specified should refer to DEFAULT_PYTHON_VERSION
-                specific_tag = get_new_tag(
+        for old_tag in tag_mapping.keys():
-                    full_arch_tag, date_tag if "-deps" in image else sha_tag)
+            if DEFAULT_PYTHON_VERSION in old_tag:
                new_tags = _create_new_tags(
                    tag_mapping[old_tag],
                    old_str=f"-{DEFAULT_PYTHON_VERSION}",
                    new_str="")
                tag_mapping[old_tag].extend(new_tags)
-                # Tag and push rayproject/<image>:<sha/date><py_tag><arch_tag>
+        # For all tags, create Date/Sha tags
-                DOCKER_CLIENT.api.tag(
+        for old_tag in tag_mapping.keys():
-                    image=f"{full_image}:{full_arch_tag}",
+            new_tags = _create_new_tags(
-                    repository=full_image,
+                tag_mapping[old_tag],
-                    tag=specific_tag)
+                old_str="nightly",
-                docker_push(full_image, specific_tag)
+                new_str=date_tag if "-deps" in image_name else sha_tag)
            tag_mapping[old_tag].extend(new_tags)
-                if "-py37" in py_name:
+        # Sanity checking.
-                    non_python_specific_tag = specific_tag.replace("-py37", "")
+        for old_tag in tag_mapping.keys():
-                    DOCKER_CLIENT.api.tag(
+            if DEFAULT_PYTHON_VERSION in old_tag:
-                        image=f"{full_image}:{full_arch_tag}",
+                if "-cpu" in old_tag:
-                        repository=full_image,
+                    assert "nightly-cpu" in tag_mapping[old_tag]
-                        tag=non_python_specific_tag)
+                    assert "nightly" in tag_mapping[old_tag]
-                    # Tag and push rayproject/<image>:<sha/date><arch_tag>
+                    if "-deps" in image_name:
-                    docker_push(full_image, non_python_specific_tag)
+                        assert f"{date_tag}-cpu" in tag_mapping[old_tag]
                        assert f"{date_tag}" in tag_mapping[old_tag]
                    else:
                        assert f"{sha_tag}-cpu" in tag_mapping[old_tag]
                        assert f"{sha_tag}" in tag_mapping[old_tag]
-                    non_python_nightly_tag = full_arch_tag.replace("-py37", "")
+                elif ML_CUDA_VERSION in old_tag:
-                    DOCKER_CLIENT.api.tag(
+                    assert "nightly-gpu" in tag_mapping[old_tag]
-                        image=f"{full_image}:{full_arch_tag}",
+                    if "-deps" in image_name:
-                        repository=full_image,
+                        assert f"{date_tag}-gpu" in tag_mapping[old_tag]
-                        tag=non_python_nightly_tag)
+                    else:
-                    # Tag and push rayproject/<image>:nightly<arch_tag>
+                        assert f"{sha_tag}-gpu" in tag_mapping[old_tag]
-                    docker_push(full_image, non_python_nightly_tag)
+
        print(f"These tags will be created for {image_name}: ", tag_mapping)
        # Tag and push all images.
        for old_tag in tag_mapping.keys():
            for new_tag in tag_mapping[old_tag]:
                _tag_and_push(
                    full_image_name,
                    old_tag=old_tag,
                    new_tag=new_tag,
                    merge_build=merge_build)
 # Push infra here:
@ -409,22 +535,30 @@ def push_readmes(merge_build: bool):
 # Build base-deps/ray-deps only on file change, 2 weeks, per release
-# Build ray, ray-ml, autoscaler every time
+# Build ray, ray-ml every time
 # build-docker-images.py --py-versions PY37 --build-type PR --rebuild-all
 MERGE = "MERGE"
 HUMAN = "HUMAN"
 PR = "PR"
 BUILDKITE = "BUILDKITE"
 BUILD_TYPES = [MERGE, HUMAN, PR, BUILDKITE]
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--py-versions",
-        choices=["PY36", "PY37", "PY38", "PY39"],
+        choices=list(PY_MATRIX.keys()),
-        default="PY37",
+        default="py37",
        nargs="*",
        help="Which python versions to build. "
-        "Must be in (PY36, PY37, PY38, PY39)")
+        "Must be in (py36, py37, py38, py39)")
    parser.add_argument(
        "--device-types",
        choices=list(BASE_IMAGES.keys()),
        default=None,
        nargs="*",
        help="Which device types (CPU/CUDA versions) to build images for. "
        "If not specified, images will be built for all device types.")
    parser.add_argument(
        "--build-type",
        choices=BUILD_TYPES,
@ -448,26 +582,47 @@ if __name__ == "__main__":
    py_versions = args.py_versions
    py_versions = py_versions if isinstance(py_versions,
                                            list) else [py_versions]
    for key in set(PY_MATRIX.keys()):
        if key[1:].upper() not in py_versions:
            PY_MATRIX.pop(key)
    assert len(PY_MATRIX) == len(
        py_versions
    ), f"Length of PY_MATRIX != args {PY_MATRIX} : {args.py_versions}"
-    print("Building the following python versions: ", PY_MATRIX)
+    image_types = args.device_types if args.device_types else list(
        BASE_IMAGES.keys())
    assert set(list(CUDA_FULL.keys()) + ["cpu"]) == set(BASE_IMAGES.keys())
    # Make sure the python images and cuda versions we build here are
    # consistent with the ones used with fix-latest-docker.sh script.
    py_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda",
                                   "python_versions.txt")
    with open(py_version_file) as f:
        py_file_versions = f.read().splitlines()
        assert set(PY_MATRIX.keys()) == set(py_file_versions), \
            (PY_MATRIX.keys(), py_file_versions)
    cuda_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda",
                                     "cuda_versions.txt")
    with open(cuda_version_file) as f:
        cuda_file_versions = f.read().splitlines()
        assert set(BASE_IMAGES.keys()) == set(cuda_file_versions + ["cpu"]),\
            (BASE_IMAGES.keys(), cuda_file_versions + ["cpu"])
    print("Building the following python versions: ",
          [PY_MATRIX[py_version] for py_version in py_versions])
    print("Building images for the following devices: ", image_types)
    print("Building base images: ", args.base)
    build_type = args.build_type
    is_buildkite = build_type == BUILDKITE
    if build_type == BUILDKITE:
        if os.environ.get("BUILDKITE_PULL_REQUEST", "") == "false":
            build_type = MERGE
        else:
            build_type = PR
    if build_type == HUMAN:
        # If manually triggered, request user for branch and SHA value to use.
        _configure_human_version()
-    if (build_type in {HUMAN, MERGE} or is_buildkite
+    if (build_type in {HUMAN, MERGE, BUILDKITE}
            or _check_if_docker_files_modified()):
        DOCKER_CLIENT = docker.from_env()
        is_merge = build_type == MERGE
@ -478,25 +633,31 @@ if __name__ == "__main__":
            username, password = _get_docker_creds()
            DOCKER_CLIENT.api.login(username=username, password=password)
        copy_wheels(build_type == HUMAN)
-        base_images_built = build_or_pull_base_images(args.base)
+        is_base_images_built = build_or_pull_base_images(
            py_versions, image_types, args.base)
        if args.only_build_worker_container:
-            build_ray_worker_container()
+            build_for_all_versions("ray-worker-container", py_versions,
                                   image_types)
            # TODO Currently don't push ray_worker_container
        else:
-            build_ray()
+            # Build Ray Docker images.
-            build_ray_ml()
+            build_for_all_versions("ray", py_versions, image_types)
-            if build_type in {MERGE, PR}:
+
-                valid_branch = _valid_branch()
+            if ML_CUDA_VERSION in image_types:
-                if (not valid_branch) and is_merge:
+                # Build Ray ML Docker images only if ML_CUDA_VERSION is
-                    print(f"Invalid Branch found: {_get_branch()}")
+                # specified.
-                push_and_tag_images(base_images_built, valid_branch
+                prep_ray_ml()
-                                    and is_merge)
+                # Only build ML Docker for the ML_CUDA_VERSION
                build_for_all_versions(
                    "ray-ml", py_versions, image_types=[ML_CUDA_VERSION])
            if build_type in {MERGE, PR}:
                valid_branch = _valid_branch()
                if (not valid_branch) and is_merge:
                    print(f"Invalid Branch found: {_get_branch()}")
-                push_and_tag_images(base_images_built, valid_branch
+                push_and_tag_images(py_versions, image_types,
                                    is_base_images_built, valid_branch
                                    and is_merge)
        # TODO(ilr) Re-Enable Push READMEs by using a normal password
--- a/doc/source/installation.rst
+++ b/doc/source/installation.rst
@ -290,7 +290,7 @@ Image releases are `tagged` using the following format:
     - A specific nightly build (uses a SHA from the Github ``master``).
-Each tag has `variants` that add or change functionality:
+Some tags also have `variants` that add or change functionality:
 .. list-table::
   :widths: 16 40
@ -298,10 +298,12 @@ Each tag has `variants` that add or change functionality:
   * - Variant
     - Description
   * - -gpu
     - These are based off of an NVIDIA CUDA image. They require the Nvidia Docker Runtime.
   * - -cpu
     - These are based off of an Ubuntu image.
   * - -cuXX
     - These are based off of an NVIDIA CUDA image with the specified CUDA version. They require the Nvidia Docker Runtime.
   * - -gpu
     - Aliases to a specific ``-cuXX`` tagged image.
   * - <no tag>
     - Aliases to ``-cpu`` tagged images
--- a/docker/autoscaler/README.md
+++ b/docker/autoscaler/README.md
@ -1,7 +1,7 @@
 # DEPRECATED -- Please use [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml)
 ## About
 This image used to be the base image for the Ray autoscaler, but it has been replaced by [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml). 
-Please use that instead, *this image will be removed in the near future*.
+Please use that instead, *this image is deprecated*.
 ## Tags
--- a/docker/base-deps/Dockerfile
+++ b/docker/base-deps/Dockerfile
@ -1,6 +1,6 @@
 # The base-deps Docker image installs main libraries needed to run Ray
-# The GPU option is nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04
+# The GPU options are NVIDIA CUDA developer images.
 ARG BASE_IMAGE="ubuntu:focal"
 FROM ${BASE_IMAGE}
 # FROM directive resets ARG
--- a/docker/base-deps/README.md
+++ b/docker/base-deps/README.md
@ -14,7 +14,7 @@ This image  has the system-level dependencies for `Ray` and the `Ray Autoscaler`
 * `:DATE` - A specific build.
 ### Suffixes
-* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
 * `-cpu`- These are based off of an `Ubuntu` image.
 * Tags without a suffix refer to `-cpu` images
--- a/docker/fix-docker-latest.sh
+++ b/docker/fix-docker-latest.sh
@ -29,8 +29,6 @@ AWS_ACCESS_KEY_ID=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.AccessK
 AWS_SECRET_ACCESS_KEY=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SecretAccessKey)
 AWS_SESSION_TOKEN=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SessionToken)
 echo -e "Invoking this lambda!\nView logs at https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups"
 AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN AWS_SECURITY_TOKEN='' aws \
 lambda invoke --function-name DockerTagLatest \
--- a/docker/ray-deps/Dockerfile
+++ b/docker/ray-deps/Dockerfile
@ -1,5 +1,5 @@
-ARG GPU=""
+ARG BASE_IMAGE=""
-FROM rayproject/base-deps:nightly"$GPU"
+FROM rayproject/base-deps:nightly"$BASE_IMAGE"
 # If this arg is not "autoscaler" then no autoscaler requirements will be included
 ARG AUTOSCALER="autoscaler"
 ARG WHEEL_PATH
--- a/docker/ray-deps/README.md
+++ b/docker/ray-deps/README.md
@ -13,7 +13,7 @@ This has the python-level dependencies for `Ray` and the `Ray Autoscaler`. The `
 * `:DATE` - A specific build.
 ### Suffixes
-* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
 * `-cpu`- These are based off of an `Ubuntu` image.
 * Tags without a suffix refer to `-cpu` images
--- a/docker/ray-ml/Dockerfile
+++ b/docker/ray-ml/Dockerfile
@ -1,5 +1,5 @@
-ARG GPU
+ARG BASE_IMAGE
-FROM rayproject/ray:nightly"$GPU"
+FROM rayproject/ray:nightly"$BASE_IMAGE"
 ARG PYTHON_MINOR_VERSION=7
 # We have to uninstall wrapt this way for Tensorflow compatibility
@ -8,6 +8,7 @@ COPY requirements_dl.txt ./
 COPY requirements_ml_docker.txt ./
 COPY requirements_rllib.txt ./
 COPY requirements_tune.txt ./requirements_tune.txt
 COPY requirements_upstream.txt ./
 COPY install_atari_roms.sh ./install_atari_roms.sh
 RUN sudo apt-get update \
@ -23,11 +24,13 @@ RUN sudo apt-get update \
    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_rllib.txt \
    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_tune.txt \
    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \
    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_upstream.txt \
    # Remove dataclasses & typing because they are included in Python > 3.6
    && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \
        $HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi  \
    && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \
    && sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \
    && sudo rm requirements_upstream.txt \
    && sudo apt-get clean
 # Make sure tfp is installed correctly and matches tf version.
--- a/docker/ray-ml/README.md
+++ b/docker/ray-ml/README.md
@ -11,9 +11,7 @@ This image is an extension of the [`rayproject/ray`](https://hub.docker.com/repo
 * `:SHA` - A specific nightly build.
 ### Suffixes
-* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.
 * `-cpu`- These are based off of an `Ubuntu` image.
 * Tags without a suffix refer to `-cpu` images
 ## Other Images
 * [`rayproject/ray`](https://hub.docker.com/repository/docker/rayproject/ray) - Ray and all of its dependencies.
--- a/docker/ray/Dockerfile
+++ b/docker/ray/Dockerfile
@ -1,5 +1,5 @@
-ARG GPU
+ARG BASE_IMAGE
-FROM rayproject/ray-deps:nightly"$GPU"
+FROM rayproject/ray-deps:nightly"$BASE_IMAGE"
 ARG WHEEL_PATH
 ARG FIND_LINKS_PATH=".whl"
 # For Click
--- a/docker/ray/README.md
+++ b/docker/ray/README.md
@ -12,7 +12,7 @@ everything needed to get started with running Ray! They work for both local deve
 * `:SHA` - A specific nightly build.
 ### Suffixes
-* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
 * `-cpu`- These are based off of an `Ubuntu` image.
 * Tags without a suffix refer to `-cpu` images
--- a/docker/retag-lambda/cuda_versions.txt
+++ b/docker/retag-lambda/cuda_versions.txt
@ -0,0 +1,5 @@
 cu112
 cu111
 cu110
 cu102
 cu101
--- a/docker/retag-lambda/lambda_function.py
+++ b/docker/retag-lambda/lambda_function.py
@ -1,4 +1,5 @@
 import json
 import os
 import subprocess
 import boto3
@ -7,6 +8,10 @@ DOCKER_USER = None
 DOCKER_PASS = None
 def _get_curr_dir():
    return os.path.dirname(os.path.realpath(__file__))
 def get_secrets():
    global DOCKER_PASS, DOCKER_USER
    secret_name = "dockerRetagLatestCredentials"
@ -36,15 +41,30 @@ def retag(repo: str, source: str, destination: str) -> str:
        })
 def parse_versions(version_file):
    with open(version_file) as f:
        file_versions = f.read().splitlines()
    return file_versions
 def lambda_handler(event, context):
    source_image = event["source_tag"]
    destination_image = event["destination_tag"]
    total_results = []
-    for repo in ["ray", "ray-ml", "autoscaler"]:
+    python_versions = parse_versions(
        os.path.join(_get_curr_dir(), "python_versions.txt"))
    cuda_versions = parse_versions(
        os.path.join(_get_curr_dir(), "cuda_versions.txt"))
    for repo in ["ray", "ray-ml"]:
        results = []
-        for pyversion in ["py36", "py37", "py38", "py39"]:
+        for pyversion in python_versions:
            source_tag = f"{source_image}-{pyversion}"
            destination_tag = f"{destination_image}-{pyversion}"
            for cudaversion in cuda_versions:
                cuda_source_tag = source_tag + f"-{cudaversion}"
                cuda_destination_tag = destination_tag + f"-{cudaversion}"
                results.append(
                    retag(repo, cuda_source_tag, cuda_destination_tag))
            results.append(retag(repo, source_tag, destination_tag))
            results.append(retag(repo, source_tag, destination_tag + "-cpu"))
            results.append(
@ -54,7 +74,13 @@ def lambda_handler(event, context):
    # Retag images without a python version specified (defaults to py37)
    results = []
-    for repo in ["ray", "ray-ml", "autoscaler", "ray-deps", "base-deps"]:
+    for repo in ["ray", "ray-ml", "ray-deps", "base-deps"]:
        for cudaversion in cuda_versions:
            source_tag = f"{source_image}-{cudaversion}"
            destination_tag = f"{destination_image}-{cudaversion}"
            results.append(retag(repo, source_tag, destination_tag))
        # ray:nightly -> ray:1.x
        results.append(retag(repo, source_image, destination_image))
        results.append(retag(repo, source_image, destination_image + "-cpu"))
        results.append(
--- a/docker/retag-lambda/python_versions.txt
+++ b/docker/retag-lambda/python_versions.txt
@ -0,0 +1,4 @@
 py36
 py37
 py38
 py39