Revert "[Docker] Support multiple CUDA Versions (#19505)" (#19756)

This reverts commit f0053d405b.
2025-03-05 10:01:43 -05:00 · 2021-10-26 12:55:20 -07:00 · 2021-10-26 12:55:20 -07:00 · e58fcca404
commit e58fcca404
parent 2ec9a70e24
16 changed files with 229 additions and 458 deletions
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@ -75,69 +75,37 @@
 #     # Upload to latest directory.
 #     - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi

- label: ":docker: Build Images: py36 (1/2)"
+- label: ":docker: Build Images: py36"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions PY36 --build-type BUILDKITE --build-base

- label: ":docker: Build Images: py36 (2/2)"
+- label: ":docker: Build Images: py37"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions PY37 --build-type BUILDKITE --build-base

- label: ":docker: Build Images: py37 (1/2)"
+- label: ":docker: Build Images: py38"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions PY38 --build-type BUILDKITE --build-base

- label: ":docker: Build Images: py37 (2/2)"
+- label: ":docker: Build Images: py39"
  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
  commands:
    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
    - pip install -q docker aws_requests_auth boto3
    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
-
- label: ":docker: Build Images: py38 (1/2)"
-  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
-  commands:
-    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
-    - pip install -q docker aws_requests_auth boto3
-    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
-
- label: ":docker: Build Images: py38 (2/2)"
-  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
-  commands:
-    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
-    - pip install -q docker aws_requests_auth boto3
-    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
-
- label: ":docker: Build Images: py39 (1/2)"
-  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
-  commands:
-    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
-    - pip install -q docker aws_requests_auth boto3
-    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
-
- label: ":docker: Build Images: py39 (2/2)"
-  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
-  commands:
-    - LINUX_WHEELS=1 ./ci/travis/ci.sh build
-    - pip install -q docker aws_requests_auth boto3
-    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
+    - python ./ci/travis/build-docker-images.py --py-versions PY39 --build-type BUILDKITE --build-base

 - label: ":book: Lint"
  commands:
--- a/ci/travis/build-docker-images.py
+++ b/ci/travis/build-docker-images.py
@ -3,13 +3,11 @@ import datetime
 import json
 import functools
 import glob
-import itertools
 import os
 import re
 import shutil
 import subprocess
 import sys
-from collections import defaultdict
 from typing import List, Tuple

 import docker
@ -26,39 +24,18 @@ DOCKER_HUB_DESCRIPTION = {
                 "https://hub.docker.com/r/rayproject/ray"),
    "ray": "Official Docker Images for Ray, the distributed computing API.",
    "ray-ml": "Developer ready Docker Image for Ray.",
+    "autoscaler": (
+        "Deprecated image, please use: "
+        "https://hub.docker.com/repository/docker/rayproject/ray-ml")
 }

 PY_MATRIX = {
-    "py36": "3.6.12",
-    "py37": "3.7.7",
-    "py38": "3.8.5",
-    "py39": "3.9.5"
+    "-py36": "3.6.12",
+    "-py37": "3.7.7",
+    "-py38": "3.8.5",
+    "-py39": "3.9.5"
 }

-BASE_IMAGES = {
-    "cu112": "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04",
-    "cu111": "nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04",
-    "cu110": "nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04",
-    "cu102": "nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04",
-    "cu101": "nvidia/cuda:10.1-cudnn8-devel-ubuntu18.04",
-    "cpu": "ubuntu:focal",
-}
-
-CUDA_FULL = {
-    "cu112": "CUDA 11.2",
-    "cu111": "CUDA 11.1",
-    "cu110": "CUDA 11.0",
-    "cu102": "CUDA 10.2",
-    "cu101": "CUDA 10.1"
-}
-
-# The CUDA version to use for the ML Docker image.
-ML_CUDA_VERSION = "cu112"
-
-DEFAULT_PYTHON_VERSION = "py37"
-
-IMAGE_NAMES = list(DOCKER_HUB_DESCRIPTION.keys())
-

 def _get_branch():
    branch = (os.environ.get("TRAVIS_BRANCH")
@ -142,117 +119,83 @@ def _check_if_docker_files_modified():
    return affected


-def _build_docker_image(image_name: str,
-                        py_version: str,
-                        image_type: str,
-                        no_cache=True):
-    """Builds Docker image with the provided info.
+def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]:
+    built_images = []
+    for gpu in ["-cpu", "-gpu"]:
+        for py_name, py_version in PY_MATRIX.items():
+            # TODO(https://github.com/ray-project/ray/issues/16599):
+            # remove below after supporting ray-ml images with Python 3.9
+            if image_name in ["ray-ml", "autoscaler"
+                              ] and py_version.startswith("3.9"):
+                print(f"{image_name} image is currently unsupported with "
+                      "Python 3.9")
+                continue

-    image_name (str): The name of the image to build. Must be one of
-        IMAGE_NAMES.
-    py_version (str): The Python version to build the image for.
-        Must be one of PY_MATRIX.keys()
-    image_type (str): The image type to build. Must be one of
-        BASE_IMAGES.keys()
-    no_cache (bool): If True, don't use caching when building the image.
-    """
+            build_args = {}
+            build_args["PYTHON_VERSION"] = py_version
+            # I.e. "-py36"[-1] == 6
+            build_args["PYTHON_MINOR_VERSION"] = py_name[-1]

-    if image_name not in IMAGE_NAMES:
-        raise ValueError(
-            f"The provided image name {image_name} is not "
-            f"recognized. Image names must be one of {IMAGE_NAMES}")
+            if image_name == "base-deps":
+                build_args["BASE_IMAGE"] = (
+                    "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04"
+                    if gpu == "-gpu" else "ubuntu:focal")
+            else:
+                # NOTE(ilr) This is a bit of an abuse of the name "GPU"
+                build_args["GPU"] = f"{py_name}{gpu}"

-    if py_version not in PY_MATRIX.keys():
-        raise ValueError(f"The provided python version {py_version} is not "
-                         f"recognized. Python version must be one of"
-                         f" {PY_MATRIX.keys()}")
+            if image_name in ["ray", "ray-deps", "ray-worker-container"]:
+                wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"])
+                build_args["WHEEL_PATH"] = f".whl/{wheel}"
+                # Add pip option "--find-links .whl/" to ensure ray-cpp wheel
+                # can be found.
+                build_args["FIND_LINKS_PATH"] = ".whl"

-    if image_type not in BASE_IMAGES.keys():
-        raise ValueError(f"The provided CUDA version {image_type} is not "
-                         f"recognized. CUDA version must be one of"
-                         f" {image_type.keys()}")
+            tagged_name = f"rayproject/{image_name}:nightly{py_name}{gpu}"
+            for i in range(2):
+                cleanup = DOCKER_CLIENT.containers.prune().get(
+                    "SpaceReclaimed")
+                if cleanup is not None:
+                    print(f"Cleaned up {cleanup / (2**20)}MB")
+                output = DOCKER_CLIENT.api.build(
+                    path=os.path.join(_get_root_dir(), "docker", image_name),
+                    tag=tagged_name,
+                    nocache=no_cache,
+                    buildargs=build_args)

-    # TODO(https://github.com/ray-project/ray/issues/16599):
-    # remove below after supporting ray-ml images with Python 3.9
-    if image_name == "ray-ml" and py_version == "py39":
-        print(f"{image_name} image is currently unsupported with "
-              "Python 3.9")
-        return
+                cmd_output = []
+                try:
+                    start = datetime.datetime.now()
+                    current_iter = start
+                    for line in output:
+                        cmd_output.append(line.decode("utf-8"))
+                        if datetime.datetime.now(
+                        ) - current_iter >= datetime.timedelta(minutes=5):
+                            current_iter = datetime.datetime.now()
+                            elapsed = datetime.datetime.now() - start
+                            print(f"Still building {tagged_name} after "
+                                  f"{elapsed.seconds} seconds")
+                            if elapsed >= datetime.timedelta(minutes=15):
+                                print("Additional build output:")
+                                print(*cmd_output, sep="\n")
+                                # Clear cmd_output after printing, so the next
+                                # iteration will not print out the same lines.
+                                cmd_output = []
+                except Exception as e:
+                    print(f"FAILURE with error {e}")

-    build_args = {}
-    build_args["PYTHON_VERSION"] = PY_MATRIX[py_version]
-    # I.e. "py36"[-1] == 6
-    build_args["PYTHON_MINOR_VERSION"] = py_version[-1]
+                if len(DOCKER_CLIENT.api.images(tagged_name)) == 0:
+                    print(f"ERROR building: {tagged_name}. Output below:")
+                    print(*cmd_output, sep="\n")
+                    if (i == 1):
+                        raise Exception("FAILED TO BUILD IMAGE")
+                    print("TRYING AGAIN")
+                else:
+                    break

-    device_tag = f"{image_type}"
-
-    if image_name == "base-deps":
-        base_image = BASE_IMAGES[image_type]
-    else:
-        base_image = f"-{py_version}-{device_tag}"
-
-    if image_name != "ray-worker-container":
-        build_args["BASE_IMAGE"] = base_image
-
-    if image_name in ["ray", "ray-deps", "ray-worker-container"]:
-        wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"])
-        build_args["WHEEL_PATH"] = f".whl/{wheel}"
-        # Add pip option "--find-links .whl/" to ensure ray-cpp wheel
-        # can be found.
-        build_args["FIND_LINKS_PATH"] = ".whl"
-
-    tagged_name = f"rayproject/{image_name}:nightly-{py_version}-{device_tag}"
-
-    for i in range(2):
-        cleanup = DOCKER_CLIENT.containers.prune().get("SpaceReclaimed")
-        if cleanup is not None:
-            print(f"Cleaned up {cleanup / (2 ** 20)}MB")
-
-        labels = {
-            "image-name": image_name,
-            "python-version": PY_MATRIX[py_version]
-        }
-        if image_type in CUDA_FULL:
-            labels["cuda-version"] = CUDA_FULL[image_type]
-
-        output = DOCKER_CLIENT.api.build(
-            path=os.path.join(_get_root_dir(), "docker", image_name),
-            tag=tagged_name,
-            nocache=no_cache,
-            labels=labels,
-            buildargs=build_args)
-
-        cmd_output = []
-        try:
-            start = datetime.datetime.now()
-            current_iter = start
-            for line in output:
-                cmd_output.append(line.decode("utf-8"))
-                if datetime.datetime.now(
-                ) - current_iter >= datetime.timedelta(minutes=5):
-                    current_iter = datetime.datetime.now()
-                    elapsed = datetime.datetime.now() - start
-                    print(f"Still building {tagged_name} after "
-                          f"{elapsed.seconds} seconds")
-                    if elapsed >= datetime.timedelta(minutes=15):
-                        print("Additional build output:")
-                        print(*cmd_output, sep="\n")
-                        # Clear cmd_output after printing, so the next
-                        # iteration will not print out the same lines.
-                        cmd_output = []
-        except Exception as e:
-            print(f"FAILURE with error {e}")
-
-        if len(DOCKER_CLIENT.api.images(tagged_name)) == 0:
-            print(f"ERROR building: {tagged_name}. Output below:")
-            print(*cmd_output, sep="\n")
-            if i == 1:
-                raise Exception("FAILED TO BUILD IMAGE")
-            print("TRYING AGAIN")
-        else:
-            break
-
-    print("BUILT: ", tagged_name)
+            print("BUILT: ", tagged_name)
+            built_images.append(tagged_name)
+    return built_images


 def copy_wheels(human_build):
@ -275,66 +218,41 @@ def copy_wheels(human_build):
        shutil.copy(source, ray_worker_container_dst)


-def check_staleness(repository, tag):
-    DOCKER_CLIENT.api.pull(repository=repository, tag=tag)
+def build_or_pull_base_images(rebuild_base_images: bool = True) -> List[str]:
+    """Returns images to tag and build"""
+    DOCKER_CLIENT.api.pull(repository="rayproject/base-deps", tag="nightly")

-    age = DOCKER_CLIENT.api.inspect_image(f"{repository}:{tag}")["Created"]
+    age = DOCKER_CLIENT.api.inspect_image("rayproject/base-deps:nightly")[
+        "Created"]
    short_date = datetime.datetime.strptime(age.split("T")[0], "%Y-%m-%d")
    is_stale = (
        datetime.datetime.now() - short_date) > datetime.timedelta(days=14)
-    return is_stale

+    print("Pulling images for caching")

-def build_for_all_versions(image_name, py_versions, image_types, **kwargs):
-    """Builds the given Docker image for all Python & CUDA versions"""
-    for py_version in py_versions:
-        for image_type in image_types:
-            _build_docker_image(
-                image_name,
-                py_version=py_version,
-                image_type=image_type,
-                **kwargs)
+    DOCKER_CLIENT.api.pull(
+        repository="rayproject/base-deps", tag="nightly-cpu")
+    DOCKER_CLIENT.api.pull(
+        repository="rayproject/base-deps", tag="nightly-gpu")

+    DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-gpu")
+    DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-cpu")

-def build_base_images(py_versions, image_types):
-    build_for_all_versions(
-        "base-deps", py_versions, image_types, no_cache=False)
-    build_for_all_versions(
-        "ray-deps", py_versions, image_types, no_cache=False)
-
-
-def build_or_pull_base_images(py_versions: List[str],
-                              image_types: List[str],
-                              rebuild_base_images: bool = True) -> bool:
-    """Returns images to tag and build."""
-    repositories = ["rayproject/base-deps", "rayproject/ray-deps"]
-    tags = [
-        f"nightly-{py_version}-{image_type}"
-        for py_version, image_type in itertools.product(
-            py_versions, image_types)
-    ]
-
-    try:
-        is_stale = check_staleness(repositories[0], tags[0])
-
-        # We still pull even if we have to rebuild the base images to help with
-        # caching.
-        for repository in repositories:
-            for tag in tags:
-                DOCKER_CLIENT.api.pull(repository=repository, tag=tag)
-    except Exception as e:
-        print(e)
-        is_stale = True
-
-    if rebuild_base_images or _release_build() or is_stale:
-        build_base_images(py_versions, image_types)
+    # TODO(ilr) See if any caching happens
+    if (rebuild_base_images or is_stale or _release_build()):
+        for image in ["base-deps", "ray-deps"]:
+            _build_cpu_gpu_images(image, no_cache=False)
        return True
    else:
        print("Just pulling images!")
        return False


-def prep_ray_ml():
+def build_ray():
+    return _build_cpu_gpu_images("ray")
+
+
+def build_ray_ml():
    root_dir = _get_root_dir()
    requirement_files = glob.glob(
        f"{_get_root_dir()}/python/**/requirements*.txt", recursive=True)
@ -343,6 +261,11 @@ def prep_ray_ml():
    # Install atari roms script
    shutil.copy(f"{_get_root_dir()}/rllib/utils/install_atari_roms.sh",
                os.path.join(root_dir, "docker/ray-ml/"))
+    ray_ml_images = _build_cpu_gpu_images("ray-ml")
+    for img in ray_ml_images:
+        tag = img.split(":")[-1]
+        DOCKER_CLIENT.api.tag(
+            image=img, repository="rayproject/autoscaler", tag=tag)


 def _get_docker_creds() -> Tuple[str, str]:
@ -351,52 +274,39 @@ def _get_docker_creds() -> Tuple[str, str]:
    return DOCKER_USERNAME, docker_password


-def _docker_push(image, tag):
-    print(f"PUSHING: {image}:{tag}, result:")
-    # This docker API is janky. Without "stream=True" it returns a
-    # massive string filled with every progress bar update, which can
-    # cause CI to back up.
-    #
-    # With stream=True, it's a line-at-a-time generator of the same
-    # info. So we can slow it down by printing every couple hundred
-    # lines
-    i = 0
-    for progress_line in DOCKER_CLIENT.api.push(image, tag=tag, stream=True):
-        if i % 100 == 0:
-            print(progress_line)
-
-
-def _tag_and_push(full_image_name, old_tag, new_tag, merge_build=False):
-    # Do not tag release builds because they are no longer up to
-    # date after the branch cut.
-    if "nightly" in new_tag and _release_build():
-        return
-    if old_tag != new_tag:
-        DOCKER_CLIENT.api.tag(
-            image=f"{full_image_name}:{old_tag}",
-            repository=full_image_name,
-            tag=new_tag)
-    if not merge_build:
-        print("This is a PR Build! On a merge build, we would normally push"
-              f"to: {full_image_name}:{new_tag}")
-    else:
-        _docker_push(full_image_name, new_tag)
-
-
-def _create_new_tags(all_tags, old_str, new_str):
-    new_tags = []
-    for full_tag in all_tags:
-        new_tag = full_tag.replace(old_str, new_str)
-        new_tags.append(new_tag)
-    return new_tags
+def build_ray_worker_container():
+    return _build_cpu_gpu_images("ray-worker-container")


 # For non-release builds, push "nightly" & "sha"
 # For release builds, push "nightly" & "latest" & "x.x.x"
-def push_and_tag_images(py_versions: List[str],
-                        image_types: List[str],
-                        push_base_images: bool,
-                        merge_build: bool = False):
+def push_and_tag_images(push_base_images: bool, merge_build: bool = False):
+    def docker_push(image, tag):
+        # Do not tag release builds because they are no longer up to
+        # date after the branch cut.
+        if "nightly" in tag and _release_build():
+            return
+        if merge_build:
+            print(f"PUSHING: {image}:{tag}, result:")
+            # This docker API is janky. Without "stream=True" it returns a
+            # massive string filled with every progress bar update, which can
+            # cause CI to back up.
+            #
+            # With stream=True, it's a line-at-a-time generator of the same
+            # info. So we can slow it down by printing every couple hundred
+            # lines
+            i = 0
+            for progress_line in DOCKER_CLIENT.api.push(
+                    image, tag=tag, stream=True):
+                if i % 100 == 0:
+                    print(progress_line)
+        else:
+            print(
+                "This is a PR Build! On a merge build, we would normally push "
+                f"to: {image}:{tag}")
+
+    def get_new_tag(old_tag, new_tag):
+        return old_tag.replace("nightly", new_tag)

    date_tag = datetime.datetime.now().strftime("%Y-%m-%d")
    sha_tag = _get_commit_sha()
@ -406,97 +316,61 @@ def push_and_tag_images(py_versions: List[str],
        date_tag = release_name
        sha_tag = release_name

-    image_list = ["ray", "ray-ml"]
+    image_list = ["ray", "ray-ml", "autoscaler"]
    if push_base_images:
        image_list.extend(["base-deps", "ray-deps"])

-    for image_name in image_list:
-        full_image_name = f"rayproject/{image_name}"
+    for image in image_list:
+        for py_name, py_version in PY_MATRIX.items():
+            # TODO(https://github.com/ray-project/ray/issues/16599):
+            # remove below after supporting ray-ml images with Python 3.9
+            if image in ["ray-ml", "autoscaler"
+                         ] and py_version.startswith("3.9"):
+                print(
+                    f"{image} image is currently unsupported with Python 3.9")
+                continue

-        # Mapping from old tags to new tags.
-        # These are the tags we will push.
-        # The key is the full image name, and the values are all the tags
-        # for that image.
-        tag_mapping = defaultdict(list)
-        for py_name in py_versions:
-            for image_type in image_types:
-                if image_name == "ray-ml" and image_type != ML_CUDA_VERSION:
-                    print("ML Docker image is not built for the following "
-                          f"device type: {image_type}")
-                    continue
+            full_image = f"rayproject/{image}"

-                # TODO(https://github.com/ray-project/ray/issues/16599):
-                # remove below after supporting ray-ml images with Python 3.9
-                if image_name in ["ray-ml"
-                                  ] and PY_MATRIX[py_name].startswith("3.9"):
-                    print(f"{image_name} image is currently "
-                          f"unsupported with Python 3.9")
-                    continue
+            # Tag "nightly-py3x" from "nightly-py3x-cpu"
+            DOCKER_CLIENT.api.tag(
+                image=f"{full_image}:nightly{py_name}-cpu",
+                repository=full_image,
+                tag=f"nightly{py_name}")

-                tag = f"nightly-{py_name}-{image_type}"
-                tag_mapping[tag].append(tag)
+            for arch_tag in ["-cpu", "-gpu", ""]:
+                full_arch_tag = f"nightly{py_name}{arch_tag}"

-        # If no device is specified, it should map to CPU image.
-        # "-gpu" tag should refer to the ML_CUDA_VERSION
-        for old_tag in tag_mapping.keys():
-            if "cpu" in old_tag:
-                new_tags = _create_new_tags(
-                    tag_mapping[old_tag], old_str="-cpu", new_str="")
-                tag_mapping[old_tag].extend(new_tags)
-            elif ML_CUDA_VERSION in old_tag:
-                new_tags = _create_new_tags(
-                    tag_mapping[old_tag],
-                    old_str=f"-{ML_CUDA_VERSION}",
-                    new_str="-gpu")
-                tag_mapping[old_tag].extend(new_tags)
+                # Tag and push rayproject/<image>:nightly<py_tag><arch_tag>
+                docker_push(full_image, full_arch_tag)

-        # No Python version specified should refer to DEFAULT_PYTHON_VERSION
-        for old_tag in tag_mapping.keys():
-            if DEFAULT_PYTHON_VERSION in old_tag:
-                new_tags = _create_new_tags(
-                    tag_mapping[old_tag],
-                    old_str=f"-{DEFAULT_PYTHON_VERSION}",
-                    new_str="")
-                tag_mapping[old_tag].extend(new_tags)
+                # Ex: specific_tag == "1.0.1" or "<sha>" or "<date>"
+                specific_tag = get_new_tag(
+                    full_arch_tag, date_tag if "-deps" in image else sha_tag)

-        # For all tags, create Date/Sha tags
-        for old_tag in tag_mapping.keys():
-            new_tags = _create_new_tags(
-                tag_mapping[old_tag],
-                old_str="nightly",
-                new_str=date_tag if "-deps" in image_name else sha_tag)
-            tag_mapping[old_tag].extend(new_tags)
+                # Tag and push rayproject/<image>:<sha/date><py_tag><arch_tag>
+                DOCKER_CLIENT.api.tag(
+                    image=f"{full_image}:{full_arch_tag}",
+                    repository=full_image,
+                    tag=specific_tag)
+                docker_push(full_image, specific_tag)

-        # Sanity checking.
-        for old_tag in tag_mapping.keys():
-            if DEFAULT_PYTHON_VERSION in old_tag:
-                if "-cpu" in old_tag:
-                    assert "nightly-cpu" in tag_mapping[old_tag]
-                    assert "nightly" in tag_mapping[old_tag]
-                    if "-deps" in image_name:
-                        assert f"{date_tag}-cpu" in tag_mapping[old_tag]
-                        assert f"{date_tag}" in tag_mapping[old_tag]
-                    else:
-                        assert f"{sha_tag}-cpu" in tag_mapping[old_tag]
-                        assert f"{sha_tag}" in tag_mapping[old_tag]
+                if "-py37" in py_name:
+                    non_python_specific_tag = specific_tag.replace("-py37", "")
+                    DOCKER_CLIENT.api.tag(
+                        image=f"{full_image}:{full_arch_tag}",
+                        repository=full_image,
+                        tag=non_python_specific_tag)
+                    # Tag and push rayproject/<image>:<sha/date><arch_tag>
+                    docker_push(full_image, non_python_specific_tag)

-                elif ML_CUDA_VERSION in old_tag:
-                    assert "nightly-gpu" in tag_mapping[old_tag]
-                    if "-deps" in image_name:
-                        assert f"{date_tag}-gpu" in tag_mapping[old_tag]
-                    else:
-                        assert f"{sha_tag}-gpu" in tag_mapping[old_tag]
-
-        print(f"These tags will be created for {image_name}: ", tag_mapping)
-
-        # Tag and push all images.
-        for old_tag in tag_mapping.keys():
-            for new_tag in tag_mapping[old_tag]:
-                _tag_and_push(
-                    full_image_name,
-                    old_tag=old_tag,
-                    new_tag=new_tag,
-                    merge_build=merge_build)
+                    non_python_nightly_tag = full_arch_tag.replace("-py37", "")
+                    DOCKER_CLIENT.api.tag(
+                        image=f"{full_image}:{full_arch_tag}",
+                        repository=full_image,
+                        tag=non_python_nightly_tag)
+                    # Tag and push rayproject/<image>:nightly<arch_tag>
+                    docker_push(full_image, non_python_nightly_tag)


 # Push infra here:
@ -535,30 +409,22 @@ def push_readmes(merge_build: bool):


 # Build base-deps/ray-deps only on file change, 2 weeks, per release
-# Build ray, ray-ml every time
+# Build ray, ray-ml, autoscaler every time
 # build-docker-images.py --py-versions PY37 --build-type PR --rebuild-all
 MERGE = "MERGE"
 HUMAN = "HUMAN"
 PR = "PR"
 BUILDKITE = "BUILDKITE"
 BUILD_TYPES = [MERGE, HUMAN, PR, BUILDKITE]
-
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--py-versions",
-        choices=list(PY_MATRIX.keys()),
-        default="py37",
+        choices=["PY36", "PY37", "PY38", "PY39"],
+        default="PY37",
        nargs="*",
        help="Which python versions to build. "
-        "Must be in (py36, py37, py38, py39)")
-    parser.add_argument(
-        "--device-types",
-        choices=list(BASE_IMAGES.keys()),
-        default=None,
-        nargs="*",
-        help="Which device types (CPU/CUDA versions) to build images for. "
-        "If not specified, images will be built for all device types.")
+        "Must be in (PY36, PY37, PY38, PY39)")
    parser.add_argument(
        "--build-type",
        choices=BUILD_TYPES,
@ -582,47 +448,26 @@ if __name__ == "__main__":
    py_versions = args.py_versions
    py_versions = py_versions if isinstance(py_versions,
                                            list) else [py_versions]
+    for key in set(PY_MATRIX.keys()):
+        if key[1:].upper() not in py_versions:
+            PY_MATRIX.pop(key)
+    assert len(PY_MATRIX) == len(
+        py_versions
+    ), f"Length of PY_MATRIX != args {PY_MATRIX} : {args.py_versions}"

-    image_types = args.device_types if args.device_types else list(
-        BASE_IMAGES.keys())
-
-    assert set(list(CUDA_FULL.keys()) + ["cpu"]) == set(BASE_IMAGES.keys())
-
-    # Make sure the python images and cuda versions we build here are
-    # consistent with the ones used with fix-latest-docker.sh script.
-    py_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda",
-                                   "python_versions.txt")
-    with open(py_version_file) as f:
-        py_file_versions = f.read().splitlines()
-        assert set(PY_MATRIX.keys()) == set(py_file_versions), \
-            (PY_MATRIX.keys(), py_file_versions)
-
-    cuda_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda",
-                                     "cuda_versions.txt")
-
-    with open(cuda_version_file) as f:
-        cuda_file_versions = f.read().splitlines()
-        assert set(BASE_IMAGES.keys()) == set(cuda_file_versions + ["cpu"]),\
-            (BASE_IMAGES.keys(), cuda_file_versions + ["cpu"])
-
-    print("Building the following python versions: ",
-          [PY_MATRIX[py_version] for py_version in py_versions])
-    print("Building images for the following devices: ", image_types)
+    print("Building the following python versions: ", PY_MATRIX)
    print("Building base images: ", args.base)

    build_type = args.build_type
    is_buildkite = build_type == BUILDKITE
-
    if build_type == BUILDKITE:
        if os.environ.get("BUILDKITE_PULL_REQUEST", "") == "false":
            build_type = MERGE
        else:
            build_type = PR
-
    if build_type == HUMAN:
-        # If manually triggered, request user for branch and SHA value to use.
        _configure_human_version()
-    if (build_type in {HUMAN, MERGE, BUILDKITE}
+    if (build_type in {HUMAN, MERGE} or is_buildkite
            or _check_if_docker_files_modified()):
        DOCKER_CLIENT = docker.from_env()
        is_merge = build_type == MERGE
@ -633,31 +478,25 @@ if __name__ == "__main__":
            username, password = _get_docker_creds()
            DOCKER_CLIENT.api.login(username=username, password=password)
        copy_wheels(build_type == HUMAN)
-        is_base_images_built = build_or_pull_base_images(
-            py_versions, image_types, args.base)
-
+        base_images_built = build_or_pull_base_images(args.base)
        if args.only_build_worker_container:
-            build_for_all_versions("ray-worker-container", py_versions,
-                                   image_types)
+            build_ray_worker_container()
            # TODO Currently don't push ray_worker_container
        else:
-            # Build Ray Docker images.
-            build_for_all_versions("ray", py_versions, image_types)
-
-            if ML_CUDA_VERSION in image_types:
-                # Build Ray ML Docker images only if ML_CUDA_VERSION is
-                # specified.
-                prep_ray_ml()
-                # Only build ML Docker for the ML_CUDA_VERSION
-                build_for_all_versions(
-                    "ray-ml", py_versions, image_types=[ML_CUDA_VERSION])
+            build_ray()
+            build_ray_ml()
+            if build_type in {MERGE, PR}:
+                valid_branch = _valid_branch()
+                if (not valid_branch) and is_merge:
+                    print(f"Invalid Branch found: {_get_branch()}")
+                push_and_tag_images(base_images_built, valid_branch
+                                    and is_merge)

            if build_type in {MERGE, PR}:
                valid_branch = _valid_branch()
                if (not valid_branch) and is_merge:
                    print(f"Invalid Branch found: {_get_branch()}")
-                push_and_tag_images(py_versions, image_types,
-                                    is_base_images_built, valid_branch
+                push_and_tag_images(base_images_built, valid_branch
                                    and is_merge)

        # TODO(ilr) Re-Enable Push READMEs by using a normal password
--- a/doc/source/installation.rst
+++ b/doc/source/installation.rst
@ -290,7 +290,7 @@ Image releases are `tagged` using the following format:
     - A specific nightly build (uses a SHA from the Github ``master``).


-Some tags also have `variants` that add or change functionality:
+Each tag has `variants` that add or change functionality:

 .. list-table::
   :widths: 16 40
@ -298,12 +298,10 @@ Some tags also have `variants` that add or change functionality:

   * - Variant
     - Description
+   * - -gpu
+     - These are based off of an NVIDIA CUDA image. They require the Nvidia Docker Runtime.
   * - -cpu
     - These are based off of an Ubuntu image.
-   * - -cuXX
-     - These are based off of an NVIDIA CUDA image with the specified CUDA version. They require the Nvidia Docker Runtime.
-   * - -gpu
-     - Aliases to a specific ``-cuXX`` tagged image.
   * - <no tag>
     - Aliases to ``-cpu`` tagged images

--- a/docker/autoscaler/README.md
+++ b/docker/autoscaler/README.md
@ -1,7 +1,7 @@
 # DEPRECATED -- Please use [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml)
 ## About
 This image used to be the base image for the Ray autoscaler, but it has been replaced by [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml). 
-Please use that instead, *this image is deprecated*.
+Please use that instead, *this image will be removed in the near future*.


 ## Tags
--- a/docker/base-deps/Dockerfile
+++ b/docker/base-deps/Dockerfile
@ -1,6 +1,6 @@
 # The base-deps Docker image installs main libraries needed to run Ray

-# The GPU options are NVIDIA CUDA developer images.
+# The GPU option is nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04
 ARG BASE_IMAGE="ubuntu:focal"
 FROM ${BASE_IMAGE}
 # FROM directive resets ARG
--- a/docker/base-deps/README.md
+++ b/docker/base-deps/README.md
@ -14,7 +14,7 @@ This image  has the system-level dependencies for `Ray` and the `Ray Autoscaler`
 * `:DATE` - A specific build.

 ### Suffixes
-* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
 * `-cpu`- These are based off of an `Ubuntu` image.
 * Tags without a suffix refer to `-cpu` images

--- a/docker/fix-docker-latest.sh
+++ b/docker/fix-docker-latest.sh
@ -29,6 +29,8 @@ AWS_ACCESS_KEY_ID=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.AccessK
 AWS_SECRET_ACCESS_KEY=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SecretAccessKey)
 AWS_SESSION_TOKEN=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SessionToken)

+
+
 echo -e "Invoking this lambda!\nView logs at https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups"
 AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN AWS_SECURITY_TOKEN='' aws \
 lambda invoke --function-name DockerTagLatest \
--- a/docker/ray-deps/Dockerfile
+++ b/docker/ray-deps/Dockerfile
@ -1,5 +1,5 @@
-ARG BASE_IMAGE=""
-FROM rayproject/base-deps:nightly"$BASE_IMAGE"
+ARG GPU=""
+FROM rayproject/base-deps:nightly"$GPU"
 # If this arg is not "autoscaler" then no autoscaler requirements will be included
 ARG AUTOSCALER="autoscaler"
 ARG WHEEL_PATH
--- a/docker/ray-deps/README.md
+++ b/docker/ray-deps/README.md
@ -13,7 +13,7 @@ This has the python-level dependencies for `Ray` and the `Ray Autoscaler`. The `
 * `:DATE` - A specific build.

 ### Suffixes
-* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
 * `-cpu`- These are based off of an `Ubuntu` image.
 * Tags without a suffix refer to `-cpu` images

--- a/docker/ray-ml/Dockerfile
+++ b/docker/ray-ml/Dockerfile
@ -1,5 +1,5 @@
-ARG BASE_IMAGE
-FROM rayproject/ray:nightly"$BASE_IMAGE"
+ARG GPU
+FROM rayproject/ray:nightly"$GPU"
 ARG PYTHON_MINOR_VERSION=7

 # We have to uninstall wrapt this way for Tensorflow compatibility
@ -8,7 +8,6 @@ COPY requirements_dl.txt ./
 COPY requirements_ml_docker.txt ./
 COPY requirements_rllib.txt ./
 COPY requirements_tune.txt ./requirements_tune.txt
-COPY requirements_upstream.txt ./
 COPY install_atari_roms.sh ./install_atari_roms.sh

 RUN sudo apt-get update \
@ -24,13 +23,11 @@ RUN sudo apt-get update \
    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_rllib.txt \
    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_tune.txt \
    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \
-    && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_upstream.txt \
    # Remove dataclasses & typing because they are included in Python > 3.6
    && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \
        $HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi  \
    && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \
    && sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \
-    && sudo rm requirements_upstream.txt \
    && sudo apt-get clean

 # Make sure tfp is installed correctly and matches tf version.
--- a/docker/ray-ml/README.md
+++ b/docker/ray-ml/README.md
@ -11,7 +11,9 @@ This image is an extension of the [`rayproject/ray`](https://hub.docker.com/repo
 * `:SHA` - A specific nightly build.

 ### Suffixes
-* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.
+* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-cpu`- These are based off of an `Ubuntu` image.
+* Tags without a suffix refer to `-cpu` images

 ## Other Images
 * [`rayproject/ray`](https://hub.docker.com/repository/docker/rayproject/ray) - Ray and all of its dependencies.
--- a/docker/ray/Dockerfile
+++ b/docker/ray/Dockerfile
@ -1,5 +1,5 @@
-ARG BASE_IMAGE
-FROM rayproject/ray-deps:nightly"$BASE_IMAGE"
+ARG GPU
+FROM rayproject/ray-deps:nightly"$GPU"
 ARG WHEEL_PATH
 ARG FIND_LINKS_PATH=".whl"
 # For Click
--- a/docker/ray/README.md
+++ b/docker/ray/README.md
@ -12,7 +12,7 @@ everything needed to get started with running Ray! They work for both local deve
 * `:SHA` - A specific nightly build.

 ### Suffixes
-* `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
+* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.  
 * `-cpu`- These are based off of an `Ubuntu` image.
 * Tags without a suffix refer to `-cpu` images

--- a/docker/retag-lambda/cuda_versions.txt
+++ b/docker/retag-lambda/cuda_versions.txt
@ -1,5 +0,0 @@
-cu112
-cu111
-cu110
-cu102
-cu101
--- a/docker/retag-lambda/lambda_function.py
+++ b/docker/retag-lambda/lambda_function.py
@ -1,5 +1,4 @@
 import json
-import os
 import subprocess

 import boto3
@ -8,10 +7,6 @@ DOCKER_USER = None
 DOCKER_PASS = None


-def _get_curr_dir():
-    return os.path.dirname(os.path.realpath(__file__))
-
-
 def get_secrets():
    global DOCKER_PASS, DOCKER_USER
    secret_name = "dockerRetagLatestCredentials"
@ -41,30 +36,15 @@ def retag(repo: str, source: str, destination: str) -> str:
        })


-def parse_versions(version_file):
-    with open(version_file) as f:
-        file_versions = f.read().splitlines()
-    return file_versions
-
-
 def lambda_handler(event, context):
    source_image = event["source_tag"]
    destination_image = event["destination_tag"]
    total_results = []
-    python_versions = parse_versions(
-        os.path.join(_get_curr_dir(), "python_versions.txt"))
-    cuda_versions = parse_versions(
-        os.path.join(_get_curr_dir(), "cuda_versions.txt"))
-    for repo in ["ray", "ray-ml"]:
+    for repo in ["ray", "ray-ml", "autoscaler"]:
        results = []
-        for pyversion in python_versions:
+        for pyversion in ["py36", "py37", "py38", "py39"]:
            source_tag = f"{source_image}-{pyversion}"
            destination_tag = f"{destination_image}-{pyversion}"
-            for cudaversion in cuda_versions:
-                cuda_source_tag = source_tag + f"-{cudaversion}"
-                cuda_destination_tag = destination_tag + f"-{cudaversion}"
-                results.append(
-                    retag(repo, cuda_source_tag, cuda_destination_tag))
            results.append(retag(repo, source_tag, destination_tag))
            results.append(retag(repo, source_tag, destination_tag + "-cpu"))
            results.append(
@ -74,13 +54,7 @@ def lambda_handler(event, context):

    # Retag images without a python version specified (defaults to py37)
    results = []
-    for repo in ["ray", "ray-ml", "ray-deps", "base-deps"]:
-        for cudaversion in cuda_versions:
-            source_tag = f"{source_image}-{cudaversion}"
-            destination_tag = f"{destination_image}-{cudaversion}"
-            results.append(retag(repo, source_tag, destination_tag))
-
-        # ray:nightly -> ray:1.x
+    for repo in ["ray", "ray-ml", "autoscaler", "ray-deps", "base-deps"]:
        results.append(retag(repo, source_image, destination_image))
        results.append(retag(repo, source_image, destination_image + "-cpu"))
        results.append(
--- a/docker/retag-lambda/python_versions.txt
+++ b/docker/retag-lambda/python_versions.txt
@ -1,4 +0,0 @@
-py36
-py37
-py38
-py39