Revert "Revert "[Docker] Support multiple CUDA Versions (#19505)" (#19756)" (#19763)

This reverts commit e58fcca404.
This commit is contained in:
Amog Kamsetty 2021-10-26 17:32:56 -07:00 committed by GitHub
parent 47744d282c
commit db863aafc0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 458 additions and 229 deletions

View file

@ -75,37 +75,69 @@
# # Upload to latest directory. # # Upload to latest directory.
# - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi # - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi
- label: ":docker: Build Images: py36" - label: ":docker: Build Images: py36 (1/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands: commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build - LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3 - pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions PY36 --build-type BUILDKITE --build-base - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py37" - label: ":docker: Build Images: py36 (2/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands: commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build - LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3 - pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions PY37 --build-type BUILDKITE --build-base - python ./ci/travis/build-docker-images.py --py-versions py36 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py38" - label: ":docker: Build Images: py37 (1/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands: commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build - LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3 - pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions PY38 --build-type BUILDKITE --build-base - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py39" - label: ":docker: Build Images: py37 (2/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands: commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build - LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3 - pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions PY39 --build-type BUILDKITE --build-base - python ./ci/travis/build-docker-images.py --py-versions py37 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py38 (1/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py38 (2/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions py38 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py39 (1/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cpu cu101 cu102 --build-type BUILDKITE --build-base
- label: ":docker: Build Images: py39 (2/2)"
conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
commands:
- LINUX_WHEELS=1 ./ci/travis/ci.sh build
- pip install -q docker aws_requests_auth boto3
- if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
- python ./ci/travis/build-docker-images.py --py-versions py39 --device-types cu110 cu111 cu112 --build-type BUILDKITE --build-base
- label: ":book: Lint" - label: ":book: Lint"
commands: commands:

View file

@ -3,11 +3,13 @@ import datetime
import json import json
import functools import functools
import glob import glob
import itertools
import os import os
import re import re
import shutil import shutil
import subprocess import subprocess
import sys import sys
from collections import defaultdict
from typing import List, Tuple from typing import List, Tuple
import docker import docker
@ -24,18 +26,39 @@ DOCKER_HUB_DESCRIPTION = {
"https://hub.docker.com/r/rayproject/ray"), "https://hub.docker.com/r/rayproject/ray"),
"ray": "Official Docker Images for Ray, the distributed computing API.", "ray": "Official Docker Images for Ray, the distributed computing API.",
"ray-ml": "Developer ready Docker Image for Ray.", "ray-ml": "Developer ready Docker Image for Ray.",
"autoscaler": (
"Deprecated image, please use: "
"https://hub.docker.com/repository/docker/rayproject/ray-ml")
} }
PY_MATRIX = { PY_MATRIX = {
"-py36": "3.6.12", "py36": "3.6.12",
"-py37": "3.7.7", "py37": "3.7.7",
"-py38": "3.8.5", "py38": "3.8.5",
"-py39": "3.9.5" "py39": "3.9.5"
} }
BASE_IMAGES = {
"cu112": "nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04",
"cu111": "nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04",
"cu110": "nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04",
"cu102": "nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04",
"cu101": "nvidia/cuda:10.1-cudnn8-devel-ubuntu18.04",
"cpu": "ubuntu:focal",
}
CUDA_FULL = {
"cu112": "CUDA 11.2",
"cu111": "CUDA 11.1",
"cu110": "CUDA 11.0",
"cu102": "CUDA 10.2",
"cu101": "CUDA 10.1"
}
# The CUDA version to use for the ML Docker image.
ML_CUDA_VERSION = "cu112"
DEFAULT_PYTHON_VERSION = "py37"
IMAGE_NAMES = list(DOCKER_HUB_DESCRIPTION.keys())
def _get_branch(): def _get_branch():
branch = (os.environ.get("TRAVIS_BRANCH") branch = (os.environ.get("TRAVIS_BRANCH")
@ -119,83 +142,117 @@ def _check_if_docker_files_modified():
return affected return affected
def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]: def _build_docker_image(image_name: str,
built_images = [] py_version: str,
for gpu in ["-cpu", "-gpu"]: image_type: str,
for py_name, py_version in PY_MATRIX.items(): no_cache=True):
# TODO(https://github.com/ray-project/ray/issues/16599): """Builds Docker image with the provided info.
# remove below after supporting ray-ml images with Python 3.9
if image_name in ["ray-ml", "autoscaler"
] and py_version.startswith("3.9"):
print(f"{image_name} image is currently unsupported with "
"Python 3.9")
continue
build_args = {} image_name (str): The name of the image to build. Must be one of
build_args["PYTHON_VERSION"] = py_version IMAGE_NAMES.
# I.e. "-py36"[-1] == 6 py_version (str): The Python version to build the image for.
build_args["PYTHON_MINOR_VERSION"] = py_name[-1] Must be one of PY_MATRIX.keys()
image_type (str): The image type to build. Must be one of
BASE_IMAGES.keys()
no_cache (bool): If True, don't use caching when building the image.
"""
if image_name == "base-deps": if image_name not in IMAGE_NAMES:
build_args["BASE_IMAGE"] = ( raise ValueError(
"nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" f"The provided image name {image_name} is not "
if gpu == "-gpu" else "ubuntu:focal") f"recognized. Image names must be one of {IMAGE_NAMES}")
else:
# NOTE(ilr) This is a bit of an abuse of the name "GPU"
build_args["GPU"] = f"{py_name}{gpu}"
if image_name in ["ray", "ray-deps", "ray-worker-container"]: if py_version not in PY_MATRIX.keys():
wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"]) raise ValueError(f"The provided python version {py_version} is not "
build_args["WHEEL_PATH"] = f".whl/{wheel}" f"recognized. Python version must be one of"
# Add pip option "--find-links .whl/" to ensure ray-cpp wheel f" {PY_MATRIX.keys()}")
# can be found.
build_args["FIND_LINKS_PATH"] = ".whl"
tagged_name = f"rayproject/{image_name}:nightly{py_name}{gpu}" if image_type not in BASE_IMAGES.keys():
for i in range(2): raise ValueError(f"The provided CUDA version {image_type} is not "
cleanup = DOCKER_CLIENT.containers.prune().get( f"recognized. CUDA version must be one of"
"SpaceReclaimed") f" {image_type.keys()}")
if cleanup is not None:
print(f"Cleaned up {cleanup / (2**20)}MB")
output = DOCKER_CLIENT.api.build(
path=os.path.join(_get_root_dir(), "docker", image_name),
tag=tagged_name,
nocache=no_cache,
buildargs=build_args)
cmd_output = [] # TODO(https://github.com/ray-project/ray/issues/16599):
try: # remove below after supporting ray-ml images with Python 3.9
start = datetime.datetime.now() if image_name == "ray-ml" and py_version == "py39":
current_iter = start print(f"{image_name} image is currently unsupported with "
for line in output: "Python 3.9")
cmd_output.append(line.decode("utf-8")) return
if datetime.datetime.now(
) - current_iter >= datetime.timedelta(minutes=5):
current_iter = datetime.datetime.now()
elapsed = datetime.datetime.now() - start
print(f"Still building {tagged_name} after "
f"{elapsed.seconds} seconds")
if elapsed >= datetime.timedelta(minutes=15):
print("Additional build output:")
print(*cmd_output, sep="\n")
# Clear cmd_output after printing, so the next
# iteration will not print out the same lines.
cmd_output = []
except Exception as e:
print(f"FAILURE with error {e}")
if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: build_args = {}
print(f"ERROR building: {tagged_name}. Output below:") build_args["PYTHON_VERSION"] = PY_MATRIX[py_version]
print(*cmd_output, sep="\n") # I.e. "py36"[-1] == 6
if (i == 1): build_args["PYTHON_MINOR_VERSION"] = py_version[-1]
raise Exception("FAILED TO BUILD IMAGE")
print("TRYING AGAIN")
else:
break
print("BUILT: ", tagged_name) device_tag = f"{image_type}"
built_images.append(tagged_name)
return built_images if image_name == "base-deps":
base_image = BASE_IMAGES[image_type]
else:
base_image = f"-{py_version}-{device_tag}"
if image_name != "ray-worker-container":
build_args["BASE_IMAGE"] = base_image
if image_name in ["ray", "ray-deps", "ray-worker-container"]:
wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"])
build_args["WHEEL_PATH"] = f".whl/{wheel}"
# Add pip option "--find-links .whl/" to ensure ray-cpp wheel
# can be found.
build_args["FIND_LINKS_PATH"] = ".whl"
tagged_name = f"rayproject/{image_name}:nightly-{py_version}-{device_tag}"
for i in range(2):
cleanup = DOCKER_CLIENT.containers.prune().get("SpaceReclaimed")
if cleanup is not None:
print(f"Cleaned up {cleanup / (2 ** 20)}MB")
labels = {
"image-name": image_name,
"python-version": PY_MATRIX[py_version]
}
if image_type in CUDA_FULL:
labels["cuda-version"] = CUDA_FULL[image_type]
output = DOCKER_CLIENT.api.build(
path=os.path.join(_get_root_dir(), "docker", image_name),
tag=tagged_name,
nocache=no_cache,
labels=labels,
buildargs=build_args)
cmd_output = []
try:
start = datetime.datetime.now()
current_iter = start
for line in output:
cmd_output.append(line.decode("utf-8"))
if datetime.datetime.now(
) - current_iter >= datetime.timedelta(minutes=5):
current_iter = datetime.datetime.now()
elapsed = datetime.datetime.now() - start
print(f"Still building {tagged_name} after "
f"{elapsed.seconds} seconds")
if elapsed >= datetime.timedelta(minutes=15):
print("Additional build output:")
print(*cmd_output, sep="\n")
# Clear cmd_output after printing, so the next
# iteration will not print out the same lines.
cmd_output = []
except Exception as e:
print(f"FAILURE with error {e}")
if len(DOCKER_CLIENT.api.images(tagged_name)) == 0:
print(f"ERROR building: {tagged_name}. Output below:")
print(*cmd_output, sep="\n")
if i == 1:
raise Exception("FAILED TO BUILD IMAGE")
print("TRYING AGAIN")
else:
break
print("BUILT: ", tagged_name)
def copy_wheels(human_build): def copy_wheels(human_build):
@ -218,41 +275,66 @@ def copy_wheels(human_build):
shutil.copy(source, ray_worker_container_dst) shutil.copy(source, ray_worker_container_dst)
def build_or_pull_base_images(rebuild_base_images: bool = True) -> List[str]: def check_staleness(repository, tag):
"""Returns images to tag and build""" DOCKER_CLIENT.api.pull(repository=repository, tag=tag)
DOCKER_CLIENT.api.pull(repository="rayproject/base-deps", tag="nightly")
age = DOCKER_CLIENT.api.inspect_image("rayproject/base-deps:nightly")[ age = DOCKER_CLIENT.api.inspect_image(f"{repository}:{tag}")["Created"]
"Created"]
short_date = datetime.datetime.strptime(age.split("T")[0], "%Y-%m-%d") short_date = datetime.datetime.strptime(age.split("T")[0], "%Y-%m-%d")
is_stale = ( is_stale = (
datetime.datetime.now() - short_date) > datetime.timedelta(days=14) datetime.datetime.now() - short_date) > datetime.timedelta(days=14)
return is_stale
print("Pulling images for caching")
DOCKER_CLIENT.api.pull( def build_for_all_versions(image_name, py_versions, image_types, **kwargs):
repository="rayproject/base-deps", tag="nightly-cpu") """Builds the given Docker image for all Python & CUDA versions"""
DOCKER_CLIENT.api.pull( for py_version in py_versions:
repository="rayproject/base-deps", tag="nightly-gpu") for image_type in image_types:
_build_docker_image(
image_name,
py_version=py_version,
image_type=image_type,
**kwargs)
DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-gpu")
DOCKER_CLIENT.api.pull(repository="rayproject/ray-deps", tag="nightly-cpu")
# TODO(ilr) See if any caching happens def build_base_images(py_versions, image_types):
if (rebuild_base_images or is_stale or _release_build()): build_for_all_versions(
for image in ["base-deps", "ray-deps"]: "base-deps", py_versions, image_types, no_cache=False)
_build_cpu_gpu_images(image, no_cache=False) build_for_all_versions(
"ray-deps", py_versions, image_types, no_cache=False)
def build_or_pull_base_images(py_versions: List[str],
image_types: List[str],
rebuild_base_images: bool = True) -> bool:
"""Returns images to tag and build."""
repositories = ["rayproject/base-deps", "rayproject/ray-deps"]
tags = [
f"nightly-{py_version}-{image_type}"
for py_version, image_type in itertools.product(
py_versions, image_types)
]
try:
is_stale = check_staleness(repositories[0], tags[0])
# We still pull even if we have to rebuild the base images to help with
# caching.
for repository in repositories:
for tag in tags:
DOCKER_CLIENT.api.pull(repository=repository, tag=tag)
except Exception as e:
print(e)
is_stale = True
if rebuild_base_images or _release_build() or is_stale:
build_base_images(py_versions, image_types)
return True return True
else: else:
print("Just pulling images!") print("Just pulling images!")
return False return False
def build_ray(): def prep_ray_ml():
return _build_cpu_gpu_images("ray")
def build_ray_ml():
root_dir = _get_root_dir() root_dir = _get_root_dir()
requirement_files = glob.glob( requirement_files = glob.glob(
f"{_get_root_dir()}/python/**/requirements*.txt", recursive=True) f"{_get_root_dir()}/python/**/requirements*.txt", recursive=True)
@ -261,11 +343,6 @@ def build_ray_ml():
# Install atari roms script # Install atari roms script
shutil.copy(f"{_get_root_dir()}/rllib/utils/install_atari_roms.sh", shutil.copy(f"{_get_root_dir()}/rllib/utils/install_atari_roms.sh",
os.path.join(root_dir, "docker/ray-ml/")) os.path.join(root_dir, "docker/ray-ml/"))
ray_ml_images = _build_cpu_gpu_images("ray-ml")
for img in ray_ml_images:
tag = img.split(":")[-1]
DOCKER_CLIENT.api.tag(
image=img, repository="rayproject/autoscaler", tag=tag)
def _get_docker_creds() -> Tuple[str, str]: def _get_docker_creds() -> Tuple[str, str]:
@ -274,39 +351,52 @@ def _get_docker_creds() -> Tuple[str, str]:
return DOCKER_USERNAME, docker_password return DOCKER_USERNAME, docker_password
def build_ray_worker_container(): def _docker_push(image, tag):
return _build_cpu_gpu_images("ray-worker-container") print(f"PUSHING: {image}:{tag}, result:")
# This docker API is janky. Without "stream=True" it returns a
# massive string filled with every progress bar update, which can
# cause CI to back up.
#
# With stream=True, it's a line-at-a-time generator of the same
# info. So we can slow it down by printing every couple hundred
# lines
i = 0
for progress_line in DOCKER_CLIENT.api.push(image, tag=tag, stream=True):
if i % 100 == 0:
print(progress_line)
def _tag_and_push(full_image_name, old_tag, new_tag, merge_build=False):
# Do not tag release builds because they are no longer up to
# date after the branch cut.
if "nightly" in new_tag and _release_build():
return
if old_tag != new_tag:
DOCKER_CLIENT.api.tag(
image=f"{full_image_name}:{old_tag}",
repository=full_image_name,
tag=new_tag)
if not merge_build:
print("This is a PR Build! On a merge build, we would normally push"
f"to: {full_image_name}:{new_tag}")
else:
_docker_push(full_image_name, new_tag)
def _create_new_tags(all_tags, old_str, new_str):
new_tags = []
for full_tag in all_tags:
new_tag = full_tag.replace(old_str, new_str)
new_tags.append(new_tag)
return new_tags
# For non-release builds, push "nightly" & "sha" # For non-release builds, push "nightly" & "sha"
# For release builds, push "nightly" & "latest" & "x.x.x" # For release builds, push "nightly" & "latest" & "x.x.x"
def push_and_tag_images(push_base_images: bool, merge_build: bool = False): def push_and_tag_images(py_versions: List[str],
def docker_push(image, tag): image_types: List[str],
# Do not tag release builds because they are no longer up to push_base_images: bool,
# date after the branch cut. merge_build: bool = False):
if "nightly" in tag and _release_build():
return
if merge_build:
print(f"PUSHING: {image}:{tag}, result:")
# This docker API is janky. Without "stream=True" it returns a
# massive string filled with every progress bar update, which can
# cause CI to back up.
#
# With stream=True, it's a line-at-a-time generator of the same
# info. So we can slow it down by printing every couple hundred
# lines
i = 0
for progress_line in DOCKER_CLIENT.api.push(
image, tag=tag, stream=True):
if i % 100 == 0:
print(progress_line)
else:
print(
"This is a PR Build! On a merge build, we would normally push "
f"to: {image}:{tag}")
def get_new_tag(old_tag, new_tag):
return old_tag.replace("nightly", new_tag)
date_tag = datetime.datetime.now().strftime("%Y-%m-%d") date_tag = datetime.datetime.now().strftime("%Y-%m-%d")
sha_tag = _get_commit_sha() sha_tag = _get_commit_sha()
@ -316,61 +406,97 @@ def push_and_tag_images(push_base_images: bool, merge_build: bool = False):
date_tag = release_name date_tag = release_name
sha_tag = release_name sha_tag = release_name
image_list = ["ray", "ray-ml", "autoscaler"] image_list = ["ray", "ray-ml"]
if push_base_images: if push_base_images:
image_list.extend(["base-deps", "ray-deps"]) image_list.extend(["base-deps", "ray-deps"])
for image in image_list: for image_name in image_list:
for py_name, py_version in PY_MATRIX.items(): full_image_name = f"rayproject/{image_name}"
# TODO(https://github.com/ray-project/ray/issues/16599):
# remove below after supporting ray-ml images with Python 3.9
if image in ["ray-ml", "autoscaler"
] and py_version.startswith("3.9"):
print(
f"{image} image is currently unsupported with Python 3.9")
continue
full_image = f"rayproject/{image}" # Mapping from old tags to new tags.
# These are the tags we will push.
# The key is the full image name, and the values are all the tags
# for that image.
tag_mapping = defaultdict(list)
for py_name in py_versions:
for image_type in image_types:
if image_name == "ray-ml" and image_type != ML_CUDA_VERSION:
print("ML Docker image is not built for the following "
f"device type: {image_type}")
continue
# Tag "nightly-py3x" from "nightly-py3x-cpu" # TODO(https://github.com/ray-project/ray/issues/16599):
DOCKER_CLIENT.api.tag( # remove below after supporting ray-ml images with Python 3.9
image=f"{full_image}:nightly{py_name}-cpu", if image_name in ["ray-ml"
repository=full_image, ] and PY_MATRIX[py_name].startswith("3.9"):
tag=f"nightly{py_name}") print(f"{image_name} image is currently "
f"unsupported with Python 3.9")
continue
for arch_tag in ["-cpu", "-gpu", ""]: tag = f"nightly-{py_name}-{image_type}"
full_arch_tag = f"nightly{py_name}{arch_tag}" tag_mapping[tag].append(tag)
# Tag and push rayproject/<image>:nightly<py_tag><arch_tag> # If no device is specified, it should map to CPU image.
docker_push(full_image, full_arch_tag) # "-gpu" tag should refer to the ML_CUDA_VERSION
for old_tag in tag_mapping.keys():
if "cpu" in old_tag:
new_tags = _create_new_tags(
tag_mapping[old_tag], old_str="-cpu", new_str="")
tag_mapping[old_tag].extend(new_tags)
elif ML_CUDA_VERSION in old_tag:
new_tags = _create_new_tags(
tag_mapping[old_tag],
old_str=f"-{ML_CUDA_VERSION}",
new_str="-gpu")
tag_mapping[old_tag].extend(new_tags)
# Ex: specific_tag == "1.0.1" or "<sha>" or "<date>" # No Python version specified should refer to DEFAULT_PYTHON_VERSION
specific_tag = get_new_tag( for old_tag in tag_mapping.keys():
full_arch_tag, date_tag if "-deps" in image else sha_tag) if DEFAULT_PYTHON_VERSION in old_tag:
new_tags = _create_new_tags(
tag_mapping[old_tag],
old_str=f"-{DEFAULT_PYTHON_VERSION}",
new_str="")
tag_mapping[old_tag].extend(new_tags)
# Tag and push rayproject/<image>:<sha/date><py_tag><arch_tag> # For all tags, create Date/Sha tags
DOCKER_CLIENT.api.tag( for old_tag in tag_mapping.keys():
image=f"{full_image}:{full_arch_tag}", new_tags = _create_new_tags(
repository=full_image, tag_mapping[old_tag],
tag=specific_tag) old_str="nightly",
docker_push(full_image, specific_tag) new_str=date_tag if "-deps" in image_name else sha_tag)
tag_mapping[old_tag].extend(new_tags)
if "-py37" in py_name: # Sanity checking.
non_python_specific_tag = specific_tag.replace("-py37", "") for old_tag in tag_mapping.keys():
DOCKER_CLIENT.api.tag( if DEFAULT_PYTHON_VERSION in old_tag:
image=f"{full_image}:{full_arch_tag}", if "-cpu" in old_tag:
repository=full_image, assert "nightly-cpu" in tag_mapping[old_tag]
tag=non_python_specific_tag) assert "nightly" in tag_mapping[old_tag]
# Tag and push rayproject/<image>:<sha/date><arch_tag> if "-deps" in image_name:
docker_push(full_image, non_python_specific_tag) assert f"{date_tag}-cpu" in tag_mapping[old_tag]
assert f"{date_tag}" in tag_mapping[old_tag]
else:
assert f"{sha_tag}-cpu" in tag_mapping[old_tag]
assert f"{sha_tag}" in tag_mapping[old_tag]
non_python_nightly_tag = full_arch_tag.replace("-py37", "") elif ML_CUDA_VERSION in old_tag:
DOCKER_CLIENT.api.tag( assert "nightly-gpu" in tag_mapping[old_tag]
image=f"{full_image}:{full_arch_tag}", if "-deps" in image_name:
repository=full_image, assert f"{date_tag}-gpu" in tag_mapping[old_tag]
tag=non_python_nightly_tag) else:
# Tag and push rayproject/<image>:nightly<arch_tag> assert f"{sha_tag}-gpu" in tag_mapping[old_tag]
docker_push(full_image, non_python_nightly_tag)
print(f"These tags will be created for {image_name}: ", tag_mapping)
# Tag and push all images.
for old_tag in tag_mapping.keys():
for new_tag in tag_mapping[old_tag]:
_tag_and_push(
full_image_name,
old_tag=old_tag,
new_tag=new_tag,
merge_build=merge_build)
# Push infra here: # Push infra here:
@ -409,22 +535,30 @@ def push_readmes(merge_build: bool):
# Build base-deps/ray-deps only on file change, 2 weeks, per release # Build base-deps/ray-deps only on file change, 2 weeks, per release
# Build ray, ray-ml, autoscaler every time # Build ray, ray-ml every time
# build-docker-images.py --py-versions PY37 --build-type PR --rebuild-all # build-docker-images.py --py-versions PY37 --build-type PR --rebuild-all
MERGE = "MERGE" MERGE = "MERGE"
HUMAN = "HUMAN" HUMAN = "HUMAN"
PR = "PR" PR = "PR"
BUILDKITE = "BUILDKITE" BUILDKITE = "BUILDKITE"
BUILD_TYPES = [MERGE, HUMAN, PR, BUILDKITE] BUILD_TYPES = [MERGE, HUMAN, PR, BUILDKITE]
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--py-versions", "--py-versions",
choices=["PY36", "PY37", "PY38", "PY39"], choices=list(PY_MATRIX.keys()),
default="PY37", default="py37",
nargs="*", nargs="*",
help="Which python versions to build. " help="Which python versions to build. "
"Must be in (PY36, PY37, PY38, PY39)") "Must be in (py36, py37, py38, py39)")
parser.add_argument(
"--device-types",
choices=list(BASE_IMAGES.keys()),
default=None,
nargs="*",
help="Which device types (CPU/CUDA versions) to build images for. "
"If not specified, images will be built for all device types.")
parser.add_argument( parser.add_argument(
"--build-type", "--build-type",
choices=BUILD_TYPES, choices=BUILD_TYPES,
@ -448,26 +582,47 @@ if __name__ == "__main__":
py_versions = args.py_versions py_versions = args.py_versions
py_versions = py_versions if isinstance(py_versions, py_versions = py_versions if isinstance(py_versions,
list) else [py_versions] list) else [py_versions]
for key in set(PY_MATRIX.keys()):
if key[1:].upper() not in py_versions:
PY_MATRIX.pop(key)
assert len(PY_MATRIX) == len(
py_versions
), f"Length of PY_MATRIX != args {PY_MATRIX} : {args.py_versions}"
print("Building the following python versions: ", PY_MATRIX) image_types = args.device_types if args.device_types else list(
BASE_IMAGES.keys())
assert set(list(CUDA_FULL.keys()) + ["cpu"]) == set(BASE_IMAGES.keys())
# Make sure the python images and cuda versions we build here are
# consistent with the ones used with fix-latest-docker.sh script.
py_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda",
"python_versions.txt")
with open(py_version_file) as f:
py_file_versions = f.read().splitlines()
assert set(PY_MATRIX.keys()) == set(py_file_versions), \
(PY_MATRIX.keys(), py_file_versions)
cuda_version_file = os.path.join(_get_root_dir(), "docker/retag-lambda",
"cuda_versions.txt")
with open(cuda_version_file) as f:
cuda_file_versions = f.read().splitlines()
assert set(BASE_IMAGES.keys()) == set(cuda_file_versions + ["cpu"]),\
(BASE_IMAGES.keys(), cuda_file_versions + ["cpu"])
print("Building the following python versions: ",
[PY_MATRIX[py_version] for py_version in py_versions])
print("Building images for the following devices: ", image_types)
print("Building base images: ", args.base) print("Building base images: ", args.base)
build_type = args.build_type build_type = args.build_type
is_buildkite = build_type == BUILDKITE is_buildkite = build_type == BUILDKITE
if build_type == BUILDKITE: if build_type == BUILDKITE:
if os.environ.get("BUILDKITE_PULL_REQUEST", "") == "false": if os.environ.get("BUILDKITE_PULL_REQUEST", "") == "false":
build_type = MERGE build_type = MERGE
else: else:
build_type = PR build_type = PR
if build_type == HUMAN: if build_type == HUMAN:
# If manually triggered, request user for branch and SHA value to use.
_configure_human_version() _configure_human_version()
if (build_type in {HUMAN, MERGE} or is_buildkite if (build_type in {HUMAN, MERGE, BUILDKITE}
or _check_if_docker_files_modified()): or _check_if_docker_files_modified()):
DOCKER_CLIENT = docker.from_env() DOCKER_CLIENT = docker.from_env()
is_merge = build_type == MERGE is_merge = build_type == MERGE
@ -478,25 +633,31 @@ if __name__ == "__main__":
username, password = _get_docker_creds() username, password = _get_docker_creds()
DOCKER_CLIENT.api.login(username=username, password=password) DOCKER_CLIENT.api.login(username=username, password=password)
copy_wheels(build_type == HUMAN) copy_wheels(build_type == HUMAN)
base_images_built = build_or_pull_base_images(args.base) is_base_images_built = build_or_pull_base_images(
py_versions, image_types, args.base)
if args.only_build_worker_container: if args.only_build_worker_container:
build_ray_worker_container() build_for_all_versions("ray-worker-container", py_versions,
image_types)
# TODO Currently don't push ray_worker_container # TODO Currently don't push ray_worker_container
else: else:
build_ray() # Build Ray Docker images.
build_ray_ml() build_for_all_versions("ray", py_versions, image_types)
if build_type in {MERGE, PR}:
valid_branch = _valid_branch() if ML_CUDA_VERSION in image_types:
if (not valid_branch) and is_merge: # Build Ray ML Docker images only if ML_CUDA_VERSION is
print(f"Invalid Branch found: {_get_branch()}") # specified.
push_and_tag_images(base_images_built, valid_branch prep_ray_ml()
and is_merge) # Only build ML Docker for the ML_CUDA_VERSION
build_for_all_versions(
"ray-ml", py_versions, image_types=[ML_CUDA_VERSION])
if build_type in {MERGE, PR}: if build_type in {MERGE, PR}:
valid_branch = _valid_branch() valid_branch = _valid_branch()
if (not valid_branch) and is_merge: if (not valid_branch) and is_merge:
print(f"Invalid Branch found: {_get_branch()}") print(f"Invalid Branch found: {_get_branch()}")
push_and_tag_images(base_images_built, valid_branch push_and_tag_images(py_versions, image_types,
is_base_images_built, valid_branch
and is_merge) and is_merge)
# TODO(ilr) Re-Enable Push READMEs by using a normal password # TODO(ilr) Re-Enable Push READMEs by using a normal password

View file

@ -290,7 +290,7 @@ Image releases are `tagged` using the following format:
- A specific nightly build (uses a SHA from the Github ``master``). - A specific nightly build (uses a SHA from the Github ``master``).
Each tag has `variants` that add or change functionality: Some tags also have `variants` that add or change functionality:
.. list-table:: .. list-table::
:widths: 16 40 :widths: 16 40
@ -298,10 +298,12 @@ Each tag has `variants` that add or change functionality:
* - Variant * - Variant
- Description - Description
* - -gpu
- These are based off of an NVIDIA CUDA image. They require the Nvidia Docker Runtime.
* - -cpu * - -cpu
- These are based off of an Ubuntu image. - These are based off of an Ubuntu image.
* - -cuXX
- These are based off of an NVIDIA CUDA image with the specified CUDA version. They require the Nvidia Docker Runtime.
* - -gpu
- Aliases to a specific ``-cuXX`` tagged image.
* - <no tag> * - <no tag>
- Aliases to ``-cpu`` tagged images - Aliases to ``-cpu`` tagged images

View file

@ -1,7 +1,7 @@
# DEPRECATED -- Please use [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml) # DEPRECATED -- Please use [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml)
## About ## About
This image used to be the base image for the Ray autoscaler, but it has been replaced by [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml). This image used to be the base image for the Ray autoscaler, but it has been replaced by [`rayproject/ray-ml`](https://hub.docker.com/repository/docker/rayproject/ray-ml).
Please use that instead, *this image will be removed in the near future*. Please use that instead, *this image is deprecated*.
## Tags ## Tags

View file

@ -1,6 +1,6 @@
# The base-deps Docker image installs main libraries needed to run Ray # The base-deps Docker image installs main libraries needed to run Ray
# The GPU option is nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04 # The GPU options are NVIDIA CUDA developer images.
ARG BASE_IMAGE="ubuntu:focal" ARG BASE_IMAGE="ubuntu:focal"
FROM ${BASE_IMAGE} FROM ${BASE_IMAGE}
# FROM directive resets ARG # FROM directive resets ARG

View file

@ -14,7 +14,7 @@ This image has the system-level dependencies for `Ray` and the `Ray Autoscaler`
* `:DATE` - A specific build. * `:DATE` - A specific build.
### Suffixes ### Suffixes
* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. * `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.
* `-cpu`- These are based off of an `Ubuntu` image. * `-cpu`- These are based off of an `Ubuntu` image.
* Tags without a suffix refer to `-cpu` images * Tags without a suffix refer to `-cpu` images

View file

@ -29,8 +29,6 @@ AWS_ACCESS_KEY_ID=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.AccessK
AWS_SECRET_ACCESS_KEY=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SecretAccessKey) AWS_SECRET_ACCESS_KEY=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SecretAccessKey)
AWS_SESSION_TOKEN=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SessionToken) AWS_SESSION_TOKEN=$(echo "$ASSUME_ROLE_CREDENTIALS" | jq -r .Credentials.SessionToken)
echo -e "Invoking this lambda!\nView logs at https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups" echo -e "Invoking this lambda!\nView logs at https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups"
AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN AWS_SECURITY_TOKEN='' aws \ AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN AWS_SECURITY_TOKEN='' aws \
lambda invoke --function-name DockerTagLatest \ lambda invoke --function-name DockerTagLatest \

View file

@ -1,5 +1,5 @@
ARG GPU="" ARG BASE_IMAGE=""
FROM rayproject/base-deps:nightly"$GPU" FROM rayproject/base-deps:nightly"$BASE_IMAGE"
# If this arg is not "autoscaler" then no autoscaler requirements will be included # If this arg is not "autoscaler" then no autoscaler requirements will be included
ARG AUTOSCALER="autoscaler" ARG AUTOSCALER="autoscaler"
ARG WHEEL_PATH ARG WHEEL_PATH

View file

@ -13,7 +13,7 @@ This has the python-level dependencies for `Ray` and the `Ray Autoscaler`. The `
* `:DATE` - A specific build. * `:DATE` - A specific build.
### Suffixes ### Suffixes
* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. * `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.
* `-cpu`- These are based off of an `Ubuntu` image. * `-cpu`- These are based off of an `Ubuntu` image.
* Tags without a suffix refer to `-cpu` images * Tags without a suffix refer to `-cpu` images

View file

@ -1,5 +1,5 @@
ARG GPU ARG BASE_IMAGE
FROM rayproject/ray:nightly"$GPU" FROM rayproject/ray:nightly"$BASE_IMAGE"
ARG PYTHON_MINOR_VERSION=7 ARG PYTHON_MINOR_VERSION=7
# We have to uninstall wrapt this way for Tensorflow compatibility # We have to uninstall wrapt this way for Tensorflow compatibility
@ -8,6 +8,7 @@ COPY requirements_dl.txt ./
COPY requirements_ml_docker.txt ./ COPY requirements_ml_docker.txt ./
COPY requirements_rllib.txt ./ COPY requirements_rllib.txt ./
COPY requirements_tune.txt ./requirements_tune.txt COPY requirements_tune.txt ./requirements_tune.txt
COPY requirements_upstream.txt ./
COPY install_atari_roms.sh ./install_atari_roms.sh COPY install_atari_roms.sh ./install_atari_roms.sh
RUN sudo apt-get update \ RUN sudo apt-get update \
@ -23,11 +24,13 @@ RUN sudo apt-get update \
&& $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_rllib.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_rllib.txt \
&& $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_tune.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_tune.txt \
&& $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \
&& $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_upstream.txt \
# Remove dataclasses & typing because they are included in Python > 3.6 # Remove dataclasses & typing because they are included in Python > 3.6
&& if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \ && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \
$HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi \ $HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi \
&& sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \ && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \
&& sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \ && sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \
&& sudo rm requirements_upstream.txt \
&& sudo apt-get clean && sudo apt-get clean
# Make sure tfp is installed correctly and matches tf version. # Make sure tfp is installed correctly and matches tf version.

View file

@ -11,9 +11,7 @@ This image is an extension of the [`rayproject/ray`](https://hub.docker.com/repo
* `:SHA` - A specific nightly build. * `:SHA` - A specific nightly build.
### Suffixes ### Suffixes
* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. * `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.
* `-cpu`- These are based off of an `Ubuntu` image.
* Tags without a suffix refer to `-cpu` images
## Other Images ## Other Images
* [`rayproject/ray`](https://hub.docker.com/repository/docker/rayproject/ray) - Ray and all of its dependencies. * [`rayproject/ray`](https://hub.docker.com/repository/docker/rayproject/ray) - Ray and all of its dependencies.

View file

@ -1,5 +1,5 @@
ARG GPU ARG BASE_IMAGE
FROM rayproject/ray-deps:nightly"$GPU" FROM rayproject/ray-deps:nightly"$BASE_IMAGE"
ARG WHEEL_PATH ARG WHEEL_PATH
ARG FIND_LINKS_PATH=".whl" ARG FIND_LINKS_PATH=".whl"
# For Click # For Click

View file

@ -12,7 +12,7 @@ everything needed to get started with running Ray! They work for both local deve
* `:SHA` - A specific nightly build. * `:SHA` - A specific nightly build.
### Suffixes ### Suffixes
* `-gpu` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs. * `-cuXXX` - These are based off of an `NVIDIA CUDA` image. They require the [Nvidia Docker Runtime](https://github.com/NVIDIA/nvidia-docker) to be installed on the host for the container to access GPUs.
* `-cpu`- These are based off of an `Ubuntu` image. * `-cpu`- These are based off of an `Ubuntu` image.
* Tags without a suffix refer to `-cpu` images * Tags without a suffix refer to `-cpu` images

View file

@ -0,0 +1,5 @@
cu112
cu111
cu110
cu102
cu101

View file

@ -1,4 +1,5 @@
import json import json
import os
import subprocess import subprocess
import boto3 import boto3
@ -7,6 +8,10 @@ DOCKER_USER = None
DOCKER_PASS = None DOCKER_PASS = None
def _get_curr_dir():
return os.path.dirname(os.path.realpath(__file__))
def get_secrets(): def get_secrets():
global DOCKER_PASS, DOCKER_USER global DOCKER_PASS, DOCKER_USER
secret_name = "dockerRetagLatestCredentials" secret_name = "dockerRetagLatestCredentials"
@ -36,15 +41,30 @@ def retag(repo: str, source: str, destination: str) -> str:
}) })
def parse_versions(version_file):
with open(version_file) as f:
file_versions = f.read().splitlines()
return file_versions
def lambda_handler(event, context): def lambda_handler(event, context):
source_image = event["source_tag"] source_image = event["source_tag"]
destination_image = event["destination_tag"] destination_image = event["destination_tag"]
total_results = [] total_results = []
for repo in ["ray", "ray-ml", "autoscaler"]: python_versions = parse_versions(
os.path.join(_get_curr_dir(), "python_versions.txt"))
cuda_versions = parse_versions(
os.path.join(_get_curr_dir(), "cuda_versions.txt"))
for repo in ["ray", "ray-ml"]:
results = [] results = []
for pyversion in ["py36", "py37", "py38", "py39"]: for pyversion in python_versions:
source_tag = f"{source_image}-{pyversion}" source_tag = f"{source_image}-{pyversion}"
destination_tag = f"{destination_image}-{pyversion}" destination_tag = f"{destination_image}-{pyversion}"
for cudaversion in cuda_versions:
cuda_source_tag = source_tag + f"-{cudaversion}"
cuda_destination_tag = destination_tag + f"-{cudaversion}"
results.append(
retag(repo, cuda_source_tag, cuda_destination_tag))
results.append(retag(repo, source_tag, destination_tag)) results.append(retag(repo, source_tag, destination_tag))
results.append(retag(repo, source_tag, destination_tag + "-cpu")) results.append(retag(repo, source_tag, destination_tag + "-cpu"))
results.append( results.append(
@ -54,7 +74,13 @@ def lambda_handler(event, context):
# Retag images without a python version specified (defaults to py37) # Retag images without a python version specified (defaults to py37)
results = [] results = []
for repo in ["ray", "ray-ml", "autoscaler", "ray-deps", "base-deps"]: for repo in ["ray", "ray-ml", "ray-deps", "base-deps"]:
for cudaversion in cuda_versions:
source_tag = f"{source_image}-{cudaversion}"
destination_tag = f"{destination_image}-{cudaversion}"
results.append(retag(repo, source_tag, destination_tag))
# ray:nightly -> ray:1.x
results.append(retag(repo, source_image, destination_image)) results.append(retag(repo, source_image, destination_image))
results.append(retag(repo, source_image, destination_image + "-cpu")) results.append(retag(repo, source_image, destination_image + "-cpu"))
results.append( results.append(

View file

@ -0,0 +1,4 @@
py36
py37
py38
py39