mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[docker] Support non-root container (#11407)
This commit is contained in:
parent
62c7ab5182
commit
9920933e31
10 changed files with 71 additions and 45 deletions
|
@ -10,8 +10,23 @@ ENV TZ=America/Los_Angeles
|
|||
ENV PATH "/root/anaconda3/bin:$PATH"
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG PYTHON_VERSION=3.7.7
|
||||
RUN apt-get update -y && apt-get upgrade -y \
|
||||
&& apt-get install -y \
|
||||
|
||||
ARG RAY_UID=1000
|
||||
ARG RAY_GID=100
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y sudo tzdata \
|
||||
&& useradd -ms /bin/bash -d /home/ray ray --uid $RAY_UID --gid $RAY_GID \
|
||||
&& usermod -aG sudo ray \
|
||||
&& echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get clean
|
||||
|
||||
USER $RAY_UID
|
||||
ENV HOME=/home/ray
|
||||
|
||||
RUN sudo apt-get update -y && sudo apt-get upgrade -y \
|
||||
&& sudo apt-get install -y \
|
||||
git \
|
||||
wget \
|
||||
cmake \
|
||||
|
@ -26,7 +41,7 @@ RUN apt-get update -y && apt-get upgrade -y \
|
|||
-O /tmp/miniconda.sh \
|
||||
&& /bin/bash /tmp/miniconda.sh -b -u -p $HOME/anaconda3 \
|
||||
&& $HOME/anaconda3/bin/conda init \
|
||||
&& echo 'export PATH=$HOME/anaconda3/bin:$PATH' > /etc/profile.d/conda.sh \
|
||||
&& echo 'export PATH=$HOME/anaconda3/bin:$PATH' >> /home/ray/.bashrc \
|
||||
&& rm /tmp/miniconda.sh \
|
||||
&& $HOME/anaconda3/bin/conda install -y \
|
||||
libgcc python=$PYTHON_VERSION \
|
||||
|
@ -42,15 +57,15 @@ RUN apt-get update -y && apt-get upgrade -y \
|
|||
# AttributeError: 'numpy.ufunc' object has no attribute '__module__'
|
||||
&& $HOME/anaconda3/bin/pip uninstall -y dask \
|
||||
# We install cmake temporarily to get psutil
|
||||
&& apt-get autoremove -y cmake \
|
||||
&& sudo apt-get autoremove -y cmake \
|
||||
# Either install kubectl or remove wget
|
||||
&& (if [ "$AUTOSCALER" = "autoscaler" ]; \
|
||||
then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \
|
||||
&& touch /etc/apt/sources.list.d/kubernetes.list \
|
||||
&& echo "deb http://apt.kubernetes.io/ kubernetes-xenial main" | tee -a /etc/apt/sources.list.d/kubernetes.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install kubectl; \
|
||||
else apt-get autoremove -y wget; \
|
||||
then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - \
|
||||
&& sudo touch /etc/apt/sources.list.d/kubernetes.list \
|
||||
&& echo "deb http://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee -a /etc/apt/sources.list.d/kubernetes.list \
|
||||
&& sudo apt-get update \
|
||||
&& sudo apt-get install kubectl; \
|
||||
else sudo apt-get autoremove -y wget; \
|
||||
fi;) \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get clean
|
||||
&& sudo rm -rf /var/lib/apt/lists/* \
|
||||
&& sudo apt-get clean
|
||||
|
|
|
@ -17,4 +17,4 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install $(basename $WHEEL_PATH)[all]
|
|||
"azure-mgmt-compute==12.0.0" \
|
||||
"azure-mgmt-msi==1.0.0" \
|
||||
"azure-mgmt-network==10.1.0"; fi) \
|
||||
&& $HOME/anaconda3/bin/pip uninstall ray -y && rm $(basename $WHEEL_PATH)
|
||||
&& $HOME/anaconda3/bin/pip uninstall ray -y && sudo rm $(basename $WHEEL_PATH)
|
||||
|
|
|
@ -7,15 +7,15 @@ COPY requirements_ml_docker.txt ./
|
|||
COPY requirements_rllib.txt ./
|
||||
COPY requirements_tune.txt ./
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y gcc \
|
||||
RUN sudo apt-get update \
|
||||
&& sudo apt-get install -y gcc \
|
||||
cmake \
|
||||
libgtk2.0-dev \
|
||||
zlib1g-dev \
|
||||
libgl1-mesa-dev \
|
||||
&& $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements.txt \
|
||||
&& $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_ml_docker.txt \
|
||||
&& rm requirements.txt && rm requirements_ml_docker.txt \
|
||||
&& apt-get remove cmake gcc -y \
|
||||
&& apt-get clean
|
||||
&& sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \
|
||||
&& sudo apt-get remove cmake gcc -y \
|
||||
&& sudo apt-get clean
|
||||
|
||||
|
|
|
@ -6,4 +6,4 @@ ENV LC_ALL=C.UTF-8
|
|||
ENV LANG=C.UTF-8
|
||||
COPY $WHEEL_PATH .
|
||||
RUN $HOME/anaconda3/bin/pip --no-cache-dir install `basename $WHEEL_PATH`[all] \
|
||||
&& rm `basename $WHEEL_PATH`
|
||||
&& sudo rm `basename $WHEEL_PATH`
|
||||
|
|
|
@ -25,6 +25,8 @@ from ray.autoscaler._private.subprocess_output_util import (
|
|||
from ray.autoscaler._private.cli_logger import cli_logger, cf
|
||||
from ray.util.debug import log_once
|
||||
|
||||
from ray.autoscaler._private.constants import RAY_HOME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# How long to wait for a node to start, in seconds
|
||||
|
@ -190,7 +192,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
|||
logger.warning("'rsync_filter' detected but is currently "
|
||||
"unsupported for k8s.")
|
||||
if target.startswith("~"):
|
||||
target = "/root" + target[1:]
|
||||
target = RAY_HOME + target[1:]
|
||||
|
||||
try:
|
||||
flags = "-aqz" if is_rsync_silent() else "-avz"
|
||||
|
@ -206,7 +208,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
|||
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
|
||||
UserWarning)
|
||||
if target.startswith("~"):
|
||||
target = "/root" + target[1:]
|
||||
target = RAY_HOME + target[1:]
|
||||
|
||||
self.process_runner.check_call(self.kubectl + [
|
||||
"cp", source, "{}/{}:{}".format(self.namespace, self.node_id,
|
||||
|
@ -215,7 +217,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
|||
|
||||
def run_rsync_down(self, source, target, options=None):
|
||||
if target.startswith("~"):
|
||||
target = "/root" + target[1:]
|
||||
target = RAY_HOME + target[1:]
|
||||
|
||||
try:
|
||||
flags = "-aqz" if is_rsync_silent() else "-avz"
|
||||
|
@ -231,7 +233,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
|
|||
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
|
||||
UserWarning)
|
||||
if target.startswith("~"):
|
||||
target = "/root" + target[1:]
|
||||
target = RAY_HOME + target[1:]
|
||||
|
||||
self.process_runner.check_call(self.kubectl + [
|
||||
"cp", "{}/{}:{}".format(self.namespace, self.node_id, source),
|
||||
|
@ -699,6 +701,16 @@ class DockerCommandRunner(CommandRunnerInterface):
|
|||
cleaned_bind_mounts.pop(mnt, None)
|
||||
|
||||
if not self._check_container_status():
|
||||
# Get home directory
|
||||
image_env = self.ssh_command_runner.run(
|
||||
"docker inspect -f '{{json .Config.Env}}' " + image,
|
||||
with_output=True).decode().strip()
|
||||
home_directory = "/root"
|
||||
for env_var in json.loads(image_env):
|
||||
if env_var.startswith("HOME="):
|
||||
home_directory = env_var.split("HOME=")[1]
|
||||
break
|
||||
|
||||
start_command = docker_start_cmds(
|
||||
self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts,
|
||||
self.container_name,
|
||||
|
@ -706,7 +718,7 @@ class DockerCommandRunner(CommandRunnerInterface):
|
|||
"run_options", []) + self.docker_config.get(
|
||||
f"{'head' if as_head else 'worker'}_run_options",
|
||||
[]) + self._configure_runtime(),
|
||||
self.ssh_command_runner.cluster_name)
|
||||
self.ssh_command_runner.cluster_name, home_directory)
|
||||
self.run(start_command, run_env="host")
|
||||
else:
|
||||
running_image = self.run(
|
||||
|
|
|
@ -41,3 +41,6 @@ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE = 1000
|
|||
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
|
||||
# Max number of retries to create an EC2 node (retry different subnet)
|
||||
BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)
|
||||
|
||||
# ray home path in the container image
|
||||
RAY_HOME = "/home/ray"
|
||||
|
|
|
@ -65,15 +65,15 @@ def check_docker_image(cname):
|
|||
|
||||
|
||||
def docker_start_cmds(user, image, mount_dict, container_name, user_options,
|
||||
cluster_name):
|
||||
cluster_name, home_directory):
|
||||
# Imported here due to circular dependency.
|
||||
from ray.autoscaler.sdk import get_docker_host_mount_location
|
||||
docker_mount_prefix = get_docker_host_mount_location(cluster_name)
|
||||
mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict}
|
||||
|
||||
# TODO(ilr) Move away from defaulting to /root/
|
||||
mount_flags = " ".join([
|
||||
"-v {src}:{dest}".format(src=k, dest=v.replace("~/", "/root/"))
|
||||
"-v {src}:{dest}".format(
|
||||
src=k, dest=v.replace("~/", home_directory + "/"))
|
||||
for k, v in mount.items()
|
||||
])
|
||||
|
||||
|
|
|
@ -17,21 +17,3 @@ class StaroidCommandRunner(KubernetesCommandRunner):
|
|||
if kube_api_server is not None:
|
||||
self.kubectl.extend(["--server", kube_api_server])
|
||||
os.environ["KUBE_API_SERVER"] = kube_api_server
|
||||
|
||||
def _rewrite_target_home_dir(self, target):
|
||||
# Staroid forces containers to run non-root permission. Ray docker
|
||||
# image does not have a support for non-root user at the moment.
|
||||
# Use /tmp/ray as a home directory until docker image supports
|
||||
# non-root user.
|
||||
|
||||
if target.startswith("~/"):
|
||||
return "/home/ray" + target[1:]
|
||||
return target
|
||||
|
||||
def run_rsync_up(self, source, target, options=None):
|
||||
target = self._rewrite_target_home_dir(target)
|
||||
super().run_rsync_up(source, target, options)
|
||||
|
||||
def run_rsync_down(self, source, target, options=None):
|
||||
target = self._rewrite_target_home_dir(target)
|
||||
super().run_rsync_down(source, target, options)
|
||||
|
|
|
@ -390,6 +390,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
# Two initial calls to docker cp, one before run, two final calls to cp
|
||||
runner.respond_to_call(".State.Running",
|
||||
["false", "false", "false", "true", "true"])
|
||||
runner.respond_to_call("json .Config.Env", ["[]"])
|
||||
commands.get_or_create_head_node(
|
||||
SMALL_CLUSTER,
|
||||
config_path,
|
||||
|
@ -968,6 +969,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
|
@ -1006,6 +1008,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
|
@ -1159,6 +1162,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
@ -1227,6 +1231,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider(cache_stopped=False)
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
@ -1269,6 +1274,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider(cache_stopped=True)
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(3)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
@ -1335,6 +1341,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider(cache_stopped=True)
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(13)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
@ -1385,6 +1392,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config["max_workers"] = 2
|
||||
config_path = self.write_config(config)
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
@ -1438,6 +1446,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config["max_workers"] = 2
|
||||
config_path = self.write_config(config)
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
@ -1484,6 +1493,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
from ray.autoscaler._private import util
|
||||
util._hash_cache = {}
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
|
|
|
@ -1002,6 +1002,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(MULTI_WORKER_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]"])
|
||||
get_or_create_head_node(
|
||||
MULTI_WORKER_CLUSTER,
|
||||
config_path,
|
||||
|
@ -1272,6 +1273,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
|
@ -1353,6 +1355,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
|
@ -1405,6 +1408,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
config_path = self.write_config(config)
|
||||
self.provider = MockProvider()
|
||||
runner = MockProcessRunner()
|
||||
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
LoadMetrics(),
|
||||
|
|
Loading…
Add table
Reference in a new issue