mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[kuberay][autoscaler] Improve CPU, GPU, and memory detection. (#26219)
This PR improves the autoscaler's resource detection logic
This commit is contained in:
parent
34d1e580cb
commit
7d3ceb222c
4 changed files with 110 additions and 59 deletions
|
@ -1,10 +1,10 @@
|
|||
import decimal
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
from contextlib import suppress
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import kubernetes
|
||||
import requests
|
||||
|
||||
from ray.autoscaler._private.constants import (
|
||||
|
@ -29,8 +29,6 @@ RAYCLUSTER_FETCH_RETRY_S = 5
|
|||
# Used as the name of the "head node type" by the autoscaler.
|
||||
_HEAD_GROUP_NAME = "head-group"
|
||||
|
||||
_GPU_WARNING_LOGGED = False
|
||||
|
||||
|
||||
class AutoscalingConfigProducer:
|
||||
"""Produces an autoscaling config by reading data from the RayCluster CR.
|
||||
|
@ -259,15 +257,14 @@ def _get_num_cpus(
|
|||
k8s_resource_limits: Dict[str, str],
|
||||
group_name: str,
|
||||
) -> int:
|
||||
if "num_cpus" in ray_start_params:
|
||||
return int(ray_start_params["num_cpus"])
|
||||
"""Get CPU annotation from ray_start_params or k8s_resource_limits,
|
||||
with priority for ray_start_params.
|
||||
"""
|
||||
if "num-cpus" in ray_start_params:
|
||||
return int(ray_start_params["num-cpus"])
|
||||
elif "cpu" in k8s_resource_limits:
|
||||
cpu_str = str(k8s_resource_limits["cpu"])
|
||||
if cpu_str[-1] == "m":
|
||||
# For example, '500m' rounds up to 1.
|
||||
return math.ceil(int(cpu_str[:-1]) / 1000)
|
||||
else:
|
||||
return int(cpu_str)
|
||||
cpu_quantity: str = k8s_resource_limits["cpu"]
|
||||
return _round_up_k8s_quantity(cpu_quantity)
|
||||
else:
|
||||
# Getting the number of CPUs is important, so raise an error if we can't do it.
|
||||
raise ValueError(
|
||||
|
@ -280,13 +277,14 @@ def _get_num_cpus(
|
|||
def _get_memory(
|
||||
ray_start_params: Dict[str, str], k8s_resource_limits: Dict[str, Any]
|
||||
) -> Optional[int]:
|
||||
"""Get memory resource annotation from ray_start_params, if it is set there.
|
||||
|
||||
TODO, maybe: Consider container resource limits as in
|
||||
https://github.com/ray-project/ray/pull/14567/files
|
||||
"""Get memory resource annotation from ray_start_params or k8s_resource_limits,
|
||||
with priority for ray_start_params.
|
||||
"""
|
||||
if "memory" in ray_start_params:
|
||||
return int(ray_start_params["memory"])
|
||||
elif "memory" in k8s_resource_limits:
|
||||
memory_quantity: str = k8s_resource_limits["memory"]
|
||||
return _round_up_k8s_quantity(memory_quantity)
|
||||
return None
|
||||
|
||||
|
||||
|
@ -295,34 +293,44 @@ def _get_num_gpus(
|
|||
k8s_resource_limits: Dict[str, Any],
|
||||
group_name: str,
|
||||
) -> Optional[int]:
|
||||
"""Read the number of GPUs from the Ray start params.
|
||||
|
||||
Potential TODO: Read GPU info from the container spec, here and in the
|
||||
Ray Operator.
|
||||
"""Get memory resource annotation from ray_start_params or k8s_resource_limits,
|
||||
with priority for ray_start_params.
|
||||
"""
|
||||
|
||||
if "num-gpus" in ray_start_params:
|
||||
return int(ray_start_params["num-gpus"])
|
||||
|
||||
# Issue a warning if GPUs are present in the container spec but not in the
|
||||
# ray start params.
|
||||
# TODO: Consider reading GPU info from container spec.
|
||||
else:
|
||||
for key in k8s_resource_limits:
|
||||
global _GPU_WARNING_LOGGED
|
||||
if "gpu" in key and not _GPU_WARNING_LOGGED:
|
||||
with suppress(Exception):
|
||||
if int(k8s_resource_limits[key]) > 0:
|
||||
logger.warning(
|
||||
f"Detected GPUs in container resources for group {group_name}."
|
||||
"To ensure Ray and the autoscaler are aware of the GPUs,"
|
||||
" set the `--num-gpus` rayStartParam."
|
||||
)
|
||||
_GPU_WARNING_LOGGED = True
|
||||
break
|
||||
|
||||
# e.g. nvidia.com/gpu
|
||||
if key.endswith("gpu"):
|
||||
# Typically, this is a string representing an interger, e.g. "1".
|
||||
gpu_resource_quantity = k8s_resource_limits[key]
|
||||
# Convert to int, making no assumptions on the gpu_resource_quantity,
|
||||
# besides that it's valid as a K8s resource quantity.
|
||||
num_gpus = _round_up_k8s_quantity(gpu_resource_quantity)
|
||||
if num_gpus > 0:
|
||||
# Only one GPU type supported for now, break out on first
|
||||
# "/gpu" match.
|
||||
return num_gpus
|
||||
return None
|
||||
|
||||
|
||||
def _round_up_k8s_quantity(quantity: str) -> int:
|
||||
"""Rounds a Kubernetes resource quantity up to the nearest integer.
|
||||
|
||||
Args:
|
||||
quantity: Resource quantity as a string in the canonical K8s form.
|
||||
|
||||
Returns:
|
||||
The quantity, rounded up, as an integer.
|
||||
"""
|
||||
resource_decimal: decimal.Decimal = kubernetes.utils.quantity.parse_quantity(
|
||||
quantity
|
||||
)
|
||||
rounded = resource_decimal.to_integral_value(rounding=decimal.ROUND_UP)
|
||||
return int(rounded)
|
||||
|
||||
|
||||
def _get_custom_resources(
|
||||
ray_start_params: Dict[str, Any], group_name: str
|
||||
) -> Dict[str, int]:
|
||||
|
|
|
@ -71,7 +71,7 @@ def _setup_logging() -> None:
|
|||
filename=ray_constants.MONITOR_LOG_FILE_NAME, # monitor.log
|
||||
max_bytes=ray_constants.LOGGING_ROTATE_BYTES,
|
||||
backup_count=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
||||
logger_name="ray", # Root of the logging hierachy for Ray code.
|
||||
logger_name="ray", # Root of the logging hierarchy for Ray code.
|
||||
)
|
||||
# Logs will also be written to the container's stdout.
|
||||
# The stdout handler was set up in the cli entry point.
|
||||
|
|
|
@ -41,7 +41,6 @@ from ray.autoscaler._private.commands import (
|
|||
)
|
||||
from ray.autoscaler._private.constants import RAY_PROCESSES
|
||||
from ray.autoscaler._private.fake_multi_node.node_provider import FAKE_HEAD_NODE_ID
|
||||
from ray.autoscaler._private.kuberay.run_autoscaler import run_kuberay_autoscaler
|
||||
from ray.dashboard.modules.job.cli import job_cli_group
|
||||
from ray.experimental.state.api import get_log, list_logs
|
||||
from ray.experimental.state.common import DEFAULT_RPC_TIMEOUT, DEFAULT_LOG_LIMIT
|
||||
|
@ -2292,6 +2291,10 @@ def kuberay_autoscaler(cluster_name: str, cluster_namespace: str) -> None:
|
|||
KubeRay cluster configs.
|
||||
`ray kuberay-autoscaler` is NOT a public CLI.
|
||||
"""
|
||||
# Delay import to avoid introducing Ray core dependency on the Python Kubernetes
|
||||
# client.
|
||||
from ray.autoscaler._private.kuberay.run_autoscaler import run_kuberay_autoscaler
|
||||
|
||||
run_kuberay_autoscaler(cluster_name, cluster_namespace)
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import copy
|
||||
from pathlib import Path
|
||||
import requests
|
||||
from typing import Any, Dict, Optional
|
||||
|
@ -10,20 +11,30 @@ import yaml
|
|||
from ray.autoscaler._private.kuberay.autoscaling_config import (
|
||||
_derive_autoscaling_config_from_ray_cr,
|
||||
AutoscalingConfigProducer,
|
||||
_round_up_k8s_quantity,
|
||||
)
|
||||
|
||||
AUTOSCALING_CONFIG_MODULE_PATH = "ray.autoscaler._private.kuberay.autoscaling_config"
|
||||
|
||||
|
||||
def _get_basic_ray_cr() -> dict:
|
||||
"""Returns the example Ray CR included in the Ray documentation."""
|
||||
"""Returns the example Ray CR included in the Ray documentation,
|
||||
modified to include a GPU worker group.
|
||||
"""
|
||||
cr_path = str(
|
||||
Path(__file__).resolve().parents[2]
|
||||
/ "autoscaler"
|
||||
/ "kuberay"
|
||||
/ "ray-cluster.complete.yaml"
|
||||
)
|
||||
return yaml.safe_load(open(cr_path).read())
|
||||
config = yaml.safe_load(open(cr_path).read())
|
||||
gpu_group = copy.deepcopy(config["spec"]["workerGroupSpecs"][0])
|
||||
gpu_group["groupName"] = "gpu-group"
|
||||
gpu_group["template"]["spec"]["containers"][0]["resources"]["limits"].setdefault(
|
||||
"nvidia.com/gpu", 3
|
||||
)
|
||||
config["spec"]["workerGroupSpecs"].append(gpu_group)
|
||||
return config
|
||||
|
||||
|
||||
def _get_basic_autoscaling_config() -> dict:
|
||||
|
@ -44,6 +55,7 @@ def _get_basic_autoscaling_config() -> dict:
|
|||
"node_config": {},
|
||||
"resources": {
|
||||
"CPU": 1,
|
||||
"memory": 1000000000,
|
||||
"Custom1": 1,
|
||||
"Custom2": 5,
|
||||
},
|
||||
|
@ -54,10 +66,24 @@ def _get_basic_autoscaling_config() -> dict:
|
|||
"node_config": {},
|
||||
"resources": {
|
||||
"CPU": 1,
|
||||
"memory": 536870912,
|
||||
"Custom2": 5,
|
||||
"Custom3": 1,
|
||||
},
|
||||
},
|
||||
# Same as "small-group" with a GPU entry added.
|
||||
"gpu-group": {
|
||||
"max_workers": 300,
|
||||
"min_workers": 1,
|
||||
"node_config": {},
|
||||
"resources": {
|
||||
"CPU": 1,
|
||||
"memory": 536870912,
|
||||
"Custom2": 5,
|
||||
"Custom3": 1,
|
||||
"GPU": 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
"auth": {},
|
||||
"cluster_synced_files": [],
|
||||
|
@ -69,7 +95,7 @@ def _get_basic_autoscaling_config() -> dict:
|
|||
"head_start_ray_commands": [],
|
||||
"idle_timeout_minutes": 5,
|
||||
"initialization_commands": [],
|
||||
"max_workers": 300,
|
||||
"max_workers": 600,
|
||||
"setup_commands": [],
|
||||
"upscaling_speed": 1,
|
||||
"worker_nodes": {},
|
||||
|
@ -99,19 +125,25 @@ def _get_no_cpu_error() -> str:
|
|||
)
|
||||
|
||||
|
||||
def _get_ray_cr_memory_and_gpu() -> dict:
|
||||
"""CR with memory and gpu rayStartParams."""
|
||||
def _get_ray_cr_with_overrides() -> dict:
|
||||
"""CR with memory, cpu, and gpu overrides from rayStartParams."""
|
||||
cr = _get_basic_ray_cr()
|
||||
cr["spec"]["workerGroupSpecs"][0]["rayStartParams"]["memory"] = "300000000"
|
||||
cr["spec"]["workerGroupSpecs"][0]["rayStartParams"]["num-gpus"] = "1"
|
||||
# num-gpus rayStartParam with no gpus in container limits
|
||||
cr["spec"]["workerGroupSpecs"][0]["rayStartParams"]["num-gpus"] = "100"
|
||||
# num-gpus rayStartParam overriding gpus in container limits
|
||||
cr["spec"]["workerGroupSpecs"][1]["rayStartParams"]["num-gpus"] = "100"
|
||||
cr["spec"]["workerGroupSpecs"][0]["rayStartParams"]["num-cpus"] = "100"
|
||||
return cr
|
||||
|
||||
|
||||
def _get_autoscaling_config_memory_and_gpu() -> dict:
|
||||
def _get_autoscaling_config_with_overrides() -> dict:
|
||||
"""Autoscaling config with memory and gpu annotations."""
|
||||
config = _get_basic_autoscaling_config()
|
||||
config["available_node_types"]["small-group"]["resources"]["memory"] = 300000000
|
||||
config["available_node_types"]["small-group"]["resources"]["GPU"] = 1
|
||||
config["available_node_types"]["small-group"]["resources"]["GPU"] = 100
|
||||
config["available_node_types"]["small-group"]["resources"]["CPU"] = 100
|
||||
config["available_node_types"]["gpu-group"]["resources"]["GPU"] = 100
|
||||
return config
|
||||
|
||||
|
||||
|
@ -151,6 +183,21 @@ def _get_autoscaling_config_with_options() -> dict:
|
|||
return config
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input,output",
|
||||
[
|
||||
# There's no particular discipline to these test cases.
|
||||
("100m", 1),
|
||||
("15001m", 16),
|
||||
("2", 2),
|
||||
("100Mi", 104857600),
|
||||
("1G", 1000000000),
|
||||
],
|
||||
)
|
||||
def test_resource_quantity(input: str, output: int):
|
||||
assert _round_up_k8s_quantity(input) == output, output
|
||||
|
||||
|
||||
PARAM_ARGS = ",".join(
|
||||
[
|
||||
"ray_cr_in",
|
||||
|
@ -182,20 +229,12 @@ TEST_DATA = (
|
|||
id="no-cpu-error",
|
||||
),
|
||||
pytest.param(
|
||||
_get_ray_cr_memory_and_gpu(),
|
||||
_get_autoscaling_config_memory_and_gpu(),
|
||||
_get_ray_cr_with_overrides(),
|
||||
_get_autoscaling_config_with_overrides(),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
id="memory-and-gpu",
|
||||
),
|
||||
pytest.param(
|
||||
_get_ray_cr_missing_gpu_arg(),
|
||||
_get_basic_autoscaling_config(),
|
||||
None,
|
||||
None,
|
||||
_get_gpu_complaint(),
|
||||
id="gpu-complaint",
|
||||
id="overrides",
|
||||
),
|
||||
pytest.param(
|
||||
_get_ray_cr_with_autoscaler_options(),
|
||||
|
@ -239,7 +278,8 @@ def test_cr_image_consistency():
|
|||
cr = _get_basic_ray_cr()
|
||||
|
||||
group_specs = [cr["spec"]["headGroupSpec"]] + cr["spec"]["workerGroupSpecs"]
|
||||
assert len(group_specs) == 2
|
||||
# Head, CPU group, GPU group.
|
||||
assert len(group_specs) == 3
|
||||
|
||||
ray_containers = [
|
||||
group_spec["template"]["spec"]["containers"][0] for group_spec in group_specs
|
||||
|
|
Loading…
Add table
Reference in a new issue