[Autoscaler][Local Node Provider] Log a warning if max_workers < len(worker_ips) (#24635)

Logs a warning when a user sets max_workers for local node provider less than the number of available ips.

Also removes defaults of 0 for min_workers and max_workers from example configs to help prevent users inadvertantly setting max_workers=0 again.
This commit is contained in:
Dmitri Gekhtman 2022-05-10 10:03:29 -07:00 committed by GitHub
parent 4c1f27118a
commit 29eebdfef2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 6 deletions

View file

@ -100,6 +100,13 @@ def prepare_manual(config: Dict[str, Any]) -> Dict[str, Any]:
else:
node_type["max_workers"] = max_workers
if max_workers < num_ips:
cli_logger.warning(
f"The value of `max_workers` supplied ({max_workers}) is less"
f" than the number of available worker ips ({num_ips})."
f" At most {max_workers} Ray worker nodes will connect to the cluster."
)
return config

View file

@ -1,11 +1,7 @@
cluster_name: default
min_workers: 0
max_workers: 0
docker:
image: ""
container_name: ""
upscaling_speed: 1.0
idle_timeout_minutes: 5
provider:
type: local
head_ip: YOUR_HEAD_NODE_HOSTNAME

View file

@ -45,12 +45,14 @@ auth:
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
# Typically, min_workers == max_workers == len(worker_ips).
min_workers: 0
# This field is optional.
min_workers: TYPICALLY_THE_NUMBER_OF_WORKER_IPS
# The maximum number of workers nodes to launch in addition to the head node.
# This takes precedence over min_workers.
# Typically, min_workers == max_workers == len(worker_ips).
max_workers: 0
# This field is optional.
max_workers: TYPICALLY_THE_NUMBER_OF_WORKER_IPS
# The default behavior for manually managed clusters is
# min_workers == max_workers == len(worker_ips),
# meaning that Ray is started on all available nodes of the cluster.

View file

@ -1,5 +1,6 @@
import jsonschema
import logging
import mock
import os
import sys
import tempfile
@ -262,6 +263,33 @@ class AutoscalingConfigTest(unittest.TestCase):
# Check that worker config numbers were clipped to 3.
assert prepared_config == expected_prepared
not_enough_workers_config = copy.deepcopy(base_config)
# Max workers is less than than the three available ips.
# The user is probably has probably made an error. Make sure we log a warning.
not_enough_workers_config["max_workers"] = 0
not_enough_workers_config["min_workers"] = 0
with mock.patch(
"ray.autoscaler._private.local.config.cli_logger.warning"
) as warning:
prepared_config = prepare_config(not_enough_workers_config)
warning.assert_called_with(
"The value of `max_workers` supplied (0) is less"
" than the number of available worker ips (3)."
" At most 0 Ray worker nodes will connect to the cluster."
)
expected_prepared = yaml.safe_load(EXPECTED_LOCAL_CONFIG_STR)
# We logged a warning.
# However, prepare_config does not repair the strange config setting:
expected_prepared["max_workers"] = 0
expected_prepared["available_node_types"]["local.cluster.node"][
"max_workers"
] = 0
expected_prepared["available_node_types"]["local.cluster.node"][
"min_workers"
] = 0
assert prepared_config == expected_prepared
def testValidateNetworkConfig(self):
web_yaml = (
"https://raw.githubusercontent.com/ray-project/ray/"