mirror of
https://github.com/vale981/ray
synced 2025-03-07 02:51:39 -05:00
[autoscaler] Create provider exactly once (#10703)
Co-authored-by: Alex Wu <itswu.alex@gmail.com>
This commit is contained in:
parent
67bf396ae7
commit
eb025ea8cb
2 changed files with 13 additions and 5 deletions
|
@ -64,6 +64,9 @@ class StandardAutoscaler:
|
||||||
process_runner=subprocess,
|
process_runner=subprocess,
|
||||||
update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
|
update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
|
||||||
self.config_path = config_path
|
self.config_path = config_path
|
||||||
|
# Keep this before self.reset (self.provider needs to be created
|
||||||
|
# exactly once).
|
||||||
|
self.provider = None
|
||||||
self.reset(errors_fatal=True)
|
self.reset(errors_fatal=True)
|
||||||
self.load_metrics = load_metrics
|
self.load_metrics = load_metrics
|
||||||
|
|
||||||
|
@ -250,6 +253,7 @@ class StandardAutoscaler:
|
||||||
self.should_update(node_id) for node_id in nodes):
|
self.should_update(node_id) for node_id in nodes):
|
||||||
if node_id is not None:
|
if node_id is not None:
|
||||||
resources = self._node_resources(node_id)
|
resources = self._node_resources(node_id)
|
||||||
|
logger.debug(f"{node_id}: Starting new thread runner.")
|
||||||
T.append(
|
T.append(
|
||||||
threading.Thread(
|
threading.Thread(
|
||||||
target=self.spawn_updater,
|
target=self.spawn_updater,
|
||||||
|
@ -295,7 +299,7 @@ class StandardAutoscaler:
|
||||||
self.config = new_config
|
self.config = new_config
|
||||||
self.runtime_hash = new_runtime_hash
|
self.runtime_hash = new_runtime_hash
|
||||||
self.file_mounts_contents_hash = new_file_mounts_contents_hash
|
self.file_mounts_contents_hash = new_file_mounts_contents_hash
|
||||||
|
if not self.provider:
|
||||||
self.provider = get_node_provider(self.config["provider"],
|
self.provider = get_node_provider(self.config["provider"],
|
||||||
self.config["cluster_name"])
|
self.config["cluster_name"])
|
||||||
# Check whether we can enable the resource demand scheduler.
|
# Check whether we can enable the resource demand scheduler.
|
||||||
|
@ -462,6 +466,8 @@ class StandardAutoscaler:
|
||||||
|
|
||||||
def spawn_updater(self, node_id, init_commands, ray_start_commands,
|
def spawn_updater(self, node_id, init_commands, ray_start_commands,
|
||||||
node_resources, docker_config):
|
node_resources, docker_config):
|
||||||
|
logger.info(f"Creating new (spawn_updater) updater thread for node"
|
||||||
|
f" {node_id}.")
|
||||||
updater = NodeUpdaterThread(
|
updater = NodeUpdaterThread(
|
||||||
node_id=node_id,
|
node_id=node_id,
|
||||||
provider_config=self.config["provider"],
|
provider_config=self.config["provider"],
|
||||||
|
@ -492,6 +498,8 @@ class StandardAutoscaler:
|
||||||
return False
|
return False
|
||||||
if self.num_failed_updates.get(node_id, 0) > 0: # TODO(ekl) retry?
|
if self.num_failed_updates.get(node_id, 0) > 0: # TODO(ekl) retry?
|
||||||
return False
|
return False
|
||||||
|
logger.debug(f"{node_id} is not being updated and "
|
||||||
|
"passes config check (can_update=True).")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def launch_new_node(self, count: int, node_type: Optional[str]) -> None:
|
def launch_new_node(self, count: int, node_type: Optional[str]) -> None:
|
||||||
|
|
|
@ -455,7 +455,7 @@ def kill_node(config_file, yes, hard, override_cluster_name):
|
||||||
|
|
||||||
def monitor_cluster(cluster_config_file, num_lines, override_cluster_name):
|
def monitor_cluster(cluster_config_file, num_lines, override_cluster_name):
|
||||||
"""Tails the autoscaler logs of a Ray cluster."""
|
"""Tails the autoscaler logs of a Ray cluster."""
|
||||||
cmd = "tail -n {} -f /tmp/ray/session_*/logs/monitor*".format(num_lines)
|
cmd = f"tail -n {num_lines} -f /tmp/ray/session_latest/logs/monitor*"
|
||||||
exec_cluster(
|
exec_cluster(
|
||||||
cluster_config_file,
|
cluster_config_file,
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
|
@ -717,7 +717,7 @@ def get_or_create_head_node(config,
|
||||||
logger, "get_or_create_head_node: "
|
logger, "get_or_create_head_node: "
|
||||||
"Head node up-to-date, IP address is: {}", head_node_ip)
|
"Head node up-to-date, IP address is: {}", head_node_ip)
|
||||||
|
|
||||||
monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
|
monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*"
|
||||||
if override_cluster_name:
|
if override_cluster_name:
|
||||||
modifiers = " --cluster-name={}".format(
|
modifiers = " --cluster-name={}".format(
|
||||||
quote(override_cluster_name))
|
quote(override_cluster_name))
|
||||||
|
|
Loading…
Add table
Reference in a new issue