mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[autoscaler][hotfix] Update node list after terminating unhealthy nodes (#17992)
* Update nodes; update test. * consistency * lint
This commit is contained in:
parent
5ca28b1cc8
commit
13d5d0f9ef
2 changed files with 13 additions and 5 deletions
|
@ -396,7 +396,10 @@ class StandardAutoscaler:
|
|||
|
||||
if self.disable_node_updaters:
|
||||
# If updaters are unavailable, terminate unhealthy nodes.
|
||||
self.terminate_unhealthy_nodes(nodes, now)
|
||||
nodes_to_terminate = self.get_unhealthy_nodes(nodes, now)
|
||||
if nodes_to_terminate:
|
||||
self._terminate_nodes_and_cleanup(nodes_to_terminate)
|
||||
nodes = self.workers()
|
||||
else:
|
||||
# Attempt to recover unhealthy nodes
|
||||
for node_id in nodes:
|
||||
|
@ -716,8 +719,10 @@ class StandardAutoscaler:
|
|||
return True
|
||||
return False
|
||||
|
||||
def terminate_unhealthy_nodes(self, nodes: List[NodeID], now: float):
|
||||
"""Terminate nodes for which we haven't received a heartbeat on time.
|
||||
def get_unhealthy_nodes(self, nodes: List[NodeID],
|
||||
now: float) -> List[NodeID]:
|
||||
"""Determine nodes for which we haven't received a heartbeat on time.
|
||||
These nodes are subsequently terminated.
|
||||
|
||||
Used when node updaters are not available for recovery.
|
||||
"""
|
||||
|
@ -748,8 +753,7 @@ class StandardAutoscaler:
|
|||
aggregate=operator.add)
|
||||
nodes_to_terminate.append(node_id)
|
||||
|
||||
if nodes_to_terminate:
|
||||
self._terminate_nodes_and_cleanup(nodes_to_terminate)
|
||||
return nodes_to_terminate
|
||||
|
||||
def recover_if_needed(self, node_id, now):
|
||||
if not self.can_update(node_id):
|
||||
|
|
|
@ -204,6 +204,10 @@ class MockProvider(NodeProvider):
|
|||
return self.mock_nodes[node_id].state in ["stopped", "terminated"]
|
||||
|
||||
def node_tags(self, node_id):
|
||||
# Don't assume that node providers can retrieve tags from
|
||||
# terminated nodes.
|
||||
if self.is_terminated(node_id):
|
||||
raise Exception(f"The node with id {node_id} has been terminated!")
|
||||
with self.lock:
|
||||
return self.mock_nodes[node_id].tags
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue