mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Fix GCP node termination (#23101)
Skips 404s on node termination for GCP node provider. Also resets internal "self.nodes_to_terminate" state at the start of an autoscaler iteration -- that's necessary for correct cleanup in the event of failed node termination.
This commit is contained in:
parent
cf512254bb
commit
c707ad8d73
2 changed files with 17 additions and 3 deletions
|
@ -311,6 +311,9 @@ class StandardAutoscaler:
|
|||
# Query the provider to update the list of non-terminated nodes
|
||||
self.non_terminated_nodes = NonTerminatedNodes(self.provider)
|
||||
|
||||
# This will accumulate the nodes we need to terminate.
|
||||
self.nodes_to_terminate = []
|
||||
|
||||
# Update running nodes gauge
|
||||
num_workers = len(self.non_terminated_nodes.worker_ids)
|
||||
self.prom_metrics.running_workers.set(num_workers)
|
||||
|
|
|
@ -4,6 +4,8 @@ from threading import RLock
|
|||
import time
|
||||
import logging
|
||||
|
||||
import googleapiclient
|
||||
|
||||
from ray.autoscaler.node_provider import NodeProvider
|
||||
from ray.autoscaler._private.gcp.config import (
|
||||
bootstrap_gcp,
|
||||
|
@ -175,9 +177,18 @@ class GCPNodeProvider(NodeProvider):
|
|||
def terminate_node(self, node_id: str):
|
||||
with self.lock:
|
||||
resource = self._get_resource_depending_on_node_name(node_id)
|
||||
try:
|
||||
result = resource.delete_instance(
|
||||
node_id=node_id,
|
||||
)
|
||||
except googleapiclient.errors.HttpError as http_error:
|
||||
if http_error.resp.status == 404:
|
||||
logger.warning(
|
||||
f"Tried to delete the node with id {node_id} "
|
||||
"but it was already gone."
|
||||
)
|
||||
else:
|
||||
raise http_error from None
|
||||
return result
|
||||
|
||||
@_retry
|
||||
|
|
Loading…
Add table
Reference in a new issue