Fix GCP node termination (#23101)

Skips 404s on node termination for GCP node provider.
Also resets internal "self.nodes_to_terminate" state at the start of an autoscaler iteration -- that's necessary for correct cleanup in the event of failed node termination.
This commit is contained in:
Dmitri Gekhtman 2022-03-17 09:51:16 -07:00 committed by GitHub
parent cf512254bb
commit c707ad8d73
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 3 deletions

View file

@ -311,6 +311,9 @@ class StandardAutoscaler:
# Query the provider to update the list of non-terminated nodes
self.non_terminated_nodes = NonTerminatedNodes(self.provider)
# This will accumulate the nodes we need to terminate.
self.nodes_to_terminate = []
# Update running nodes gauge
num_workers = len(self.non_terminated_nodes.worker_ids)
self.prom_metrics.running_workers.set(num_workers)

View file

@ -4,6 +4,8 @@ from threading import RLock
import time
import logging
import googleapiclient
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler._private.gcp.config import (
bootstrap_gcp,
@ -175,9 +177,18 @@ class GCPNodeProvider(NodeProvider):
def terminate_node(self, node_id: str):
with self.lock:
resource = self._get_resource_depending_on_node_name(node_id)
try:
result = resource.delete_instance(
node_id=node_id,
)
except googleapiclient.errors.HttpError as http_error:
if http_error.resp.status == 404:
logger.warning(
f"Tried to delete the node with id {node_id} "
"but it was already gone."
)
else:
raise http_error from None
return result
@_retry