Fix GCP node termination (#23101)

Skips 404s on node termination for GCP node provider. Also resets internal "self.nodes_to_terminate" state at the start of an autoscaler iteration -- that's necessary for correct cleanup in the event of failed node termination.
2025-03-06 02:21:39 -05:00 · 2022-03-17 09:51:16 -07:00 · 2022-03-17 09:51:16 -07:00 · c707ad8d73
commit c707ad8d73
parent cf512254bb
2 changed files with 17 additions and 3 deletions
--- a/python/ray/autoscaler/_private/autoscaler.py
+++ b/python/ray/autoscaler/_private/autoscaler.py
@ -311,6 +311,9 @@ class StandardAutoscaler:
        # Query the provider to update the list of non-terminated nodes
        self.non_terminated_nodes = NonTerminatedNodes(self.provider)

+        # This will accumulate the nodes we need to terminate.
+        self.nodes_to_terminate = []
+
        # Update running nodes gauge
        num_workers = len(self.non_terminated_nodes.worker_ids)
        self.prom_metrics.running_workers.set(num_workers)
--- a/python/ray/autoscaler/_private/gcp/node_provider.py
+++ b/python/ray/autoscaler/_private/gcp/node_provider.py
@ -4,6 +4,8 @@ from threading import RLock
 import time
 import logging

+import googleapiclient
+
 from ray.autoscaler.node_provider import NodeProvider
 from ray.autoscaler._private.gcp.config import (
    bootstrap_gcp,
@ -175,9 +177,18 @@ class GCPNodeProvider(NodeProvider):
    def terminate_node(self, node_id: str):
        with self.lock:
            resource = self._get_resource_depending_on_node_name(node_id)
+            try:
                result = resource.delete_instance(
                    node_id=node_id,
                )
+            except googleapiclient.errors.HttpError as http_error:
+                if http_error.resp.status == 404:
+                    logger.warning(
+                        f"Tried to delete the node with id {node_id} "
+                        "but it was already gone."
+                    )
+                else:
+                    raise http_error from None
            return result

    @_retry