[autoscaler/k8s] Handle unavailable k8s API (#12283)

This commit is contained in:
Gekho457 2020-11-24 12:13:15 -05:00 committed by GitHub
parent 462c7fb575
commit e66ddab190
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,5 +1,6 @@
from collections import defaultdict, namedtuple
from typing import Any, Optional, Dict, List
from urllib3.exceptions import MaxRetryError
import copy
import logging
import math
@ -130,7 +131,13 @@ class StandardAutoscaler:
if _internal_kv_initialized():
_internal_kv_put(
DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
self.num_failures += 1
# Don't abort the autoscaler if the K8s API server is down.
# https://github.com/ray-project/ray/issues/12255
is_k8s_connection_error = (
self.config["provider"]["type"] == "kubernetes"
and isinstance(e, MaxRetryError))
if not is_k8s_connection_error:
self.num_failures += 1
if self.num_failures > self.max_failures:
logger.critical("StandardAutoscaler: "
"Too many errors, abort.")