[autoscaler/k8s] Handle unavailable k8s API (#12283)

This commit is contained in:
Gekho457 2020-11-24 12:13:15 -05:00 committed by GitHub
parent 462c7fb575
commit e66ddab190
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,5 +1,6 @@
from collections import defaultdict, namedtuple from collections import defaultdict, namedtuple
from typing import Any, Optional, Dict, List from typing import Any, Optional, Dict, List
from urllib3.exceptions import MaxRetryError
import copy import copy
import logging import logging
import math import math
@ -130,7 +131,13 @@ class StandardAutoscaler:
if _internal_kv_initialized(): if _internal_kv_initialized():
_internal_kv_put( _internal_kv_put(
DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True) DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
self.num_failures += 1 # Don't abort the autoscaler if the K8s API server is down.
# https://github.com/ray-project/ray/issues/12255
is_k8s_connection_error = (
self.config["provider"]["type"] == "kubernetes"
and isinstance(e, MaxRetryError))
if not is_k8s_connection_error:
self.num_failures += 1
if self.num_failures > self.max_failures: if self.num_failures > self.max_failures:
logger.critical("StandardAutoscaler: " logger.critical("StandardAutoscaler: "
"Too many errors, abort.") "Too many errors, abort.")