mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[autoscaler/k8s] Handle unavailable k8s API (#12283)
This commit is contained in:
parent
462c7fb575
commit
e66ddab190
1 changed files with 8 additions and 1 deletions
|
@ -1,5 +1,6 @@
|
|||
from collections import defaultdict, namedtuple
|
||||
from typing import Any, Optional, Dict, List
|
||||
from urllib3.exceptions import MaxRetryError
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
|
@ -130,6 +131,12 @@ class StandardAutoscaler:
|
|||
if _internal_kv_initialized():
|
||||
_internal_kv_put(
|
||||
DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
|
||||
# Don't abort the autoscaler if the K8s API server is down.
|
||||
# https://github.com/ray-project/ray/issues/12255
|
||||
is_k8s_connection_error = (
|
||||
self.config["provider"]["type"] == "kubernetes"
|
||||
and isinstance(e, MaxRetryError))
|
||||
if not is_k8s_connection_error:
|
||||
self.num_failures += 1
|
||||
if self.num_failures > self.max_failures:
|
||||
logger.critical("StandardAutoscaler: "
|
||||
|
|
Loading…
Add table
Reference in a new issue