mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[autoscaler/k8s] Handle unavailable k8s API (#12283)
This commit is contained in:
parent
462c7fb575
commit
e66ddab190
1 changed files with 8 additions and 1 deletions
|
@ -1,5 +1,6 @@
|
||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict, namedtuple
|
||||||
from typing import Any, Optional, Dict, List
|
from typing import Any, Optional, Dict, List
|
||||||
|
from urllib3.exceptions import MaxRetryError
|
||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
|
@ -130,7 +131,13 @@ class StandardAutoscaler:
|
||||||
if _internal_kv_initialized():
|
if _internal_kv_initialized():
|
||||||
_internal_kv_put(
|
_internal_kv_put(
|
||||||
DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
|
DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
|
||||||
self.num_failures += 1
|
# Don't abort the autoscaler if the K8s API server is down.
|
||||||
|
# https://github.com/ray-project/ray/issues/12255
|
||||||
|
is_k8s_connection_error = (
|
||||||
|
self.config["provider"]["type"] == "kubernetes"
|
||||||
|
and isinstance(e, MaxRetryError))
|
||||||
|
if not is_k8s_connection_error:
|
||||||
|
self.num_failures += 1
|
||||||
if self.num_failures > self.max_failures:
|
if self.num_failures > self.max_failures:
|
||||||
logger.critical("StandardAutoscaler: "
|
logger.critical("StandardAutoscaler: "
|
||||||
"Too many errors, abort.")
|
"Too many errors, abort.")
|
||||||
|
|
Loading…
Add table
Reference in a new issue