Operator does not retry monitor on failure. (#22792)

This commit is contained in:
Dmitri Gekhtman 2022-03-02 23:37:03 -08:00 committed by GitHub
parent 207d93a52c
commit 991a62dd47
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 20 additions and 4 deletions

1
.github/CODEOWNERS vendored
View file

@ -11,6 +11,7 @@
/src/ray/protobuf/gcs.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
/src/ray/protobuf/gcs_service.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
/dashboard/modules/snapshot @wuisawesome @ijrsvt @edoakes @alanwguo @architkulkarni
/python/ray/autoscaler/_private/monitor.py @wuisawesome @DmitriGekhtman
# Metrics
/src/ray/stats/metric_defs.h @ericl @scv119 @rkooo567

View file

@ -146,6 +146,7 @@ class Monitor:
prefix_cluster_info: bool = False,
monitor_ip: Optional[str] = None,
stop_event: Optional[Event] = None,
retry_on_failure: bool = True,
):
if not use_gcs_for_bootstrap():
# Initialize the Redis clients.
@ -209,6 +210,7 @@ class Monitor:
self.prefix_cluster_info = prefix_cluster_info
# Can be used to signal graceful exit from monitor loop.
self.stop_event = stop_event # type: Optional[Event]
self.retry_on_failure = retry_on_failure
self.autoscaling_config = autoscaling_config
self.autoscaler = None
# If set, we are in a manually created cluster (non-autoscaling) and
@ -405,7 +407,11 @@ class Monitor:
ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
)
except Exception:
logger.exception("Monitor: Execution exception. Trying again...")
# By default, do not exit the monitor on failure.
if self.retry_on_failure:
logger.exception("Monitor: Execution exception. Trying again...")
else:
raise
# Wait for a autoscaler update interval before processing the next
# round of messages.

View file

@ -115,6 +115,7 @@ class RayCluster:
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
prefix_cluster_info=True,
stop_event=self.monitor_stop_event,
retry_on_failure=False,
)
mtr.run()

View file

@ -249,11 +249,19 @@ class KubernetesOperatorTest(unittest.TestCase):
pod_spec["containers"][0]["image"] = IMAGE
pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY
# Use a custom Redis port for one of the clusters.
example_cluster_config["spec"]["headStartRayCommands"][1] += " --port 6400"
# Use a custom port for one of the clusters.
new_head_start_cmd = example_cluster_config["spec"]["headStartRayCommands"][
1
].replace("6379", "6380")
new_worker_start_cmd = example_cluster_config["spec"][
"workerStartRayCommands"
][1].replace("6379", "6380")
example_cluster_config["spec"]["headStartRayCommands"][
1
] = new_head_start_cmd
example_cluster_config["spec"]["workerStartRayCommands"][
1
] = " ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6400"
] = new_worker_start_cmd
# Dump to temporary files
yaml.dump(example_cluster_config, example_cluster_file)