From 991a62dd4782e17a519ce29495b6e837f37f19f3 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> Date: Wed, 2 Mar 2022 23:37:03 -0800 Subject: [PATCH] Operator does not retry monitor on failure. (#22792) --- .github/CODEOWNERS | 1 + python/ray/autoscaler/_private/monitor.py | 8 +++++++- python/ray/ray_operator/operator.py | 1 + .../kubernetes_e2e/test_k8s_operator_basic.py | 14 +++++++++++--- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 271e5aecc..d46fb2a24 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -11,6 +11,7 @@ /src/ray/protobuf/gcs.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen /src/ray/protobuf/gcs_service.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen /dashboard/modules/snapshot @wuisawesome @ijrsvt @edoakes @alanwguo @architkulkarni +/python/ray/autoscaler/_private/monitor.py @wuisawesome @DmitriGekhtman # Metrics /src/ray/stats/metric_defs.h @ericl @scv119 @rkooo567 diff --git a/python/ray/autoscaler/_private/monitor.py b/python/ray/autoscaler/_private/monitor.py index 137c1cf06..e411cb831 100644 --- a/python/ray/autoscaler/_private/monitor.py +++ b/python/ray/autoscaler/_private/monitor.py @@ -146,6 +146,7 @@ class Monitor: prefix_cluster_info: bool = False, monitor_ip: Optional[str] = None, stop_event: Optional[Event] = None, + retry_on_failure: bool = True, ): if not use_gcs_for_bootstrap(): # Initialize the Redis clients. @@ -209,6 +210,7 @@ class Monitor: self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] + self.retry_on_failure = retry_on_failure self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and @@ -405,7 +407,11 @@ class Monitor: ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True ) except Exception: - logger.exception("Monitor: Execution exception. Trying again...") + # By default, do not exit the monitor on failure. + if self.retry_on_failure: + logger.exception("Monitor: Execution exception. Trying again...") + else: + raise # Wait for a autoscaler update interval before processing the next # round of messages. diff --git a/python/ray/ray_operator/operator.py b/python/ray/ray_operator/operator.py index 068471bd5..c1d14cef1 100644 --- a/python/ray/ray_operator/operator.py +++ b/python/ray/ray_operator/operator.py @@ -115,6 +115,7 @@ class RayCluster: redis_password=ray_constants.REDIS_DEFAULT_PASSWORD, prefix_cluster_info=True, stop_event=self.monitor_stop_event, + retry_on_failure=False, ) mtr.run() diff --git a/python/ray/tests/kubernetes_e2e/test_k8s_operator_basic.py b/python/ray/tests/kubernetes_e2e/test_k8s_operator_basic.py index f59a7c556..13d0aa71c 100644 --- a/python/ray/tests/kubernetes_e2e/test_k8s_operator_basic.py +++ b/python/ray/tests/kubernetes_e2e/test_k8s_operator_basic.py @@ -249,11 +249,19 @@ class KubernetesOperatorTest(unittest.TestCase): pod_spec["containers"][0]["image"] = IMAGE pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY - # Use a custom Redis port for one of the clusters. - example_cluster_config["spec"]["headStartRayCommands"][1] += " --port 6400" + # Use a custom port for one of the clusters. + new_head_start_cmd = example_cluster_config["spec"]["headStartRayCommands"][ + 1 + ].replace("6379", "6380") + new_worker_start_cmd = example_cluster_config["spec"][ + "workerStartRayCommands" + ][1].replace("6379", "6380") + example_cluster_config["spec"]["headStartRayCommands"][ + 1 + ] = new_head_start_cmd example_cluster_config["spec"]["workerStartRayCommands"][ 1 - ] = " ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6400" + ] = new_worker_start_cmd # Dump to temporary files yaml.dump(example_cluster_config, example_cluster_file)