mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
Operator does not retry monitor on failure. (#22792)
This commit is contained in:
parent
207d93a52c
commit
991a62dd47
4 changed files with 20 additions and 4 deletions
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
|
@ -11,6 +11,7 @@
|
|||
/src/ray/protobuf/gcs.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
|
||||
/src/ray/protobuf/gcs_service.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
|
||||
/dashboard/modules/snapshot @wuisawesome @ijrsvt @edoakes @alanwguo @architkulkarni
|
||||
/python/ray/autoscaler/_private/monitor.py @wuisawesome @DmitriGekhtman
|
||||
|
||||
# Metrics
|
||||
/src/ray/stats/metric_defs.h @ericl @scv119 @rkooo567
|
||||
|
|
|
@ -146,6 +146,7 @@ class Monitor:
|
|||
prefix_cluster_info: bool = False,
|
||||
monitor_ip: Optional[str] = None,
|
||||
stop_event: Optional[Event] = None,
|
||||
retry_on_failure: bool = True,
|
||||
):
|
||||
if not use_gcs_for_bootstrap():
|
||||
# Initialize the Redis clients.
|
||||
|
@ -209,6 +210,7 @@ class Monitor:
|
|||
self.prefix_cluster_info = prefix_cluster_info
|
||||
# Can be used to signal graceful exit from monitor loop.
|
||||
self.stop_event = stop_event # type: Optional[Event]
|
||||
self.retry_on_failure = retry_on_failure
|
||||
self.autoscaling_config = autoscaling_config
|
||||
self.autoscaler = None
|
||||
# If set, we are in a manually created cluster (non-autoscaling) and
|
||||
|
@ -405,7 +407,11 @@ class Monitor:
|
|||
ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Monitor: Execution exception. Trying again...")
|
||||
# By default, do not exit the monitor on failure.
|
||||
if self.retry_on_failure:
|
||||
logger.exception("Monitor: Execution exception. Trying again...")
|
||||
else:
|
||||
raise
|
||||
|
||||
# Wait for a autoscaler update interval before processing the next
|
||||
# round of messages.
|
||||
|
|
|
@ -115,6 +115,7 @@ class RayCluster:
|
|||
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
|
||||
prefix_cluster_info=True,
|
||||
stop_event=self.monitor_stop_event,
|
||||
retry_on_failure=False,
|
||||
)
|
||||
mtr.run()
|
||||
|
||||
|
|
|
@ -249,11 +249,19 @@ class KubernetesOperatorTest(unittest.TestCase):
|
|||
pod_spec["containers"][0]["image"] = IMAGE
|
||||
pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY
|
||||
|
||||
# Use a custom Redis port for one of the clusters.
|
||||
example_cluster_config["spec"]["headStartRayCommands"][1] += " --port 6400"
|
||||
# Use a custom port for one of the clusters.
|
||||
new_head_start_cmd = example_cluster_config["spec"]["headStartRayCommands"][
|
||||
1
|
||||
].replace("6379", "6380")
|
||||
new_worker_start_cmd = example_cluster_config["spec"][
|
||||
"workerStartRayCommands"
|
||||
][1].replace("6379", "6380")
|
||||
example_cluster_config["spec"]["headStartRayCommands"][
|
||||
1
|
||||
] = new_head_start_cmd
|
||||
example_cluster_config["spec"]["workerStartRayCommands"][
|
||||
1
|
||||
] = " ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6400"
|
||||
] = new_worker_start_cmd
|
||||
|
||||
# Dump to temporary files
|
||||
yaml.dump(example_cluster_config, example_cluster_file)
|
||||
|
|
Loading…
Add table
Reference in a new issue