mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Operator does not retry monitor on failure. (#22792)
This commit is contained in:
parent
207d93a52c
commit
991a62dd47
4 changed files with 20 additions and 4 deletions
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
|
@ -11,6 +11,7 @@
|
||||||
/src/ray/protobuf/gcs.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
|
/src/ray/protobuf/gcs.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
|
||||||
/src/ray/protobuf/gcs_service.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
|
/src/ray/protobuf/gcs_service.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen
|
||||||
/dashboard/modules/snapshot @wuisawesome @ijrsvt @edoakes @alanwguo @architkulkarni
|
/dashboard/modules/snapshot @wuisawesome @ijrsvt @edoakes @alanwguo @architkulkarni
|
||||||
|
/python/ray/autoscaler/_private/monitor.py @wuisawesome @DmitriGekhtman
|
||||||
|
|
||||||
# Metrics
|
# Metrics
|
||||||
/src/ray/stats/metric_defs.h @ericl @scv119 @rkooo567
|
/src/ray/stats/metric_defs.h @ericl @scv119 @rkooo567
|
||||||
|
|
|
@ -146,6 +146,7 @@ class Monitor:
|
||||||
prefix_cluster_info: bool = False,
|
prefix_cluster_info: bool = False,
|
||||||
monitor_ip: Optional[str] = None,
|
monitor_ip: Optional[str] = None,
|
||||||
stop_event: Optional[Event] = None,
|
stop_event: Optional[Event] = None,
|
||||||
|
retry_on_failure: bool = True,
|
||||||
):
|
):
|
||||||
if not use_gcs_for_bootstrap():
|
if not use_gcs_for_bootstrap():
|
||||||
# Initialize the Redis clients.
|
# Initialize the Redis clients.
|
||||||
|
@ -209,6 +210,7 @@ class Monitor:
|
||||||
self.prefix_cluster_info = prefix_cluster_info
|
self.prefix_cluster_info = prefix_cluster_info
|
||||||
# Can be used to signal graceful exit from monitor loop.
|
# Can be used to signal graceful exit from monitor loop.
|
||||||
self.stop_event = stop_event # type: Optional[Event]
|
self.stop_event = stop_event # type: Optional[Event]
|
||||||
|
self.retry_on_failure = retry_on_failure
|
||||||
self.autoscaling_config = autoscaling_config
|
self.autoscaling_config = autoscaling_config
|
||||||
self.autoscaler = None
|
self.autoscaler = None
|
||||||
# If set, we are in a manually created cluster (non-autoscaling) and
|
# If set, we are in a manually created cluster (non-autoscaling) and
|
||||||
|
@ -405,7 +407,11 @@ class Monitor:
|
||||||
ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
|
ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
# By default, do not exit the monitor on failure.
|
||||||
|
if self.retry_on_failure:
|
||||||
logger.exception("Monitor: Execution exception. Trying again...")
|
logger.exception("Monitor: Execution exception. Trying again...")
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
# Wait for a autoscaler update interval before processing the next
|
# Wait for a autoscaler update interval before processing the next
|
||||||
# round of messages.
|
# round of messages.
|
||||||
|
|
|
@ -115,6 +115,7 @@ class RayCluster:
|
||||||
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
|
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
|
||||||
prefix_cluster_info=True,
|
prefix_cluster_info=True,
|
||||||
stop_event=self.monitor_stop_event,
|
stop_event=self.monitor_stop_event,
|
||||||
|
retry_on_failure=False,
|
||||||
)
|
)
|
||||||
mtr.run()
|
mtr.run()
|
||||||
|
|
||||||
|
|
|
@ -249,11 +249,19 @@ class KubernetesOperatorTest(unittest.TestCase):
|
||||||
pod_spec["containers"][0]["image"] = IMAGE
|
pod_spec["containers"][0]["image"] = IMAGE
|
||||||
pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY
|
pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY
|
||||||
|
|
||||||
# Use a custom Redis port for one of the clusters.
|
# Use a custom port for one of the clusters.
|
||||||
example_cluster_config["spec"]["headStartRayCommands"][1] += " --port 6400"
|
new_head_start_cmd = example_cluster_config["spec"]["headStartRayCommands"][
|
||||||
|
1
|
||||||
|
].replace("6379", "6380")
|
||||||
|
new_worker_start_cmd = example_cluster_config["spec"][
|
||||||
|
"workerStartRayCommands"
|
||||||
|
][1].replace("6379", "6380")
|
||||||
|
example_cluster_config["spec"]["headStartRayCommands"][
|
||||||
|
1
|
||||||
|
] = new_head_start_cmd
|
||||||
example_cluster_config["spec"]["workerStartRayCommands"][
|
example_cluster_config["spec"]["workerStartRayCommands"][
|
||||||
1
|
1
|
||||||
] = " ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6400"
|
] = new_worker_start_cmd
|
||||||
|
|
||||||
# Dump to temporary files
|
# Dump to temporary files
|
||||||
yaml.dump(example_cluster_config, example_cluster_file)
|
yaml.dump(example_cluster_config, example_cluster_file)
|
||||||
|
|
Loading…
Add table
Reference in a new issue