mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[autoscaler] Event summarizer reports launch failure (#20814)
Partly addresses #20774 by registering node launcher failures in driver logs, via the event summarizer. This way, users can tell that the launch failed from the driver logs. Also pushes the node creation exception to driver logs, but only once per 60 minutes.
This commit is contained in:
parent
ea1d081aac
commit
94883f61b1
6 changed files with 84 additions and 25 deletions
|
@ -235,7 +235,8 @@ class StandardAutoscaler:
|
||||||
index=i,
|
index=i,
|
||||||
pending=self.pending_launches,
|
pending=self.pending_launches,
|
||||||
node_types=self.available_node_types,
|
node_types=self.available_node_types,
|
||||||
prom_metrics=self.prom_metrics)
|
prom_metrics=self.prom_metrics,
|
||||||
|
event_summarizer=self.event_summarizer)
|
||||||
node_launcher.daemon = True
|
node_launcher.daemon = True
|
||||||
node_launcher.start()
|
node_launcher.start()
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import Any, Callable, Dict, List
|
from typing import Any, Callable, Dict, List
|
||||||
import time
|
import time
|
||||||
|
from threading import RLock
|
||||||
|
|
||||||
|
|
||||||
class EventSummarizer:
|
class EventSummarizer:
|
||||||
|
@ -13,6 +14,10 @@ class EventSummarizer:
|
||||||
# added here, until its TTL expires.
|
# added here, until its TTL expires.
|
||||||
self.throttled_messages: Dict[str, float] = {}
|
self.throttled_messages: Dict[str, float] = {}
|
||||||
|
|
||||||
|
# Event summarizer is used by the main thread and
|
||||||
|
# by node launcher child threads.
|
||||||
|
self.lock = RLock()
|
||||||
|
|
||||||
def add(self, template: str, *, quantity: Any,
|
def add(self, template: str, *, quantity: Any,
|
||||||
aggregate: Callable[[Any, Any], Any]) -> None:
|
aggregate: Callable[[Any, Any], Any]) -> None:
|
||||||
"""Add a log message, which will be combined by template.
|
"""Add a log message, which will be combined by template.
|
||||||
|
@ -24,14 +29,15 @@ class EventSummarizer:
|
||||||
quantities. The result is inserted into the template to
|
quantities. The result is inserted into the template to
|
||||||
produce the final log message.
|
produce the final log message.
|
||||||
"""
|
"""
|
||||||
# Enforce proper sentence structure.
|
with self.lock:
|
||||||
if not template.endswith("."):
|
# Enforce proper sentence structure.
|
||||||
template += "."
|
if not template.endswith("."):
|
||||||
if template in self.events_by_key:
|
template += "."
|
||||||
self.events_by_key[template] = aggregate(
|
if template in self.events_by_key:
|
||||||
self.events_by_key[template], quantity)
|
self.events_by_key[template] = aggregate(
|
||||||
else:
|
self.events_by_key[template], quantity)
|
||||||
self.events_by_key[template] = quantity
|
else:
|
||||||
|
self.events_by_key[template] = quantity
|
||||||
|
|
||||||
def add_once_per_interval(self, message: str, key: str, interval_s: int):
|
def add_once_per_interval(self, message: str, key: str, interval_s: int):
|
||||||
"""Add a log message, which is throttled once per interval by a key.
|
"""Add a log message, which is throttled once per interval by a key.
|
||||||
|
@ -41,24 +47,27 @@ class EventSummarizer:
|
||||||
key (str): The key to use to deduplicate the message.
|
key (str): The key to use to deduplicate the message.
|
||||||
interval_s (int): Throttling interval in seconds.
|
interval_s (int): Throttling interval in seconds.
|
||||||
"""
|
"""
|
||||||
if key not in self.throttled_messages:
|
with self.lock:
|
||||||
self.throttled_messages[key] = time.time() + interval_s
|
if key not in self.throttled_messages:
|
||||||
self.messages_to_send.append(message)
|
self.throttled_messages[key] = time.time() + interval_s
|
||||||
|
self.messages_to_send.append(message)
|
||||||
|
|
||||||
def summary(self) -> List[str]:
|
def summary(self) -> List[str]:
|
||||||
"""Generate the aggregated log summary of all added events."""
|
"""Generate the aggregated log summary of all added events."""
|
||||||
out = []
|
with self.lock:
|
||||||
for template, quantity in self.events_by_key.items():
|
out = []
|
||||||
out.append(template.format(quantity))
|
for template, quantity in self.events_by_key.items():
|
||||||
out.extend(self.messages_to_send)
|
out.append(template.format(quantity))
|
||||||
|
out.extend(self.messages_to_send)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Clear the events added."""
|
"""Clear the events added."""
|
||||||
self.events_by_key.clear()
|
with self.lock:
|
||||||
self.messages_to_send.clear()
|
self.events_by_key.clear()
|
||||||
# Expire any messages that have reached their TTL. This allows them
|
self.messages_to_send.clear()
|
||||||
# to be sent again.
|
# Expire any messages that have reached their TTL. This allows them
|
||||||
for k, t in list(self.throttled_messages.items()):
|
# to be sent again.
|
||||||
if time.time() > t:
|
for k, t in list(self.throttled_messages.items()):
|
||||||
del self.throttled_messages[k]
|
if time.time() > t:
|
||||||
|
del self.throttled_messages[k]
|
||||||
|
|
|
@ -327,8 +327,11 @@ class Monitor:
|
||||||
status["autoscaler_report"] = asdict(self.autoscaler.summary())
|
status["autoscaler_report"] = asdict(self.autoscaler.summary())
|
||||||
|
|
||||||
for msg in self.event_summarizer.summary():
|
for msg in self.event_summarizer.summary():
|
||||||
logger.info("{}{}".format(
|
# Need to prefix each line of the message for the lines to
|
||||||
ray_constants.LOG_PREFIX_EVENT_SUMMARY, msg))
|
# get pushed to the driver logs.
|
||||||
|
for line in msg.split("\n"):
|
||||||
|
logger.info("{}{}".format(
|
||||||
|
ray_constants.LOG_PREFIX_EVENT_SUMMARY, line))
|
||||||
self.event_summarizer.clear()
|
self.event_summarizer.clear()
|
||||||
|
|
||||||
as_json = json.dumps(status)
|
as_json = json.dumps(status)
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
from typing import Any, Optional, Dict
|
from typing import Any, Optional, Dict
|
||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
|
import operator
|
||||||
import threading
|
import threading
|
||||||
|
import traceback
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_NODE_STATUS,
|
from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_NODE_STATUS,
|
||||||
|
@ -21,6 +23,7 @@ class NodeLauncher(threading.Thread):
|
||||||
provider,
|
provider,
|
||||||
queue,
|
queue,
|
||||||
pending,
|
pending,
|
||||||
|
event_summarizer,
|
||||||
prom_metrics=None,
|
prom_metrics=None,
|
||||||
node_types=None,
|
node_types=None,
|
||||||
index=None,
|
index=None,
|
||||||
|
@ -32,6 +35,7 @@ class NodeLauncher(threading.Thread):
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
self.node_types = node_types
|
self.node_types = node_types
|
||||||
self.index = str(index) if index is not None else ""
|
self.index = str(index) if index is not None else ""
|
||||||
|
self.event_summarizer = event_summarizer
|
||||||
super(NodeLauncher, self).__init__(*args, **kwargs)
|
super(NodeLauncher, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def _launch_node(self, config: Dict[str, Any], count: int,
|
def _launch_node(self, config: Dict[str, Any], count: int,
|
||||||
|
@ -84,6 +88,18 @@ class NodeLauncher(threading.Thread):
|
||||||
except Exception:
|
except Exception:
|
||||||
self.prom_metrics.node_launch_exceptions.inc()
|
self.prom_metrics.node_launch_exceptions.inc()
|
||||||
self.prom_metrics.failed_create_nodes.inc(count)
|
self.prom_metrics.failed_create_nodes.inc(count)
|
||||||
|
self.event_summarizer.add(
|
||||||
|
"Failed to launch {} nodes of type " + node_type + ".",
|
||||||
|
quantity=count,
|
||||||
|
aggregate=operator.add)
|
||||||
|
# Log traceback from failed node creation only once per minute
|
||||||
|
# to avoid spamming driver logs with tracebacks.
|
||||||
|
self.event_summarizer.add_once_per_interval(
|
||||||
|
message="Node creation failed. See the traceback below."
|
||||||
|
" See autoscaler logs for further details.\n"
|
||||||
|
f"{traceback.format_exc()}",
|
||||||
|
key="Failed to create node.",
|
||||||
|
interval_s=60)
|
||||||
logger.exception("Launch failed")
|
logger.exception("Launch failed")
|
||||||
finally:
|
finally:
|
||||||
self.pending.dec(node_type, count)
|
self.pending.dec(node_type, count)
|
||||||
|
|
|
@ -1259,6 +1259,34 @@ class AutoscalingTest(unittest.TestCase):
|
||||||
runner.assert_has_call("172.0.0.4", pattern="rsync")
|
runner.assert_has_call("172.0.0.4", pattern="rsync")
|
||||||
runner.clear_history()
|
runner.clear_history()
|
||||||
|
|
||||||
|
def testSummarizerFailedCreate(self):
|
||||||
|
"""Checks that event summarizer reports failed node creation.
|
||||||
|
"""
|
||||||
|
config = copy.deepcopy(SMALL_CLUSTER)
|
||||||
|
config_path = self.write_config(config)
|
||||||
|
self.provider = MockProvider()
|
||||||
|
self.provider.error_creates = True
|
||||||
|
runner = MockProcessRunner()
|
||||||
|
mock_metrics = Mock(spec=AutoscalerPrometheusMetrics())
|
||||||
|
autoscaler = MockAutoscaler(
|
||||||
|
config_path,
|
||||||
|
LoadMetrics(),
|
||||||
|
MockNodeInfoStub(),
|
||||||
|
max_failures=0,
|
||||||
|
process_runner=runner,
|
||||||
|
update_interval_s=0,
|
||||||
|
prom_metrics=mock_metrics)
|
||||||
|
assert len(self.provider.non_terminated_nodes({})) == 0
|
||||||
|
autoscaler.update()
|
||||||
|
|
||||||
|
# Expect the next two messages in the logs.
|
||||||
|
msg = "Failed to launch 2 nodes of type ray-legacy-worker-node-type."
|
||||||
|
|
||||||
|
def expected_message_logged():
|
||||||
|
return msg in autoscaler.event_summarizer.summary()
|
||||||
|
|
||||||
|
self.waitFor(expected_message_logged)
|
||||||
|
|
||||||
def testReadonlyNodeProvider(self):
|
def testReadonlyNodeProvider(self):
|
||||||
config = copy.deepcopy(SMALL_CLUSTER)
|
config = copy.deepcopy(SMALL_CLUSTER)
|
||||||
config_path = self.write_config(config)
|
config_path = self.write_config(config)
|
||||||
|
|
|
@ -14,6 +14,7 @@ from ray.tests.test_autoscaler import (MockProvider, MockProcessRunner,
|
||||||
MockNodeInfoStub, mock_raylet_id,
|
MockNodeInfoStub, mock_raylet_id,
|
||||||
MockAutoscaler)
|
MockAutoscaler)
|
||||||
from ray.tests.test_resource_demand_scheduler import MULTI_WORKER_CLUSTER
|
from ray.tests.test_resource_demand_scheduler import MULTI_WORKER_CLUSTER
|
||||||
|
from ray.autoscaler._private.event_summarizer import EventSummarizer
|
||||||
from ray.autoscaler._private.providers import (
|
from ray.autoscaler._private.providers import (
|
||||||
_NODE_PROVIDERS,
|
_NODE_PROVIDERS,
|
||||||
_clear_provider_cache,
|
_clear_provider_cache,
|
||||||
|
@ -195,6 +196,7 @@ class Simulator:
|
||||||
# Manually create a node launcher. Note that we won't start it as a
|
# Manually create a node launcher. Note that we won't start it as a
|
||||||
# separate thread.
|
# separate thread.
|
||||||
self.node_launcher = NodeLauncher(
|
self.node_launcher = NodeLauncher(
|
||||||
|
event_summarizer=EventSummarizer(),
|
||||||
provider=self.autoscaler.provider,
|
provider=self.autoscaler.provider,
|
||||||
queue=self.autoscaler.launch_queue,
|
queue=self.autoscaler.launch_queue,
|
||||||
index=0,
|
index=0,
|
||||||
|
|
Loading…
Add table
Reference in a new issue