mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[autoscaler] AWS Autoscaler CloudWatch Dashboard support (#20266)
These changes add a set of improvements to enable automatic creation and update of CloudWatch dashboards when provisioning AWS Autoscaling clusters. Successful implementation of these improvements will allow AWS Autoscaler users to: 1. Get rapid insights into their cluster state via CloudWatch dashboards. 2. Allow users to update their CloudWatch dashboard JSON configuration files during Ray up execution time. Notes: 1. This PR is a follow-up PR for #18619, adds dashboard support.
This commit is contained in:
parent
6420c75fd2
commit
71fae21e8e
7 changed files with 563 additions and 240 deletions
|
@ -21,10 +21,9 @@ CLOUDWATCH_CONFIG_HASH_TAG_BASE = "cloudwatch-config-hash"
|
|||
|
||||
|
||||
class CloudwatchHelper:
|
||||
def __init__(self, provider_config: Dict[str, Any], node_ids: List[str],
|
||||
def __init__(self, provider_config: Dict[str, Any], node_id: str,
|
||||
cluster_name: str) -> None:
|
||||
# dedupe and sort node IDs to support deterministic unit test stubs
|
||||
self.node_ids = sorted(set(node_ids))
|
||||
self.node_id = node_id
|
||||
self.cluster_name = cluster_name
|
||||
self.provider_config = provider_config
|
||||
region = provider_config["region"]
|
||||
|
@ -41,10 +40,13 @@ class CloudwatchHelper:
|
|||
is_head_node: whether this node is the head node.
|
||||
"""
|
||||
if CloudwatchHelper.cloudwatch_config_exists(self.provider_config,
|
||||
"config"):
|
||||
self._update_cloudwatch_agent_config(is_head_node)
|
||||
"agent"):
|
||||
self._update_cloudwatch_config(is_head_node, "agent")
|
||||
if CloudwatchHelper.cloudwatch_config_exists(self.provider_config,
|
||||
"dashboard"):
|
||||
self._update_cloudwatch_config(is_head_node, "dashboard")
|
||||
|
||||
def _ec2_health_check_waiter(self, node_ids: List[str]) -> None:
|
||||
def _ec2_health_check_waiter(self, node_id: str) -> None:
|
||||
# wait for all EC2 instance checks to complete
|
||||
try:
|
||||
logger.info(
|
||||
|
@ -52,59 +54,101 @@ class CloudwatchHelper:
|
|||
"configuring Unified Cloudwatch Agent. This may take a few "
|
||||
"minutes...")
|
||||
waiter = self.ec2_client.get_waiter("instance_status_ok")
|
||||
waiter.wait(InstanceIds=node_ids)
|
||||
waiter.wait(InstanceIds=[node_id])
|
||||
except botocore.exceptions.WaiterError as e:
|
||||
logger.error(
|
||||
"Failed while waiting for EC2 instance checks to complete: {}".
|
||||
format(e.message))
|
||||
raise e
|
||||
|
||||
def _update_cloudwatch_agent_config(self, is_head_node: bool) -> None:
|
||||
""" check whether update operations are needed.
|
||||
def _update_cloudwatch_config(self, is_head_node: bool,
|
||||
config_type: str) -> None:
|
||||
"""Update remote CloudWatch configs at Parameter Store,
|
||||
update hash tag value on node and perform associated operations
|
||||
at CloudWatch console if local CloudWatch configs change.
|
||||
|
||||
Args:
|
||||
is_head_node: whether this node is the head node.
|
||||
config_type: CloudWatch config file type.
|
||||
"""
|
||||
cwa_installed = self._setup_cwa()
|
||||
param_name = self._get_ssm_param_name()
|
||||
param_name = self._get_ssm_param_name(config_type)
|
||||
if cwa_installed:
|
||||
if is_head_node:
|
||||
cw_config_ssm = self._set_cloudwatch_ssm_config_param(
|
||||
param_name)
|
||||
cur_cw_config_hash = self._sha1_hash_file()
|
||||
param_name, config_type)
|
||||
cur_cw_config_hash = self._sha1_hash_file(config_type)
|
||||
ssm_cw_config_hash = self._sha1_hash_json(cw_config_ssm)
|
||||
# check if user updated Unified Cloudwatch Agent config file.
|
||||
# if so, perform corresponding actions.
|
||||
if cur_cw_config_hash != ssm_cw_config_hash:
|
||||
logger.info(
|
||||
"Unified Cloudwatch Agent config file has changed.")
|
||||
self._upload_config_to_ssm_and_set_hash_tag()
|
||||
self._restart_cloudwatch_agent()
|
||||
"Cloudwatch {} config file has changed.".format(
|
||||
config_type))
|
||||
self._upload_config_to_ssm_and_set_hash_tag(config_type)
|
||||
if config_type == "agent":
|
||||
self._restart_cloudwatch_agent()
|
||||
elif config_type == "dashboard":
|
||||
self._put_cloudwatch_dashboard()
|
||||
else:
|
||||
head_node_hash = self._get_head_node_config_hash()
|
||||
cur_node_hash = self._get_cur_node_config_hash()
|
||||
head_node_hash = self._get_head_node_config_hash(config_type)
|
||||
cur_node_hash = self._get_cur_node_config_hash(config_type)
|
||||
if head_node_hash != cur_node_hash:
|
||||
logger.info(
|
||||
"Unified Cloudwatch Agent config file has changed.")
|
||||
self._restart_cloudwatch_agent()
|
||||
"Cloudwatch {} config file has changed.".format(
|
||||
config_type))
|
||||
if config_type == "agent":
|
||||
self._restart_cloudwatch_agent()
|
||||
self._update_cloudwatch_hash_tag_value(
|
||||
self.node_ids, head_node_hash)
|
||||
self.node_id, head_node_hash, config_type)
|
||||
|
||||
def _send_command_to_nodes(self, document_name: str, parameters: List[str],
|
||||
node_ids: List[str]) -> Dict[str, Any]:
|
||||
def _put_cloudwatch_dashboard(self) -> Dict[str, Any]:
|
||||
"""put dashboard to cloudwatch console"""
|
||||
|
||||
cloudwatch_config = self.provider_config["cloudwatch"]
|
||||
dashboard_config = cloudwatch_config \
|
||||
.get("dashboard", {})
|
||||
dashboard_name_cluster = dashboard_config.get("name",
|
||||
self.cluster_name)
|
||||
dashboard_name = self.cluster_name + "-" + dashboard_name_cluster
|
||||
|
||||
widgets = self._replace_dashboard_config_variables()
|
||||
|
||||
response = self.cloudwatch_client.put_dashboard(
|
||||
DashboardName=dashboard_name,
|
||||
DashboardBody=json.dumps({
|
||||
"widgets": widgets
|
||||
}))
|
||||
issue_count = len(response.get("DashboardValidationMessages", []))
|
||||
if issue_count > 0:
|
||||
for issue in response.get("DashboardValidationMessages"):
|
||||
logging.error("Error in dashboard config: {} - {}".format(
|
||||
issue["Message"], issue["DataPath"]))
|
||||
raise Exception(
|
||||
"Errors in dashboard configuration: {} issues raised".format(
|
||||
issue_count))
|
||||
else:
|
||||
logger.info("Successfully put dashboard to CloudWatch console")
|
||||
return response
|
||||
|
||||
def _send_command_to_node(self, document_name: str, parameters: List[str],
|
||||
node_id: str) -> Dict[str, Any]:
|
||||
""" send SSM command to the given nodes """
|
||||
logger.debug("Sending SSM command to {} node(s). Document name: {}. "
|
||||
"Parameters: {}.".format(
|
||||
len(node_ids), document_name, parameters))
|
||||
"Parameters: {}.".format(node_id, document_name,
|
||||
parameters))
|
||||
response = self.ssm_client.send_command(
|
||||
InstanceIds=node_ids,
|
||||
InstanceIds=[node_id],
|
||||
DocumentName=document_name,
|
||||
Parameters=parameters,
|
||||
MaxConcurrency=str(min(len(node_ids), 100)),
|
||||
MaxConcurrency=str(min(len(node_id), 1)),
|
||||
MaxErrors="0")
|
||||
return response
|
||||
|
||||
def _ssm_command_waiter(self,
|
||||
document_name: str,
|
||||
parameters: List[str],
|
||||
node_ids: List[str],
|
||||
node_id: str,
|
||||
retry_failed: bool = True) -> bool:
|
||||
""" wait for SSM command to complete on all cluster nodes """
|
||||
|
||||
|
@ -113,76 +157,72 @@ class CloudwatchHelper:
|
|||
# exist instead of failing immediately, and by resubmitting
|
||||
# any failed command until all retry attempts are exhausted
|
||||
# by default.
|
||||
response = self._send_command_to_nodes(document_name, parameters,
|
||||
node_ids)
|
||||
response = self._send_command_to_node(document_name, parameters,
|
||||
node_id)
|
||||
command_id = response["Command"]["CommandId"]
|
||||
|
||||
cloudwatch_config = self.provider_config["cloudwatch"]
|
||||
agent_retryer_config = cloudwatch_config \
|
||||
.get("agent", {}) \
|
||||
.get("agent") \
|
||||
.get("retryer", {})
|
||||
max_attempts = agent_retryer_config.get("max_attempts", 120)
|
||||
delay_seconds = agent_retryer_config.get("delay_seconds", 30)
|
||||
num_attempts = 0
|
||||
cmd_invocation_res = {}
|
||||
for node_id in node_ids:
|
||||
while True:
|
||||
num_attempts += 1
|
||||
logger.debug("Listing SSM command ID {} invocations on node {}"
|
||||
.format(command_id, node_id))
|
||||
response = self.ssm_client.list_command_invocations(
|
||||
CommandId=command_id,
|
||||
InstanceId=node_id,
|
||||
)
|
||||
cmd_invocations = response["CommandInvocations"]
|
||||
if not cmd_invocations:
|
||||
logger.debug(
|
||||
"SSM Command ID {} invocation does not exist. If "
|
||||
"the command was just started, it may take a "
|
||||
"few seconds to register.".format(command_id))
|
||||
else:
|
||||
if len(cmd_invocations) > 1:
|
||||
logger.warning(
|
||||
"Expected to find 1 SSM command invocation with "
|
||||
"ID {} on node {} but found {}: {}".format(
|
||||
command_id,
|
||||
node_id,
|
||||
len(cmd_invocations),
|
||||
cmd_invocations,
|
||||
))
|
||||
cmd_invocation = cmd_invocations[0]
|
||||
if cmd_invocation["Status"] == "Success":
|
||||
while True:
|
||||
num_attempts += 1
|
||||
logger.debug("Listing SSM command ID {} invocations on node {}"
|
||||
.format(command_id, node_id))
|
||||
response = self.ssm_client.list_command_invocations(
|
||||
CommandId=command_id,
|
||||
InstanceId=node_id,
|
||||
)
|
||||
cmd_invocations = response["CommandInvocations"]
|
||||
if not cmd_invocations:
|
||||
logger.debug("SSM Command ID {} invocation does not exist. If "
|
||||
"the command was just started, it may take a "
|
||||
"few seconds to register.".format(command_id))
|
||||
else:
|
||||
if len(cmd_invocations) > 1:
|
||||
logger.warning(
|
||||
"Expected to find 1 SSM command invocation with "
|
||||
"ID {} on node {} but found {}: {}".format(
|
||||
command_id,
|
||||
node_id,
|
||||
len(cmd_invocations),
|
||||
cmd_invocations,
|
||||
))
|
||||
cmd_invocation = cmd_invocations[0]
|
||||
if cmd_invocation["Status"] == "Success":
|
||||
logger.debug("SSM Command ID {} completed successfully."
|
||||
.format(command_id))
|
||||
cmd_invocation_res[node_id] = True
|
||||
break
|
||||
if num_attempts >= max_attempts:
|
||||
logger.error(
|
||||
"Max attempts for command {} exceeded on node {}"
|
||||
.format(command_id, node_id))
|
||||
raise botocore.exceptions.WaiterError(
|
||||
name="ssm_waiter",
|
||||
reason="Max attempts exceeded",
|
||||
last_response=cmd_invocation,
|
||||
)
|
||||
if cmd_invocation["Status"] == "Failed":
|
||||
logger.debug(f"SSM Command ID {command_id} failed.")
|
||||
if retry_failed:
|
||||
logger.debug(f"Retrying in {delay_seconds} seconds.")
|
||||
response = self._send_command_to_node(
|
||||
document_name, parameters, node_id)
|
||||
command_id = response["Command"]["CommandId"]
|
||||
logger.debug("Sent SSM command ID {} to node {}"
|
||||
.format(command_id, node_id))
|
||||
else:
|
||||
logger.debug(
|
||||
"SSM Command ID {} completed successfully."
|
||||
.format(command_id))
|
||||
cmd_invocation_res[node_id] = True
|
||||
f"Ignoring Command ID {command_id} failure.")
|
||||
cmd_invocation_res[node_id] = False
|
||||
break
|
||||
if num_attempts >= max_attempts:
|
||||
logger.error(
|
||||
"Max attempts for command {} exceeded on node {}"
|
||||
.format(command_id, node_id))
|
||||
raise botocore.exceptions.WaiterError(
|
||||
name="ssm_waiter",
|
||||
reason="Max attempts exceeded",
|
||||
last_response=cmd_invocation,
|
||||
)
|
||||
if cmd_invocation["Status"] == "Failed":
|
||||
logger.debug(f"SSM Command ID {command_id} failed.")
|
||||
if retry_failed:
|
||||
logger.debug(
|
||||
f"Retrying in {delay_seconds} seconds.")
|
||||
response = self._send_command_to_nodes(
|
||||
document_name, parameters, [node_id])
|
||||
command_id = response["Command"]["CommandId"]
|
||||
logger.debug("Sent SSM command ID {} to node {}"
|
||||
.format(command_id, node_id))
|
||||
else:
|
||||
logger.debug(
|
||||
f"Ignoring Command ID {command_id} failure.")
|
||||
cmd_invocation_res[node_id] = False
|
||||
break
|
||||
time.sleep(delay_seconds)
|
||||
|
||||
time.sleep(delay_seconds)
|
||||
return cmd_invocation_res
|
||||
|
||||
def _replace_config_variables(self, string: str, node_id: str,
|
||||
|
@ -211,7 +251,6 @@ class CloudwatchHelper:
|
|||
modified strings in the collection (which is not necessarily equal to
|
||||
the number of variables replaced).
|
||||
"""
|
||||
modified_value_count = 0
|
||||
for key in collection:
|
||||
if type(collection) is dict:
|
||||
value = collection.get(key)
|
||||
|
@ -222,49 +261,48 @@ class CloudwatchHelper:
|
|||
if type(value) is str:
|
||||
collection[index_key] = self._replace_config_variables(
|
||||
value, node_id, cluster_name, region)
|
||||
modified_value_count += (collection[index_key] != value)
|
||||
elif type(value) is dict or type(value) is list:
|
||||
collection[index_key], modified_count = self. \
|
||||
collection[index_key] = self. \
|
||||
_replace_all_config_variables(
|
||||
value, node_id, cluster_name, region)
|
||||
modified_value_count += modified_count
|
||||
return collection, modified_value_count
|
||||
return collection
|
||||
|
||||
def _load_config_file(self) -> Dict[str, Any]:
|
||||
def _load_config_file(self, config_type: str) -> Dict[str, Any]:
|
||||
"""load JSON config file"""
|
||||
cloudwatch_config = self.provider_config["cloudwatch"]
|
||||
json_config_file_section = cloudwatch_config.get("agent", {})
|
||||
json_config_file_section = cloudwatch_config.get(config_type, {})
|
||||
json_config_file_path = json_config_file_section.get("config", {})
|
||||
json_config_path = os.path.abspath(json_config_file_path)
|
||||
with open(json_config_path) as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
def _set_cloudwatch_ssm_config_param(self, parameter_name: str) -> str:
|
||||
def _set_cloudwatch_ssm_config_param(self, parameter_name: str,
|
||||
config_type: str) -> str:
|
||||
"""
|
||||
get cloudwatch config for the given param and config type from SSM
|
||||
if it exists, returns empty str if not.
|
||||
if it exists, put it in the SSM param store if not
|
||||
"""
|
||||
try:
|
||||
parameter_value = self._get_ssm_param(parameter_name)
|
||||
except botocore.exceptions.ClientError as e:
|
||||
if e.response["Error"]["Code"] == "ParameterNotFound":
|
||||
logger.info(
|
||||
"Unified Cloudwatch Agent config file is not found "
|
||||
"Cloudwatch {} config file is not found "
|
||||
"at SSM parameter store. "
|
||||
"Checking for Unified Cloudwatch Agent installation")
|
||||
"Checking for Unified CloudWatch Agent installation".
|
||||
format(config_type))
|
||||
return self._get_default_empty_config_file_hash()
|
||||
else:
|
||||
logger.info(
|
||||
"Failed to fetch Unified Cloudwatch Agent config from SSM "
|
||||
"parameter store.")
|
||||
logger.info("Failed to fetch CloudWatch {} config from SSM "
|
||||
"parameter store.".format(config_type))
|
||||
logger.error(e)
|
||||
raise e
|
||||
return parameter_value
|
||||
|
||||
def _get_default_empty_config_file_hash(self):
|
||||
default_cwa_config = "{}"
|
||||
parameter_value = self._sha1_hash_json(default_cwa_config)
|
||||
default_cw_config = "{}"
|
||||
parameter_value = self._sha1_hash_json(default_cw_config)
|
||||
return parameter_value
|
||||
|
||||
def _get_ssm_param(self, parameter_name: str) -> str:
|
||||
|
@ -286,48 +324,54 @@ class CloudwatchHelper:
|
|||
sha1_res = hash.hexdigest()
|
||||
return sha1_res
|
||||
|
||||
def _sha1_hash_file(self) -> str:
|
||||
def _sha1_hash_file(self, config_type: str) -> str:
|
||||
"""calculate the config file sha1 hash"""
|
||||
config = self._replace_cwa_config_variables()
|
||||
if config_type == "agent":
|
||||
config = self._replace_cwa_config_variables()
|
||||
if config_type == "dashboard":
|
||||
config = self._replace_dashboard_config_variables()
|
||||
value = json.dumps(config)
|
||||
sha1_res = self._sha1_hash_json(value)
|
||||
return sha1_res
|
||||
|
||||
def _upload_config_to_ssm_and_set_hash_tag(self):
|
||||
"""This function should only be called by head node"""
|
||||
data = self._replace_cwa_config_variables()
|
||||
sha1_hash_value = self._sha1_hash_file()
|
||||
self._upload_config_to_ssm(data)
|
||||
self._update_cloudwatch_hash_tag_value(self.node_ids, sha1_hash_value)
|
||||
def _upload_config_to_ssm_and_set_hash_tag(self, config_type: str):
|
||||
if config_type == "agent":
|
||||
data = self._replace_cwa_config_variables()
|
||||
if config_type == "dashboard":
|
||||
data = self._replace_dashboard_config_variables()
|
||||
sha1_hash_value = self._sha1_hash_file(config_type)
|
||||
self._upload_config_to_ssm(data, config_type)
|
||||
self._update_cloudwatch_hash_tag_value(self.node_id, sha1_hash_value,
|
||||
config_type)
|
||||
|
||||
def _add_cwa_installed_tag(self, node_ids: List[str]) -> None:
|
||||
def _add_cwa_installed_tag(self, node_id: str) -> None:
|
||||
self.ec2_client.create_tags(
|
||||
Resources=node_ids,
|
||||
Resources=[node_id],
|
||||
Tags=[{
|
||||
"Key": CLOUDWATCH_AGENT_INSTALLED_TAG,
|
||||
"Value": "True"
|
||||
}])
|
||||
logger.info("Successfully add Unified Cloudwatch Agent installed "
|
||||
"tag on {}".format(node_ids))
|
||||
"tag on {}".format(node_id))
|
||||
|
||||
def _update_cloudwatch_hash_tag_value(self, node_ids: List[str],
|
||||
sha1_hash_value: str):
|
||||
hash_key_value = "-".join([CLOUDWATCH_CONFIG_HASH_TAG_BASE, "agent"])
|
||||
def _update_cloudwatch_hash_tag_value(
|
||||
self, node_id: str, sha1_hash_value: str, config_type: str):
|
||||
hash_key_value = "-".join(
|
||||
[CLOUDWATCH_CONFIG_HASH_TAG_BASE, config_type])
|
||||
self.ec2_client.create_tags(
|
||||
Resources=node_ids,
|
||||
Resources=[node_id],
|
||||
Tags=[{
|
||||
"Key": hash_key_value,
|
||||
"Value": sha1_hash_value
|
||||
}])
|
||||
logger.info(
|
||||
"Successfully update Unified Cloudwatch Agent hash tag on {}".
|
||||
format(node_ids))
|
||||
logger.info("Successfully update cloudwatch {} hash tag on {}".format(
|
||||
config_type, node_id))
|
||||
|
||||
def _get_ssm_param_name(self) -> str:
|
||||
def _get_ssm_param_name(self, config_type: str) -> str:
|
||||
"""return the parameter name for cloudwatch configs"""
|
||||
ssm_config_param_name = \
|
||||
"AmazonCloudWatch-" + "ray_{}_config_{}". \
|
||||
format("agent", self.cluster_name)
|
||||
format(config_type, self.cluster_name)
|
||||
return ssm_config_param_name
|
||||
|
||||
def _put_ssm_param(self, parameter: Dict[str, Any],
|
||||
|
@ -341,8 +385,8 @@ class CloudwatchHelper:
|
|||
Tier="Intelligent-Tiering",
|
||||
)
|
||||
|
||||
def _upload_config_to_ssm(self, param: Dict[str, Any]):
|
||||
param_name = self._get_ssm_param_name()
|
||||
def _upload_config_to_ssm(self, param: Dict[str, Any], config_type: str):
|
||||
param_name = self._get_ssm_param_name(config_type)
|
||||
self._put_ssm_param(param, param_name)
|
||||
|
||||
def _replace_cwa_config_variables(self) -> Dict[str, Any]:
|
||||
|
@ -350,28 +394,45 @@ class CloudwatchHelper:
|
|||
replace known variable occurrences in
|
||||
Unified Cloudwatch Agent config file
|
||||
"""
|
||||
cwa_config = self._load_config_file()
|
||||
cwa_config = self._load_config_file("agent")
|
||||
self._replace_all_config_variables(
|
||||
cwa_config,
|
||||
self.node_ids[0],
|
||||
self.node_id,
|
||||
self.cluster_name,
|
||||
self.provider_config["region"],
|
||||
)
|
||||
return cwa_config
|
||||
|
||||
def _replace_dashboard_config_variables(self) -> Dict[str, Any]:
|
||||
"""
|
||||
replace known variable occurrences in CloudWatch Dashboard config file
|
||||
"""
|
||||
data = self._load_config_file("dashboard")
|
||||
widgets = []
|
||||
for item in data:
|
||||
item_out = self._replace_all_config_variables(
|
||||
item,
|
||||
self.node_id,
|
||||
self.cluster_name,
|
||||
self.provider_config["region"],
|
||||
)
|
||||
item_out = copy.deepcopy(item)
|
||||
widgets.append(item_out)
|
||||
return widgets
|
||||
|
||||
def _restart_cloudwatch_agent(self) -> None:
|
||||
"""restart Unified Cloudwatch Agent"""
|
||||
cwa_param_name = self._get_ssm_param_name()
|
||||
cwa_param_name = self._get_ssm_param_name("agent")
|
||||
logger.info(
|
||||
"Restarting Unified Cloudwatch Agent package on {} node(s)."
|
||||
.format(len(self.node_ids)))
|
||||
.format((self.node_id)))
|
||||
self._stop_cloudwatch_agent()
|
||||
self._start_cloudwatch_agent(cwa_param_name)
|
||||
|
||||
def _stop_cloudwatch_agent(self) -> None:
|
||||
"""stop Unified Cloudwatch Agent"""
|
||||
logger.info("Stopping Unified Cloudwatch Agent package on {} node(s)."
|
||||
.format(len(self.node_ids)))
|
||||
"""stop Unified CloudWatch Agent """
|
||||
logger.info("Stopping Unified CloudWatch Agent package on node {}."
|
||||
.format(self.node_id))
|
||||
parameters_stop_cwa = {
|
||||
"action": ["stop"],
|
||||
"mode": ["ec2"],
|
||||
|
@ -381,16 +442,16 @@ class CloudwatchHelper:
|
|||
self._ssm_command_waiter(
|
||||
"AmazonCloudWatch-ManageAgent",
|
||||
parameters_stop_cwa,
|
||||
self.node_ids,
|
||||
self.node_id,
|
||||
False,
|
||||
)
|
||||
logger.info("Unified Cloudwatch Agent stopped on {} node(s).".format(
|
||||
len(self.node_ids)))
|
||||
logger.info("Unified CloudWatch Agent stopped on node {}.".format(
|
||||
self.node_id))
|
||||
|
||||
def _start_cloudwatch_agent(self, cwa_param_name: str) -> None:
|
||||
"""start Unified Cloudwatch Agent"""
|
||||
logger.info("Starting Unified Cloudwatch Agent package on {} node(s)."
|
||||
.format(len(self.node_ids)))
|
||||
"""start Unified CloudWatch Agent"""
|
||||
logger.info("Starting Unified CloudWatch Agent package on node {}."
|
||||
.format(self.node_id))
|
||||
parameters_start_cwa = {
|
||||
"action": ["configure"],
|
||||
"mode": ["ec2"],
|
||||
|
@ -399,21 +460,21 @@ class CloudwatchHelper:
|
|||
"optionalRestart": ["yes"],
|
||||
}
|
||||
self._ssm_command_waiter("AmazonCloudWatch-ManageAgent",
|
||||
parameters_start_cwa, self.node_ids)
|
||||
logger.info(
|
||||
"Unified Cloudwatch Agent started successfully on {} node(s)."
|
||||
.format(len(self.node_ids)))
|
||||
parameters_start_cwa, self.node_id)
|
||||
logger.info("Unified CloudWatch Agent started successfully on node {}."
|
||||
.format(self.node_id))
|
||||
|
||||
def _setup_cwa(self) -> bool:
|
||||
cwa_installed = self._check_cwa_installed_ec2_tag()
|
||||
if cwa_installed == "False":
|
||||
res_cwa_installed = self._ensure_cwa_installed_ssm(self.node_ids)
|
||||
res_cwa_installed = self._ensure_cwa_installed_ssm(self.node_id)
|
||||
return res_cwa_installed
|
||||
else:
|
||||
return True
|
||||
|
||||
def _get_head_node_config_hash(self) -> str:
|
||||
hash_key_value = "-".join([CLOUDWATCH_CONFIG_HASH_TAG_BASE, "agent"])
|
||||
def _get_head_node_config_hash(self, config_type: str) -> str:
|
||||
hash_key_value = "-".join(
|
||||
[CLOUDWATCH_CONFIG_HASH_TAG_BASE, config_type])
|
||||
filters = copy.deepcopy(
|
||||
self._get_current_cluster_session_nodes(self.cluster_name))
|
||||
filters.append({
|
||||
|
@ -432,11 +493,12 @@ class CloudwatchHelper:
|
|||
"{} Error caught when getting value of {} tag on head node".
|
||||
format(e.response["Error"], hash_key_value))
|
||||
|
||||
def _get_cur_node_config_hash(self) -> str:
|
||||
hash_key_value = "-".join([CLOUDWATCH_CONFIG_HASH_TAG_BASE, "agent"])
|
||||
def _get_cur_node_config_hash(self, config_type: str) -> str:
|
||||
hash_key_value = "-".join(
|
||||
[CLOUDWATCH_CONFIG_HASH_TAG_BASE, config_type])
|
||||
try:
|
||||
response = self.ec2_client.describe_instances(
|
||||
InstanceIds=self.node_ids)
|
||||
InstanceIds=[self.node_id])
|
||||
reservations = response["Reservations"]
|
||||
message = "More than 1 response received from " \
|
||||
"describing current node"
|
||||
|
@ -447,9 +509,9 @@ class CloudwatchHelper:
|
|||
hash_value = self._get_default_empty_config_file_hash()
|
||||
for tag in tags:
|
||||
if tag["Key"] == hash_key_value:
|
||||
logger.info("Successfully get Unified Cloudwatch Agent "
|
||||
"hash tag value from node {}".format(
|
||||
self.node_ids))
|
||||
logger.info(
|
||||
"Successfully get cloudwatch {} hash tag value from "
|
||||
"node {}".format(config_type, self.node_id))
|
||||
hash_value = tag["Value"]
|
||||
return hash_value
|
||||
except botocore.exceptions.ClientError as e:
|
||||
|
@ -457,36 +519,29 @@ class CloudwatchHelper:
|
|||
"{} Error caught when getting hash tag {} tag".format(
|
||||
e.response["Error"], hash_key_value))
|
||||
|
||||
def _ensure_cwa_installed_ssm(self, node_ids: List[str]) -> bool:
|
||||
def _ensure_cwa_installed_ssm(self, node_id: str) -> bool:
|
||||
"""
|
||||
Check if Unified Cloudwatch Agent is installed via ssm run command.
|
||||
|
||||
If not, notify user to use an AMI with
|
||||
the Unified CloudWatch Agent installed.
|
||||
"""
|
||||
logger.info("Checking Unified Cloudwatch Agent "
|
||||
"status on {} nodes".format(len(node_ids)))
|
||||
"status on node {}".format(node_id))
|
||||
parameters_status_cwa = {
|
||||
"action": ["status"],
|
||||
"mode": ["ec2"],
|
||||
}
|
||||
self._ec2_health_check_waiter(node_ids)
|
||||
self._ec2_health_check_waiter(node_id)
|
||||
cmd_invocation_res = self._ssm_command_waiter(
|
||||
"AmazonCloudWatch-ManageAgent", parameters_status_cwa, node_ids,
|
||||
"AmazonCloudWatch-ManageAgent", parameters_status_cwa, node_id,
|
||||
False)
|
||||
uninstalled_nodes = []
|
||||
installed_nodes = []
|
||||
for node_id, res in cmd_invocation_res.items():
|
||||
if not res:
|
||||
uninstalled_nodes.append(node_id)
|
||||
else:
|
||||
installed_nodes.append(node_id)
|
||||
if len(uninstalled_nodes) > 0:
|
||||
cwa_installed = cmd_invocation_res.get(node_id, False)
|
||||
if not cwa_installed:
|
||||
logger.warning(
|
||||
"Unified CloudWatch Agent not installed on {}. "
|
||||
"Ray logs, metrics not picked up. "
|
||||
"Please use an AMI with Unified CloudWatch Agent installed."
|
||||
.format(uninstalled_nodes))
|
||||
.format(node_id))
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
@ -504,11 +559,12 @@ class CloudwatchHelper:
|
|||
|
||||
def _check_cwa_installed_ec2_tag(self) -> List[str]:
|
||||
"""
|
||||
Check if Unified Cloudwatch Agent is installed.
|
||||
Filtering all nodes to get nodes
|
||||
without Unified CloudWatch Agent installed
|
||||
"""
|
||||
try:
|
||||
response = self.ec2_client.describe_instances(
|
||||
InstanceIds=self.node_ids)
|
||||
InstanceIds=[self.node_id])
|
||||
reservations = response["Reservations"]
|
||||
message = "More than 1 response received from " \
|
||||
"describing current node"
|
||||
|
@ -519,15 +575,15 @@ class CloudwatchHelper:
|
|||
cwa_installed = str(False)
|
||||
for tag in tags:
|
||||
if tag["Key"] == CLOUDWATCH_AGENT_INSTALLED_TAG:
|
||||
logger.info("Unified Cloudwatch Agent is installed on "
|
||||
"node {}".format(self.node_ids))
|
||||
logger.info("Unified CloudWatch Agent is installed on "
|
||||
"node {}".format(self.node_id))
|
||||
cwa_installed = tag["Value"]
|
||||
return cwa_installed
|
||||
except botocore.exceptions.ClientError as e:
|
||||
logger.warning(
|
||||
"{} Error caught when getting Unified Cloudwatch Agent status "
|
||||
"based on {} tag".format(e.response["Error"],
|
||||
CLOUDWATCH_AGENT_INSTALLED_TAG))
|
||||
"{} Error caught when getting Unified CloudWatch Agent "
|
||||
"status based on {} tag".format(
|
||||
e.response["Error"], CLOUDWATCH_AGENT_INSTALLED_TAG))
|
||||
|
||||
@staticmethod
|
||||
def resolve_instance_profile_name(
|
||||
|
@ -545,7 +601,7 @@ class CloudwatchHelper:
|
|||
doesn't exist.
|
||||
"""
|
||||
cwa_cfg_exists = CloudwatchHelper.cloudwatch_config_exists(
|
||||
config, "config")
|
||||
config, "agent")
|
||||
return CLOUDWATCH_RAY_INSTANCE_PROFILE if cwa_cfg_exists \
|
||||
else default_instance_profile_name
|
||||
|
||||
|
@ -563,7 +619,7 @@ class CloudwatchHelper:
|
|||
default ray iam role name if cloudwatch config file doesn't exist.
|
||||
"""
|
||||
cwa_cfg_exists = CloudwatchHelper.cloudwatch_config_exists(
|
||||
config, "config")
|
||||
config, "agent")
|
||||
return CLOUDWATCH_RAY_IAM_ROLE if cwa_cfg_exists \
|
||||
else default_iam_role_name
|
||||
|
||||
|
@ -583,7 +639,7 @@ class CloudwatchHelper:
|
|||
cluster config file.
|
||||
"""
|
||||
cwa_cfg_exists = CloudwatchHelper.cloudwatch_config_exists(
|
||||
config, "config")
|
||||
config, "agent")
|
||||
if cwa_cfg_exists:
|
||||
cloudwatch_managed_policy = {
|
||||
"Version": "2012-10-17",
|
||||
|
@ -624,7 +680,7 @@ class CloudwatchHelper:
|
|||
|
||||
@staticmethod
|
||||
def cloudwatch_config_exists(config: Dict[str, Any],
|
||||
config_key_name: str) -> bool:
|
||||
config_type: str) -> bool:
|
||||
"""Check if CloudWatch configuration was specified by the user
|
||||
in their cluster config file.
|
||||
|
||||
|
@ -633,12 +689,11 @@ class CloudwatchHelper:
|
|||
|
||||
Args:
|
||||
config: provider section of cluster config file.
|
||||
config_key_name: config file name.
|
||||
config_type: type of CloudWatch config file.
|
||||
|
||||
Returns:
|
||||
True if config file is specified by user.
|
||||
False if config file is not specified.
|
||||
"""
|
||||
cfg = config.get("cloudwatch", {}).get("agent",
|
||||
{}).get(config_key_name)
|
||||
cfg = config.get("cloudwatch", {}).get(config_type, {}).get("config")
|
||||
return bool(cfg)
|
||||
|
|
|
@ -361,7 +361,7 @@ class AWSNodeProvider(NodeProvider):
|
|||
"Value": v,
|
||||
})
|
||||
if CloudwatchHelper.cloudwatch_config_exists(self.provider_config,
|
||||
"config"):
|
||||
"agent"):
|
||||
cwa_installed = self._check_ami_cwa_installation(node_config)
|
||||
if cwa_installed:
|
||||
tag_pairs.extend([{
|
||||
|
|
|
@ -307,7 +307,7 @@ class NodeUpdater:
|
|||
from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper \
|
||||
import CloudwatchHelper
|
||||
CloudwatchHelper(self.provider.provider_config,
|
||||
[self.node_id], self.provider.cluster_name). \
|
||||
self.node_id, self.provider.cluster_name). \
|
||||
update_from_config(self.is_head_node)
|
||||
|
||||
if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash:
|
||||
|
|
|
@ -3,10 +3,6 @@
|
|||
"metrics_collection_interval":60,
|
||||
"run_as_user":"root"
|
||||
},
|
||||
"csm":{
|
||||
"memory_limit_in_mb":20,
|
||||
"port":31000
|
||||
},
|
||||
"logs":{
|
||||
"metrics_collected": {
|
||||
"prometheus": {
|
||||
|
@ -116,60 +112,72 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"metrics":{
|
||||
"append_dimensions":{
|
||||
"AutoScalingGroupName":"${aws:AutoScalingGroupName}",
|
||||
"InstanceId":"${aws:InstanceId}"
|
||||
},
|
||||
"metrics_collected":{
|
||||
"collectd":{
|
||||
"metrics_aggregation_interval":60
|
||||
},
|
||||
"cpu":{
|
||||
"measurement":[
|
||||
"usage_active",
|
||||
"usage_system",
|
||||
"usage_user",
|
||||
"usage_idle",
|
||||
"time_active",
|
||||
"time_system",
|
||||
"time_user",
|
||||
"time_idle"
|
||||
]
|
||||
},
|
||||
"processes":{
|
||||
"measurement":[
|
||||
"processes_running",
|
||||
"processes_sleeping",
|
||||
"processes_zombies",
|
||||
"processes_dead",
|
||||
"processes_total"
|
||||
],
|
||||
"metrics_collection_interval":60,
|
||||
"resources":[
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"disk":{
|
||||
"measurement":[
|
||||
"disk_used_percent"
|
||||
],
|
||||
"metrics_collection_interval":60,
|
||||
"resources":[
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"mem":{
|
||||
"measurement":[
|
||||
"mem_used_percent"
|
||||
],
|
||||
"metrics_collection_interval":60
|
||||
},
|
||||
"statsd":{
|
||||
"metrics_aggregation_interval":60,
|
||||
"metrics_collection_interval":10,
|
||||
"service_address":":8125"
|
||||
}
|
||||
}
|
||||
"metrics": {
|
||||
"namespace": "{cluster_name}-ray-CWAgent",
|
||||
"aggregation_dimensions": [
|
||||
[
|
||||
"InstanceId"
|
||||
]
|
||||
],
|
||||
"append_dimensions": {
|
||||
"AutoScalingGroupName": "${aws:AutoScalingGroupName}",
|
||||
"InstanceId": "${aws:InstanceId}"
|
||||
},
|
||||
"metrics_collected": {
|
||||
"collectd": {
|
||||
"metrics_aggregation_interval": 60
|
||||
},
|
||||
"cpu": {
|
||||
"measurement": [
|
||||
"usage_active",
|
||||
"usage_system",
|
||||
"usage_user",
|
||||
"usage_idle",
|
||||
"time_active",
|
||||
"time_system",
|
||||
"time_user",
|
||||
"time_idle"
|
||||
],
|
||||
"resources": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"processes": {
|
||||
"measurement": [
|
||||
"processes_running",
|
||||
"processes_sleeping",
|
||||
"processes_zombies",
|
||||
"processes_dead",
|
||||
"processes_total"
|
||||
],
|
||||
"metrics_collection_interval": 60,
|
||||
"resources": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"disk": {
|
||||
"measurement": [
|
||||
"disk_used_percent"
|
||||
],
|
||||
"metrics_collection_interval": 60,
|
||||
"resources": [
|
||||
"/"
|
||||
]
|
||||
},
|
||||
"mem": {
|
||||
"measurement": [
|
||||
"mem_used_percent"
|
||||
],
|
||||
"metrics_collection_interval": 60,
|
||||
"resources": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"statsd": {
|
||||
"metrics_aggregation_interval": 60,
|
||||
"metrics_collection_interval": 10,
|
||||
"service_address": ":8125"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,238 @@
|
|||
[
|
||||
{
|
||||
"type":"explorer",
|
||||
"x":12,
|
||||
"y":18,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties": {
|
||||
"metrics": [
|
||||
{
|
||||
"metricName": "CPUUtilization",
|
||||
"resourceType": "AWS::EC2::Instance",
|
||||
"stat": "Average"
|
||||
}
|
||||
],
|
||||
"aggregateBy": {
|
||||
"key": "*",
|
||||
"func": "SUM"
|
||||
},
|
||||
"labels": [
|
||||
{
|
||||
"key": "cloudwatch-agent-installed",
|
||||
"value": "True"
|
||||
},
|
||||
{
|
||||
"key": "ray-cluster-name",
|
||||
"value": "{cluster_name}"
|
||||
}
|
||||
],
|
||||
"widgetOptions": {
|
||||
"legend": {
|
||||
"position": "bottom"
|
||||
},
|
||||
"view": "timeSeries",
|
||||
"stacked": false,
|
||||
"rowsPerPage": 1,
|
||||
"widgetsPerRow": 1
|
||||
},
|
||||
"title":"Cluster CPU Utilization"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type":"explorer",
|
||||
"x":0,
|
||||
"y":18,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties": {
|
||||
"metrics": [
|
||||
{
|
||||
"metricName": "CPUUtilization",
|
||||
"resourceType": "AWS::EC2::Instance",
|
||||
"stat": "Average"
|
||||
}
|
||||
],
|
||||
"aggregateBy": {
|
||||
"key": "*",
|
||||
"func": "AVG"
|
||||
},
|
||||
"labels": [
|
||||
{
|
||||
"key": "cloudwatch-agent-installed",
|
||||
"value": "True"
|
||||
},
|
||||
{
|
||||
"key": "ray-cluster-name",
|
||||
"value": "{cluster_name}"
|
||||
}
|
||||
],
|
||||
"widgetOptions": {
|
||||
"legend": {
|
||||
"position": "bottom"
|
||||
},
|
||||
"view": "timeSeries",
|
||||
"stacked": false,
|
||||
"rowsPerPage": 1,
|
||||
"widgetsPerRow": 1
|
||||
},
|
||||
"title":"Single Node CPU Utilization (Avg and Max)"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type":"metric",
|
||||
"x":12,
|
||||
"y":6,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties":{
|
||||
"view":"timeSeries",
|
||||
"metrics":[
|
||||
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process sum", "id": "e1" } ],
|
||||
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process sum", "id": "e2" } ]
|
||||
],
|
||||
"region":"{region}",
|
||||
"stat":"Average",
|
||||
"period":60,
|
||||
"title":"Cluster Processes"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type":"metric",
|
||||
"x":0,
|
||||
"y":6,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties":{
|
||||
"view":"timeSeries",
|
||||
"metrics":[
|
||||
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process average", "id": "e3" } ],
|
||||
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process average", "id": "e4" } ],
|
||||
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process maximum", "id": "e5" } ],
|
||||
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process maximum", "id": "e6" } ]
|
||||
],
|
||||
"region":"{region}",
|
||||
"stat":"Average",
|
||||
"period":60,
|
||||
"title":"Single Node Processes (Avg and Max)"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type":"metric",
|
||||
"x":12,
|
||||
"y":12,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties":{
|
||||
"view":"timeSeries",
|
||||
"stacked":false,
|
||||
"metrics":[
|
||||
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Average', 300))", "label": "cluster disk used percent sum", "id": "e7", "period": 300 } ]
|
||||
|
||||
],
|
||||
"region":"{region}",
|
||||
"title":"Cluster Disk Usage"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type":"metric",
|
||||
"x":0,
|
||||
"y":12,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties":{
|
||||
"view":"timeSeries",
|
||||
"stacked":false,
|
||||
"metrics":[
|
||||
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Average', 300))", "id": "e8", "label": "cluster disk used percent average", "period": 300 } ],
|
||||
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Maximum', 300))", "id": "e9", "label": "cluster disk used percent maximum", "period": 300 } ]
|
||||
|
||||
],
|
||||
"region":"{region}",
|
||||
"title":"Single Node Disk Usage (Avg and Max)"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type":"metric",
|
||||
"x":12,
|
||||
"y":18,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties": {
|
||||
"metrics": [
|
||||
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Average', 300))", "id": "e10", "label": "cluster mem used percent sum", "period": 300 } ]
|
||||
|
||||
],
|
||||
"view": "timeSeries",
|
||||
"stacked": false,
|
||||
"region": "{region}",
|
||||
"stat": "Maximum",
|
||||
"period": 300,
|
||||
"start": "-PT2H",
|
||||
"end": "P0D",
|
||||
"title": "Cluster Memory Usage"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type":"metric",
|
||||
"x":0,
|
||||
"y":18,
|
||||
"width":12,
|
||||
"height":6,
|
||||
"properties": {
|
||||
"metrics": [
|
||||
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Average', 300))", "id": "e11", "label": "cluster mem used percent average", "period": 300 } ],
|
||||
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Maximum', 300))", "id": "e12", "label": "cluster mem used percent maximum", "period": 300 } ]
|
||||
],
|
||||
"view": "timeSeries",
|
||||
"stacked": false,
|
||||
"region": "{region}",
|
||||
"stat": "Maximum",
|
||||
"period": 300,
|
||||
"start": "-PT2H",
|
||||
"end": "P0D",
|
||||
"title": "Single Node Memory Usage (Avg and Max)"
|
||||
}
|
||||
},
|
||||
{
|
||||
"height": 6,
|
||||
"width": 12,
|
||||
"y": 0,
|
||||
"x": 0,
|
||||
"type": "metric",
|
||||
"properties": {
|
||||
"metrics": [
|
||||
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-prometheus,instance} ray_node_cpu_count', 'Maximum', 300))", "label": "cluster cpu sum", "id": "e13" } ]
|
||||
],
|
||||
"view": "timeSeries",
|
||||
"stacked": false,
|
||||
"region": "{region}",
|
||||
"stat": "Maximum",
|
||||
"period": 300,
|
||||
"start": "-PT2H",
|
||||
"end": "P0D",
|
||||
"title": "Cluster CPUs"
|
||||
}
|
||||
},
|
||||
{
|
||||
"height": 6,
|
||||
"width": 12,
|
||||
"y": 0,
|
||||
"x": 12,
|
||||
"type": "metric",
|
||||
"properties": {
|
||||
"metrics": [
|
||||
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-prometheus,instance} object_store_available_memory', 'Average', 300))", "label": "cluster object store available memory sum", "id": "e14" } ]
|
||||
],
|
||||
"view": "timeSeries",
|
||||
"stacked": false,
|
||||
"region": "{region}",
|
||||
"stat": "Maximum",
|
||||
"period": 300,
|
||||
"start": "-PT2H",
|
||||
"end": "P0D",
|
||||
"title": "Cluster Object Store Available Memory"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
|
@ -14,8 +14,9 @@ provider:
|
|||
# We depend on AWS Systems Manager (SSM) to deploy CloudWatch configuration updates to your cluster,
|
||||
# with relevant configuration created or updated in the SSM Parameter Store during `ray up`.
|
||||
|
||||
# The `AmazonCloudWatch-ray_agent_config_{cluster_name}` SSM Parameter Store Config Key is used to
|
||||
# store a remote cache of the last Unified CloudWatch Agent config applied.
|
||||
# We support two CloudWatch related config type under this cloudwatch section: agent and dashboard.
|
||||
# The `AmazonCloudWatch-ray_{config_type}_config_{cluster_name}` SSM Parameter Store Config Key is used to
|
||||
# store a remote cache of the last Unified CloudWatch config applied.
|
||||
|
||||
# Every time you run `ray up` to update your cluster, we compare your local CloudWatch config file contents
|
||||
# to the SSM Parameter Store's contents for that config and, if they differ, then the associated CloudWatch
|
||||
|
@ -39,6 +40,17 @@ provider:
|
|||
max_attempts: 120
|
||||
# Seconds to wait between each Unified CloudWatch Agent SSM config update attempt.
|
||||
delay_seconds: 30
|
||||
# For CloudWatch Dashboard config files, we will also replace references to
|
||||
# `{region}` with your cluster's region name, and `{cluster_name}` with your cluster name.
|
||||
dashboard:
|
||||
# CloudWatch Dashboard name
|
||||
# Per-cluster level dashboard is created and dashboard name will be
|
||||
# `{your_cluster_name}-example-dashboard-name` as default
|
||||
name: "example-dashboard-name"
|
||||
# The CloudWatch Dashboard is defined via the config file described
|
||||
# at https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Dashboard-Body-Structure.html.
|
||||
# Path to the CloudWatch Dashboard config file
|
||||
config: "cloudwatch/example-cloudwatch-dashboard-config.json"
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
|
|
|
@ -201,6 +201,16 @@
|
|||
"description": "Seconds to wait between each Unified CloudWatch Agent installation attempt."
|
||||
}
|
||||
}
|
||||
},
|
||||
"dashboard": {
|
||||
"name": {
|
||||
"type": ["string", "null"],
|
||||
"description": "User defined CloudWatch Dashboard name."
|
||||
},
|
||||
"config": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Path to CloudWatch Dashboard config file. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Dashboard-Body-Structure.html for additional details."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue