[autoscaler] AWS Autoscaler CloudWatch Dashboard support (#20266)

These changes add a set of improvements to enable automatic creation and update of CloudWatch dashboards when provisioning AWS Autoscaling clusters. Successful implementation of these improvements will allow AWS Autoscaler users to: 1. Get rapid insights into their cluster state via CloudWatch dashboards. 2. Allow users to update their CloudWatch dashboard JSON configuration files during Ray up execution time. Notes: 1. This PR is a follow-up PR for #18619, adds dashboard support.
2025-03-05 18:11:42 -05:00 · 2022-01-10 10:18:53 -08:00 · 2022-01-10 10:18:53 -08:00 · 71fae21e8e
commit 71fae21e8e
parent 6420c75fd2
7 changed files with 563 additions and 240 deletions
--- a/python/ray/autoscaler/_private/aws/cloudwatch/cloudwatch_helper.py
+++ b/python/ray/autoscaler/_private/aws/cloudwatch/cloudwatch_helper.py
@ -21,10 +21,9 @@ CLOUDWATCH_CONFIG_HASH_TAG_BASE = "cloudwatch-config-hash"


 class CloudwatchHelper:
-    def __init__(self, provider_config: Dict[str, Any], node_ids: List[str],
+    def __init__(self, provider_config: Dict[str, Any], node_id: str,
                 cluster_name: str) -> None:
-        # dedupe and sort node IDs to support deterministic unit test stubs
-        self.node_ids = sorted(set(node_ids))
+        self.node_id = node_id
        self.cluster_name = cluster_name
        self.provider_config = provider_config
        region = provider_config["region"]
@ -41,10 +40,13 @@ class CloudwatchHelper:
            is_head_node: whether this node is the head node.
        """
        if CloudwatchHelper.cloudwatch_config_exists(self.provider_config,
-                                                     "config"):
-            self._update_cloudwatch_agent_config(is_head_node)
+                                                     "agent"):
+            self._update_cloudwatch_config(is_head_node, "agent")
+        if CloudwatchHelper.cloudwatch_config_exists(self.provider_config,
+                                                     "dashboard"):
+            self._update_cloudwatch_config(is_head_node, "dashboard")

-    def _ec2_health_check_waiter(self, node_ids: List[str]) -> None:
+    def _ec2_health_check_waiter(self, node_id: str) -> None:
        # wait for all EC2 instance checks to complete
        try:
            logger.info(
@ -52,59 +54,101 @@ class CloudwatchHelper:
                "configuring Unified Cloudwatch Agent. This may take a few "
                "minutes...")
            waiter = self.ec2_client.get_waiter("instance_status_ok")
-            waiter.wait(InstanceIds=node_ids)
+            waiter.wait(InstanceIds=[node_id])
        except botocore.exceptions.WaiterError as e:
            logger.error(
                "Failed while waiting for EC2 instance checks to complete: {}".
                format(e.message))
            raise e

-    def _update_cloudwatch_agent_config(self, is_head_node: bool) -> None:
-        """ check whether update operations are needed.
+    def _update_cloudwatch_config(self, is_head_node: bool,
+                                  config_type: str) -> None:
+        """Update remote CloudWatch configs at Parameter Store,
+        update hash tag value on node and perform associated operations
+        at CloudWatch console if local CloudWatch configs change.
+
+        Args:
+            is_head_node: whether this node is the head node.
+            config_type: CloudWatch config file type.
        """
        cwa_installed = self._setup_cwa()
-        param_name = self._get_ssm_param_name()
+        param_name = self._get_ssm_param_name(config_type)
        if cwa_installed:
            if is_head_node:
                cw_config_ssm = self._set_cloudwatch_ssm_config_param(
-                    param_name)
-                cur_cw_config_hash = self._sha1_hash_file()
+                    param_name, config_type)
+                cur_cw_config_hash = self._sha1_hash_file(config_type)
                ssm_cw_config_hash = self._sha1_hash_json(cw_config_ssm)
                # check if user updated Unified Cloudwatch Agent config file.
                # if so, perform corresponding actions.
                if cur_cw_config_hash != ssm_cw_config_hash:
                    logger.info(
-                        "Unified Cloudwatch Agent config file has changed.")
-                    self._upload_config_to_ssm_and_set_hash_tag()
-                    self._restart_cloudwatch_agent()
+                        "Cloudwatch {} config file has changed.".format(
+                            config_type))
+                    self._upload_config_to_ssm_and_set_hash_tag(config_type)
+                    if config_type == "agent":
+                        self._restart_cloudwatch_agent()
+                    elif config_type == "dashboard":
+                        self._put_cloudwatch_dashboard()
            else:
-                head_node_hash = self._get_head_node_config_hash()
-                cur_node_hash = self._get_cur_node_config_hash()
+                head_node_hash = self._get_head_node_config_hash(config_type)
+                cur_node_hash = self._get_cur_node_config_hash(config_type)
                if head_node_hash != cur_node_hash:
                    logger.info(
-                        "Unified Cloudwatch Agent config file has changed.")
-                    self._restart_cloudwatch_agent()
+                        "Cloudwatch {} config file has changed.".format(
+                            config_type))
+                    if config_type == "agent":
+                        self._restart_cloudwatch_agent()
                    self._update_cloudwatch_hash_tag_value(
-                        self.node_ids, head_node_hash)
+                        self.node_id, head_node_hash, config_type)

-    def _send_command_to_nodes(self, document_name: str, parameters: List[str],
-                               node_ids: List[str]) -> Dict[str, Any]:
+    def _put_cloudwatch_dashboard(self) -> Dict[str, Any]:
+        """put dashboard to cloudwatch console"""
+
+        cloudwatch_config = self.provider_config["cloudwatch"]
+        dashboard_config = cloudwatch_config \
+            .get("dashboard", {})
+        dashboard_name_cluster = dashboard_config.get("name",
+                                                      self.cluster_name)
+        dashboard_name = self.cluster_name + "-" + dashboard_name_cluster
+
+        widgets = self._replace_dashboard_config_variables()
+
+        response = self.cloudwatch_client.put_dashboard(
+            DashboardName=dashboard_name,
+            DashboardBody=json.dumps({
+                "widgets": widgets
+            }))
+        issue_count = len(response.get("DashboardValidationMessages", []))
+        if issue_count > 0:
+            for issue in response.get("DashboardValidationMessages"):
+                logging.error("Error in dashboard config: {} - {}".format(
+                    issue["Message"], issue["DataPath"]))
+            raise Exception(
+                "Errors in dashboard configuration: {} issues raised".format(
+                    issue_count))
+        else:
+            logger.info("Successfully put dashboard to CloudWatch console")
+        return response
+
+    def _send_command_to_node(self, document_name: str, parameters: List[str],
+                              node_id: str) -> Dict[str, Any]:
        """ send SSM command to the given nodes """
        logger.debug("Sending SSM command to {} node(s). Document name: {}. "
-                     "Parameters: {}.".format(
-                         len(node_ids), document_name, parameters))
+                     "Parameters: {}.".format(node_id, document_name,
+                                              parameters))
        response = self.ssm_client.send_command(
-            InstanceIds=node_ids,
+            InstanceIds=[node_id],
            DocumentName=document_name,
            Parameters=parameters,
-            MaxConcurrency=str(min(len(node_ids), 100)),
+            MaxConcurrency=str(min(len(node_id), 1)),
            MaxErrors="0")
        return response

    def _ssm_command_waiter(self,
                            document_name: str,
                            parameters: List[str],
-                            node_ids: List[str],
+                            node_id: str,
                            retry_failed: bool = True) -> bool:
        """ wait for SSM command to complete on all cluster nodes """

@ -113,76 +157,72 @@ class CloudwatchHelper:
        # exist instead of failing immediately, and by resubmitting
        # any failed command until all retry attempts are exhausted
        # by default.
-        response = self._send_command_to_nodes(document_name, parameters,
-                                               node_ids)
+        response = self._send_command_to_node(document_name, parameters,
+                                              node_id)
        command_id = response["Command"]["CommandId"]

        cloudwatch_config = self.provider_config["cloudwatch"]
        agent_retryer_config = cloudwatch_config \
-            .get("agent", {}) \
+            .get("agent") \
            .get("retryer", {})
        max_attempts = agent_retryer_config.get("max_attempts", 120)
        delay_seconds = agent_retryer_config.get("delay_seconds", 30)
        num_attempts = 0
        cmd_invocation_res = {}
-        for node_id in node_ids:
-            while True:
-                num_attempts += 1
-                logger.debug("Listing SSM command ID {} invocations on node {}"
-                             .format(command_id, node_id))
-                response = self.ssm_client.list_command_invocations(
-                    CommandId=command_id,
-                    InstanceId=node_id,
-                )
-                cmd_invocations = response["CommandInvocations"]
-                if not cmd_invocations:
-                    logger.debug(
-                        "SSM Command ID {} invocation does not exist. If "
-                        "the command was just started, it may take a "
-                        "few seconds to register.".format(command_id))
-                else:
-                    if len(cmd_invocations) > 1:
-                        logger.warning(
-                            "Expected to find 1 SSM command invocation with "
-                            "ID {} on node {} but found {}: {}".format(
-                                command_id,
-                                node_id,
-                                len(cmd_invocations),
-                                cmd_invocations,
-                            ))
-                    cmd_invocation = cmd_invocations[0]
-                    if cmd_invocation["Status"] == "Success":
+        while True:
+            num_attempts += 1
+            logger.debug("Listing SSM command ID {} invocations on node {}"
+                         .format(command_id, node_id))
+            response = self.ssm_client.list_command_invocations(
+                CommandId=command_id,
+                InstanceId=node_id,
+            )
+            cmd_invocations = response["CommandInvocations"]
+            if not cmd_invocations:
+                logger.debug("SSM Command ID {} invocation does not exist. If "
+                             "the command was just started, it may take a "
+                             "few seconds to register.".format(command_id))
+            else:
+                if len(cmd_invocations) > 1:
+                    logger.warning(
+                        "Expected to find 1 SSM command invocation with "
+                        "ID {} on node {} but found {}: {}".format(
+                            command_id,
+                            node_id,
+                            len(cmd_invocations),
+                            cmd_invocations,
+                        ))
+                cmd_invocation = cmd_invocations[0]
+                if cmd_invocation["Status"] == "Success":
+                    logger.debug("SSM Command ID {} completed successfully."
+                                 .format(command_id))
+                    cmd_invocation_res[node_id] = True
+                    break
+                if num_attempts >= max_attempts:
+                    logger.error(
+                        "Max attempts for command {} exceeded on node {}"
+                        .format(command_id, node_id))
+                    raise botocore.exceptions.WaiterError(
+                        name="ssm_waiter",
+                        reason="Max attempts exceeded",
+                        last_response=cmd_invocation,
+                    )
+                if cmd_invocation["Status"] == "Failed":
+                    logger.debug(f"SSM Command ID {command_id} failed.")
+                    if retry_failed:
+                        logger.debug(f"Retrying in {delay_seconds} seconds.")
+                        response = self._send_command_to_node(
+                            document_name, parameters, node_id)
+                        command_id = response["Command"]["CommandId"]
+                        logger.debug("Sent SSM command ID {} to node {}"
+                                     .format(command_id, node_id))
+                    else:
                        logger.debug(
-                            "SSM Command ID {} completed successfully."
-                            .format(command_id))
-                        cmd_invocation_res[node_id] = True
+                            f"Ignoring Command ID {command_id} failure.")
+                        cmd_invocation_res[node_id] = False
                        break
-                    if num_attempts >= max_attempts:
-                        logger.error(
-                            "Max attempts for command {} exceeded on node {}"
-                            .format(command_id, node_id))
-                        raise botocore.exceptions.WaiterError(
-                            name="ssm_waiter",
-                            reason="Max attempts exceeded",
-                            last_response=cmd_invocation,
-                        )
-                    if cmd_invocation["Status"] == "Failed":
-                        logger.debug(f"SSM Command ID {command_id} failed.")
-                        if retry_failed:
-                            logger.debug(
-                                f"Retrying in {delay_seconds} seconds.")
-                            response = self._send_command_to_nodes(
-                                document_name, parameters, [node_id])
-                            command_id = response["Command"]["CommandId"]
-                            logger.debug("Sent SSM command ID {} to node {}"
-                                         .format(command_id, node_id))
-                        else:
-                            logger.debug(
-                                f"Ignoring Command ID {command_id} failure.")
-                            cmd_invocation_res[node_id] = False
-                            break
+            time.sleep(delay_seconds)

-                time.sleep(delay_seconds)
        return cmd_invocation_res

    def _replace_config_variables(self, string: str, node_id: str,
@ -211,7 +251,6 @@ class CloudwatchHelper:
        modified strings in the collection (which is not necessarily equal to
        the number of variables replaced).
        """
-        modified_value_count = 0
        for key in collection:
            if type(collection) is dict:
                value = collection.get(key)
@ -222,49 +261,48 @@ class CloudwatchHelper:
            if type(value) is str:
                collection[index_key] = self._replace_config_variables(
                    value, node_id, cluster_name, region)
-                modified_value_count += (collection[index_key] != value)
            elif type(value) is dict or type(value) is list:
-                collection[index_key], modified_count = self. \
+                collection[index_key] = self. \
                    _replace_all_config_variables(
                    value, node_id, cluster_name, region)
-                modified_value_count += modified_count
-        return collection, modified_value_count
+        return collection

-    def _load_config_file(self) -> Dict[str, Any]:
+    def _load_config_file(self, config_type: str) -> Dict[str, Any]:
        """load JSON config file"""
        cloudwatch_config = self.provider_config["cloudwatch"]
-        json_config_file_section = cloudwatch_config.get("agent", {})
+        json_config_file_section = cloudwatch_config.get(config_type, {})
        json_config_file_path = json_config_file_section.get("config", {})
        json_config_path = os.path.abspath(json_config_file_path)
        with open(json_config_path) as f:
            data = json.load(f)
        return data

-    def _set_cloudwatch_ssm_config_param(self, parameter_name: str) -> str:
+    def _set_cloudwatch_ssm_config_param(self, parameter_name: str,
+                                         config_type: str) -> str:
        """
        get cloudwatch config for the given param and config type from SSM
-        if it exists, returns empty str if not.
+        if it exists, put it in the SSM param store if not
        """
        try:
            parameter_value = self._get_ssm_param(parameter_name)
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] == "ParameterNotFound":
                logger.info(
-                    "Unified Cloudwatch Agent config file is not found "
+                    "Cloudwatch {} config file is not found "
                    "at SSM parameter store. "
-                    "Checking for Unified Cloudwatch Agent installation")
+                    "Checking for Unified CloudWatch Agent installation".
+                    format(config_type))
                return self._get_default_empty_config_file_hash()
            else:
-                logger.info(
-                    "Failed to fetch Unified Cloudwatch Agent config from SSM "
-                    "parameter store.")
+                logger.info("Failed to fetch CloudWatch {} config from SSM "
+                            "parameter store.".format(config_type))
                logger.error(e)
                raise e
        return parameter_value

    def _get_default_empty_config_file_hash(self):
-        default_cwa_config = "{}"
-        parameter_value = self._sha1_hash_json(default_cwa_config)
+        default_cw_config = "{}"
+        parameter_value = self._sha1_hash_json(default_cw_config)
        return parameter_value

    def _get_ssm_param(self, parameter_name: str) -> str:
@ -286,48 +324,54 @@ class CloudwatchHelper:
        sha1_res = hash.hexdigest()
        return sha1_res

-    def _sha1_hash_file(self) -> str:
+    def _sha1_hash_file(self, config_type: str) -> str:
        """calculate the config file sha1 hash"""
-        config = self._replace_cwa_config_variables()
+        if config_type == "agent":
+            config = self._replace_cwa_config_variables()
+        if config_type == "dashboard":
+            config = self._replace_dashboard_config_variables()
        value = json.dumps(config)
        sha1_res = self._sha1_hash_json(value)
        return sha1_res

-    def _upload_config_to_ssm_and_set_hash_tag(self):
-        """This function should only be called by head node"""
-        data = self._replace_cwa_config_variables()
-        sha1_hash_value = self._sha1_hash_file()
-        self._upload_config_to_ssm(data)
-        self._update_cloudwatch_hash_tag_value(self.node_ids, sha1_hash_value)
+    def _upload_config_to_ssm_and_set_hash_tag(self, config_type: str):
+        if config_type == "agent":
+            data = self._replace_cwa_config_variables()
+        if config_type == "dashboard":
+            data = self._replace_dashboard_config_variables()
+        sha1_hash_value = self._sha1_hash_file(config_type)
+        self._upload_config_to_ssm(data, config_type)
+        self._update_cloudwatch_hash_tag_value(self.node_id, sha1_hash_value,
+                                               config_type)

-    def _add_cwa_installed_tag(self, node_ids: List[str]) -> None:
+    def _add_cwa_installed_tag(self, node_id: str) -> None:
        self.ec2_client.create_tags(
-            Resources=node_ids,
+            Resources=[node_id],
            Tags=[{
                "Key": CLOUDWATCH_AGENT_INSTALLED_TAG,
                "Value": "True"
            }])
        logger.info("Successfully add Unified Cloudwatch Agent installed "
-                    "tag on {}".format(node_ids))
+                    "tag on {}".format(node_id))

-    def _update_cloudwatch_hash_tag_value(self, node_ids: List[str],
-                                          sha1_hash_value: str):
-        hash_key_value = "-".join([CLOUDWATCH_CONFIG_HASH_TAG_BASE, "agent"])
+    def _update_cloudwatch_hash_tag_value(
+            self, node_id: str, sha1_hash_value: str, config_type: str):
+        hash_key_value = "-".join(
+            [CLOUDWATCH_CONFIG_HASH_TAG_BASE, config_type])
        self.ec2_client.create_tags(
-            Resources=node_ids,
+            Resources=[node_id],
            Tags=[{
                "Key": hash_key_value,
                "Value": sha1_hash_value
            }])
-        logger.info(
-            "Successfully update Unified Cloudwatch Agent hash tag on {}".
-            format(node_ids))
+        logger.info("Successfully update cloudwatch {} hash tag on {}".format(
+            config_type, node_id))

-    def _get_ssm_param_name(self) -> str:
+    def _get_ssm_param_name(self, config_type: str) -> str:
        """return the parameter name for cloudwatch configs"""
        ssm_config_param_name = \
            "AmazonCloudWatch-" + "ray_{}_config_{}". \
-            format("agent", self.cluster_name)
+            format(config_type, self.cluster_name)
        return ssm_config_param_name

    def _put_ssm_param(self, parameter: Dict[str, Any],
@ -341,8 +385,8 @@ class CloudwatchHelper:
            Tier="Intelligent-Tiering",
        )

-    def _upload_config_to_ssm(self, param: Dict[str, Any]):
-        param_name = self._get_ssm_param_name()
+    def _upload_config_to_ssm(self, param: Dict[str, Any], config_type: str):
+        param_name = self._get_ssm_param_name(config_type)
        self._put_ssm_param(param, param_name)

    def _replace_cwa_config_variables(self) -> Dict[str, Any]:
@ -350,28 +394,45 @@ class CloudwatchHelper:
        replace known variable occurrences in
        Unified Cloudwatch Agent config file
        """
-        cwa_config = self._load_config_file()
+        cwa_config = self._load_config_file("agent")
        self._replace_all_config_variables(
            cwa_config,
-            self.node_ids[0],
+            self.node_id,
            self.cluster_name,
            self.provider_config["region"],
        )
        return cwa_config

+    def _replace_dashboard_config_variables(self) -> Dict[str, Any]:
+        """
+        replace known variable occurrences in CloudWatch Dashboard config file
+        """
+        data = self._load_config_file("dashboard")
+        widgets = []
+        for item in data:
+            item_out = self._replace_all_config_variables(
+                item,
+                self.node_id,
+                self.cluster_name,
+                self.provider_config["region"],
+            )
+            item_out = copy.deepcopy(item)
+            widgets.append(item_out)
+        return widgets
+
    def _restart_cloudwatch_agent(self) -> None:
        """restart Unified Cloudwatch Agent"""
-        cwa_param_name = self._get_ssm_param_name()
+        cwa_param_name = self._get_ssm_param_name("agent")
        logger.info(
            "Restarting Unified Cloudwatch Agent package on {} node(s)."
-            .format(len(self.node_ids)))
+            .format((self.node_id)))
        self._stop_cloudwatch_agent()
        self._start_cloudwatch_agent(cwa_param_name)

    def _stop_cloudwatch_agent(self) -> None:
-        """stop Unified Cloudwatch Agent"""
-        logger.info("Stopping Unified Cloudwatch Agent package on {} node(s)."
-                    .format(len(self.node_ids)))
+        """stop Unified CloudWatch Agent """
+        logger.info("Stopping Unified CloudWatch Agent package on node {}."
+                    .format(self.node_id))
        parameters_stop_cwa = {
            "action": ["stop"],
            "mode": ["ec2"],
@ -381,16 +442,16 @@ class CloudwatchHelper:
        self._ssm_command_waiter(
            "AmazonCloudWatch-ManageAgent",
            parameters_stop_cwa,
-            self.node_ids,
+            self.node_id,
            False,
        )
-        logger.info("Unified Cloudwatch Agent stopped on {} node(s).".format(
-            len(self.node_ids)))
+        logger.info("Unified CloudWatch Agent stopped on node {}.".format(
+            self.node_id))

    def _start_cloudwatch_agent(self, cwa_param_name: str) -> None:
-        """start Unified Cloudwatch Agent"""
-        logger.info("Starting Unified Cloudwatch Agent package on {} node(s)."
-                    .format(len(self.node_ids)))
+        """start Unified CloudWatch Agent"""
+        logger.info("Starting Unified CloudWatch Agent package on node {}."
+                    .format(self.node_id))
        parameters_start_cwa = {
            "action": ["configure"],
            "mode": ["ec2"],
@ -399,21 +460,21 @@ class CloudwatchHelper:
            "optionalRestart": ["yes"],
        }
        self._ssm_command_waiter("AmazonCloudWatch-ManageAgent",
-                                 parameters_start_cwa, self.node_ids)
-        logger.info(
-            "Unified Cloudwatch Agent started successfully on {} node(s)."
-            .format(len(self.node_ids)))
+                                 parameters_start_cwa, self.node_id)
+        logger.info("Unified CloudWatch Agent started successfully on node {}."
+                    .format(self.node_id))

    def _setup_cwa(self) -> bool:
        cwa_installed = self._check_cwa_installed_ec2_tag()
        if cwa_installed == "False":
-            res_cwa_installed = self._ensure_cwa_installed_ssm(self.node_ids)
+            res_cwa_installed = self._ensure_cwa_installed_ssm(self.node_id)
            return res_cwa_installed
        else:
            return True

-    def _get_head_node_config_hash(self) -> str:
-        hash_key_value = "-".join([CLOUDWATCH_CONFIG_HASH_TAG_BASE, "agent"])
+    def _get_head_node_config_hash(self, config_type: str) -> str:
+        hash_key_value = "-".join(
+            [CLOUDWATCH_CONFIG_HASH_TAG_BASE, config_type])
        filters = copy.deepcopy(
            self._get_current_cluster_session_nodes(self.cluster_name))
        filters.append({
@ -432,11 +493,12 @@ class CloudwatchHelper:
                "{} Error caught when getting value of {} tag on head node".
                format(e.response["Error"], hash_key_value))

-    def _get_cur_node_config_hash(self) -> str:
-        hash_key_value = "-".join([CLOUDWATCH_CONFIG_HASH_TAG_BASE, "agent"])
+    def _get_cur_node_config_hash(self, config_type: str) -> str:
+        hash_key_value = "-".join(
+            [CLOUDWATCH_CONFIG_HASH_TAG_BASE, config_type])
        try:
            response = self.ec2_client.describe_instances(
-                InstanceIds=self.node_ids)
+                InstanceIds=[self.node_id])
            reservations = response["Reservations"]
            message = "More than 1 response received from " \
                      "describing current node"
@ -447,9 +509,9 @@ class CloudwatchHelper:
            hash_value = self._get_default_empty_config_file_hash()
            for tag in tags:
                if tag["Key"] == hash_key_value:
-                    logger.info("Successfully get Unified Cloudwatch Agent "
-                                "hash tag value from node {}".format(
-                                    self.node_ids))
+                    logger.info(
+                        "Successfully get cloudwatch {} hash tag value from "
+                        "node {}".format(config_type, self.node_id))
                    hash_value = tag["Value"]
            return hash_value
        except botocore.exceptions.ClientError as e:
@ -457,36 +519,29 @@ class CloudwatchHelper:
                "{} Error caught when getting hash tag {} tag".format(
                    e.response["Error"], hash_key_value))

-    def _ensure_cwa_installed_ssm(self, node_ids: List[str]) -> bool:
+    def _ensure_cwa_installed_ssm(self, node_id: str) -> bool:
        """
        Check if Unified Cloudwatch Agent is installed via ssm run command.
-
        If not, notify user to use an AMI with
        the Unified CloudWatch Agent installed.
        """
        logger.info("Checking Unified Cloudwatch Agent "
-                    "status on {} nodes".format(len(node_ids)))
+                    "status on node {}".format(node_id))
        parameters_status_cwa = {
            "action": ["status"],
            "mode": ["ec2"],
        }
-        self._ec2_health_check_waiter(node_ids)
+        self._ec2_health_check_waiter(node_id)
        cmd_invocation_res = self._ssm_command_waiter(
-            "AmazonCloudWatch-ManageAgent", parameters_status_cwa, node_ids,
+            "AmazonCloudWatch-ManageAgent", parameters_status_cwa, node_id,
            False)
-        uninstalled_nodes = []
-        installed_nodes = []
-        for node_id, res in cmd_invocation_res.items():
-            if not res:
-                uninstalled_nodes.append(node_id)
-            else:
-                installed_nodes.append(node_id)
-        if len(uninstalled_nodes) > 0:
+        cwa_installed = cmd_invocation_res.get(node_id, False)
+        if not cwa_installed:
            logger.warning(
                "Unified CloudWatch Agent not installed on {}. "
                "Ray logs, metrics not picked up. "
                "Please use an AMI with Unified CloudWatch Agent installed."
-                .format(uninstalled_nodes))
+                .format(node_id))
            return False
        else:
            return True
@ -504,11 +559,12 @@ class CloudwatchHelper:

    def _check_cwa_installed_ec2_tag(self) -> List[str]:
        """
-        Check if Unified Cloudwatch Agent is installed.
+        Filtering all nodes to get nodes
+        without Unified CloudWatch Agent installed
        """
        try:
            response = self.ec2_client.describe_instances(
-                InstanceIds=self.node_ids)
+                InstanceIds=[self.node_id])
            reservations = response["Reservations"]
            message = "More than 1 response received from " \
                      "describing current node"
@ -519,15 +575,15 @@ class CloudwatchHelper:
            cwa_installed = str(False)
            for tag in tags:
                if tag["Key"] == CLOUDWATCH_AGENT_INSTALLED_TAG:
-                    logger.info("Unified Cloudwatch Agent is installed on "
-                                "node {}".format(self.node_ids))
+                    logger.info("Unified CloudWatch Agent is installed on "
+                                "node {}".format(self.node_id))
                    cwa_installed = tag["Value"]
            return cwa_installed
        except botocore.exceptions.ClientError as e:
            logger.warning(
-                "{} Error caught when getting Unified Cloudwatch Agent status "
-                "based on {} tag".format(e.response["Error"],
-                                         CLOUDWATCH_AGENT_INSTALLED_TAG))
+                "{} Error caught when getting Unified CloudWatch Agent "
+                "status based on {} tag".format(
+                    e.response["Error"], CLOUDWATCH_AGENT_INSTALLED_TAG))

    @staticmethod
    def resolve_instance_profile_name(
@ -545,7 +601,7 @@ class CloudwatchHelper:
                doesn't exist.
        """
        cwa_cfg_exists = CloudwatchHelper.cloudwatch_config_exists(
-            config, "config")
+            config, "agent")
        return CLOUDWATCH_RAY_INSTANCE_PROFILE if cwa_cfg_exists \
            else default_instance_profile_name

@ -563,7 +619,7 @@ class CloudwatchHelper:
            default ray iam role name if cloudwatch config file doesn't exist.
        """
        cwa_cfg_exists = CloudwatchHelper.cloudwatch_config_exists(
-            config, "config")
+            config, "agent")
        return CLOUDWATCH_RAY_IAM_ROLE if cwa_cfg_exists \
            else default_iam_role_name

@ -583,7 +639,7 @@ class CloudwatchHelper:
                cluster config file.
        """
        cwa_cfg_exists = CloudwatchHelper.cloudwatch_config_exists(
-            config, "config")
+            config, "agent")
        if cwa_cfg_exists:
            cloudwatch_managed_policy = {
                "Version": "2012-10-17",
@ -624,7 +680,7 @@ class CloudwatchHelper:

    @staticmethod
    def cloudwatch_config_exists(config: Dict[str, Any],
-                                 config_key_name: str) -> bool:
+                                 config_type: str) -> bool:
        """Check if CloudWatch configuration was specified by the user
        in their cluster config file.

@ -633,12 +689,11 @@ class CloudwatchHelper:

        Args:
            config: provider section of cluster config file.
-            config_key_name: config file name.
+            config_type: type of CloudWatch config file.

        Returns:
            True if config file is specified by user.
            False if config file is not specified.
        """
-        cfg = config.get("cloudwatch", {}).get("agent",
-                                               {}).get(config_key_name)
+        cfg = config.get("cloudwatch", {}).get(config_type, {}).get("config")
        return bool(cfg)
--- a/python/ray/autoscaler/_private/aws/node_provider.py
+++ b/python/ray/autoscaler/_private/aws/node_provider.py
@ -361,7 +361,7 @@ class AWSNodeProvider(NodeProvider):
                "Value": v,
            })
        if CloudwatchHelper.cloudwatch_config_exists(self.provider_config,
-                                                     "config"):
+                                                     "agent"):
            cwa_installed = self._check_ami_cwa_installation(node_config)
            if cwa_installed:
                tag_pairs.extend([{
--- a/python/ray/autoscaler/_private/updater.py
+++ b/python/ray/autoscaler/_private/updater.py
@ -307,7 +307,7 @@ class NodeUpdater:
            from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper \
                import CloudwatchHelper
            CloudwatchHelper(self.provider.provider_config,
-                             [self.node_id], self.provider.cluster_name). \
+                             self.node_id, self.provider.cluster_name). \
                update_from_config(self.is_head_node)

        if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash:
--- a/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-agent-config.json
+++ b/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-agent-config.json
@ -3,10 +3,6 @@
      "metrics_collection_interval":60,
      "run_as_user":"root"
   },
-   "csm":{
-      "memory_limit_in_mb":20,
-      "port":31000
-   },
   "logs":{
      "metrics_collected": {
        "prometheus": {
@ -116,60 +112,72 @@
         }
      }
   },
-   "metrics":{
-      "append_dimensions":{
-         "AutoScalingGroupName":"${aws:AutoScalingGroupName}",
-         "InstanceId":"${aws:InstanceId}"
-      },
-      "metrics_collected":{
-         "collectd":{
-            "metrics_aggregation_interval":60
-         },
-         "cpu":{
-            "measurement":[
-               "usage_active",
-               "usage_system",
-               "usage_user",
-               "usage_idle",
-               "time_active",
-               "time_system",
-               "time_user",
-               "time_idle"
-            ]
-         },
-         "processes":{
-            "measurement":[
-               "processes_running",
-               "processes_sleeping",
-               "processes_zombies",
-               "processes_dead",
-               "processes_total"
-            ],
-            "metrics_collection_interval":60,
-            "resources":[
-               "*"
-            ]
-         },
-         "disk":{
-            "measurement":[
-               "disk_used_percent"
-            ],
-            "metrics_collection_interval":60,
-            "resources":[
-               "*"
-            ]
-         },
-         "mem":{
-            "measurement":[
-               "mem_used_percent"
-            ],
-            "metrics_collection_interval":60
-         },
-         "statsd":{
-            "metrics_aggregation_interval":60,
-            "metrics_collection_interval":10,
-            "service_address":":8125"
-         }
-      }
+   "metrics": {
+     "namespace": "{cluster_name}-ray-CWAgent",
+     "aggregation_dimensions": [
+       [
+         "InstanceId"
+       ]
+     ],
+     "append_dimensions": {
+       "AutoScalingGroupName": "${aws:AutoScalingGroupName}",
+       "InstanceId": "${aws:InstanceId}"
+     },
+     "metrics_collected": {
+       "collectd": {
+         "metrics_aggregation_interval": 60
+       },
+       "cpu": {
+         "measurement": [
+           "usage_active",
+           "usage_system",
+           "usage_user",
+           "usage_idle",
+           "time_active",
+           "time_system",
+           "time_user",
+           "time_idle"
+         ],
+         "resources": [
+           "*"
+         ]
+       },
+       "processes": {
+         "measurement": [
+           "processes_running",
+           "processes_sleeping",
+           "processes_zombies",
+           "processes_dead",
+           "processes_total"
+         ],
+         "metrics_collection_interval": 60,
+         "resources": [
+           "*"
+         ]
+       },
+       "disk": {
+         "measurement": [
+           "disk_used_percent"
+         ],
+         "metrics_collection_interval": 60,
+         "resources": [
+           "/"
+         ]
+       },
+       "mem": {
+         "measurement": [
+           "mem_used_percent"
+         ],
+         "metrics_collection_interval": 60,
+         "resources": [
+           "*"
+         ]
+       },
+       "statsd": {
+         "metrics_aggregation_interval": 60,
+         "metrics_collection_interval": 10,
+         "service_address": ":8125"
+       }
+     }
   }
 }
--- a/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-dashboard-config.json
+++ b/python/ray/autoscaler/aws/cloudwatch/example-cloudwatch-dashboard-config.json
@ -0,0 +1,238 @@
+[
+   {
+      "type":"explorer",
+      "x":12,
+      "y":18,
+      "width":12,
+      "height":6,
+      "properties": {
+        "metrics": [
+           {
+              "metricName": "CPUUtilization",
+              "resourceType": "AWS::EC2::Instance",
+              "stat": "Average"
+           }
+        ],
+        "aggregateBy": {
+                    "key": "*",
+                    "func": "SUM"
+        },
+        "labels": [
+            {
+                "key": "cloudwatch-agent-installed",
+                "value": "True"
+            },
+            {
+                "key": "ray-cluster-name",
+                "value": "{cluster_name}"
+            }
+        ],
+        "widgetOptions": {
+            "legend": {
+                "position": "bottom"
+            },
+            "view": "timeSeries",
+            "stacked": false,
+            "rowsPerPage": 1,
+            "widgetsPerRow": 1
+        },
+        "title":"Cluster CPU Utilization"
+      }
+   },
+   {
+      "type":"explorer",
+      "x":0,
+      "y":18,
+      "width":12,
+      "height":6,
+      "properties": {
+        "metrics": [
+           {
+              "metricName": "CPUUtilization",
+              "resourceType": "AWS::EC2::Instance",
+              "stat": "Average"
+           }
+        ],
+        "aggregateBy": {
+              "key": "*",
+              "func": "AVG"
+        },
+        "labels": [
+            {
+                "key": "cloudwatch-agent-installed",
+                "value": "True"
+            },
+            {
+                "key": "ray-cluster-name",
+                "value": "{cluster_name}"
+            }
+        ],
+        "widgetOptions": {
+            "legend": {
+                "position": "bottom"
+            },
+            "view": "timeSeries",
+            "stacked": false,
+            "rowsPerPage": 1,
+            "widgetsPerRow": 1
+        },
+        "title":"Single Node CPU Utilization (Avg and Max)"
+      }
+   },
+   {
+      "type":"metric",
+      "x":12,
+      "y":6,
+      "width":12,
+      "height":6,
+      "properties":{
+         "view":"timeSeries",
+         "metrics":[
+            [ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process sum", "id": "e1" } ],
+            [ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process sum", "id": "e2" } ]
+         ],
+         "region":"{region}",
+         "stat":"Average",
+         "period":60,
+         "title":"Cluster Processes"
+      }
+   },
+   {
+      "type":"metric",
+      "x":0,
+      "y":6,
+      "width":12,
+      "height":6,
+      "properties":{
+         "view":"timeSeries",
+         "metrics":[
+            [ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process average", "id": "e3" } ],
+            [ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process average", "id": "e4" } ],
+            [ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process maximum", "id": "e5" } ],
+            [ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process maximum", "id": "e6" } ]
+         ],
+         "region":"{region}",
+         "stat":"Average",
+         "period":60,
+         "title":"Single Node Processes (Avg and Max)"
+      }
+   },
+   {
+      "type":"metric",
+      "x":12,
+      "y":12,
+      "width":12,
+      "height":6,
+      "properties":{
+         "view":"timeSeries",
+         "stacked":false,
+         "metrics":[
+            [ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Average', 300))", "label": "cluster disk used percent sum", "id": "e7", "period": 300 } ]
+
+         ],
+         "region":"{region}",
+         "title":"Cluster Disk Usage"
+      }
+   },
+   {
+      "type":"metric",
+      "x":0,
+      "y":12,
+      "width":12,
+      "height":6,
+      "properties":{
+         "view":"timeSeries",
+         "stacked":false,
+         "metrics":[
+            [ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Average', 300))", "id": "e8", "label": "cluster disk used percent average", "period": 300 } ],
+            [ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Maximum', 300))", "id": "e9", "label": "cluster disk used percent maximum", "period": 300 } ]
+
+         ],
+         "region":"{region}",
+         "title":"Single Node Disk Usage (Avg and Max)"
+      }
+   },
+   {
+      "type":"metric",
+      "x":12,
+      "y":18,
+      "width":12,
+      "height":6,
+      "properties": {
+          "metrics": [
+              [ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Average', 300))", "id": "e10", "label": "cluster mem used percent sum", "period": 300 } ]
+
+          ],
+          "view": "timeSeries",
+          "stacked": false,
+          "region": "{region}",
+          "stat": "Maximum",
+          "period": 300,
+          "start": "-PT2H",
+          "end": "P0D",
+          "title": "Cluster Memory Usage"
+      }
+   },
+   {
+      "type":"metric",
+      "x":0,
+      "y":18,
+      "width":12,
+      "height":6,
+      "properties": {
+          "metrics": [
+              [ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Average', 300))", "id": "e11", "label": "cluster mem used percent average", "period": 300 } ],
+              [ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Maximum', 300))", "id": "e12", "label": "cluster mem used percent maximum", "period": 300 } ]
+          ],
+          "view": "timeSeries",
+          "stacked": false,
+          "region": "{region}",
+          "stat": "Maximum",
+          "period": 300,
+          "start": "-PT2H",
+          "end": "P0D",
+          "title": "Single Node Memory Usage (Avg and Max)"
+      }
+   },
+   {
+      "height": 6,
+      "width": 12,
+      "y": 0,
+      "x": 0,
+      "type": "metric",
+      "properties": {
+          "metrics": [
+              [ { "expression": "SUM(SEARCH('{{cluster_name}-ray-prometheus,instance} ray_node_cpu_count', 'Maximum', 300))", "label": "cluster cpu sum", "id": "e13" } ]
+          ],
+          "view": "timeSeries",
+          "stacked": false,
+          "region": "{region}",
+          "stat": "Maximum",
+          "period": 300,
+          "start": "-PT2H",
+          "end": "P0D",
+          "title": "Cluster CPUs"
+      }
+   },
+   {
+      "height": 6,
+      "width": 12,
+      "y": 0,
+      "x": 12,
+      "type": "metric",
+      "properties": {
+          "metrics": [
+              [ { "expression": "SUM(SEARCH('{{cluster_name}-ray-prometheus,instance} object_store_available_memory', 'Average', 300))", "label": "cluster object store available memory sum", "id": "e14" } ]
+          ],
+          "view": "timeSeries",
+          "stacked": false,
+          "region": "{region}",
+          "stat": "Maximum",
+          "period": 300,
+          "start": "-PT2H",
+          "end": "P0D",
+          "title": "Cluster Object Store Available Memory"
+      }
+   }
+]
+
--- a/python/ray/autoscaler/aws/example-cloudwatch.yaml
+++ b/python/ray/autoscaler/aws/example-cloudwatch.yaml
@ -14,8 +14,9 @@ provider:
        # We depend on AWS Systems Manager (SSM) to deploy CloudWatch configuration updates to your cluster,
        # with relevant configuration created or updated in the SSM Parameter Store during `ray up`.

-        # The `AmazonCloudWatch-ray_agent_config_{cluster_name}` SSM Parameter Store Config Key is used to
-        # store a remote cache of the last Unified CloudWatch Agent config applied.
+        # We support two CloudWatch related config type under this cloudwatch section: agent and dashboard.
+        # The `AmazonCloudWatch-ray_{config_type}_config_{cluster_name}` SSM Parameter Store Config Key is used to
+        # store a remote cache of the last Unified CloudWatch config applied.

        # Every time you run `ray up` to update your cluster, we compare your local CloudWatch config file contents
        # to the SSM Parameter Store's contents for that config and, if they differ, then the associated CloudWatch
@ -39,6 +40,17 @@ provider:
                max_attempts: 120
                # Seconds to wait between each Unified CloudWatch Agent SSM config update attempt.
                delay_seconds: 30
+        # For CloudWatch Dashboard config files, we will also replace references to
+        # `{region}` with your cluster's region name, and `{cluster_name}` with your cluster name.
+        dashboard:
+            # CloudWatch Dashboard name
+            # Per-cluster level dashboard is created and dashboard name will be
+            # `{your_cluster_name}-example-dashboard-name` as default
+            name: "example-dashboard-name"
+            # The CloudWatch Dashboard is defined via the config file described
+            # at https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Dashboard-Body-Structure.html.
+            # Path to the CloudWatch Dashboard config file
+            config: "cloudwatch/example-cloudwatch-dashboard-config.json"

 # How Ray will authenticate with newly launched nodes.
 auth:
--- a/python/ray/autoscaler/ray-schema.json
+++ b/python/ray/autoscaler/ray-schema.json
@ -201,6 +201,16 @@
                                "description": "Seconds to wait between each Unified CloudWatch Agent installation attempt."
                            }
                        }
+                    },
+                    "dashboard": {
+                        "name": {
+                            "type": ["string", "null"],
+                            "description": "User defined CloudWatch Dashboard name."
+                        },
+                        "config": {
+                            "type": ["string", "null"],
+                            "description": "Path to CloudWatch Dashboard config file. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Dashboard-Body-Structure.html for additional details."
+                        }
                    }
                }
            }