diff --git a/CHANGELOG.md b/CHANGELOG.md index 63978fa573..28656849b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ CHANGELOG - Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings. - Upgrade jmespath to ~=1.0 (from ~=0.10). - Upgrade tabulate to <=0.9.0 (from <=0.8.10). +- Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics. +- Add alarm on missing clustermgtd heartbeat. **BUG FIXES** - Add validation to block updates that change tag order. Blocking such change prevents update failures. diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index 6e68cdaea7..8ac6db4601 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -186,6 +186,11 @@ CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT = 1 DETAILED_MONITORING_ENABLED_DEFAULT = False +# CloudWatch Metrics +CW_METRICS_NAMESPACE = "ParallelCluster" +CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName" +CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat" + STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events" PCLUSTER_IMAGE_NAME_REGEX = r"^[-_A-Za-z0-9{][-_A-Za-z0-9\s:{}\.]+[-_A-Za-z0-9}]$" diff --git a/cli/src/pcluster/templates/cdk_builder_utils.py b/cli/src/pcluster/templates/cdk_builder_utils.py index 8c7ee692e3..ebaeb78973 100644 --- a/cli/src/pcluster/templates/cdk_builder_utils.py +++ b/cli/src/pcluster/templates/cdk_builder_utils.py @@ -38,11 +38,13 @@ from pcluster.constants import ( COOKBOOK_PACKAGES_VERSIONS, CW_LOGS_RETENTION_DAYS_DEFAULT, + CW_METRICS_NAMESPACE, IAM_ROLE_PATH, LAMBDA_VPC_ACCESS_MANAGED_POLICY, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_DYNAMODB_PREFIX, PCLUSTER_NODE_TYPE_TAG, + SLURM, ULTRASERVER_INSTANCE_PREFIX_LIST, ) from pcluster.launch_template_utils import _LaunchTemplateBuilder @@ -742,6 +744,19 @@ def _build_policy(self) -> List[iam.PolicyStatement]: ), ] + if self._config.scheduling.scheduler == SLURM: + policy.extend( + [ + iam.PolicyStatement( + sid="CloudWatch", + actions=["cloudwatch:PutMetricData"], + effect=iam.Effect.ALLOW, + resources=["*"], + conditions={"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}}, + ) + ] + ) + if ( self._config.scheduling.scheduler == "slurm" and self._config.scheduling.settings diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index 66fdf179c5..e9a0ed483f 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -64,6 +64,9 @@ CW_ALARM_PERIOD_DEFAULT, CW_LOG_GROUP_NAME_PREFIX, CW_LOGS_CFN_PARAM_NAME, + CW_METRICS_CLUSTERMGTD_HEARTBEAT, + CW_METRICS_DIMENSION_CLUSTER_NAME, + CW_METRICS_NAMESPACE, DEFAULT_EPHEMERAL_DIR, EFS_PORT, FSX_PORTS, @@ -364,38 +367,64 @@ def _cw_metric_head_node( def _add_head_node_alarms(self): self.head_node_alarms = [] + # Metric-specific configurations (only specify overrides from defaults) metrics_for_alarms = { - "Health": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), - "Cpu": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), - "Mem": self._cw_metric_head_node("CWAgent", "mem_used_percent"), - "Disk": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), + "Health": { + "metric": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), + "threshold": 0, + }, + "Cpu": { + "metric": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), + }, + "Mem": { + "metric": self._cw_metric_head_node("CWAgent", "mem_used_percent"), + }, + "Disk": { + "metric": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), + }, } - for metric_key, metric in metrics_for_alarms.items(): + if self._condition_is_slurm(): + metrics_for_alarms["ClustermgtdHeartbeat"] = { + "metric": self._cw_metric_head_node( + CW_METRICS_NAMESPACE, + CW_METRICS_CLUSTERMGTD_HEARTBEAT, + extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name}, + ), + "evaluation_periods": 10, + "datapoints_to_alarm": 10, + "comparison_operator": cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD, + "threshold": 1, + "treat_missing_data": cloudwatch.TreatMissingData.BREACHING, + } + + for metric_key, alarm_config in metrics_for_alarms.items(): alarm_id = f"HeadNode{metric_key}Alarm" alarm_name = f"{self.stack.stack_name}-HeadNode-{metric_key}" - threshold = 0 if metric_key == "Health" else CW_ALARM_PERCENT_THRESHOLD_DEFAULT - self.head_node_alarms.append( - cloudwatch.Alarm( - scope=self.stack, - id=alarm_id, - alarm_name=alarm_name, - metric=metric, - evaluation_periods=CW_ALARM_EVALUATION_PERIODS_DEFAULT, - threshold=threshold, - comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, - datapoints_to_alarm=CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, - ) - ) - - self.head_node_alarms.append( - cloudwatch.CompositeAlarm( + alarm = cloudwatch.Alarm( scope=self.stack, - id="HeadNodeAlarm", - composite_alarm_name=f"{self.stack.stack_name}-HeadNode", - alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), + id=alarm_id, + alarm_name=alarm_name, + metric=alarm_config["metric"], + evaluation_periods=alarm_config.get("evaluation_periods", CW_ALARM_EVALUATION_PERIODS_DEFAULT), + threshold=alarm_config.get("threshold", CW_ALARM_PERCENT_THRESHOLD_DEFAULT), + comparison_operator=alarm_config.get( + "comparison_operator", cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD + ), + datapoints_to_alarm=alarm_config.get("datapoints_to_alarm", CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT), + treat_missing_data=alarm_config.get("treat_missing_data", cloudwatch.TreatMissingData.MISSING), ) + alarm.node.add_dependency(self.wait_condition) + self.head_node_alarms.append(alarm) + + composite_alarm = cloudwatch.CompositeAlarm( + scope=self.stack, + id="HeadNodeAlarm", + composite_alarm_name=f"{self.stack.stack_name}-HeadNode", + alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), ) + composite_alarm.node.add_dependency(self.wait_condition) + self.head_node_alarms.append(composite_alarm) def _add_iam_resources(self): head_node_iam_resources = HeadNodeIamResources( diff --git a/cli/src/pcluster/templates/cw_dashboard_builder.py b/cli/src/pcluster/templates/cw_dashboard_builder.py index d676739482..fff10bba2c 100644 --- a/cli/src/pcluster/templates/cw_dashboard_builder.py +++ b/cli/src/pcluster/templates/cw_dashboard_builder.py @@ -19,7 +19,12 @@ from pcluster.config.cluster_config import BaseClusterConfig, ExistingFileCache, SharedFsxLustre from pcluster.config.common import SharedStorageType -from pcluster.constants import Feature +from pcluster.constants import ( + CW_METRICS_CLUSTERMGTD_HEARTBEAT, + CW_METRICS_DIMENSION_CLUSTER_NAME, + CW_METRICS_NAMESPACE, + Feature, +) from pcluster.utils import is_feature_supported MAX_WIDTH = 24 @@ -567,9 +572,21 @@ def _add_head_node_instance_metrics_graphs(self): new_pcluster_metric(title="Memory Used Percent", metrics=["mem_used_percent"], namespace="CWAgent"), ] + # Custom Metrics + pcluster_metrics = [] + if self.config.scheduling.scheduler == "slurm": + pcluster_metrics.append( + new_pcluster_metric( + title="Daemons Heartbeats", + metrics=[CW_METRICS_CLUSTERMGTD_HEARTBEAT], + namespace=CW_METRICS_NAMESPACE, + additional_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name}, + ) + ) + # Create graphs for EC2 metrics and CW Agent metrics and update coordinates widgets_list = [] - for metrics_param in ec2_metrics + cwagent_metrics: + for metrics_param in ec2_metrics + cwagent_metrics + pcluster_metrics: metrics_list = self._generate_metrics_list(metrics_param) graph_widget = self._generate_graph_widget(metrics_param.title, metrics_list) widgets_list.append(graph_widget) diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils.py b/cli/tests/pcluster/templates/test_cdk_builder_utils.py index fe8e3ff26f..f5276e5d8e 100644 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils.py +++ b/cli/tests/pcluster/templates/test_cdk_builder_utils.py @@ -22,7 +22,7 @@ SlurmComputeResource, SlurmQueue, ) -from pcluster.constants import PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG +from pcluster.constants import CW_METRICS_NAMESPACE, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG from pcluster.schemas.cluster_schema import ClusterSchema from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.templates.cdk_builder_utils import ( @@ -593,3 +593,48 @@ def _check_cleanup_role( assert_that("/parallelcluster/" in generated_template[cleanup_resource_old]["Properties"]["Path"]).is_true() assert_that(generated_template[cleanup_resource_old]["Properties"]).does_not_contain_key("RoleName") + + +@pytest.mark.parametrize( + "config_file_name, expect_cloudwatch_permission", + [ + ("config-slurm.yaml", True), + ("config-awsbatch.yaml", False), + ], +) +def test_head_node_policy(mocker, test_datadir, config_file_name, expect_cloudwatch_permission): + """Verify that cloudwatch:PutMetricData is added for slurm scheduler but not for awsbatch.""" + mock_aws_api(mocker) + mocker.patch( + "pcluster.config.cluster_config.HeadNodeNetworking.availability_zone", + new_callable=PropertyMock(return_value="us-east-1a"), + ) + mock_bucket(mocker) + mock_bucket_object_utils(mocker) + + input_yaml = load_yaml_dict(test_datadir / config_file_name) + cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) + generated_template, _ = CDKTemplateBuilder().build_cluster_template( + cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" + ) + + # Find the CloudWatch policy statement in ParallelClusterPoliciesHeadNode + head_node_policies = generated_template["Resources"]["ParallelClusterPoliciesHeadNode"]["Properties"][ + "PolicyDocument" + ]["Statement"] + + cloudwatch_statement = next( + (stmt for stmt in head_node_policies if stmt.get("Sid") == "CloudWatch"), + None, + ) + + if expect_cloudwatch_permission: + assert_that(cloudwatch_statement).is_not_none() + assert_that(cloudwatch_statement["Action"]).is_equal_to("cloudwatch:PutMetricData") + assert_that(cloudwatch_statement["Effect"]).is_equal_to("Allow") + assert_that(cloudwatch_statement["Resource"]).is_equal_to("*") + assert_that(cloudwatch_statement["Condition"]).is_equal_to( + {"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}} + ) + else: + assert_that(cloudwatch_statement).is_none() diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml b/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml new file mode 100644 index 0000000000..719b0b4db0 --- /dev/null +++ b/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml @@ -0,0 +1,20 @@ +Image: + Os: alinux2 +HeadNode: + InstanceType: t3.micro + Ssh: + KeyName: String + Networking: + SubnetId: subnet-12345678 +Scheduling: + Scheduler: awsbatch + AwsBatchQueues: + - Name: queue1 + Networking: + SubnetIds: + - subnet-12345678 + ComputeResources: + - Name: compute_resource1 + InstanceTypes: + - c4.xlarge + MaxvCpus: 10 diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml b/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml new file mode 100644 index 0000000000..c7a52acb66 --- /dev/null +++ b/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml @@ -0,0 +1,28 @@ +Image: + Os: alinux2 +HeadNode: + InstanceType: String + Ssh: + KeyName: String + Networking: + SubnetId: subnet-12345678 + Iam: + S3Access: + - BucketName: testbucketpball + EnableWriteAccess: True +Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue1 + ComputeResources: + - Name: compute_resource1 + InstanceType: t3.micro + MinCount: 1 + MaxCount: 5 + Networking: + SubnetIds: + - subnet-12345678 + Iam: + S3Access: + - BucketName: testbucketpball + EnableWriteAccess: True diff --git a/cli/tests/pcluster/templates/test_cluster_stack.py b/cli/tests/pcluster/templates/test_cluster_stack.py index 0553e9b229..8a0f314318 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack.py +++ b/cli/tests/pcluster/templates/test_cluster_stack.py @@ -281,7 +281,7 @@ def test_add_alarms(mocker, config_file_name): "comparison_operator": "GreaterThanThreshold", "evaluation_periods": 1, "datapoints_to_alarm": 1, - "treat_missing_data": None, + "treat_missing_data": "missing", }, "Cpu": { "name": "clustername-HeadNode-Cpu", @@ -291,7 +291,7 @@ def test_add_alarms(mocker, config_file_name): "comparison_operator": "GreaterThanThreshold", "evaluation_periods": 1, "datapoints_to_alarm": 1, - "treat_missing_data": None, + "treat_missing_data": "missing", }, "Mem": { "name": "clustername-HeadNode-Mem", @@ -301,7 +301,7 @@ def test_add_alarms(mocker, config_file_name): "comparison_operator": "GreaterThanThreshold", "evaluation_periods": 1, "datapoints_to_alarm": 1, - "treat_missing_data": None, + "treat_missing_data": "missing", }, "Disk": { "name": "clustername-HeadNode-Disk", @@ -311,19 +311,45 @@ def test_add_alarms(mocker, config_file_name): "comparison_operator": "GreaterThanThreshold", "evaluation_periods": 1, "datapoints_to_alarm": 1, - "treat_missing_data": None, + "treat_missing_data": "missing", }, } + if cluster.scheduling.scheduler == "slurm": + expected_alarms["Clustermgtd-Heartbeat"] = { + "name": "clustername-HeadNode-ClustermgtdHeartbeat", + "metric_name": "ClustermgtdHeartbeat", + "namespace": "ParallelCluster", + "threshold": 1, + "comparison_operator": "LessThanThreshold", + "evaluation_periods": 10, + "datapoints_to_alarm": 10, + "treat_missing_data": "breaching", + } + if cluster.are_alarms_enabled: - # Verify each simple alarm exists with correct details + # Find the HeadNode wait condition resource name + wait_condition_resources = get_resources( + generated_template, type="AWS::CloudFormation::WaitCondition", name_pattern="^HeadNodeWaitCondition" + ) + assert_that(wait_condition_resources).is_length(1) + wait_condition_name = list(wait_condition_resources.keys())[0] + + # Collect simple alarm resource names for composite alarm rule verification + simple_alarm_resource_names = [] + + # Verify each simple alarm exists with correct details and depends on wait condition for _alarm_key, expected in expected_alarms.items(): matched_resources = get_resources( generated_template, type=simple_type, properties={"AlarmName": expected["name"]} ) assert_that(matched_resources).is_length(1) - alarm_properties = list(matched_resources.values())[0]["Properties"] + alarm_resource_name = list(matched_resources.keys())[0] + simple_alarm_resource_names.append(alarm_resource_name) + + alarm_resource = list(matched_resources.values())[0] + alarm_properties = alarm_resource["Properties"] assert_that(alarm_properties.get("AlarmName")).is_equal_to(expected["name"]) assert_that(alarm_properties.get("MetricName")).is_equal_to(expected["metric_name"]) assert_that(alarm_properties.get("Namespace")).is_equal_to(expected["namespace"]) @@ -332,12 +358,25 @@ def test_add_alarms(mocker, config_file_name): assert_that(alarm_properties.get("EvaluationPeriods")).is_equal_to(expected["evaluation_periods"]) assert_that(alarm_properties.get("DatapointsToAlarm")).is_equal_to(expected["datapoints_to_alarm"]) assert_that(alarm_properties.get("TreatMissingData")).is_equal_to(expected["treat_missing_data"]) + assert_that(alarm_resource.get("DependsOn")).contains(wait_condition_name) - # Verify composite alarm exists + # Verify composite alarm exists with correct properties composite_alarms = get_resources(generated_template, type=composite_type) assert_that(composite_alarms).is_length(1) - composite_alarm_properties = list(composite_alarms.values())[0]["Properties"] + composite_alarm_resource = list(composite_alarms.values())[0] + composite_alarm_properties = composite_alarm_resource["Properties"] assert_that(composite_alarm_properties["AlarmName"]).is_equal_to("clustername-HeadNode") + assert_that(composite_alarm_resource.get("DependsOn")).contains(wait_condition_name) + + # Verify composite alarm rule triggers on ANY of the simple alarms (OR logic) + alarm_rule = composite_alarm_properties.get("AlarmRule") + assert_that(alarm_rule).is_not_none() + alarm_rule_str = str(alarm_rule) + for alarm_resource_name in simple_alarm_resource_names: + assert_that(alarm_rule_str).contains(alarm_resource_name) + expected_or_count = len(simple_alarm_resource_names) - 1 + actual_or_count = alarm_rule_str.count(" OR ") + assert_that(actual_or_count).is_equal_to(expected_or_count) else: matched_simple_alarms = get_resources(generated_template, type=simple_type) matched_composite_alarms = get_resources(generated_template, type=composite_type) diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py index 647a69a182..f60c5c3378 100644 --- a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py @@ -58,7 +58,7 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r if cluster_config.is_cw_dashboard_enabled: assert_that(output_yaml).contains("CloudwatchDashboard") assert_that(output_yaml).contains("Head Node EC2 Metrics") - _verify_head_node_instance_metrics_graphs(output_yaml) + _verify_head_node_instance_metrics_graphs(output_yaml, cluster_config.scheduling.scheduler) if cluster_config.are_alarms_enabled: assert_that(output_yaml).contains("Cluster Alarms") @@ -79,7 +79,7 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r assert_that(output_yaml).does_not_contain("CloudwatchDashboard") assert_that(output_yaml).does_not_contain("Head Node EC2 Metrics") - _verify_alarms(output_yaml, cluster_config.are_alarms_enabled) + _verify_alarms(output_yaml, cluster_config.are_alarms_enabled, cluster_config.scheduling.scheduler) if cluster_config.is_cw_logging_enabled: assert_that(output_yaml).contains("ClusterCWLogGroup") @@ -87,7 +87,7 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r assert_that(output_yaml).does_not_contain("ClusterCWLogGroup") -def _verify_alarms(output_yaml, alarms_enabled): +def _verify_alarms(output_yaml, alarms_enabled, scheduler): if alarms_enabled: assert_that(output_yaml).contains("HeadNodeHealthAlarm") assert_that(output_yaml).contains("StatusCheckFailed") @@ -101,6 +101,13 @@ def _verify_alarms(output_yaml, alarms_enabled): assert_that(output_yaml).contains("HeadNodeDiskAlarm") assert_that(output_yaml).contains("disk_used_percent") + # ClustermgtdHeartbeat alarm is only created for Slurm scheduler + if scheduler == "slurm": + assert_that(output_yaml).contains("HeadNodeClustermgtdHeartbeatAlarm") + assert_that(output_yaml).contains("ClustermgtdHeartbeat") + else: + assert_that(output_yaml).does_not_contain("HeadNodeClustermgtdHeartbeatAlarm") + else: assert_that(output_yaml).does_not_contain("Cluster Alarms") assert_that(output_yaml).does_not_contain("AWS::CloudWatch::Alarm") @@ -129,7 +136,7 @@ def _verify_metric_filter_dimensions(metric_filters): ) -def _verify_head_node_instance_metrics_graphs(output_yaml): +def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler): """Verify CloudWatch graphs within the Head Node Instance Metrics section.""" assert_that(output_yaml).contains("Head Node Instance Metrics") assert_that(output_yaml).contains("CPU Utilization") @@ -139,6 +146,12 @@ def _verify_head_node_instance_metrics_graphs(output_yaml): assert_that(output_yaml).contains("Disk Read/Write Ops") assert_that(output_yaml).contains("Disk Used Percent") assert_that(output_yaml).contains("Memory Used Percent") + # Daemons Heartbeats widget is only created for Slurm scheduler + if scheduler == "slurm": + assert_that(output_yaml).contains("Daemons Heartbeats") + assert_that(output_yaml).contains("ClustermgtdHeartbeat") + else: + assert_that(output_yaml).does_not_contain("Daemons Heartbeats") def _verify_ec2_metrics_conditions(cluster_config, output_yaml): diff --git a/cli/tests/pcluster/utils.py b/cli/tests/pcluster/utils.py index 8e31c445ce..0042e3abdc 100644 --- a/cli/tests/pcluster/utils.py +++ b/cli/tests/pcluster/utils.py @@ -10,6 +10,7 @@ # limitations under the License. import itertools import os +import re from copy import deepcopy import pytest @@ -37,12 +38,18 @@ def load_cluster_model_from_yaml(config_file_name, test_datadir=None): def get_resources( - generated_template: dict, name: str = None, type: str = None, properties: dict = None, deletion_policy: str = None + generated_template: dict, + name: str = None, + name_pattern: str = None, + type: str = None, + properties: dict = None, + deletion_policy: str = None, ): return dict( (res_name, res_value) for res_name, res_value in generated_template.get("Resources", {}).items() if (name is None or res_name == name) + and (name_pattern is None or re.search(name_pattern, res_name)) and (type is None or res_value.get("Type") == type) and (deletion_policy is None or res_value.get("DeletionPolicy") == deletion_policy) and ( diff --git a/tests/iam_policies/cluster-roles.cfn.yaml b/tests/iam_policies/cluster-roles.cfn.yaml index 7ae7f6f0a5..b59ecd540f 100644 --- a/tests/iam_policies/cluster-roles.cfn.yaml +++ b/tests/iam_policies/cluster-roles.cfn.yaml @@ -136,6 +136,13 @@ Resources: - route53:ChangeResourceRecordSets Resource: '*' Effect: Allow + - Action: + - cloudwatch:PutMetricData + Resource: '*' + Effect: Allow + Condition: + StringEquals: + cloudwatch:namespace: 'ParallelCluster' Version: '2012-10-17' Type: AWS::IAM::Role diff --git a/tests/integration-tests/tests/monitoring/test_monitoring.py b/tests/integration-tests/tests/monitoring/test_monitoring.py index 061394d94b..a884c54bd0 100644 --- a/tests/integration-tests/tests/monitoring/test_monitoring.py +++ b/tests/integration-tests/tests/monitoring/test_monitoring.py @@ -10,6 +10,7 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import datetime +import logging import math import time @@ -33,6 +34,7 @@ def test_monitoring( cw_log_enabled, alarms_enabled, region, + scheduler, pcluster_config_reader, clusters_factory, test_datadir, @@ -48,13 +50,14 @@ def test_monitoring( headnode_instance_id = cluster.get_cluster_instance_ids(node_type="HeadNode")[0] compute_instance_ids = cluster.get_cluster_instance_ids(node_type="Compute") # the MinCount is set to 1, so we should have at least one compute node + logging.info(f"Retrieved compute nodes: {compute_instance_ids}") assert_that(compute_instance_ids).is_not_empty() # test CWAgent metrics # we only perform this test for one of the 3 test conditions # because this test could be time-consuming (we allow some retries to ensure we can get metrics data) if dashboard_enabled and cw_log_enabled: - _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_ids[0]) + _test_metrics(cw_client, headnode_instance_id, compute_instance_ids[0], cluster.cfn_name, scheduler) # test dashboard and alarms _test_dashboard(cw_client, cluster.cfn_name, region, dashboard_enabled, cw_log_enabled) @@ -65,21 +68,34 @@ def test_monitoring( @retry(stop_max_attempt_number=8, wait_fixed=minutes(2)) -def _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_id): +def _test_metrics(cw_client, headnode_instance_id, compute_instance_id, cluster_name, scheduler): # query for the past 20 minutes start_timestamp, end_timestamp = _get_start_end_timestamp(minutes=20) - # test memory and disk metrics are collected for the head node - metrics_response_headnode = _get_metric_data(headnode_instance_id, cw_client, start_timestamp, end_timestamp) + # test memory, disk, and clustermgtd heartbeat metrics are collected for the head node + logging.info(f"Retrieving head node metrics from {start_timestamp} to {end_timestamp}") + metrics_response_headnode = _get_metric_data( + cw_client, start_timestamp, end_timestamp, instance_id=headnode_instance_id, cluster_name=cluster_name + ) + logging.info("Head node metrics retrieved") mem_values = _get_metric_data_values(metrics_response_headnode, "mem") disk_values = _get_metric_data_values(metrics_response_headnode, "disk") + clustermgtd_heartbeat_values = _get_metric_data_values(metrics_response_headnode, "clustermgtd_heartbeat") assert_that(mem_values).is_not_empty() assert_that(disk_values).is_not_empty() + if scheduler == "slurm": + assert_that(clustermgtd_heartbeat_values).is_not_empty() + else: + assert_that(clustermgtd_heartbeat_values).is_empty() + # wait for additional 1 minute to reduce the chance of false negative result for compute nodes - time.sleep(60) + sleep_seconds = 60 + logging.info(f"Waiting {sleep_seconds} seconds for compute node metrics") + time.sleep(sleep_seconds) + # test memory and disk metrics are not collected for compute nodes - metrics_response_compute = _get_metric_data(compute_instance_id, cw_client, start_timestamp, end_timestamp) + metrics_response_compute = _get_metric_data(cw_client, start_timestamp, end_timestamp, compute_instance_id) mem_values = _get_metric_data_values(metrics_response_compute, "mem") disk_values = _get_metric_data_values(metrics_response_compute, "disk") assert_that(mem_values).is_empty() @@ -87,6 +103,8 @@ def _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_id) def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_enabled): + # TODO: This assertion can be removed because the content of cluster dashboard is covered by unit tests. + # At least let's not expand this assertion with more conditions. dashboard_name = "{0}-{1}".format(cluster_name, region) if dashboard_enabled: dashboard_response = cw_client.get_dashboard(DashboardName=dashboard_name) @@ -108,6 +126,8 @@ def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_e def _test_alarms(cw_client, cluster_name, headnode_instance_id, alarms_enabled): + # TODO: This assertion can be removed because the settings of cluster alarms are covered by unit tests. + # At least let's not expand this assertion with more conditions. alarm_response = cw_client.describe_alarms(AlarmNamePrefix=cluster_name) if alarms_enabled: health_alarm_name = f"{cluster_name}-HeadNode-Health" @@ -144,49 +164,71 @@ def _get_start_end_timestamp(minutes): return start_timestamp, end_timestamp_ceil -def _get_metric_data(instance_id, cw_client, start_timestamp, end_timestamp): - metrics_response = cw_client.get_metric_data( - MetricDataQueries=[ - { - "Id": "mem", - "MetricStat": { - "Metric": { - "Namespace": "CWAgent", - "MetricName": "mem_used_percent", - "Dimensions": [ - { - "Name": "InstanceId", - "Value": instance_id, - } - ], - }, - "Period": 60, - "Stat": "Maximum", +def _get_metric_data(cw_client, start_timestamp, end_timestamp, instance_id, cluster_name=None): + """ + Query CloudWatch metrics. + + Args: + cw_client: CloudWatch client + start_timestamp: Start time for the query + end_timestamp: End time for the query + instance_id: EC2 instance ID for CWAgent metrics + cluster_name: Cluster name for ParallelCluster metrics (optional) + """ + queries = [ + { + "Id": "mem", + "MetricStat": { + "Metric": { + "Namespace": "CWAgent", + "MetricName": "mem_used_percent", + "Dimensions": [{"Name": "InstanceId", "Value": instance_id}], + }, + "Period": 60, + "Stat": "Maximum", + }, + }, + { + "Id": "disk", + "MetricStat": { + "Metric": { + "Namespace": "CWAgent", + "MetricName": "disk_used_percent", + "Dimensions": [ + {"Name": "InstanceId", "Value": instance_id}, + {"Name": "path", "Value": "/"}, + ], }, + "Period": 60, + "Stat": "Maximum", }, + }, + ] + + if cluster_name: + queries.append( { - "Id": "disk", + "Id": "clustermgtd_heartbeat", "MetricStat": { "Metric": { - "Namespace": "CWAgent", - "MetricName": "disk_used_percent", + "Namespace": "ParallelCluster", + "MetricName": "ClustermgtdHeartbeat", "Dimensions": [ - { - "Name": "InstanceId", - "Value": instance_id, - }, - {"Name": "path", "Value": "/"}, + {"Name": "ClusterName", "Value": cluster_name}, + {"Name": "InstanceId", "Value": instance_id}, ], }, "Period": 60, - "Stat": "Maximum", + "Stat": "Sum", }, - }, - ], + } + ) + + return cw_client.get_metric_data( + MetricDataQueries=queries, StartTime=start_timestamp, EndTime=end_timestamp, ) - return metrics_response def _get_metric_data_values(response, query_id):