diff --git a/CHANGELOG.md b/CHANGELOG.md index b4b4458e50..95a78e21a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,6 @@ CHANGELOG - Upgrade jmespath to ~=1.0 (from ~=0.10). - Upgrade tabulate to <=0.9.0 (from <=0.8.10). - Add support for p6-b300 instances for all OSs except AL2. -- Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics. - Add alarm on missing clustermgtd heartbeat. **BUG FIXES** diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index 94604f930b..748b598a03 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -191,6 +191,7 @@ # CloudWatch Metrics CW_METRICS_NAMESPACE = "ParallelCluster" CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName" +CW_METRICS_DIMENSION_INSTANCE_ID = "InstanceId" CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat" STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events" diff --git a/cli/src/pcluster/templates/cdk_builder_utils.py b/cli/src/pcluster/templates/cdk_builder_utils.py index ebaeb78973..8c7ee692e3 100644 --- a/cli/src/pcluster/templates/cdk_builder_utils.py +++ b/cli/src/pcluster/templates/cdk_builder_utils.py @@ -38,13 +38,11 @@ from pcluster.constants import ( COOKBOOK_PACKAGES_VERSIONS, CW_LOGS_RETENTION_DAYS_DEFAULT, - CW_METRICS_NAMESPACE, IAM_ROLE_PATH, LAMBDA_VPC_ACCESS_MANAGED_POLICY, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_DYNAMODB_PREFIX, PCLUSTER_NODE_TYPE_TAG, - SLURM, ULTRASERVER_INSTANCE_PREFIX_LIST, ) from pcluster.launch_template_utils import _LaunchTemplateBuilder @@ -744,19 +742,6 @@ def _build_policy(self) -> List[iam.PolicyStatement]: ), ] - if self._config.scheduling.scheduler == SLURM: - policy.extend( - [ - iam.PolicyStatement( - sid="CloudWatch", - actions=["cloudwatch:PutMetricData"], - effect=iam.Effect.ALLOW, - resources=["*"], - conditions={"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}}, - ) - ] - ) - if ( self._config.scheduling.scheduler == "slurm" and self._config.scheduling.settings diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index e9a0ed483f..c27b06e34d 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -66,6 +66,7 @@ CW_LOGS_CFN_PARAM_NAME, CW_METRICS_CLUSTERMGTD_HEARTBEAT, CW_METRICS_DIMENSION_CLUSTER_NAME, + CW_METRICS_DIMENSION_INSTANCE_ID, CW_METRICS_NAMESPACE, DEFAULT_EPHEMERAL_DIR, EFS_PORT, @@ -384,8 +385,34 @@ def _add_head_node_alarms(self): }, } - if self._condition_is_slurm(): - metrics_for_alarms["ClustermgtdHeartbeat"] = { + # These alarms required Cw logging enabled because they are based on CW Metrics Filters. + if self._condition_is_slurm() and self.config.is_cw_logging_enabled: + # Create metric filter to extract heartbeat metric from clustermgtd event logs + clustermgtd_heartbeat_metric_filter = logs.CfnMetricFilter( + scope=self.stack, + id=f"{CW_METRICS_CLUSTERMGTD_HEARTBEAT}Filter", + filter_pattern='{ $.event-type = "clustermgtd-heartbeat" }', + log_group_name=self.log_group_name, + metric_transformations=[ + logs.CfnMetricFilter.MetricTransformationProperty( + metric_namespace=CW_METRICS_NAMESPACE, + metric_name=CW_METRICS_CLUSTERMGTD_HEARTBEAT, + metric_value="1", + unit="Count", + dimensions=[ + logs.CfnMetricFilter.DimensionProperty( + key=CW_METRICS_DIMENSION_CLUSTER_NAME, value="$.cluster-name" + ), + logs.CfnMetricFilter.DimensionProperty( + key=CW_METRICS_DIMENSION_INSTANCE_ID, value="$.instance-id" + ), + ], + ) + ], + ) + clustermgtd_heartbeat_metric_filter.add_depends_on(self.log_group) + + metrics_for_alarms[CW_METRICS_CLUSTERMGTD_HEARTBEAT] = { "metric": self._cw_metric_head_node( CW_METRICS_NAMESPACE, CW_METRICS_CLUSTERMGTD_HEARTBEAT, diff --git a/cli/src/pcluster/templates/cw_dashboard_builder.py b/cli/src/pcluster/templates/cw_dashboard_builder.py index fff10bba2c..03769b8a6e 100644 --- a/cli/src/pcluster/templates/cw_dashboard_builder.py +++ b/cli/src/pcluster/templates/cw_dashboard_builder.py @@ -574,7 +574,7 @@ def _add_head_node_instance_metrics_graphs(self): # Custom Metrics pcluster_metrics = [] - if self.config.scheduling.scheduler == "slurm": + if self.config.scheduling.scheduler == "slurm" and self.config.is_cw_logging_enabled: pcluster_metrics.append( new_pcluster_metric( title="Daemons Heartbeats", diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml b/cli/tests/pcluster/example_configs/slurm.logging_disabled.yaml similarity index 54% rename from cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml rename to cli/tests/pcluster/example_configs/slurm.logging_disabled.yaml index 719b0b4db0..b8d75e7dc7 100644 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml +++ b/cli/tests/pcluster/example_configs/slurm.logging_disabled.yaml @@ -2,19 +2,21 @@ Image: Os: alinux2 HeadNode: InstanceType: t3.micro - Ssh: - KeyName: String Networking: SubnetId: subnet-12345678 + Ssh: + KeyName: ec2-key-name Scheduling: - Scheduler: awsbatch - AwsBatchQueues: + Scheduler: slurm + SlurmQueues: - Name: queue1 Networking: SubnetIds: - subnet-12345678 ComputeResources: - - Name: compute_resource1 - InstanceTypes: - - c4.xlarge - MaxvCpus: 10 + - Name: compute-resource1 + InstanceType: c5.2xlarge +Monitoring: + Logs: + CloudWatch: + Enabled: false diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils.py b/cli/tests/pcluster/templates/test_cdk_builder_utils.py index f5276e5d8e..fe8e3ff26f 100644 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils.py +++ b/cli/tests/pcluster/templates/test_cdk_builder_utils.py @@ -22,7 +22,7 @@ SlurmComputeResource, SlurmQueue, ) -from pcluster.constants import CW_METRICS_NAMESPACE, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG +from pcluster.constants import PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG from pcluster.schemas.cluster_schema import ClusterSchema from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.templates.cdk_builder_utils import ( @@ -593,48 +593,3 @@ def _check_cleanup_role( assert_that("/parallelcluster/" in generated_template[cleanup_resource_old]["Properties"]["Path"]).is_true() assert_that(generated_template[cleanup_resource_old]["Properties"]).does_not_contain_key("RoleName") - - -@pytest.mark.parametrize( - "config_file_name, expect_cloudwatch_permission", - [ - ("config-slurm.yaml", True), - ("config-awsbatch.yaml", False), - ], -) -def test_head_node_policy(mocker, test_datadir, config_file_name, expect_cloudwatch_permission): - """Verify that cloudwatch:PutMetricData is added for slurm scheduler but not for awsbatch.""" - mock_aws_api(mocker) - mocker.patch( - "pcluster.config.cluster_config.HeadNodeNetworking.availability_zone", - new_callable=PropertyMock(return_value="us-east-1a"), - ) - mock_bucket(mocker) - mock_bucket_object_utils(mocker) - - input_yaml = load_yaml_dict(test_datadir / config_file_name) - cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template, _ = CDKTemplateBuilder().build_cluster_template( - cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" - ) - - # Find the CloudWatch policy statement in ParallelClusterPoliciesHeadNode - head_node_policies = generated_template["Resources"]["ParallelClusterPoliciesHeadNode"]["Properties"][ - "PolicyDocument" - ]["Statement"] - - cloudwatch_statement = next( - (stmt for stmt in head_node_policies if stmt.get("Sid") == "CloudWatch"), - None, - ) - - if expect_cloudwatch_permission: - assert_that(cloudwatch_statement).is_not_none() - assert_that(cloudwatch_statement["Action"]).is_equal_to("cloudwatch:PutMetricData") - assert_that(cloudwatch_statement["Effect"]).is_equal_to("Allow") - assert_that(cloudwatch_statement["Resource"]).is_equal_to("*") - assert_that(cloudwatch_statement["Condition"]).is_equal_to( - {"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}} - ) - else: - assert_that(cloudwatch_statement).is_none() diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml b/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml deleted file mode 100644 index c7a52acb66..0000000000 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml +++ /dev/null @@ -1,28 +0,0 @@ -Image: - Os: alinux2 -HeadNode: - InstanceType: String - Ssh: - KeyName: String - Networking: - SubnetId: subnet-12345678 - Iam: - S3Access: - - BucketName: testbucketpball - EnableWriteAccess: True -Scheduling: - Scheduler: slurm - SlurmQueues: - - Name: queue1 - ComputeResources: - - Name: compute_resource1 - InstanceType: t3.micro - MinCount: 1 - MaxCount: 5 - Networking: - SubnetIds: - - subnet-12345678 - Iam: - S3Access: - - BucketName: testbucketpball - EnableWriteAccess: True diff --git a/cli/tests/pcluster/templates/test_cluster_stack.py b/cli/tests/pcluster/templates/test_cluster_stack.py index 8a0f314318..78d824ec9d 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack.py +++ b/cli/tests/pcluster/templates/test_cluster_stack.py @@ -250,6 +250,7 @@ def test_add_efs_shared_storage(mocker, test_datadir, config_file_name, expected [ "slurm.required.yaml", "slurm.full.yaml", + "slurm.logging_disabled.yaml", "awsbatch.simple.yaml", "awsbatch.full.yaml", ], @@ -315,7 +316,7 @@ def test_add_alarms(mocker, config_file_name): }, } - if cluster.scheduling.scheduler == "slurm": + if cluster.scheduling.scheduler == "slurm" and cluster.is_cw_logging_enabled: expected_alarms["Clustermgtd-Heartbeat"] = { "name": "clustername-HeadNode-ClustermgtdHeartbeat", "metric_name": "ClustermgtdHeartbeat", diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py index f60c5c3378..e26f9a4088 100644 --- a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py @@ -58,7 +58,9 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r if cluster_config.is_cw_dashboard_enabled: assert_that(output_yaml).contains("CloudwatchDashboard") assert_that(output_yaml).contains("Head Node EC2 Metrics") - _verify_head_node_instance_metrics_graphs(output_yaml, cluster_config.scheduling.scheduler) + _verify_head_node_instance_metrics_graphs( + output_yaml, cluster_config.scheduling.scheduler, cluster_config.is_cw_logging_enabled + ) if cluster_config.are_alarms_enabled: assert_that(output_yaml).contains("Cluster Alarms") @@ -79,7 +81,12 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r assert_that(output_yaml).does_not_contain("CloudwatchDashboard") assert_that(output_yaml).does_not_contain("Head Node EC2 Metrics") - _verify_alarms(output_yaml, cluster_config.are_alarms_enabled, cluster_config.scheduling.scheduler) + _verify_alarms( + output_yaml, + cluster_config.are_alarms_enabled, + cluster_config.scheduling.scheduler, + cluster_config.is_cw_logging_enabled, + ) if cluster_config.is_cw_logging_enabled: assert_that(output_yaml).contains("ClusterCWLogGroup") @@ -87,7 +94,7 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r assert_that(output_yaml).does_not_contain("ClusterCWLogGroup") -def _verify_alarms(output_yaml, alarms_enabled, scheduler): +def _verify_alarms(output_yaml, alarms_enabled, scheduler, is_cw_logging_enabled): if alarms_enabled: assert_that(output_yaml).contains("HeadNodeHealthAlarm") assert_that(output_yaml).contains("StatusCheckFailed") @@ -102,7 +109,7 @@ def _verify_alarms(output_yaml, alarms_enabled, scheduler): assert_that(output_yaml).contains("disk_used_percent") # ClustermgtdHeartbeat alarm is only created for Slurm scheduler - if scheduler == "slurm": + if scheduler == "slurm" and is_cw_logging_enabled: assert_that(output_yaml).contains("HeadNodeClustermgtdHeartbeatAlarm") assert_that(output_yaml).contains("ClustermgtdHeartbeat") else: @@ -130,13 +137,15 @@ def _verify_metric_filter_dimensions(metric_filters): ) expected_dimensions = [{"Key": "ClusterName", "Value": "$.cluster-name"}] + if name == "ClustermgtdHeartbeatFilter": + expected_dimensions.append({"Key": "InstanceId", "Value": "$.instance-id"}) assert_that(dimensions, description=f"{name} should have dimensions {expected_dimensions}").is_equal_to( expected_dimensions ) -def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler): +def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler, is_cw_logging_enabled): """Verify CloudWatch graphs within the Head Node Instance Metrics section.""" assert_that(output_yaml).contains("Head Node Instance Metrics") assert_that(output_yaml).contains("CPU Utilization") @@ -146,8 +155,8 @@ def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler): assert_that(output_yaml).contains("Disk Read/Write Ops") assert_that(output_yaml).contains("Disk Used Percent") assert_that(output_yaml).contains("Memory Used Percent") - # Daemons Heartbeats widget is only created for Slurm scheduler - if scheduler == "slurm": + # Daemons Heartbeats widget is only created for Slurm scheduler with logging enabled + if scheduler == "slurm" and is_cw_logging_enabled: assert_that(output_yaml).contains("Daemons Heartbeats") assert_that(output_yaml).contains("ClustermgtdHeartbeat") else: diff --git a/tests/iam_policies/cluster-roles.cfn.yaml b/tests/iam_policies/cluster-roles.cfn.yaml index b59ecd540f..7ae7f6f0a5 100644 --- a/tests/iam_policies/cluster-roles.cfn.yaml +++ b/tests/iam_policies/cluster-roles.cfn.yaml @@ -136,13 +136,6 @@ Resources: - route53:ChangeResourceRecordSets Resource: '*' Effect: Allow - - Action: - - cloudwatch:PutMetricData - Resource: '*' - Effect: Allow - Condition: - StringEquals: - cloudwatch:namespace: 'ParallelCluster' Version: '2012-10-17' Type: AWS::IAM::Role diff --git a/tests/integration-tests/network_template_builder.py b/tests/integration-tests/network_template_builder.py index dce243d95c..796a3150ba 100644 --- a/tests/integration-tests/network_template_builder.py +++ b/tests/integration-tests/network_template_builder.py @@ -241,12 +241,6 @@ def __build_vpc_endpoints(self, subnet_id, route_table_ids): type=VPCEndpointConfig.EndpointType.INTERFACE, enable_private_dns=True, ), - VPCEndpointConfig( - name="MonitoringEndpoint", - service_name=f"com.amazonaws.{region}.monitoring", - type=VPCEndpointConfig.EndpointType.INTERFACE, - enable_private_dns=True, - ), VPCEndpointConfig( name="CFNEndpoint", service_name=prefix + f"com.amazonaws.{region}.cloudformation",