From ce93dafa69755528042263fd4d2f7973e29f0c04 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 4 Feb 2026 11:26:17 -0500 Subject: [PATCH 1/3] Revert "[Test] In test `test_cluster_in_no_internet_subnet` add VPC endpoint for CloudWatch Metrics (com.amazonaws.$region.monitoring), which is now required by clustermgtd to put metrics." This reverts commit 3aacea38d78f1cbac709a7c2f4dc8995129f2662. We must revert this commit because the VPC Endpoint for CloudWatch Monitoring is not needed anymore. We introduced it when we introduced for the first time a call to PutMetricData from clustermgtd. Now we moved to an approach where we make no rewquests to CloudWatch Monitoring so the endpoint is not required anymore. --- tests/integration-tests/network_template_builder.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/integration-tests/network_template_builder.py b/tests/integration-tests/network_template_builder.py index dce243d95c..796a3150ba 100644 --- a/tests/integration-tests/network_template_builder.py +++ b/tests/integration-tests/network_template_builder.py @@ -241,12 +241,6 @@ def __build_vpc_endpoints(self, subnet_id, route_table_ids): type=VPCEndpointConfig.EndpointType.INTERFACE, enable_private_dns=True, ), - VPCEndpointConfig( - name="MonitoringEndpoint", - service_name=f"com.amazonaws.{region}.monitoring", - type=VPCEndpointConfig.EndpointType.INTERFACE, - enable_private_dns=True, - ), VPCEndpointConfig( name="CFNEndpoint", service_name=prefix + f"com.amazonaws.{region}.cloudformation", From 5ec6e2fa1785ef91aa294382399b3e0c956e9a83 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 4 Feb 2026 11:33:11 -0500 Subject: [PATCH 2/3] Revert "[Observability] Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics." This reverts commit 9329636e60c3b95de7dba1a6a5d97a22d218e792. We must revert this commit because we do not need the permission cloudwatch:PutMetricData` anymore. We introduced such extra permissions when we introduced a PutMetricData request from the head node to let clustermgtd publish a metric. However, we now changed the approach moving to metric filters rather than explicit PutMetricData, so the permissionb is not required anymore. --- CHANGELOG.md | 1 - cli/src/pcluster/constants.py | 5 -- .../pcluster/templates/cdk_builder_utils.py | 15 ------ .../templates/test_cdk_builder_utils.py | 47 +------------------ .../config-awsbatch.yaml | 20 -------- .../test_head_node_policy/config-slurm.yaml | 28 ----------- tests/iam_policies/cluster-roles.cfn.yaml | 7 --- 7 files changed, 1 insertion(+), 122 deletions(-) delete mode 100644 cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml delete mode 100644 cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index b4b4458e50..95a78e21a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,6 @@ CHANGELOG - Upgrade jmespath to ~=1.0 (from ~=0.10). - Upgrade tabulate to <=0.9.0 (from <=0.8.10). - Add support for p6-b300 instances for all OSs except AL2. -- Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics. - Add alarm on missing clustermgtd heartbeat. **BUG FIXES** diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index 94604f930b..f684772b18 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -188,11 +188,6 @@ CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT = 1 DETAILED_MONITORING_ENABLED_DEFAULT = False -# CloudWatch Metrics -CW_METRICS_NAMESPACE = "ParallelCluster" -CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName" -CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat" - STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events" PCLUSTER_IMAGE_NAME_REGEX = r"^[-_A-Za-z0-9{][-_A-Za-z0-9\s:{}\.]+[-_A-Za-z0-9}]$" diff --git a/cli/src/pcluster/templates/cdk_builder_utils.py b/cli/src/pcluster/templates/cdk_builder_utils.py index ebaeb78973..8c7ee692e3 100644 --- a/cli/src/pcluster/templates/cdk_builder_utils.py +++ b/cli/src/pcluster/templates/cdk_builder_utils.py @@ -38,13 +38,11 @@ from pcluster.constants import ( COOKBOOK_PACKAGES_VERSIONS, CW_LOGS_RETENTION_DAYS_DEFAULT, - CW_METRICS_NAMESPACE, IAM_ROLE_PATH, LAMBDA_VPC_ACCESS_MANAGED_POLICY, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_DYNAMODB_PREFIX, PCLUSTER_NODE_TYPE_TAG, - SLURM, ULTRASERVER_INSTANCE_PREFIX_LIST, ) from pcluster.launch_template_utils import _LaunchTemplateBuilder @@ -744,19 +742,6 @@ def _build_policy(self) -> List[iam.PolicyStatement]: ), ] - if self._config.scheduling.scheduler == SLURM: - policy.extend( - [ - iam.PolicyStatement( - sid="CloudWatch", - actions=["cloudwatch:PutMetricData"], - effect=iam.Effect.ALLOW, - resources=["*"], - conditions={"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}}, - ) - ] - ) - if ( self._config.scheduling.scheduler == "slurm" and self._config.scheduling.settings diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils.py b/cli/tests/pcluster/templates/test_cdk_builder_utils.py index f5276e5d8e..fe8e3ff26f 100644 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils.py +++ b/cli/tests/pcluster/templates/test_cdk_builder_utils.py @@ -22,7 +22,7 @@ SlurmComputeResource, SlurmQueue, ) -from pcluster.constants import CW_METRICS_NAMESPACE, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG +from pcluster.constants import PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG from pcluster.schemas.cluster_schema import ClusterSchema from pcluster.templates.cdk_builder import CDKTemplateBuilder from pcluster.templates.cdk_builder_utils import ( @@ -593,48 +593,3 @@ def _check_cleanup_role( assert_that("/parallelcluster/" in generated_template[cleanup_resource_old]["Properties"]["Path"]).is_true() assert_that(generated_template[cleanup_resource_old]["Properties"]).does_not_contain_key("RoleName") - - -@pytest.mark.parametrize( - "config_file_name, expect_cloudwatch_permission", - [ - ("config-slurm.yaml", True), - ("config-awsbatch.yaml", False), - ], -) -def test_head_node_policy(mocker, test_datadir, config_file_name, expect_cloudwatch_permission): - """Verify that cloudwatch:PutMetricData is added for slurm scheduler but not for awsbatch.""" - mock_aws_api(mocker) - mocker.patch( - "pcluster.config.cluster_config.HeadNodeNetworking.availability_zone", - new_callable=PropertyMock(return_value="us-east-1a"), - ) - mock_bucket(mocker) - mock_bucket_object_utils(mocker) - - input_yaml = load_yaml_dict(test_datadir / config_file_name) - cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml) - generated_template, _ = CDKTemplateBuilder().build_cluster_template( - cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername" - ) - - # Find the CloudWatch policy statement in ParallelClusterPoliciesHeadNode - head_node_policies = generated_template["Resources"]["ParallelClusterPoliciesHeadNode"]["Properties"][ - "PolicyDocument" - ]["Statement"] - - cloudwatch_statement = next( - (stmt for stmt in head_node_policies if stmt.get("Sid") == "CloudWatch"), - None, - ) - - if expect_cloudwatch_permission: - assert_that(cloudwatch_statement).is_not_none() - assert_that(cloudwatch_statement["Action"]).is_equal_to("cloudwatch:PutMetricData") - assert_that(cloudwatch_statement["Effect"]).is_equal_to("Allow") - assert_that(cloudwatch_statement["Resource"]).is_equal_to("*") - assert_that(cloudwatch_statement["Condition"]).is_equal_to( - {"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}} - ) - else: - assert_that(cloudwatch_statement).is_none() diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml b/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml deleted file mode 100644 index 719b0b4db0..0000000000 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-awsbatch.yaml +++ /dev/null @@ -1,20 +0,0 @@ -Image: - Os: alinux2 -HeadNode: - InstanceType: t3.micro - Ssh: - KeyName: String - Networking: - SubnetId: subnet-12345678 -Scheduling: - Scheduler: awsbatch - AwsBatchQueues: - - Name: queue1 - Networking: - SubnetIds: - - subnet-12345678 - ComputeResources: - - Name: compute_resource1 - InstanceTypes: - - c4.xlarge - MaxvCpus: 10 diff --git a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml b/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml deleted file mode 100644 index c7a52acb66..0000000000 --- a/cli/tests/pcluster/templates/test_cdk_builder_utils/test_head_node_policy/config-slurm.yaml +++ /dev/null @@ -1,28 +0,0 @@ -Image: - Os: alinux2 -HeadNode: - InstanceType: String - Ssh: - KeyName: String - Networking: - SubnetId: subnet-12345678 - Iam: - S3Access: - - BucketName: testbucketpball - EnableWriteAccess: True -Scheduling: - Scheduler: slurm - SlurmQueues: - - Name: queue1 - ComputeResources: - - Name: compute_resource1 - InstanceType: t3.micro - MinCount: 1 - MaxCount: 5 - Networking: - SubnetIds: - - subnet-12345678 - Iam: - S3Access: - - BucketName: testbucketpball - EnableWriteAccess: True diff --git a/tests/iam_policies/cluster-roles.cfn.yaml b/tests/iam_policies/cluster-roles.cfn.yaml index b59ecd540f..7ae7f6f0a5 100644 --- a/tests/iam_policies/cluster-roles.cfn.yaml +++ b/tests/iam_policies/cluster-roles.cfn.yaml @@ -136,13 +136,6 @@ Resources: - route53:ChangeResourceRecordSets Resource: '*' Effect: Allow - - Action: - - cloudwatch:PutMetricData - Resource: '*' - Effect: Allow - Condition: - StringEquals: - cloudwatch:namespace: 'ParallelCluster' Version: '2012-10-17' Type: AWS::IAM::Role From ba3c5e2f17f751a26cfea35d47db777777c4e525 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 4 Feb 2026 21:08:24 -0500 Subject: [PATCH 3/3] [Observability] Use metric filter to generate the clustermgtd heartbeat metric. --- cli/src/pcluster/constants.py | 6 ++++ cli/src/pcluster/templates/cluster_stack.py | 31 +++++++++++++++++-- .../templates/cw_dashboard_builder.py | 2 +- .../slurm.logging_disabled.yaml | 22 +++++++++++++ .../pcluster/templates/test_cluster_stack.py | 3 +- .../templates/test_cw_dashboard_builder.py | 23 +++++++++----- 6 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 cli/tests/pcluster/example_configs/slurm.logging_disabled.yaml diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index f684772b18..748b598a03 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -188,6 +188,12 @@ CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT = 1 DETAILED_MONITORING_ENABLED_DEFAULT = False +# CloudWatch Metrics +CW_METRICS_NAMESPACE = "ParallelCluster" +CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName" +CW_METRICS_DIMENSION_INSTANCE_ID = "InstanceId" +CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat" + STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events" PCLUSTER_IMAGE_NAME_REGEX = r"^[-_A-Za-z0-9{][-_A-Za-z0-9\s:{}\.]+[-_A-Za-z0-9}]$" diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index e9a0ed483f..c27b06e34d 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -66,6 +66,7 @@ CW_LOGS_CFN_PARAM_NAME, CW_METRICS_CLUSTERMGTD_HEARTBEAT, CW_METRICS_DIMENSION_CLUSTER_NAME, + CW_METRICS_DIMENSION_INSTANCE_ID, CW_METRICS_NAMESPACE, DEFAULT_EPHEMERAL_DIR, EFS_PORT, @@ -384,8 +385,34 @@ def _add_head_node_alarms(self): }, } - if self._condition_is_slurm(): - metrics_for_alarms["ClustermgtdHeartbeat"] = { + # These alarms required Cw logging enabled because they are based on CW Metrics Filters. + if self._condition_is_slurm() and self.config.is_cw_logging_enabled: + # Create metric filter to extract heartbeat metric from clustermgtd event logs + clustermgtd_heartbeat_metric_filter = logs.CfnMetricFilter( + scope=self.stack, + id=f"{CW_METRICS_CLUSTERMGTD_HEARTBEAT}Filter", + filter_pattern='{ $.event-type = "clustermgtd-heartbeat" }', + log_group_name=self.log_group_name, + metric_transformations=[ + logs.CfnMetricFilter.MetricTransformationProperty( + metric_namespace=CW_METRICS_NAMESPACE, + metric_name=CW_METRICS_CLUSTERMGTD_HEARTBEAT, + metric_value="1", + unit="Count", + dimensions=[ + logs.CfnMetricFilter.DimensionProperty( + key=CW_METRICS_DIMENSION_CLUSTER_NAME, value="$.cluster-name" + ), + logs.CfnMetricFilter.DimensionProperty( + key=CW_METRICS_DIMENSION_INSTANCE_ID, value="$.instance-id" + ), + ], + ) + ], + ) + clustermgtd_heartbeat_metric_filter.add_depends_on(self.log_group) + + metrics_for_alarms[CW_METRICS_CLUSTERMGTD_HEARTBEAT] = { "metric": self._cw_metric_head_node( CW_METRICS_NAMESPACE, CW_METRICS_CLUSTERMGTD_HEARTBEAT, diff --git a/cli/src/pcluster/templates/cw_dashboard_builder.py b/cli/src/pcluster/templates/cw_dashboard_builder.py index fff10bba2c..03769b8a6e 100644 --- a/cli/src/pcluster/templates/cw_dashboard_builder.py +++ b/cli/src/pcluster/templates/cw_dashboard_builder.py @@ -574,7 +574,7 @@ def _add_head_node_instance_metrics_graphs(self): # Custom Metrics pcluster_metrics = [] - if self.config.scheduling.scheduler == "slurm": + if self.config.scheduling.scheduler == "slurm" and self.config.is_cw_logging_enabled: pcluster_metrics.append( new_pcluster_metric( title="Daemons Heartbeats", diff --git a/cli/tests/pcluster/example_configs/slurm.logging_disabled.yaml b/cli/tests/pcluster/example_configs/slurm.logging_disabled.yaml new file mode 100644 index 0000000000..b8d75e7dc7 --- /dev/null +++ b/cli/tests/pcluster/example_configs/slurm.logging_disabled.yaml @@ -0,0 +1,22 @@ +Image: + Os: alinux2 +HeadNode: + InstanceType: t3.micro + Networking: + SubnetId: subnet-12345678 + Ssh: + KeyName: ec2-key-name +Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue1 + Networking: + SubnetIds: + - subnet-12345678 + ComputeResources: + - Name: compute-resource1 + InstanceType: c5.2xlarge +Monitoring: + Logs: + CloudWatch: + Enabled: false diff --git a/cli/tests/pcluster/templates/test_cluster_stack.py b/cli/tests/pcluster/templates/test_cluster_stack.py index 8a0f314318..78d824ec9d 100644 --- a/cli/tests/pcluster/templates/test_cluster_stack.py +++ b/cli/tests/pcluster/templates/test_cluster_stack.py @@ -250,6 +250,7 @@ def test_add_efs_shared_storage(mocker, test_datadir, config_file_name, expected [ "slurm.required.yaml", "slurm.full.yaml", + "slurm.logging_disabled.yaml", "awsbatch.simple.yaml", "awsbatch.full.yaml", ], @@ -315,7 +316,7 @@ def test_add_alarms(mocker, config_file_name): }, } - if cluster.scheduling.scheduler == "slurm": + if cluster.scheduling.scheduler == "slurm" and cluster.is_cw_logging_enabled: expected_alarms["Clustermgtd-Heartbeat"] = { "name": "clustername-HeadNode-ClustermgtdHeartbeat", "metric_name": "ClustermgtdHeartbeat", diff --git a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py index f60c5c3378..e26f9a4088 100644 --- a/cli/tests/pcluster/templates/test_cw_dashboard_builder.py +++ b/cli/tests/pcluster/templates/test_cw_dashboard_builder.py @@ -58,7 +58,9 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r if cluster_config.is_cw_dashboard_enabled: assert_that(output_yaml).contains("CloudwatchDashboard") assert_that(output_yaml).contains("Head Node EC2 Metrics") - _verify_head_node_instance_metrics_graphs(output_yaml, cluster_config.scheduling.scheduler) + _verify_head_node_instance_metrics_graphs( + output_yaml, cluster_config.scheduling.scheduler, cluster_config.is_cw_logging_enabled + ) if cluster_config.are_alarms_enabled: assert_that(output_yaml).contains("Cluster Alarms") @@ -79,7 +81,12 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r assert_that(output_yaml).does_not_contain("CloudwatchDashboard") assert_that(output_yaml).does_not_contain("Head Node EC2 Metrics") - _verify_alarms(output_yaml, cluster_config.are_alarms_enabled, cluster_config.scheduling.scheduler) + _verify_alarms( + output_yaml, + cluster_config.are_alarms_enabled, + cluster_config.scheduling.scheduler, + cluster_config.is_cw_logging_enabled, + ) if cluster_config.is_cw_logging_enabled: assert_that(output_yaml).contains("ClusterCWLogGroup") @@ -87,7 +94,7 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r assert_that(output_yaml).does_not_contain("ClusterCWLogGroup") -def _verify_alarms(output_yaml, alarms_enabled, scheduler): +def _verify_alarms(output_yaml, alarms_enabled, scheduler, is_cw_logging_enabled): if alarms_enabled: assert_that(output_yaml).contains("HeadNodeHealthAlarm") assert_that(output_yaml).contains("StatusCheckFailed") @@ -102,7 +109,7 @@ def _verify_alarms(output_yaml, alarms_enabled, scheduler): assert_that(output_yaml).contains("disk_used_percent") # ClustermgtdHeartbeat alarm is only created for Slurm scheduler - if scheduler == "slurm": + if scheduler == "slurm" and is_cw_logging_enabled: assert_that(output_yaml).contains("HeadNodeClustermgtdHeartbeatAlarm") assert_that(output_yaml).contains("ClustermgtdHeartbeat") else: @@ -130,13 +137,15 @@ def _verify_metric_filter_dimensions(metric_filters): ) expected_dimensions = [{"Key": "ClusterName", "Value": "$.cluster-name"}] + if name == "ClustermgtdHeartbeatFilter": + expected_dimensions.append({"Key": "InstanceId", "Value": "$.instance-id"}) assert_that(dimensions, description=f"{name} should have dimensions {expected_dimensions}").is_equal_to( expected_dimensions ) -def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler): +def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler, is_cw_logging_enabled): """Verify CloudWatch graphs within the Head Node Instance Metrics section.""" assert_that(output_yaml).contains("Head Node Instance Metrics") assert_that(output_yaml).contains("CPU Utilization") @@ -146,8 +155,8 @@ def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler): assert_that(output_yaml).contains("Disk Read/Write Ops") assert_that(output_yaml).contains("Disk Used Percent") assert_that(output_yaml).contains("Memory Used Percent") - # Daemons Heartbeats widget is only created for Slurm scheduler - if scheduler == "slurm": + # Daemons Heartbeats widget is only created for Slurm scheduler with logging enabled + if scheduler == "slurm" and is_cw_logging_enabled: assert_that(output_yaml).contains("Daemons Heartbeats") assert_that(output_yaml).contains("ClustermgtdHeartbeat") else: