Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ CHANGELOG
- Upgrade jmespath to ~=1.0 (from ~=0.10).
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
- Add support for p6-b300 instances for all OSs except AL2.
- Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics.
- Add alarm on missing clustermgtd heartbeat.

**BUG FIXES**
Expand Down
1 change: 1 addition & 0 deletions cli/src/pcluster/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@
# CloudWatch Metrics
CW_METRICS_NAMESPACE = "ParallelCluster"
CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName"
CW_METRICS_DIMENSION_INSTANCE_ID = "InstanceId"
CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat"

STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events"
Expand Down
15 changes: 0 additions & 15 deletions cli/src/pcluster/templates/cdk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,11 @@
from pcluster.constants import (
COOKBOOK_PACKAGES_VERSIONS,
CW_LOGS_RETENTION_DAYS_DEFAULT,
CW_METRICS_NAMESPACE,
IAM_ROLE_PATH,
LAMBDA_VPC_ACCESS_MANAGED_POLICY,
PCLUSTER_CLUSTER_NAME_TAG,
PCLUSTER_DYNAMODB_PREFIX,
PCLUSTER_NODE_TYPE_TAG,
SLURM,
ULTRASERVER_INSTANCE_PREFIX_LIST,
)
from pcluster.launch_template_utils import _LaunchTemplateBuilder
Expand Down Expand Up @@ -744,19 +742,6 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
),
]

if self._config.scheduling.scheduler == SLURM:
policy.extend(
[
iam.PolicyStatement(
sid="CloudWatch",
actions=["cloudwatch:PutMetricData"],
effect=iam.Effect.ALLOW,
resources=["*"],
conditions={"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}},
)
]
)

if (
self._config.scheduling.scheduler == "slurm"
and self._config.scheduling.settings
Expand Down
31 changes: 29 additions & 2 deletions cli/src/pcluster/templates/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
CW_LOGS_CFN_PARAM_NAME,
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
CW_METRICS_DIMENSION_CLUSTER_NAME,
CW_METRICS_DIMENSION_INSTANCE_ID,
CW_METRICS_NAMESPACE,
DEFAULT_EPHEMERAL_DIR,
EFS_PORT,
Expand Down Expand Up @@ -384,8 +385,34 @@ def _add_head_node_alarms(self):
},
}

if self._condition_is_slurm():
metrics_for_alarms["ClustermgtdHeartbeat"] = {
# These alarms required Cw logging enabled because they are based on CW Metrics Filters.
if self._condition_is_slurm() and self.config.is_cw_logging_enabled:
# Create metric filter to extract heartbeat metric from clustermgtd event logs
clustermgtd_heartbeat_metric_filter = logs.CfnMetricFilter(
scope=self.stack,
id=f"{CW_METRICS_CLUSTERMGTD_HEARTBEAT}Filter",
filter_pattern='{ $.event-type = "clustermgtd-heartbeat" }',
log_group_name=self.log_group_name,
metric_transformations=[
logs.CfnMetricFilter.MetricTransformationProperty(
metric_namespace=CW_METRICS_NAMESPACE,
metric_name=CW_METRICS_CLUSTERMGTD_HEARTBEAT,
metric_value="1",
unit="Count",
dimensions=[
logs.CfnMetricFilter.DimensionProperty(
key=CW_METRICS_DIMENSION_CLUSTER_NAME, value="$.cluster-name"
),
logs.CfnMetricFilter.DimensionProperty(
key=CW_METRICS_DIMENSION_INSTANCE_ID, value="$.instance-id"
),
],
)
],
)
clustermgtd_heartbeat_metric_filter.add_depends_on(self.log_group)

metrics_for_alarms[CW_METRICS_CLUSTERMGTD_HEARTBEAT] = {
"metric": self._cw_metric_head_node(
CW_METRICS_NAMESPACE,
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
Expand Down
2 changes: 1 addition & 1 deletion cli/src/pcluster/templates/cw_dashboard_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ def _add_head_node_instance_metrics_graphs(self):

# Custom Metrics
pcluster_metrics = []
if self.config.scheduling.scheduler == "slurm":
if self.config.scheduling.scheduler == "slurm" and self.config.is_cw_logging_enabled:
pcluster_metrics.append(
new_pcluster_metric(
title="Daemons Heartbeats",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@ Image:
Os: alinux2
HeadNode:
InstanceType: t3.micro
Ssh:
KeyName: String
Networking:
SubnetId: subnet-12345678
Ssh:
KeyName: ec2-key-name
Scheduling:
Scheduler: awsbatch
AwsBatchQueues:
Scheduler: slurm
SlurmQueues:
- Name: queue1
Networking:
SubnetIds:
- subnet-12345678
ComputeResources:
- Name: compute_resource1
InstanceTypes:
- c4.xlarge
MaxvCpus: 10
- Name: compute-resource1
InstanceType: c5.2xlarge
Monitoring:
Logs:
CloudWatch:
Enabled: false
47 changes: 1 addition & 46 deletions cli/tests/pcluster/templates/test_cdk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
SlurmComputeResource,
SlurmQueue,
)
from pcluster.constants import CW_METRICS_NAMESPACE, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG
from pcluster.constants import PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG
from pcluster.schemas.cluster_schema import ClusterSchema
from pcluster.templates.cdk_builder import CDKTemplateBuilder
from pcluster.templates.cdk_builder_utils import (
Expand Down Expand Up @@ -593,48 +593,3 @@ def _check_cleanup_role(
assert_that("/parallelcluster/" in generated_template[cleanup_resource_old]["Properties"]["Path"]).is_true()

assert_that(generated_template[cleanup_resource_old]["Properties"]).does_not_contain_key("RoleName")


@pytest.mark.parametrize(
"config_file_name, expect_cloudwatch_permission",
[
("config-slurm.yaml", True),
("config-awsbatch.yaml", False),
],
)
def test_head_node_policy(mocker, test_datadir, config_file_name, expect_cloudwatch_permission):
"""Verify that cloudwatch:PutMetricData is added for slurm scheduler but not for awsbatch."""
mock_aws_api(mocker)
mocker.patch(
"pcluster.config.cluster_config.HeadNodeNetworking.availability_zone",
new_callable=PropertyMock(return_value="us-east-1a"),
)
mock_bucket(mocker)
mock_bucket_object_utils(mocker)

input_yaml = load_yaml_dict(test_datadir / config_file_name)
cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml)
generated_template, _ = CDKTemplateBuilder().build_cluster_template(
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
)

# Find the CloudWatch policy statement in ParallelClusterPoliciesHeadNode
head_node_policies = generated_template["Resources"]["ParallelClusterPoliciesHeadNode"]["Properties"][
"PolicyDocument"
]["Statement"]

cloudwatch_statement = next(
(stmt for stmt in head_node_policies if stmt.get("Sid") == "CloudWatch"),
None,
)

if expect_cloudwatch_permission:
assert_that(cloudwatch_statement).is_not_none()
assert_that(cloudwatch_statement["Action"]).is_equal_to("cloudwatch:PutMetricData")
assert_that(cloudwatch_statement["Effect"]).is_equal_to("Allow")
assert_that(cloudwatch_statement["Resource"]).is_equal_to("*")
assert_that(cloudwatch_statement["Condition"]).is_equal_to(
{"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}}
)
else:
assert_that(cloudwatch_statement).is_none()

This file was deleted.

3 changes: 2 additions & 1 deletion cli/tests/pcluster/templates/test_cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def test_add_efs_shared_storage(mocker, test_datadir, config_file_name, expected
[
"slurm.required.yaml",
"slurm.full.yaml",
"slurm.logging_disabled.yaml",
"awsbatch.simple.yaml",
"awsbatch.full.yaml",
],
Expand Down Expand Up @@ -315,7 +316,7 @@ def test_add_alarms(mocker, config_file_name):
},
}

if cluster.scheduling.scheduler == "slurm":
if cluster.scheduling.scheduler == "slurm" and cluster.is_cw_logging_enabled:
expected_alarms["Clustermgtd-Heartbeat"] = {
"name": "clustername-HeadNode-ClustermgtdHeartbeat",
"metric_name": "ClustermgtdHeartbeat",
Expand Down
23 changes: 16 additions & 7 deletions cli/tests/pcluster/templates/test_cw_dashboard_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r
if cluster_config.is_cw_dashboard_enabled:
assert_that(output_yaml).contains("CloudwatchDashboard")
assert_that(output_yaml).contains("Head Node EC2 Metrics")
_verify_head_node_instance_metrics_graphs(output_yaml, cluster_config.scheduling.scheduler)
_verify_head_node_instance_metrics_graphs(
output_yaml, cluster_config.scheduling.scheduler, cluster_config.is_cw_logging_enabled
)

if cluster_config.are_alarms_enabled:
assert_that(output_yaml).contains("Cluster Alarms")
Expand All @@ -79,15 +81,20 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r
assert_that(output_yaml).does_not_contain("CloudwatchDashboard")
assert_that(output_yaml).does_not_contain("Head Node EC2 Metrics")

_verify_alarms(output_yaml, cluster_config.are_alarms_enabled, cluster_config.scheduling.scheduler)
_verify_alarms(
output_yaml,
cluster_config.are_alarms_enabled,
cluster_config.scheduling.scheduler,
cluster_config.is_cw_logging_enabled,
)

if cluster_config.is_cw_logging_enabled:
assert_that(output_yaml).contains("ClusterCWLogGroup")
else:
assert_that(output_yaml).does_not_contain("ClusterCWLogGroup")


def _verify_alarms(output_yaml, alarms_enabled, scheduler):
def _verify_alarms(output_yaml, alarms_enabled, scheduler, is_cw_logging_enabled):
if alarms_enabled:
assert_that(output_yaml).contains("HeadNodeHealthAlarm")
assert_that(output_yaml).contains("StatusCheckFailed")
Expand All @@ -102,7 +109,7 @@ def _verify_alarms(output_yaml, alarms_enabled, scheduler):
assert_that(output_yaml).contains("disk_used_percent")

# ClustermgtdHeartbeat alarm is only created for Slurm scheduler
if scheduler == "slurm":
if scheduler == "slurm" and is_cw_logging_enabled:
assert_that(output_yaml).contains("HeadNodeClustermgtdHeartbeatAlarm")
assert_that(output_yaml).contains("ClustermgtdHeartbeat")
else:
Expand Down Expand Up @@ -130,13 +137,15 @@ def _verify_metric_filter_dimensions(metric_filters):
)

expected_dimensions = [{"Key": "ClusterName", "Value": "$.cluster-name"}]
if name == "ClustermgtdHeartbeatFilter":
expected_dimensions.append({"Key": "InstanceId", "Value": "$.instance-id"})

assert_that(dimensions, description=f"{name} should have dimensions {expected_dimensions}").is_equal_to(
expected_dimensions
)


def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler):
def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler, is_cw_logging_enabled):
"""Verify CloudWatch graphs within the Head Node Instance Metrics section."""
assert_that(output_yaml).contains("Head Node Instance Metrics")
assert_that(output_yaml).contains("CPU Utilization")
Expand All @@ -146,8 +155,8 @@ def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler):
assert_that(output_yaml).contains("Disk Read/Write Ops")
assert_that(output_yaml).contains("Disk Used Percent")
assert_that(output_yaml).contains("Memory Used Percent")
# Daemons Heartbeats widget is only created for Slurm scheduler
if scheduler == "slurm":
# Daemons Heartbeats widget is only created for Slurm scheduler with logging enabled
if scheduler == "slurm" and is_cw_logging_enabled:
assert_that(output_yaml).contains("Daemons Heartbeats")
assert_that(output_yaml).contains("ClustermgtdHeartbeat")
else:
Expand Down
7 changes: 0 additions & 7 deletions tests/iam_policies/cluster-roles.cfn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,6 @@ Resources:
- route53:ChangeResourceRecordSets
Resource: '*'
Effect: Allow
- Action:
- cloudwatch:PutMetricData
Resource: '*'
Effect: Allow
Condition:
StringEquals:
cloudwatch:namespace: 'ParallelCluster'
Version: '2012-10-17'
Type: AWS::IAM::Role

Expand Down
6 changes: 0 additions & 6 deletions tests/integration-tests/network_template_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,6 @@ def __build_vpc_endpoints(self, subnet_id, route_table_ids):
type=VPCEndpointConfig.EndpointType.INTERFACE,
enable_private_dns=True,
),
VPCEndpointConfig(
name="MonitoringEndpoint",
service_name=f"com.amazonaws.{region}.monitoring",
type=VPCEndpointConfig.EndpointType.INTERFACE,
enable_private_dns=True,
),
VPCEndpointConfig(
name="CFNEndpoint",
service_name=prefix + f"com.amazonaws.{region}.cloudformation",
Expand Down
Loading