Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ CHANGELOG
- Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings.
- Upgrade jmespath to ~=1.0 (from ~=0.10).
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
- Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics.
- Add alarm on missing clustermgtd heartbeat.

**BUG FIXES**
- Add validation to block updates that change tag order. Blocking such change prevents update failures.
Expand Down
5 changes: 5 additions & 0 deletions cli/src/pcluster/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@
CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT = 1
DETAILED_MONITORING_ENABLED_DEFAULT = False

# CloudWatch Metrics
CW_METRICS_NAMESPACE = "ParallelCluster"
CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName"
CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat"

STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events"

PCLUSTER_IMAGE_NAME_REGEX = r"^[-_A-Za-z0-9{][-_A-Za-z0-9\s:{}\.]+[-_A-Za-z0-9}]$"
Expand Down
15 changes: 15 additions & 0 deletions cli/src/pcluster/templates/cdk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@
from pcluster.constants import (
COOKBOOK_PACKAGES_VERSIONS,
CW_LOGS_RETENTION_DAYS_DEFAULT,
CW_METRICS_NAMESPACE,
IAM_ROLE_PATH,
LAMBDA_VPC_ACCESS_MANAGED_POLICY,
PCLUSTER_CLUSTER_NAME_TAG,
PCLUSTER_DYNAMODB_PREFIX,
PCLUSTER_NODE_TYPE_TAG,
SLURM,
ULTRASERVER_INSTANCE_PREFIX_LIST,
)
from pcluster.launch_template_utils import _LaunchTemplateBuilder
Expand Down Expand Up @@ -742,6 +744,19 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
),
]

if self._config.scheduling.scheduler == SLURM:
policy.extend(
[
iam.PolicyStatement(
sid="CloudWatch",
actions=["cloudwatch:PutMetricData"],
effect=iam.Effect.ALLOW,
resources=["*"],
conditions={"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}},
)
]
)

if (
self._config.scheduling.scheduler == "slurm"
and self._config.scheduling.settings
Expand Down
77 changes: 53 additions & 24 deletions cli/src/pcluster/templates/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
CW_ALARM_PERIOD_DEFAULT,
CW_LOG_GROUP_NAME_PREFIX,
CW_LOGS_CFN_PARAM_NAME,
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
CW_METRICS_DIMENSION_CLUSTER_NAME,
CW_METRICS_NAMESPACE,
DEFAULT_EPHEMERAL_DIR,
EFS_PORT,
FSX_PORTS,
Expand Down Expand Up @@ -364,38 +367,64 @@ def _cw_metric_head_node(
def _add_head_node_alarms(self):
self.head_node_alarms = []

# Metric-specific configurations (only specify overrides from defaults)
metrics_for_alarms = {
"Health": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"),
"Cpu": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"),
"Mem": self._cw_metric_head_node("CWAgent", "mem_used_percent"),
"Disk": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}),
"Health": {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE FOR THE REVIEWER
The refactoring of how alarms are defined here was required because clustermgtd heartbeat introduces an alarm that has different needs that the other alarms (threshold, missing data, observation period); so the logic must be changed to accommodate more flexibility.

"metric": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"),
"threshold": 0,
},
"Cpu": {
"metric": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"),
},
"Mem": {
"metric": self._cw_metric_head_node("CWAgent", "mem_used_percent"),
},
"Disk": {
"metric": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}),
},
}

for metric_key, metric in metrics_for_alarms.items():
if self._condition_is_slurm():
metrics_for_alarms["ClustermgtdHeartbeat"] = {
"metric": self._cw_metric_head_node(
CW_METRICS_NAMESPACE,
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
),
"evaluation_periods": 10,
"datapoints_to_alarm": 10,
"comparison_operator": cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
"threshold": 1,
"treat_missing_data": cloudwatch.TreatMissingData.BREACHING,
}

for metric_key, alarm_config in metrics_for_alarms.items():
alarm_id = f"HeadNode{metric_key}Alarm"
alarm_name = f"{self.stack.stack_name}-HeadNode-{metric_key}"
threshold = 0 if metric_key == "Health" else CW_ALARM_PERCENT_THRESHOLD_DEFAULT
self.head_node_alarms.append(
cloudwatch.Alarm(
scope=self.stack,
id=alarm_id,
alarm_name=alarm_name,
metric=metric,
evaluation_periods=CW_ALARM_EVALUATION_PERIODS_DEFAULT,
threshold=threshold,
comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
datapoints_to_alarm=CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT,
)
)

self.head_node_alarms.append(
cloudwatch.CompositeAlarm(
alarm = cloudwatch.Alarm(
scope=self.stack,
id="HeadNodeAlarm",
composite_alarm_name=f"{self.stack.stack_name}-HeadNode",
alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms),
id=alarm_id,
alarm_name=alarm_name,
metric=alarm_config["metric"],
evaluation_periods=alarm_config.get("evaluation_periods", CW_ALARM_EVALUATION_PERIODS_DEFAULT),
threshold=alarm_config.get("threshold", CW_ALARM_PERCENT_THRESHOLD_DEFAULT),
comparison_operator=alarm_config.get(
"comparison_operator", cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD
),
datapoints_to_alarm=alarm_config.get("datapoints_to_alarm", CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT),
treat_missing_data=alarm_config.get("treat_missing_data", cloudwatch.TreatMissingData.MISSING),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTES FOR THE REVIEWER
Before this change we used to leave the treat_missing_data unspecified.
The default value when unspecified is MISSING. I wanted to set it here to make clearer what is the expected behavior.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup I checked already the default

)
alarm.node.add_dependency(self.wait_condition)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTES FOR THE REVIEWER
We introduced here a dependency of head node alarms to the head node wait condition. This is needed because clustermgtd alarm is meant to go red on missing heartbeats. However, during cluster creation it is expected to not have such heartbeat. So it makes no sense to start alarming before the head node configuration has completed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the Alarm dependency for the new ClustermgtdHeartbeat Alarm but why add dependency for all the alarms?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good point.

My reasoning is:

  1. value: there is no reason to alarm on the head node until the head node completes its setup.
  2. simplicity: it is easier to define a dependency that applies to all the head node alarms than having different dependency rules.

So, un less there is a real value in created head node alarms before it completes its setup, there is not reason to have different behaviors.

Do you agree with that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes agreed!

self.head_node_alarms.append(alarm)

composite_alarm = cloudwatch.CompositeAlarm(
scope=self.stack,
id="HeadNodeAlarm",
composite_alarm_name=f"{self.stack.stack_name}-HeadNode",
alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms),
)
composite_alarm.node.add_dependency(self.wait_condition)
self.head_node_alarms.append(composite_alarm)

def _add_iam_resources(self):
head_node_iam_resources = HeadNodeIamResources(
Expand Down
21 changes: 19 additions & 2 deletions cli/src/pcluster/templates/cw_dashboard_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@

from pcluster.config.cluster_config import BaseClusterConfig, ExistingFileCache, SharedFsxLustre
from pcluster.config.common import SharedStorageType
from pcluster.constants import Feature
from pcluster.constants import (
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
CW_METRICS_DIMENSION_CLUSTER_NAME,
CW_METRICS_NAMESPACE,
Feature,
)
from pcluster.utils import is_feature_supported

MAX_WIDTH = 24
Expand Down Expand Up @@ -567,9 +572,21 @@ def _add_head_node_instance_metrics_graphs(self):
new_pcluster_metric(title="Memory Used Percent", metrics=["mem_used_percent"], namespace="CWAgent"),
]

# Custom Metrics
pcluster_metrics = []
if self.config.scheduling.scheduler == "slurm":
pcluster_metrics.append(
new_pcluster_metric(
title="Daemons Heartbeats",
metrics=[CW_METRICS_CLUSTERMGTD_HEARTBEAT],
namespace=CW_METRICS_NAMESPACE,
additional_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
)
)

# Create graphs for EC2 metrics and CW Agent metrics and update coordinates
widgets_list = []
for metrics_param in ec2_metrics + cwagent_metrics:
for metrics_param in ec2_metrics + cwagent_metrics + pcluster_metrics:
metrics_list = self._generate_metrics_list(metrics_param)
graph_widget = self._generate_graph_widget(metrics_param.title, metrics_list)
widgets_list.append(graph_widget)
Expand Down
47 changes: 46 additions & 1 deletion cli/tests/pcluster/templates/test_cdk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
SlurmComputeResource,
SlurmQueue,
)
from pcluster.constants import PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG
from pcluster.constants import CW_METRICS_NAMESPACE, PCLUSTER_CLUSTER_NAME_TAG, PCLUSTER_NODE_TYPE_TAG
from pcluster.schemas.cluster_schema import ClusterSchema
from pcluster.templates.cdk_builder import CDKTemplateBuilder
from pcluster.templates.cdk_builder_utils import (
Expand Down Expand Up @@ -593,3 +593,48 @@ def _check_cleanup_role(
assert_that("/parallelcluster/" in generated_template[cleanup_resource_old]["Properties"]["Path"]).is_true()

assert_that(generated_template[cleanup_resource_old]["Properties"]).does_not_contain_key("RoleName")


@pytest.mark.parametrize(
"config_file_name, expect_cloudwatch_permission",
[
("config-slurm.yaml", True),
("config-awsbatch.yaml", False),
],
)
def test_head_node_policy(mocker, test_datadir, config_file_name, expect_cloudwatch_permission):
"""Verify that cloudwatch:PutMetricData is added for slurm scheduler but not for awsbatch."""
mock_aws_api(mocker)
mocker.patch(
"pcluster.config.cluster_config.HeadNodeNetworking.availability_zone",
new_callable=PropertyMock(return_value="us-east-1a"),
)
mock_bucket(mocker)
mock_bucket_object_utils(mocker)

input_yaml = load_yaml_dict(test_datadir / config_file_name)
cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml)
generated_template, _ = CDKTemplateBuilder().build_cluster_template(
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
)

# Find the CloudWatch policy statement in ParallelClusterPoliciesHeadNode
head_node_policies = generated_template["Resources"]["ParallelClusterPoliciesHeadNode"]["Properties"][
"PolicyDocument"
]["Statement"]

cloudwatch_statement = next(
(stmt for stmt in head_node_policies if stmt.get("Sid") == "CloudWatch"),
None,
)

if expect_cloudwatch_permission:
assert_that(cloudwatch_statement).is_not_none()
assert_that(cloudwatch_statement["Action"]).is_equal_to("cloudwatch:PutMetricData")
assert_that(cloudwatch_statement["Effect"]).is_equal_to("Allow")
assert_that(cloudwatch_statement["Resource"]).is_equal_to("*")
assert_that(cloudwatch_statement["Condition"]).is_equal_to(
{"StringEquals": {"cloudwatch:Namespace": CW_METRICS_NAMESPACE}}
)
else:
assert_that(cloudwatch_statement).is_none()
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Image:
Os: alinux2
HeadNode:
InstanceType: t3.micro
Ssh:
KeyName: String
Networking:
SubnetId: subnet-12345678
Scheduling:
Scheduler: awsbatch
AwsBatchQueues:
- Name: queue1
Networking:
SubnetIds:
- subnet-12345678
ComputeResources:
- Name: compute_resource1
InstanceTypes:
- c4.xlarge
MaxvCpus: 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Image:
Os: alinux2
HeadNode:
InstanceType: String
Ssh:
KeyName: String
Networking:
SubnetId: subnet-12345678
Iam:
S3Access:
- BucketName: testbucketpball
EnableWriteAccess: True
Scheduling:
Scheduler: slurm
SlurmQueues:
- Name: queue1
ComputeResources:
- Name: compute_resource1
InstanceType: t3.micro
MinCount: 1
MaxCount: 5
Networking:
SubnetIds:
- subnet-12345678
Iam:
S3Access:
- BucketName: testbucketpball
EnableWriteAccess: True
Loading
Loading