Skip to content

Commit 14c5fba

Browse files
committed
[Observability] Use metric filter to generate the clustermgtd heartbeat metric.
1 parent 0642b16 commit 14c5fba

4 files changed

Lines changed: 68 additions & 6 deletions

File tree

cli/src/pcluster/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,12 @@
188188
CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT = 1
189189
DETAILED_MONITORING_ENABLED_DEFAULT = False
190190

191+
# CloudWatch Metrics
192+
CW_METRICS_NAMESPACE = "ParallelCluster"
193+
CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName"
194+
CW_METRICS_DIMENSION_INSTANCE_ID = "InstanceId"
195+
CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat"
196+
191197
STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events"
192198

193199
PCLUSTER_IMAGE_NAME_REGEX = r"^[-_A-Za-z0-9{][-_A-Za-z0-9\s:{}\.]+[-_A-Za-z0-9}]$"

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
CW_LOGS_CFN_PARAM_NAME,
6767
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
6868
CW_METRICS_DIMENSION_CLUSTER_NAME,
69+
CW_METRICS_DIMENSION_INSTANCE_ID,
6970
CW_METRICS_NAMESPACE,
7071
DEFAULT_EPHEMERAL_DIR,
7172
EFS_PORT,
@@ -351,9 +352,18 @@ def _add_internal_efs_shared_storage(self):
351352
self._add_shared_storage(internal_efs_storage_shared)
352353

353354
def _cw_metric_head_node(
354-
self, namespace, metric_name, statistic="Maximum", period_seconds=CW_ALARM_PERIOD_DEFAULT, extra_dimensions=None
355+
self,
356+
namespace,
357+
metric_name,
358+
statistic="Maximum",
359+
period_seconds=CW_ALARM_PERIOD_DEFAULT,
360+
extra_dimensions=None,
361+
override_dimensions=None,
355362
):
356-
dimensions = {"InstanceId": self.head_node_instance.ref}
363+
if override_dimensions:
364+
dimensions = override_dimensions
365+
else:
366+
dimensions = {CW_METRICS_DIMENSION_INSTANCE_ID: self.head_node_instance.ref}
357367
if extra_dimensions:
358368
dimensions.update(extra_dimensions)
359369
return cloudwatch.Metric(
@@ -384,12 +394,35 @@ def _add_head_node_alarms(self):
384394
},
385395
}
386396

387-
if self._condition_is_slurm():
388-
metrics_for_alarms["ClustermgtdHeartbeat"] = {
397+
# These alarms required Cw logging enabled because they are based on CW Metrics Filters.
398+
if self._condition_is_slurm() and self.config.is_cw_logging_enabled:
399+
# Create metric filter to extract heartbeat metric from clustermgtd event logs
400+
clustermgtd_heartbeat_metric_filter = logs.CfnMetricFilter(
401+
scope=self.stack,
402+
id=f"{CW_METRICS_CLUSTERMGTD_HEARTBEAT}Filter",
403+
filter_pattern='{ $.event-type = "clustermgtd-heartbeat" }',
404+
log_group_name=self.log_group_name,
405+
metric_transformations=[
406+
logs.CfnMetricFilter.MetricTransformationProperty(
407+
metric_namespace=CW_METRICS_NAMESPACE,
408+
metric_name=CW_METRICS_CLUSTERMGTD_HEARTBEAT,
409+
metric_value="1",
410+
unit="Count",
411+
dimensions=[
412+
logs.CfnMetricFilter.DimensionProperty(
413+
key=CW_METRICS_DIMENSION_CLUSTER_NAME, value="$.cluster-name"
414+
)
415+
],
416+
)
417+
],
418+
)
419+
clustermgtd_heartbeat_metric_filter.add_depends_on(self.log_group)
420+
421+
metrics_for_alarms[CW_METRICS_CLUSTERMGTD_HEARTBEAT] = {
389422
"metric": self._cw_metric_head_node(
390423
CW_METRICS_NAMESPACE,
391424
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
392-
extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
425+
override_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
393426
),
394427
"evaluation_periods": 10,
395428
"datapoints_to_alarm": 10,
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Image:
2+
Os: alinux2
3+
HeadNode:
4+
InstanceType: t3.micro
5+
Networking:
6+
SubnetId: subnet-12345678
7+
Ssh:
8+
KeyName: ec2-key-name
9+
Scheduling:
10+
Scheduler: slurm
11+
SlurmQueues:
12+
- Name: queue1
13+
Networking:
14+
SubnetIds:
15+
- subnet-12345678
16+
ComputeResources:
17+
- Name: compute-resource1
18+
InstanceType: c5.2xlarge
19+
Monitoring:
20+
Logs:
21+
CloudWatch:
22+
Enabled: false

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def test_add_efs_shared_storage(mocker, test_datadir, config_file_name, expected
250250
[
251251
"slurm.required.yaml",
252252
"slurm.full.yaml",
253+
"slurm.logging_disabled.yaml",
253254
"awsbatch.simple.yaml",
254255
"awsbatch.full.yaml",
255256
],
@@ -315,7 +316,7 @@ def test_add_alarms(mocker, config_file_name):
315316
},
316317
}
317318

318-
if cluster.scheduling.scheduler == "slurm":
319+
if cluster.scheduling.scheduler == "slurm" and cluster.is_cw_logging_enabled:
319320
expected_alarms["Clustermgtd-Heartbeat"] = {
320321
"name": "clustername-HeadNode-ClustermgtdHeartbeat",
321322
"metric_name": "ClustermgtdHeartbeat",

0 commit comments

Comments
 (0)