|
66 | 66 | CW_LOGS_CFN_PARAM_NAME, |
67 | 67 | CW_METRICS_CLUSTERMGTD_HEARTBEAT, |
68 | 68 | CW_METRICS_DIMENSION_CLUSTER_NAME, |
| 69 | + CW_METRICS_DIMENSION_INSTANCE_ID, |
69 | 70 | CW_METRICS_NAMESPACE, |
70 | 71 | DEFAULT_EPHEMERAL_DIR, |
71 | 72 | EFS_PORT, |
@@ -351,9 +352,18 @@ def _add_internal_efs_shared_storage(self): |
351 | 352 | self._add_shared_storage(internal_efs_storage_shared) |
352 | 353 |
|
353 | 354 | def _cw_metric_head_node( |
354 | | - self, namespace, metric_name, statistic="Maximum", period_seconds=CW_ALARM_PERIOD_DEFAULT, extra_dimensions=None |
| 355 | + self, |
| 356 | + namespace, |
| 357 | + metric_name, |
| 358 | + statistic="Maximum", |
| 359 | + period_seconds=CW_ALARM_PERIOD_DEFAULT, |
| 360 | + extra_dimensions=None, |
| 361 | + override_dimensions=None, |
355 | 362 | ): |
356 | | - dimensions = {"InstanceId": self.head_node_instance.ref} |
| 363 | + if override_dimensions: |
| 364 | + dimensions = override_dimensions |
| 365 | + else: |
| 366 | + dimensions = {CW_METRICS_DIMENSION_INSTANCE_ID: self.head_node_instance.ref} |
357 | 367 | if extra_dimensions: |
358 | 368 | dimensions.update(extra_dimensions) |
359 | 369 | return cloudwatch.Metric( |
@@ -384,12 +394,35 @@ def _add_head_node_alarms(self): |
384 | 394 | }, |
385 | 395 | } |
386 | 396 |
|
387 | | - if self._condition_is_slurm(): |
388 | | - metrics_for_alarms["ClustermgtdHeartbeat"] = { |
| 397 | + # These alarms required Cw logging enabled because they are based on CW Metrics Filters. |
| 398 | + if self._condition_is_slurm() and self.config.is_cw_logging_enabled: |
| 399 | + # Create metric filter to extract heartbeat metric from clustermgtd event logs |
| 400 | + clustermgtd_heartbeat_metric_filter = logs.CfnMetricFilter( |
| 401 | + scope=self.stack, |
| 402 | + id=f"{CW_METRICS_CLUSTERMGTD_HEARTBEAT}Filter", |
| 403 | + filter_pattern='{ $.event-type = "clustermgtd-heartbeat" }', |
| 404 | + log_group_name=self.log_group_name, |
| 405 | + metric_transformations=[ |
| 406 | + logs.CfnMetricFilter.MetricTransformationProperty( |
| 407 | + metric_namespace=CW_METRICS_NAMESPACE, |
| 408 | + metric_name=CW_METRICS_CLUSTERMGTD_HEARTBEAT, |
| 409 | + metric_value="1", |
| 410 | + unit="Count", |
| 411 | + dimensions=[ |
| 412 | + logs.CfnMetricFilter.DimensionProperty( |
| 413 | + key=CW_METRICS_DIMENSION_CLUSTER_NAME, value="$.cluster-name" |
| 414 | + ) |
| 415 | + ], |
| 416 | + ) |
| 417 | + ], |
| 418 | + ) |
| 419 | + clustermgtd_heartbeat_metric_filter.add_depends_on(self.log_group) |
| 420 | + |
| 421 | + metrics_for_alarms[CW_METRICS_CLUSTERMGTD_HEARTBEAT] = { |
389 | 422 | "metric": self._cw_metric_head_node( |
390 | 423 | CW_METRICS_NAMESPACE, |
391 | 424 | CW_METRICS_CLUSTERMGTD_HEARTBEAT, |
392 | | - extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name}, |
| 425 | + override_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name}, |
393 | 426 | ), |
394 | 427 | "evaluation_periods": 10, |
395 | 428 | "datapoints_to_alarm": 10, |
|
0 commit comments