diff --git a/infrastructure/terraform/modules/eventpub/README.md b/infrastructure/terraform/modules/eventpub/README.md index 4be7358..66d08ca 100644 --- a/infrastructure/terraform/modules/eventpub/README.md +++ b/infrastructure/terraform/modules/eventpub/README.md @@ -18,11 +18,15 @@ | [data\_plane\_bus\_arn](#input\_data\_plane\_bus\_arn) | Data plane event bus arn | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | Default tag map for application to all taggable resources in the module | `map(string)` | `{}` | no | | [enable\_event\_cache](#input\_enable\_event\_cache) | Enable caching of events to an S3 bucket | `bool` | `false` | no | +| [enable\_event\_publishing\_anomaly\_detection](#input\_enable\_event\_publishing\_anomaly\_detection) | Enable CloudWatch anomaly detection alarm for SNS message publishing. Detects abnormal drops or spikes in event publishing volume. | `bool` | `true` | no | | [enable\_firehose\_raw\_message\_delivery](#input\_enable\_firehose\_raw\_message\_delivery) | Enables raw message delivery on firehose subscription | `bool` | `false` | no | | [enable\_sns\_delivery\_logging](#input\_enable\_sns\_delivery\_logging) | Enable SNS Delivery Failure Notifications | `bool` | `false` | no | | [environment](#input\_environment) | The name of the terraformscaffold environment the module is called for | `string` | n/a | yes | | [event\_cache\_buffer\_interval](#input\_event\_cache\_buffer\_interval) | The buffer interval for data firehose | `number` | `500` | no | | [event\_cache\_expiry\_days](#input\_event\_cache\_expiry\_days) | s3 archiving expiry in days | `number` | `30` | no | +| [event\_publishing\_anomaly\_band\_width](#input\_event\_publishing\_anomaly\_band\_width) | The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity. Recommended: 2-4. | `number` | `5` | no | +| [event\_publishing\_anomaly\_evaluation\_periods](#input\_event\_publishing\_anomaly\_evaluation\_periods) | Number of evaluation periods for the publishing anomaly alarm. Each period is defined by event\_publishing\_anomaly\_period. | `number` | `3` | no | +| [event\_publishing\_anomaly\_period](#input\_event\_publishing\_anomaly\_period) | The period in seconds over which the specified statistic is applied for anomaly detection. Minimum 300 seconds (5 minutes). Recommended: 300-600. | `number` | `300` | no | | [force\_destroy](#input\_force\_destroy) | When enabled will force destroy event-cache S3 bucket | `bool` | `false` | no | | [group](#input\_group) | The name of the tfscaffold group | `string` | `null` | no | | [iam\_permissions\_boundary\_arn](#input\_iam\_permissions\_boundary\_arn) | The ARN of the permissions boundary to use for the IAM role | `string` | `null` | no | @@ -42,6 +46,7 @@ | Name | Description | |------|-------------| +| [publishing\_anomaly\_alarm](#output\_publishing\_anomaly\_alarm) | CloudWatch anomaly detection alarm details for SNS publishing | | [s3\_bucket\_event\_cache](#output\_s3\_bucket\_event\_cache) | S3 Bucket ARN and Name for event cache | | [sns\_topic](#output\_sns\_topic) | SNS Topic ARN and Name | diff --git a/infrastructure/terraform/modules/eventpub/cloudwatch_metric_alarm_publishing_anomaly.tf b/infrastructure/terraform/modules/eventpub/cloudwatch_metric_alarm_publishing_anomaly.tf new file mode 100644 index 0000000..18d81ea --- /dev/null +++ b/infrastructure/terraform/modules/eventpub/cloudwatch_metric_alarm_publishing_anomaly.tf @@ -0,0 +1,42 @@ +resource "aws_cloudwatch_metric_alarm" "publishing_anomaly" { + count = var.enable_event_publishing_anomaly_detection ? 1 : 0 + + alarm_name = "${local.csi}-sns-publishing-anomaly" + alarm_description = "RELIABILITY: Anomaly detection alarm for abnormal SNS message publishing patterns. Detects unexpected drops or spikes in event publishing volume that may indicate service degradation or misconfiguration." + comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold" + evaluation_periods = var.event_publishing_anomaly_evaluation_periods # Number of evaluation periods for the publishing anomaly alarm. + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + actions_enabled = true + + tags = merge( + local.default_tags, + { + AlarmType = "AnomalyDetection" + AlarmPurpose = "EventPublishingAbnormality" + } + ) + + metric_query { + id = "m1" + return_data = true + + metric { + metric_name = "NumberOfMessagesPublished" + namespace = "AWS/SNS" + period = var.event_publishing_anomaly_period # The period in seconds over which the specified statistic is applied for anomaly detection. + stat = "Sum" + + dimensions = { + TopicName = aws_sns_topic.main.name + } + } + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.event_publishing_anomaly_band_width})" # The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity. + label = "NumberOfMessagesPublished (expected)" + return_data = true + } +} diff --git a/infrastructure/terraform/modules/eventpub/outputs.tf b/infrastructure/terraform/modules/eventpub/outputs.tf index e2ff3b3..cbba9df 100644 --- a/infrastructure/terraform/modules/eventpub/outputs.tf +++ b/infrastructure/terraform/modules/eventpub/outputs.tf @@ -13,3 +13,11 @@ output "s3_bucket_event_cache" { bucket = module.s3bucket_event_cache[0].bucket } : {} } + +output "publishing_anomaly_alarm" { + description = "CloudWatch anomaly detection alarm details for SNS publishing" + value = var.enable_event_publishing_anomaly_detection ? { + arn = aws_cloudwatch_metric_alarm.publishing_anomaly[0].arn + name = aws_cloudwatch_metric_alarm.publishing_anomaly[0].alarm_name + } : null +} diff --git a/infrastructure/terraform/modules/eventpub/variables.tf b/infrastructure/terraform/modules/eventpub/variables.tf index 41141f9..7bdaa30 100644 --- a/infrastructure/terraform/modules/eventpub/variables.tf +++ b/infrastructure/terraform/modules/eventpub/variables.tf @@ -129,3 +129,27 @@ variable "additional_policies_for_event_cache_bucket" { description = "A list of JSON policies to use to build the bucket policy" default = [] } + +variable "enable_event_publishing_anomaly_detection" { + type = bool + description = "Enable CloudWatch anomaly detection alarm for SNS message publishing. Detects abnormal drops or spikes in event publishing volume." + default = true +} + +variable "event_publishing_anomaly_evaluation_periods" { + type = number + description = "Number of evaluation periods for the publishing anomaly alarm. Each period is defined by event_publishing_anomaly_period." + default = 3 +} + +variable "event_publishing_anomaly_period" { + type = number + description = "The period in seconds over which the specified statistic is applied for anomaly detection. Minimum 300 seconds (5 minutes). Recommended: 300-600." + default = 300 +} + +variable "event_publishing_anomaly_band_width" { + type = number + description = "The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity. Recommended: 2-4." + default = 5 +}