From d389452bbf5eccf6bd39fe5fb0577755f835f00b Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Mon, 24 Mar 2025 10:22:03 +0000 Subject: [PATCH] Add watchdog for stuck items --- README.md | 1 + .../athena_named_query_stuck_request_items.tf | 11 +++++++++++ ...atch_metric_alarm_overdue_request_item_plans.tf | 2 +- ...loudwatch_metric_alarm_overdue_request_items.tf | 2 +- .../cloudwatch_metric_alarm_overdue_requests.tf | 2 +- .../cloudwatch_metric_alarm_stuck_request_items.tf | 14 ++++++++++++++ .../scripts/sql/watchdog/stuck_request_items.sql | 13 +++++++++++++ .../reporting/sfn_state_machine_watchdog.tf | 4 ++++ 8 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 infrastructure/terraform/components/reporting/athena_named_query_stuck_request_items.tf create mode 100644 infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_stuck_request_items.tf create mode 100644 infrastructure/terraform/components/reporting/scripts/sql/watchdog/stuck_request_items.sql diff --git a/README.md b/README.md index 547ae853..cf174509 100644 --- a/README.md +++ b/README.md @@ -259,6 +259,7 @@ Watchdog queries and corresponding metrics and alarms are currently setup for th | Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans | | Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items | | Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests | +| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items | ## Contacts diff --git a/infrastructure/terraform/components/reporting/athena_named_query_stuck_request_items.tf b/infrastructure/terraform/components/reporting/athena_named_query_stuck_request_items.tf new file mode 100644 index 00000000..d16c54ac --- /dev/null +++ b/infrastructure/terraform/components/reporting/athena_named_query_stuck_request_items.tf @@ -0,0 +1,11 @@ +resource "aws_athena_named_query" "stuck_request_items" { + name = "stuck_request_items" + description = "Query to determine any request items unexpectedly stuck in an ENRICHED or PENDING_ENRICHMENT state" + workgroup = aws_athena_workgroup.user.id + database = aws_glue_catalog_database.reporting.name + query = file("${path.module}/scripts/sql/watchdog/stuck_request_items.sql") + + depends_on = [ + null_resource.request_item_status_table + ] +} diff --git a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_item_plans.tf b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_item_plans.tf index 2b6ecd3c..c921a2b0 100644 --- a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_item_plans.tf +++ b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_item_plans.tf @@ -3,7 +3,7 @@ resource "aws_cloudwatch_metric_alarm" "overdue_request_item_plans" { comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 1 threshold = 1 - alarm_description = "This metric monitors unexpected/overdue request item plans" + alarm_description = "Request item plans that did not reach a terminal state within an expected time window" metric_query { id = "max_overdue_request_item_plans_count" diff --git a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_items.tf b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_items.tf index e7d5da3c..fc3c51a0 100644 --- a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_items.tf +++ b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_request_items.tf @@ -3,7 +3,7 @@ resource "aws_cloudwatch_metric_alarm" "overdue_request_items" { comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 1 threshold = 1 - alarm_description = "This metric monitors unexpected/overdue request items" + alarm_description = "Request items that did not reach a terminal state within an expected time window" metric_query { id = "max_overdue_request_items_count" diff --git a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_requests.tf b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_requests.tf index 0a7796d2..2c7ba57b 100644 --- a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_requests.tf +++ b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_overdue_requests.tf @@ -3,7 +3,7 @@ resource "aws_cloudwatch_metric_alarm" "overdue_requests" { comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 1 threshold = 1 - alarm_description = "This metric monitors unexpected/overdue requests" + alarm_description = "Requests that did not reach a terminal state within an expected time window" metric_query { id = "max_overdue_requests_count" diff --git a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_stuck_request_items.tf b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_stuck_request_items.tf new file mode 100644 index 00000000..78622a6e --- /dev/null +++ b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_stuck_request_items.tf @@ -0,0 +1,14 @@ +resource "aws_cloudwatch_metric_alarm" "stuck_request_items" { + alarm_name = "${local.csi}-stuck-request-items" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + threshold = 1 + alarm_description = "Request items stuck in an ENRICHED or PENDING_ENRICHMENT state for longer than an expected time window" + + metric_query { + id = "max_stuck_request_items_count" + expression = "SELECT MAX(StuckRequestItemsCount) FROM \"Notify/Watchdog\" WHERE environment='${var.environment}'" + return_data = "true" + period = 3600 + } +} diff --git a/infrastructure/terraform/components/reporting/scripts/sql/watchdog/stuck_request_items.sql b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/stuck_request_items.sql new file mode 100644 index 00000000..3f3f1db2 --- /dev/null +++ b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/stuck_request_items.sql @@ -0,0 +1,13 @@ +SELECT + clientid, + COALESCE(campaignid, 'N/A'), + SUM( + CASE + WHEN status='ENRICHED' AND createdtime < DATE_ADD('day', -6, CURRENT_DATE) THEN 1 + WHEN status='PENDING_ENRICHMENT' AND createdtime < DATE_ADD('day', -2, CURRENT_DATE) THEN 1 + ELSE 0 + END + ) +FROM request_item_status +WHERE createdtime >= DATE_ADD('day', -90, CURRENT_DATE) +GROUP BY clientid, campaignid diff --git a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf index 0a3e65c3..3441ccfe 100644 --- a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf +++ b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf @@ -15,6 +15,10 @@ resource "aws_sfn_state_machine" "watchdog" { { metric_name = "OverdueRequestsCount", query_id = aws_athena_named_query.overdue_requests.id + }, + { + metric_name = "StuckRequestItemsCount", + query_id = aws_athena_named_query.stuck_request_items.id } ] environment = var.environment