From 0b287924212f65aa2afcdd1aa0ceaffdb4b14066 Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Mon, 2 Jun 2025 17:41:46 +0100 Subject: [PATCH 01/10] First pass --- README.md | 13 +- .../athena_named_query_degraded_latency.tf | 12 ++ ...loudwatch_metric_alarm_degraded_latency.tf | 15 +++ .../scripts/sql/views/latency_percentiles.sql | 6 +- .../scripts/sql/watchdog/degraded_latency.sql | 124 ++++++++++++++++++ .../reporting/sfn_state_machine_watchdog.tf | 4 + 6 files changed, 164 insertions(+), 10 deletions(-) create mode 100644 infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf create mode 100644 infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_degraded_latency.tf create mode 100644 infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql diff --git a/README.md b/README.md index cf174509..497c3812 100644 --- a/README.md +++ b/README.md @@ -254,12 +254,13 @@ CloudWatch alarms are setup to trigger if any of those watchdog metrics exceed a Watchdog queries and corresponding metrics and alarms are currently setup for the following: -| Condition | Named Query | Metric | Threshold | Alarm | -|--------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------| -| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans | -| Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items | -| Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests | -| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items | +| Condition | Named Query | Metric | Threshold | Alarm | +|---------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------| +| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans | +| Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items | +| Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests | +| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items | +| Degraded latency compared to historic trend | degraded_latency | DegradedLatenciesCount | Sum across all clients > 0 | degraded-latency | ## Contacts diff --git a/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf new file mode 100644 index 00000000..f3e9b7db --- /dev/null +++ b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf @@ -0,0 +1,12 @@ +resource "aws_athena_named_query" "degraded_latency" { + name = "degraded_latency" + description = "Query to identify if today's latencies are significantly worse than historic values" + workgroup = aws_athena_workgroup.user.id + database = aws_glue_catalog_database.reporting.name + query = file("${path.module}/scripts/sql/watchdog/degraded_latency.sql") + + depends_on = [ + null_resource.request_item_status_table, + null_resource.request_item_plan_status_table + ] +} diff --git a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_degraded_latency.tf b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_degraded_latency.tf new file mode 100644 index 00000000..c086c3f8 --- /dev/null +++ b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_degraded_latency.tf @@ -0,0 +1,15 @@ +resource "aws_cloudwatch_metric_alarm" "degraded_latency" { + alarm_name = "${local.csi}-degraded-latency" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + threshold = 1 + alarm_description = "Today's latencies are significantly higher than historic trends" + treat_missing_data = "notBreaching" + + metric_query { + id = "max_degraded_latency_count" + expression = "SELECT MAX(DegradedLatenciesCount) FROM \"Notify/Watchdog\" WHERE environment='${var.environment}'" + return_data = "true" + period = 3600 + } +} diff --git a/infrastructure/terraform/components/reporting/scripts/sql/views/latency_percentiles.sql b/infrastructure/terraform/components/reporting/scripts/sql/views/latency_percentiles.sql index 7302025d..b7302208 100644 --- a/infrastructure/terraform/components/reporting/scripts/sql/views/latency_percentiles.sql +++ b/infrastructure/terraform/components/reporting/scripts/sql/views/latency_percentiles.sql @@ -48,9 +48,7 @@ WHERE starttime IS NOT NULL AND endtime IS NOT NULL AND starttime > DATE('2000-01-01') AND DAY_OF_WEEK(starttime) <= 5 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) < 18 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) < 18 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) >= 8 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) >= 8 +AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 +AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 GROUP BY clientid, campaignid, communicationtype, percentile ORDER BY clientid, campaignid, communicationtype, percentile diff --git a/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql new file mode 100644 index 00000000..4d5b9515 --- /dev/null +++ b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql @@ -0,0 +1,124 @@ +SELECT + recent.clientid, + recent.campaignid, + SUM( + CASE + WHEN recent.p50latency > historic.p99latency THEN 1 + ELSE 0 + END + ) +FROM ( + SELECT + clientid, + COALESCE(campaignid, 'N/A') AS campaignid, + sendinggroupid, + communicationtype, + approx_percentile(to_unixtime(endtime)-to_unixtime(starttime), 0.99) AS p99latency + FROM ( + --Receipt to first channel send + SELECT rip.clientid, rip.campaignid, rip.communicationtype, rip.sendinggroupid, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM + ( + SELECT requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status + --Query optimisation to prevent full table scan on request_item_status createdtime + WHERE createdtime >= DATE_ADD('week', -1, DATE_ADD('month', -1, CURRENT_DATE)) + GROUP BY requestid + ) AS rq + INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid + WHERE rip.createdtime >= DATE_ADD('month', -1, CURRENT_DATE) + AND rip.ordernumber = 1 + AND rip.channeltype = 'primary' + UNION ALL + --Failure to fallback channel send + SELECT clientid, campaignid, communicationtype, sendinggroupid, + GREATEST( + COALESCE(prevfailedtime1, DATE('2000-01-01')), + COALESCE(prevfailedtime2, DATE('2000-01-01')), + COALESCE(prevfailedtime3, DATE('2000-01-01')) + ) AS starttime, + sendtime AS endtime + FROM ( + SELECT + clientid, + campaignid, + ordernumber, + sendtime, + communicationtype, + sendinggroupid, + LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, + LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, + LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 + FROM request_item_plan_status + WHERE createdtime >= DATE_ADD('month', -1, CURRENT_DATE) + AND channeltype = 'primary' + ) + WHERE ordernumber > 1 + ) + WHERE starttime IS NOT NULL + AND endtime IS NOT NULL + AND starttime > DATE('2000-01-01') + AND DAY_OF_WEEK(starttime) <= 5 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 + GROUP BY clientid, campaignid, communicationtype, sendinggroupid +) historic +INNER JOIN ( + SELECT + clientid, + COALESCE(campaignid, 'N/A') AS campaignid, + sendinggroupid, + communicationtype, + approx_percentile(to_unixtime(endtime)-to_unixtime(starttime), 0.5) AS p50latency + FROM ( + --Receipt to first channel send + SELECT rip.clientid, rip.campaignid, rip.communicationtype, rip.sendinggroupid, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM + ( + SELECT requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status + --Query optimisation to prevent full table scan on request_item_status createdtime + WHERE createdtime >= DATE_ADD('week', -1, DATE_ADD('month', -1, CURRENT_DATE)) + GROUP BY requestid + ) AS rq + INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid + WHERE rip.createdtime >= DATE_ADD('month', -1, CURRENT_DATE) + AND rip.ordernumber = 1 + AND rip.channeltype = 'primary' + UNION ALL + --Failure to fallback channel send + SELECT clientid, campaignid, communicationtype, sendinggroupid, + GREATEST( + COALESCE(prevfailedtime1, DATE('2000-01-01')), + COALESCE(prevfailedtime2, DATE('2000-01-01')), + COALESCE(prevfailedtime3, DATE('2000-01-01')) + ) AS starttime, + sendtime AS endtime + FROM ( + SELECT + clientid, + campaignid, + ordernumber, + sendtime, + communicationtype, + sendinggroupid, + LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, + LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, + LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 + FROM request_item_plan_status + WHERE createdtime >= DATE_ADD('month', -1, CURRENT_DATE) + AND channeltype = 'primary' + ) + WHERE ordernumber > 1 + ) + WHERE starttime IS NOT NULL + AND endtime IS NOT NULL + AND starttime > DATE('2000-01-01') + AND DAY_OF_WEEK(starttime) <= 5 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 + AND endtime >= CURRENT_DATE + GROUP BY clientid, campaignid, communicationtype, sendinggroupid +) recent +ON +historic.clientid = recent.clientid AND +historic.campaignid = recent.campaignid AND +historic.sendinggroupid = recent.sendinggroupid AND +historic.communicationtype = recent.communicationtype +GROUP BY 1, 2 diff --git a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf index ef1fc2d5..0174a698 100644 --- a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf +++ b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf @@ -19,6 +19,10 @@ resource "aws_sfn_state_machine" "watchdog" { { metric_name = "StuckRequestItemsCount", query_id = aws_athena_named_query.stuck_request_items.id + }, + { + metric_name = "DegradedLatenciesCount", + query_id = aws_athena_named_query.degraded_latency.id } ] environment = var.environment From dc33a41269ad1d1ac0c663e63a8dd17b2c92383c Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Tue, 3 Jun 2025 12:15:52 +0100 Subject: [PATCH 02/10] Factor out common latency view --- .../null_resource_raw_latency_view.tf | 18 +++++++ .../scripts/sql/views/latency_percentiles.sql | 46 +---------------- .../scripts/sql/views/raw_latency.sql | 49 +++++++++++++++++++ 3 files changed, 69 insertions(+), 44 deletions(-) create mode 100644 infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf create mode 100644 infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql diff --git a/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf b/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf new file mode 100644 index 00000000..a224fdb8 --- /dev/null +++ b/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf @@ -0,0 +1,18 @@ +resource "null_resource" "raw_latency_view" { + triggers = { + sql = filesha256("${path.module}/scripts/sql/views/raw_latency.sql") + } + provisioner "local-exec" { + command = <= DATE_ADD('week', -1, DATE_ADD('month', -2, CURRENT_DATE)) - GROUP BY requestid - ) AS rq - INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid - WHERE rip.createdtime >= DATE_ADD('month', -2, CURRENT_DATE) - AND rip.ordernumber = 1 - AND rip.channeltype = 'primary' - UNION ALL - --Failure to fallback channel send - SELECT clientid, campaignid, communicationtype, - GREATEST( - COALESCE(prevfailedtime1, DATE('2000-01-01')), - COALESCE(prevfailedtime2, DATE('2000-01-01')), - COALESCE(prevfailedtime3, DATE('2000-01-01')) - ) AS starttime, - sendtime AS endtime - FROM ( - SELECT - clientid, - campaignid, - ordernumber, - sendtime, - communicationtype, - LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, - LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, - LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 - FROM request_item_plan_status - WHERE createdtime >= DATE_ADD('month', -2, CURRENT_DATE) - AND channeltype = 'primary' - ) - WHERE ordernumber > 1 -) +FROM raw_latency CROSS JOIN UNNEST (ARRAY[0.001, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 0.999]) AS t(percentile) -WHERE starttime IS NOT NULL -AND endtime IS NOT NULL -AND starttime > DATE('2000-01-01') -AND DAY_OF_WEEK(starttime) <= 5 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 +WHERE endtime >= DATE_ADD('month', -2, CURRENT_DATE) GROUP BY clientid, campaignid, communicationtype, percentile ORDER BY clientid, campaignid, communicationtype, percentile diff --git a/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql new file mode 100644 index 00000000..82ddabde --- /dev/null +++ b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql @@ -0,0 +1,49 @@ +CREATE OR REPLACE VIEW ${view_name} AS +SELECT * FROM ( + --Receipt to first channel send + SELECT + rip.clientid, rip.campaignid, rip.communicationtype, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM + ( + SELECT clientid, requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status + WHERE createdtime >= DATE_ADD('month', -3, CURRENT_DATE) + GROUP BY 1, 2 + ) AS rq + INNER JOIN request_item_plan_status rip + ON rq.clientid = rip.clientid AND rq.requestid = rip.requestid + WHERE rip.ordernumber = 1 + AND rip.channeltype = 'primary' + AND rip.createdtime >= DATE_ADD('month', -3, CURRENT_DATE) + UNION ALL + --Failure to fallback channel send + SELECT + clientid, + campaignid, + communicationtype, + GREATEST( + COALESCE(prevfailedtime1, DATE('2000-01-01')), + COALESCE(prevfailedtime2, DATE('2000-01-01')), + COALESCE(prevfailedtime3, DATE('2000-01-01')) + ) AS starttime, + sendtime AS endtime + FROM ( + SELECT + clientid, + campaignid, + ordernumber, + sendtime, + communicationtype, + LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, + LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, + LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 + FROM request_item_plan_status + WHERE channeltype = 'primary' + AND createdtime >= DATE_ADD('month', -3, CURRENT_DATE) + ) + WHERE ordernumber > 1 +) +WHERE starttime IS NOT NULL +AND endtime IS NOT NULL +AND starttime > DATE('2000-01-01') +AND DAY_OF_WEEK(starttime) <= 5 +AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 +AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 From 83fb73c196eea2ffa18d876722243504c7cd071d Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Tue, 3 Jun 2025 16:06:48 +0100 Subject: [PATCH 03/10] Tidy up SQL --- .../null_resource_raw_latency_view.tf | 6 +- .../scripts/sql/views/latency_percentiles.sql | 6 +- .../scripts/sql/views/raw_latency.sql | 49 ------ .../scripts/sql/views/raw_latency_3m.sql | 70 +++++++++ .../scripts/sql/watchdog/degraded_latency.sql | 140 +++--------------- 5 files changed, 93 insertions(+), 178 deletions(-) delete mode 100644 infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql create mode 100644 infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql diff --git a/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf b/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf index a224fdb8..29c5a996 100644 --- a/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf +++ b/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf @@ -1,13 +1,13 @@ -resource "null_resource" "raw_latency_view" { +resource "null_resource" "raw_latency_3m_view" { triggers = { - sql = filesha256("${path.module}/scripts/sql/views/raw_latency.sql") + sql = filesha256("${path.module}/scripts/sql/views/raw_latency_3m.sql") } provisioner "local-exec" { command = <= DATE_ADD('month', -2, CURRENT_DATE) -GROUP BY clientid, campaignid, communicationtype, percentile -ORDER BY clientid, campaignid, communicationtype, percentile +GROUP BY 1, 2, 3, 4 +ORDER BY 1, 2, 3, 4 diff --git a/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql deleted file mode 100644 index 82ddabde..00000000 --- a/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency.sql +++ /dev/null @@ -1,49 +0,0 @@ -CREATE OR REPLACE VIEW ${view_name} AS -SELECT * FROM ( - --Receipt to first channel send - SELECT - rip.clientid, rip.campaignid, rip.communicationtype, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM - ( - SELECT clientid, requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status - WHERE createdtime >= DATE_ADD('month', -3, CURRENT_DATE) - GROUP BY 1, 2 - ) AS rq - INNER JOIN request_item_plan_status rip - ON rq.clientid = rip.clientid AND rq.requestid = rip.requestid - WHERE rip.ordernumber = 1 - AND rip.channeltype = 'primary' - AND rip.createdtime >= DATE_ADD('month', -3, CURRENT_DATE) - UNION ALL - --Failure to fallback channel send - SELECT - clientid, - campaignid, - communicationtype, - GREATEST( - COALESCE(prevfailedtime1, DATE('2000-01-01')), - COALESCE(prevfailedtime2, DATE('2000-01-01')), - COALESCE(prevfailedtime3, DATE('2000-01-01')) - ) AS starttime, - sendtime AS endtime - FROM ( - SELECT - clientid, - campaignid, - ordernumber, - sendtime, - communicationtype, - LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, - LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, - LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 - FROM request_item_plan_status - WHERE channeltype = 'primary' - AND createdtime >= DATE_ADD('month', -3, CURRENT_DATE) - ) - WHERE ordernumber > 1 -) -WHERE starttime IS NOT NULL -AND endtime IS NOT NULL -AND starttime > DATE('2000-01-01') -AND DAY_OF_WEEK(starttime) <= 5 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 diff --git a/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql new file mode 100644 index 00000000..80b2396b --- /dev/null +++ b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql @@ -0,0 +1,70 @@ +CREATE OR REPLACE VIEW ${view_name} AS +WITH request_created_time AS ( + SELECT + clientid, + requestid, + MIN(createdtime) AS createdtime + FROM request_item_status + WHERE createdtime >= DATE_ADD('month', -3, CURRENT_DATE) + GROUP BY clientid, requestid +), +first_channel_send AS ( + SELECT + rip.clientid, + rip.campaignid, + rip.sendinggroupid, + rip.communicationtype, + rct.createdtime AS starttime, + rip.sendtime AS endtime + FROM request_created_time rct + INNER JOIN request_item_plan_status rip + ON rct.clientid = rip.clientid + AND rct.requestid = rip.requestid + WHERE rip.ordernumber = 1 + AND rip.channeltype = 'primary' + AND rip.createdtime >= DATE_ADD('month', -3, CURRENT_DATE) +), +fallback_candidates AS ( + SELECT + clientid, + campaignid, + sendinggroupid, + ordernumber, + sendtime, + communicationtype, + LAG(completedtime, 1) OVER win AS prevfailedtime1, + LAG(completedtime, 2) OVER win AS prevfailedtime2, + LAG(completedtime, 3) OVER win AS prevfailedtime3 + FROM request_item_plan_status + WHERE channeltype = 'primary' + AND createdtime >= DATE_ADD('month', -3, CURRENT_DATE) + WINDOW win AS (PARTITION BY requestitemid ORDER BY ordernumber ASC) +), +fallback_channel_send AS ( + SELECT + clientid, + campaignid, + sendinggroupid, + communicationtype, + GREATEST( + COALESCE(prevfailedtime1, DATE('2000-01-01')), + COALESCE(prevfailedtime2, DATE('2000-01-01')), + COALESCE(prevfailedtime3, DATE('2000-01-01')) + ) AS starttime, + sendtime AS endtime + FROM fallback_candidates + WHERE ordernumber > 1 +), +combined_events AS ( + SELECT * FROM first_channel_send + UNION ALL + SELECT * FROM fallback_channel_send +) +SELECT * +FROM combined_events +WHERE starttime IS NOT NULL + AND endtime IS NOT NULL + AND starttime > DATE('2000-01-01') + AND DAY_OF_WEEK(starttime) <= 5 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17; diff --git a/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql index 4d5b9515..694b46c4 100644 --- a/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql +++ b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql @@ -1,124 +1,18 @@ +WITH latency_stats AS ( + SELECT + clientid, + campaignid, + sendinggroupid, + communicationtype, + approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.99) FILTER (WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE)) AS monthp99latency, + approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.5) FILTER (WHERE endtime >= CURRENT_DATE) AS todayp50latency + FROM raw_latency_3m + WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE) + GROUP BY 1, 2, 3, 4 +) SELECT - recent.clientid, - recent.campaignid, - SUM( - CASE - WHEN recent.p50latency > historic.p99latency THEN 1 - ELSE 0 - END - ) -FROM ( - SELECT - clientid, - COALESCE(campaignid, 'N/A') AS campaignid, - sendinggroupid, - communicationtype, - approx_percentile(to_unixtime(endtime)-to_unixtime(starttime), 0.99) AS p99latency - FROM ( - --Receipt to first channel send - SELECT rip.clientid, rip.campaignid, rip.communicationtype, rip.sendinggroupid, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM - ( - SELECT requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status - --Query optimisation to prevent full table scan on request_item_status createdtime - WHERE createdtime >= DATE_ADD('week', -1, DATE_ADD('month', -1, CURRENT_DATE)) - GROUP BY requestid - ) AS rq - INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid - WHERE rip.createdtime >= DATE_ADD('month', -1, CURRENT_DATE) - AND rip.ordernumber = 1 - AND rip.channeltype = 'primary' - UNION ALL - --Failure to fallback channel send - SELECT clientid, campaignid, communicationtype, sendinggroupid, - GREATEST( - COALESCE(prevfailedtime1, DATE('2000-01-01')), - COALESCE(prevfailedtime2, DATE('2000-01-01')), - COALESCE(prevfailedtime3, DATE('2000-01-01')) - ) AS starttime, - sendtime AS endtime - FROM ( - SELECT - clientid, - campaignid, - ordernumber, - sendtime, - communicationtype, - sendinggroupid, - LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, - LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, - LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 - FROM request_item_plan_status - WHERE createdtime >= DATE_ADD('month', -1, CURRENT_DATE) - AND channeltype = 'primary' - ) - WHERE ordernumber > 1 - ) - WHERE starttime IS NOT NULL - AND endtime IS NOT NULL - AND starttime > DATE('2000-01-01') - AND DAY_OF_WEEK(starttime) <= 5 - AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 - AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 - GROUP BY clientid, campaignid, communicationtype, sendinggroupid -) historic -INNER JOIN ( - SELECT - clientid, - COALESCE(campaignid, 'N/A') AS campaignid, - sendinggroupid, - communicationtype, - approx_percentile(to_unixtime(endtime)-to_unixtime(starttime), 0.5) AS p50latency - FROM ( - --Receipt to first channel send - SELECT rip.clientid, rip.campaignid, rip.communicationtype, rip.sendinggroupid, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM - ( - SELECT requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status - --Query optimisation to prevent full table scan on request_item_status createdtime - WHERE createdtime >= DATE_ADD('week', -1, DATE_ADD('month', -1, CURRENT_DATE)) - GROUP BY requestid - ) AS rq - INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid - WHERE rip.createdtime >= DATE_ADD('month', -1, CURRENT_DATE) - AND rip.ordernumber = 1 - AND rip.channeltype = 'primary' - UNION ALL - --Failure to fallback channel send - SELECT clientid, campaignid, communicationtype, sendinggroupid, - GREATEST( - COALESCE(prevfailedtime1, DATE('2000-01-01')), - COALESCE(prevfailedtime2, DATE('2000-01-01')), - COALESCE(prevfailedtime3, DATE('2000-01-01')) - ) AS starttime, - sendtime AS endtime - FROM ( - SELECT - clientid, - campaignid, - ordernumber, - sendtime, - communicationtype, - sendinggroupid, - LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, - LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, - LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 - FROM request_item_plan_status - WHERE createdtime >= DATE_ADD('month', -1, CURRENT_DATE) - AND channeltype = 'primary' - ) - WHERE ordernumber > 1 - ) - WHERE starttime IS NOT NULL - AND endtime IS NOT NULL - AND starttime > DATE('2000-01-01') - AND DAY_OF_WEEK(starttime) <= 5 - AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 - AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17 - AND endtime >= CURRENT_DATE - GROUP BY clientid, campaignid, communicationtype, sendinggroupid -) recent -ON -historic.clientid = recent.clientid AND -historic.campaignid = recent.campaignid AND -historic.sendinggroupid = recent.sendinggroupid AND -historic.communicationtype = recent.communicationtype -GROUP BY 1, 2 + clientid, + COALESCE(campaignid, 'N/A') AS campaignid, + COUNT_IF(todayp50latency > monthp99latency) +FROM latency_stats +GROUP BY 1, 2; From d0c9033d877d118fe130eb0d03a3d7eadce94dda Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Tue, 3 Jun 2025 16:22:42 +0100 Subject: [PATCH 04/10] Fix tf dependencies --- .../reporting/athena_named_query_degraded_latency.tf | 3 +-- .../reporting/null_resource_latency_percentiles_view.tf | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf index f3e9b7db..d2991aaf 100644 --- a/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf +++ b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf @@ -6,7 +6,6 @@ resource "aws_athena_named_query" "degraded_latency" { query = file("${path.module}/scripts/sql/watchdog/degraded_latency.sql") depends_on = [ - null_resource.request_item_status_table, - null_resource.request_item_plan_status_table + null_resource.raw_latency_3m ] } diff --git a/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf b/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf index 6e9b6795..3d115f6c 100644 --- a/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf +++ b/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf @@ -12,7 +12,6 @@ resource "null_resource" "latency_percentiles_view" { } depends_on = [ - null_resource.request_item_status_table, - null_resource.request_item_plan_status_table + null_resource.raw_latency_3m ] } From b5aa16641b85cdcf565fdb7926a001999584e24b Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Wed, 4 Jun 2025 09:39:07 +0100 Subject: [PATCH 05/10] Further tweaks --- .../reporting/scripts/sql/views/raw_latency_3m.sql | 3 +++ .../reporting/scripts/sql/watchdog/degraded_latency.sql | 9 ++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql index 80b2396b..427b3125 100644 --- a/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql +++ b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql @@ -9,6 +9,7 @@ WITH request_created_time AS ( GROUP BY clientid, requestid ), first_channel_send AS ( + --Time from batch receipt to first message send SELECT rip.clientid, rip.campaignid, @@ -41,6 +42,7 @@ fallback_candidates AS ( WINDOW win AS (PARTITION BY requestitemid ORDER BY ordernumber ASC) ), fallback_channel_send AS ( + --Time from failover trigger to subsequent send SELECT clientid, campaignid, @@ -65,6 +67,7 @@ FROM combined_events WHERE starttime IS NOT NULL AND endtime IS NOT NULL AND starttime > DATE('2000-01-01') + --Exclude unsociable hours, use 2 minute tolerance to eliminate spurious values due to race conditions AND DAY_OF_WEEK(starttime) <= 5 AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17; diff --git a/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql index 694b46c4..cdc9c34f 100644 --- a/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql +++ b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql @@ -4,8 +4,10 @@ WITH latency_stats AS ( campaignid, sendinggroupid, communicationtype, - approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.99) FILTER (WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE)) AS monthp99latency, - approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.5) FILTER (WHERE endtime >= CURRENT_DATE) AS todayp50latency + approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.95) + FILTER (WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE) AND endtime < CURRENT_DATE) AS monthp95latency, + approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.5) + FILTER (WHERE endtime >= CURRENT_DATE) AS todayp50latency FROM raw_latency_3m WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE) GROUP BY 1, 2, 3, 4 @@ -13,6 +15,7 @@ WITH latency_stats AS ( SELECT clientid, COALESCE(campaignid, 'N/A') AS campaignid, - COUNT_IF(todayp50latency > monthp99latency) + --Trigger alarm if today's median latency is more than double the 95th percentile for the last month + COUNT_IF(todayp50latency > 2 * monthp95latency) FROM latency_stats GROUP BY 1, 2; From 56fbc04fb1dcd54a645e6248f344c170dcfa1442 Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Wed, 4 Jun 2025 11:48:30 +0100 Subject: [PATCH 06/10] Linting fixes --- ..._raw_latency_view.tf => null_resource_raw_latency_3m_view.tf} | 0 scripts/config/vale/styles/Vocab/words/accept.txt | 1 + 2 files changed, 1 insertion(+) rename infrastructure/terraform/components/reporting/{null_resource_raw_latency_view.tf => null_resource_raw_latency_3m_view.tf} (100%) diff --git a/infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf b/infrastructure/terraform/components/reporting/null_resource_raw_latency_3m_view.tf similarity index 100% rename from infrastructure/terraform/components/reporting/null_resource_raw_latency_view.tf rename to infrastructure/terraform/components/reporting/null_resource_raw_latency_3m_view.tf diff --git a/scripts/config/vale/styles/Vocab/words/accept.txt b/scripts/config/vale/styles/Vocab/words/accept.txt index a83c8657..0a361b5d 100644 --- a/scripts/config/vale/styles/Vocab/words/accept.txt +++ b/scripts/config/vale/styles/Vocab/words/accept.txt @@ -33,3 +33,4 @@ create_replace_view athena_named_query add_column declaratively +degraded_latency From 95762d7c75e5f0a38935509cd461034325f33a00 Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Wed, 4 Jun 2025 11:51:56 +0100 Subject: [PATCH 07/10] Linting fixes --- .../reporting/null_resource_latency_percentiles_view.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf b/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf index 3d115f6c..2654027c 100644 --- a/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf +++ b/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf @@ -12,6 +12,6 @@ resource "null_resource" "latency_percentiles_view" { } depends_on = [ - null_resource.raw_latency_3m + null_resource.raw_latency_3m_view ] } From 1f9c2e3ab32c552e892baed4a39d6f3f40ca0ca8 Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Wed, 4 Jun 2025 11:52:26 +0100 Subject: [PATCH 08/10] Linting fix --- .../components/reporting/athena_named_query_degraded_latency.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf index d2991aaf..1b3397b1 100644 --- a/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf +++ b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf @@ -6,6 +6,6 @@ resource "aws_athena_named_query" "degraded_latency" { query = file("${path.module}/scripts/sql/watchdog/degraded_latency.sql") depends_on = [ - null_resource.raw_latency_3m + null_resource.raw_latency_3m_view ] } From 5e131c461ee68ecfc58f50ea29446b728b5e2510 Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Wed, 4 Jun 2025 12:04:49 +0100 Subject: [PATCH 09/10] IAM changes for new view --- .../acct/data_iam_policy_document_sso_read_only_table_access.tf | 1 + .../components/reporting/iam_instance_profile_powerbi_gateway.tf | 1 + 2 files changed, 2 insertions(+) diff --git a/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf b/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf index 8fa2d654..08cfd149 100644 --- a/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf +++ b/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf @@ -35,6 +35,7 @@ data "aws_iam_policy_document" "sso_read_only_table_access" { "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/dates", "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/letters_invoice_units", "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/latency_percentiles", + "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/raw_latency_3m", ], ) } diff --git a/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf b/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf index 89417ced..b5e6cdf1 100644 --- a/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf +++ b/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf @@ -186,6 +186,7 @@ data "aws_iam_policy_document" "powerbi_gateway_permissions_policy" { "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/dates", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/letters_invoice_units", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/latency_percentiles", + "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m", ] ) } From 3caa1909e114cde3dbad230f1c862f853d8ee049 Mon Sep 17 00:00:00 2001 From: Mark Pullan Date: Wed, 4 Jun 2025 12:12:40 +0100 Subject: [PATCH 10/10] IAM fix --- .../terraform/components/reporting/sfn_state_machine_watchdog.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf index 0174a698..ec921eb7 100644 --- a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf +++ b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf @@ -108,6 +108,7 @@ data "aws_iam_policy_document" "sfn_watchdog" { "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_plan_status", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status_summary", + "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m", ] }