NHSDigital · markpullan · Jun 4, 2025 · Jun 2, 2025 · Jun 3, 2025 · Jun 3, 2025
@@ -254,12 +254,13 @@ CloudWatch alarms are setup to trigger if any of those watchdog metrics exceed a
 
 Watchdog queries and corresponding metrics and alarms are currently setup for the following:
 
-| Condition                                  | Named Query                | Metric                       | Threshold                  | Alarm                      |
-|--------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------|
-| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans |
-| Request items incomplete after 2 weeks     | overdue_request_items      | OverdueRequestItemsCount     | Sum across all clients > 0 | overdue-request-items      |
-| Requests incomplete after 2 weeks          | overdue_requests           | OverdueRequestsCount         | Sum across all clients > 0 | overdue-requests           |
-| Request items stuck before being sent      | stuck_request_items        | StuckRequestItemsCount       | Sum across all clients > 0 | stuck-request-items        |
+| Condition                                   | Named Query                | Metric                       | Threshold                  | Alarm                      |
+|---------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------|
+| Request item plans unexpectedly incomplete  | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans |
+| Request items incomplete after 2 weeks      | overdue_request_items      | OverdueRequestItemsCount     | Sum across all clients > 0 | overdue-request-items      |
+| Requests incomplete after 2 weeks           | overdue_requests           | OverdueRequestsCount         | Sum across all clients > 0 | overdue-requests           |
+| Request items stuck before being sent       | stuck_request_items        | StuckRequestItemsCount       | Sum across all clients > 0 | stuck-request-items        |
+| Degraded latency compared to historic trend | degraded_latency           | DegradedLatenciesCount       | Sum across all clients > 0 | degraded-latency           |
 
 ## Contacts
 

@@ -35,6 +35,7 @@ data "aws_iam_policy_document" "sso_read_only_table_access" {
         "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/dates",
         "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/letters_invoice_units",
         "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/latency_percentiles",
+        "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/raw_latency_3m",
       ],
     )
   }

@@ -0,0 +1,11 @@
+resource "aws_athena_named_query" "degraded_latency" {
+  name        = "degraded_latency"
+  description = "Query to identify if today's latencies are significantly worse than historic values"
+  workgroup   = aws_athena_workgroup.user.id
+  database    = aws_glue_catalog_database.reporting.name
+  query       = file("${path.module}/scripts/sql/watchdog/degraded_latency.sql")
+
+  depends_on = [
+    null_resource.raw_latency_3m_view
+  ]
+}
@@ -0,0 +1,15 @@
+resource "aws_cloudwatch_metric_alarm" "degraded_latency" {
+  alarm_name          = "${local.csi}-degraded-latency"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = 1
+  threshold           = 1
+  alarm_description   = "Today's latencies are significantly higher than historic trends"
+  treat_missing_data  = "notBreaching"
+
+  metric_query {
+    id          = "max_degraded_latency_count"
+    expression  = "SELECT MAX(DegradedLatenciesCount) FROM \"Notify/Watchdog\" WHERE environment='${var.environment}'"
+    return_data = "true"
+    period      = 3600
+  }
+}
@@ -186,6 +186,7 @@ data "aws_iam_policy_document" "powerbi_gateway_permissions_policy" {
         "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/dates",
         "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/letters_invoice_units",
         "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/latency_percentiles",
+        "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m",
       ]
     )
   }

@@ -12,7 +12,6 @@ resource "null_resource" "latency_percentiles_view" {
   }
 
   depends_on = [
-    null_resource.request_item_status_table,
-    null_resource.request_item_plan_status_table
+    null_resource.raw_latency_3m_view
   ]
 }
@@ -0,0 +1,18 @@
+resource "null_resource" "raw_latency_3m_view" {
+  triggers = {
+    sql = filesha256("${path.module}/scripts/sql/views/raw_latency_3m.sql")
+  }
+  provisioner "local-exec" {
+    command = <<EOT
+      ${path.module}/scripts/create_replace_view.sh \
+        ${aws_athena_workgroup.setup.name} \
+        ${aws_glue_catalog_database.reporting.name} \
+        raw_latency_3m
+    EOT
+  }
+
+  depends_on = [
+    null_resource.request_item_status_table,
+    null_resource.request_item_plan_status_table
+  ]
+}
@@ -5,52 +5,8 @@ SELECT
   communicationtype,
   percentile * 100 AS percentile,
   approx_percentile(to_unixtime(endtime)-to_unixtime(starttime), percentile) AS latency
-FROM (
-  --Receipt to first channel send
-  SELECT rip.clientid, rip.campaignid, rip.communicationtype, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM
-    (
-      SELECT requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status
-      --Query optimisation to prevent full table scan on request_item_status createdtime
-      WHERE createdtime >= DATE_ADD('week', -1, DATE_ADD('month', -2, CURRENT_DATE))
-      GROUP BY requestid
-    ) AS rq
-    INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid
-    WHERE rip.createdtime >= DATE_ADD('month', -2, CURRENT_DATE)
-    AND rip.ordernumber = 1
-    AND rip.channeltype = 'primary'
-  UNION ALL
-  --Failure to fallback channel send
-  SELECT clientid, campaignid, communicationtype,
-    GREATEST(
-      COALESCE(prevfailedtime1, DATE('2000-01-01')),
-      COALESCE(prevfailedtime2, DATE('2000-01-01')),
-      COALESCE(prevfailedtime3, DATE('2000-01-01'))
-    ) AS starttime,
-    sendtime AS endtime
-  FROM (
-    SELECT
-      clientid,
-      campaignid,
-      ordernumber,
-      sendtime,
-      communicationtype,
-      LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1,
-      LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2,
-      LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3
-    FROM request_item_plan_status
-    WHERE createdtime >= DATE_ADD('month', -2, CURRENT_DATE)
-    AND channeltype = 'primary'
-  )
-  WHERE ordernumber > 1
-)
+FROM raw_latency_3m
 CROSS JOIN UNNEST (ARRAY[0.001, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 0.999]) AS t(percentile)
-WHERE starttime IS NOT NULL
-AND endtime IS NOT NULL
-AND starttime > DATE('2000-01-01')
-AND DAY_OF_WEEK(starttime) <= 5
-AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) < 18
-AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) < 18
-AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) >= 8
-AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) >= 8
-GROUP BY clientid, campaignid, communicationtype, percentile
-ORDER BY clientid, campaignid, communicationtype, percentile
+WHERE endtime >= DATE_ADD('month', -2, CURRENT_DATE)
+GROUP BY 1, 2, 3, 4
+ORDER BY 1, 2, 3, 4
@@ -0,0 +1,73 @@
+CREATE OR REPLACE VIEW ${view_name} AS
+WITH request_created_time AS (
+  SELECT
+    clientid,
+    requestid,
+    MIN(createdtime) AS createdtime
+  FROM request_item_status
+  WHERE createdtime >= DATE_ADD('month', -3, CURRENT_DATE)
+  GROUP BY clientid, requestid
+),
+first_channel_send AS (
+  --Time from batch receipt to first message send
+  SELECT
+    rip.clientid,
+    rip.campaignid,
+    rip.sendinggroupid,
+    rip.communicationtype,
+    rct.createdtime AS starttime,
+    rip.sendtime AS endtime
+  FROM request_created_time rct
+  INNER JOIN request_item_plan_status rip
+    ON rct.clientid = rip.clientid
+    AND rct.requestid = rip.requestid
+  WHERE rip.ordernumber = 1
+    AND rip.channeltype = 'primary'
+    AND rip.createdtime >= DATE_ADD('month', -3, CURRENT_DATE)
+),
+fallback_candidates AS (
+  SELECT
+    clientid,
+    campaignid,
+    sendinggroupid,
+    ordernumber,
+    sendtime,
+    communicationtype,
+    LAG(completedtime, 1) OVER win AS prevfailedtime1,
+    LAG(completedtime, 2) OVER win AS prevfailedtime2,
+    LAG(completedtime, 3) OVER win AS prevfailedtime3
+  FROM request_item_plan_status
+  WHERE channeltype = 'primary'
+    AND createdtime >= DATE_ADD('month', -3, CURRENT_DATE)
+  WINDOW win AS (PARTITION BY requestitemid ORDER BY ordernumber ASC)
+),
+fallback_channel_send AS (
+  --Time from failover trigger to subsequent send
+  SELECT
+    clientid,
+    campaignid,
+    sendinggroupid,
+    communicationtype,
+    GREATEST(
+      COALESCE(prevfailedtime1, DATE('2000-01-01')),
+      COALESCE(prevfailedtime2, DATE('2000-01-01')),
+      COALESCE(prevfailedtime3, DATE('2000-01-01'))
+    ) AS starttime,
+    sendtime AS endtime
+  FROM fallback_candidates
+  WHERE ordernumber > 1
+),
+combined_events AS (
+  SELECT * FROM first_channel_send
+  UNION ALL
+  SELECT * FROM fallback_channel_send
+)
+SELECT *
+FROM combined_events
+WHERE starttime IS NOT NULL
+  AND endtime IS NOT NULL
+  AND starttime > DATE('2000-01-01')
+  --Exclude unsociable hours, use 2 minute tolerance to eliminate spurious values due to race conditions
+  AND DAY_OF_WEEK(starttime) <= 5
+  AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17
+  AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17;
@@ -0,0 +1,21 @@
+WITH latency_stats AS (
+  SELECT
+    clientid,
+    campaignid,
+    sendinggroupid,
+    communicationtype,
+    approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.95)
+      FILTER (WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE) AND endtime < CURRENT_DATE) AS monthp95latency,
+    approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.5)
+      FILTER (WHERE endtime >= CURRENT_DATE) AS todayp50latency
+  FROM raw_latency_3m
+  WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE)
+  GROUP BY 1, 2, 3, 4
+)
+SELECT
+  clientid,
+  COALESCE(campaignid, 'N/A') AS campaignid,
+  --Trigger alarm if today's median latency is more than double the 95th percentile for the last month
+  COUNT_IF(todayp50latency > 2 * monthp95latency)
+FROM latency_stats
+GROUP BY 1, 2;
@@ -19,6 +19,10 @@ resource "aws_sfn_state_machine" "watchdog" {
       {
         metric_name = "StuckRequestItemsCount",
         query_id    = aws_athena_named_query.stuck_request_items.id
+      },
+      {
+        metric_name = "DegradedLatenciesCount",
+        query_id    = aws_athena_named_query.degraded_latency.id
       }
     ]
     environment = var.environment
@@ -104,6 +108,7 @@ data "aws_iam_policy_document" "sfn_watchdog" {
       "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status",
       "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_plan_status",
       "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status_summary",
+      "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m",
     ]
   }
 

@@ -33,3 +33,4 @@ create_replace_view
 athena_named_query
 add_column
 declaratively
+degraded_latency