diff --git a/README.md b/README.md index cf174509..497c3812 100644 --- a/README.md +++ b/README.md @@ -254,12 +254,13 @@ CloudWatch alarms are setup to trigger if any of those watchdog metrics exceed a Watchdog queries and corresponding metrics and alarms are currently setup for the following: -| Condition | Named Query | Metric | Threshold | Alarm | -|--------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------| -| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans | -| Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items | -| Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests | -| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items | +| Condition | Named Query | Metric | Threshold | Alarm | +|---------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------| +| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans | +| Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items | +| Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests | +| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items | +| Degraded latency compared to historic trend | degraded_latency | DegradedLatenciesCount | Sum across all clients > 0 | degraded-latency | ## Contacts diff --git a/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf b/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf index 8fa2d654..08cfd149 100644 --- a/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf +++ b/infrastructure/terraform/components/acct/data_iam_policy_document_sso_read_only_table_access.tf @@ -35,6 +35,7 @@ data "aws_iam_policy_document" "sso_read_only_table_access" { "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/dates", "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/letters_invoice_units", "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/latency_percentiles", + "arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/raw_latency_3m", ], ) } diff --git a/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf new file mode 100644 index 00000000..1b3397b1 --- /dev/null +++ b/infrastructure/terraform/components/reporting/athena_named_query_degraded_latency.tf @@ -0,0 +1,11 @@ +resource "aws_athena_named_query" "degraded_latency" { + name = "degraded_latency" + description = "Query to identify if today's latencies are significantly worse than historic values" + workgroup = aws_athena_workgroup.user.id + database = aws_glue_catalog_database.reporting.name + query = file("${path.module}/scripts/sql/watchdog/degraded_latency.sql") + + depends_on = [ + null_resource.raw_latency_3m_view + ] +} diff --git a/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_degraded_latency.tf b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_degraded_latency.tf new file mode 100644 index 00000000..c086c3f8 --- /dev/null +++ b/infrastructure/terraform/components/reporting/cloudwatch_metric_alarm_degraded_latency.tf @@ -0,0 +1,15 @@ +resource "aws_cloudwatch_metric_alarm" "degraded_latency" { + alarm_name = "${local.csi}-degraded-latency" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + threshold = 1 + alarm_description = "Today's latencies are significantly higher than historic trends" + treat_missing_data = "notBreaching" + + metric_query { + id = "max_degraded_latency_count" + expression = "SELECT MAX(DegradedLatenciesCount) FROM \"Notify/Watchdog\" WHERE environment='${var.environment}'" + return_data = "true" + period = 3600 + } +} diff --git a/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf b/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf index 89417ced..b5e6cdf1 100644 --- a/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf +++ b/infrastructure/terraform/components/reporting/iam_instance_profile_powerbi_gateway.tf @@ -186,6 +186,7 @@ data "aws_iam_policy_document" "powerbi_gateway_permissions_policy" { "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/dates", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/letters_invoice_units", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/latency_percentiles", + "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m", ] ) } diff --git a/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf b/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf index 6e9b6795..2654027c 100644 --- a/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf +++ b/infrastructure/terraform/components/reporting/null_resource_latency_percentiles_view.tf @@ -12,7 +12,6 @@ resource "null_resource" "latency_percentiles_view" { } depends_on = [ - null_resource.request_item_status_table, - null_resource.request_item_plan_status_table + null_resource.raw_latency_3m_view ] } diff --git a/infrastructure/terraform/components/reporting/null_resource_raw_latency_3m_view.tf b/infrastructure/terraform/components/reporting/null_resource_raw_latency_3m_view.tf new file mode 100644 index 00000000..29c5a996 --- /dev/null +++ b/infrastructure/terraform/components/reporting/null_resource_raw_latency_3m_view.tf @@ -0,0 +1,18 @@ +resource "null_resource" "raw_latency_3m_view" { + triggers = { + sql = filesha256("${path.module}/scripts/sql/views/raw_latency_3m.sql") + } + provisioner "local-exec" { + command = <= DATE_ADD('week', -1, DATE_ADD('month', -2, CURRENT_DATE)) - GROUP BY requestid - ) AS rq - INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid - WHERE rip.createdtime >= DATE_ADD('month', -2, CURRENT_DATE) - AND rip.ordernumber = 1 - AND rip.channeltype = 'primary' - UNION ALL - --Failure to fallback channel send - SELECT clientid, campaignid, communicationtype, - GREATEST( - COALESCE(prevfailedtime1, DATE('2000-01-01')), - COALESCE(prevfailedtime2, DATE('2000-01-01')), - COALESCE(prevfailedtime3, DATE('2000-01-01')) - ) AS starttime, - sendtime AS endtime - FROM ( - SELECT - clientid, - campaignid, - ordernumber, - sendtime, - communicationtype, - LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1, - LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2, - LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3 - FROM request_item_plan_status - WHERE createdtime >= DATE_ADD('month', -2, CURRENT_DATE) - AND channeltype = 'primary' - ) - WHERE ordernumber > 1 -) +FROM raw_latency_3m CROSS JOIN UNNEST (ARRAY[0.001, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 0.999]) AS t(percentile) -WHERE starttime IS NOT NULL -AND endtime IS NOT NULL -AND starttime > DATE('2000-01-01') -AND DAY_OF_WEEK(starttime) <= 5 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) < 18 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) < 18 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) >= 8 -AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) >= 8 -GROUP BY clientid, campaignid, communicationtype, percentile -ORDER BY clientid, campaignid, communicationtype, percentile +WHERE endtime >= DATE_ADD('month', -2, CURRENT_DATE) +GROUP BY 1, 2, 3, 4 +ORDER BY 1, 2, 3, 4 diff --git a/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql new file mode 100644 index 00000000..427b3125 --- /dev/null +++ b/infrastructure/terraform/components/reporting/scripts/sql/views/raw_latency_3m.sql @@ -0,0 +1,73 @@ +CREATE OR REPLACE VIEW ${view_name} AS +WITH request_created_time AS ( + SELECT + clientid, + requestid, + MIN(createdtime) AS createdtime + FROM request_item_status + WHERE createdtime >= DATE_ADD('month', -3, CURRENT_DATE) + GROUP BY clientid, requestid +), +first_channel_send AS ( + --Time from batch receipt to first message send + SELECT + rip.clientid, + rip.campaignid, + rip.sendinggroupid, + rip.communicationtype, + rct.createdtime AS starttime, + rip.sendtime AS endtime + FROM request_created_time rct + INNER JOIN request_item_plan_status rip + ON rct.clientid = rip.clientid + AND rct.requestid = rip.requestid + WHERE rip.ordernumber = 1 + AND rip.channeltype = 'primary' + AND rip.createdtime >= DATE_ADD('month', -3, CURRENT_DATE) +), +fallback_candidates AS ( + SELECT + clientid, + campaignid, + sendinggroupid, + ordernumber, + sendtime, + communicationtype, + LAG(completedtime, 1) OVER win AS prevfailedtime1, + LAG(completedtime, 2) OVER win AS prevfailedtime2, + LAG(completedtime, 3) OVER win AS prevfailedtime3 + FROM request_item_plan_status + WHERE channeltype = 'primary' + AND createdtime >= DATE_ADD('month', -3, CURRENT_DATE) + WINDOW win AS (PARTITION BY requestitemid ORDER BY ordernumber ASC) +), +fallback_channel_send AS ( + --Time from failover trigger to subsequent send + SELECT + clientid, + campaignid, + sendinggroupid, + communicationtype, + GREATEST( + COALESCE(prevfailedtime1, DATE('2000-01-01')), + COALESCE(prevfailedtime2, DATE('2000-01-01')), + COALESCE(prevfailedtime3, DATE('2000-01-01')) + ) AS starttime, + sendtime AS endtime + FROM fallback_candidates + WHERE ordernumber > 1 +), +combined_events AS ( + SELECT * FROM first_channel_send + UNION ALL + SELECT * FROM fallback_channel_send +) +SELECT * +FROM combined_events +WHERE starttime IS NOT NULL + AND endtime IS NOT NULL + AND starttime > DATE('2000-01-01') + --Exclude unsociable hours, use 2 minute tolerance to eliminate spurious values due to race conditions + AND DAY_OF_WEEK(starttime) <= 5 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17 + AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17; diff --git a/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql new file mode 100644 index 00000000..cdc9c34f --- /dev/null +++ b/infrastructure/terraform/components/reporting/scripts/sql/watchdog/degraded_latency.sql @@ -0,0 +1,21 @@ +WITH latency_stats AS ( + SELECT + clientid, + campaignid, + sendinggroupid, + communicationtype, + approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.95) + FILTER (WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE) AND endtime < CURRENT_DATE) AS monthp95latency, + approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.5) + FILTER (WHERE endtime >= CURRENT_DATE) AS todayp50latency + FROM raw_latency_3m + WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE) + GROUP BY 1, 2, 3, 4 +) +SELECT + clientid, + COALESCE(campaignid, 'N/A') AS campaignid, + --Trigger alarm if today's median latency is more than double the 95th percentile for the last month + COUNT_IF(todayp50latency > 2 * monthp95latency) +FROM latency_stats +GROUP BY 1, 2; diff --git a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf index ef1fc2d5..ec921eb7 100644 --- a/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf +++ b/infrastructure/terraform/components/reporting/sfn_state_machine_watchdog.tf @@ -19,6 +19,10 @@ resource "aws_sfn_state_machine" "watchdog" { { metric_name = "StuckRequestItemsCount", query_id = aws_athena_named_query.stuck_request_items.id + }, + { + metric_name = "DegradedLatenciesCount", + query_id = aws_athena_named_query.degraded_latency.id } ] environment = var.environment @@ -104,6 +108,7 @@ data "aws_iam_policy_document" "sfn_watchdog" { "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_plan_status", "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status_summary", + "arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m", ] } diff --git a/scripts/config/vale/styles/Vocab/words/accept.txt b/scripts/config/vale/styles/Vocab/words/accept.txt index a83c8657..0a361b5d 100644 --- a/scripts/config/vale/styles/Vocab/words/accept.txt +++ b/scripts/config/vale/styles/Vocab/words/accept.txt @@ -33,3 +33,4 @@ create_replace_view athena_named_query add_column declaratively +degraded_latency