Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,13 @@ CloudWatch alarms are setup to trigger if any of those watchdog metrics exceed a

Watchdog queries and corresponding metrics and alarms are currently setup for the following:

| Condition | Named Query | Metric | Threshold | Alarm |
|--------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------|
| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans |
| Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items |
| Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests |
| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items |
| Condition | Named Query | Metric | Threshold | Alarm |
|---------------------------------------------|----------------------------|------------------------------|----------------------------|----------------------------|
| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans |
| Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items |
| Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests |
| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items |
| Degraded latency compared to historic trend | degraded_latency | DegradedLatenciesCount | Sum across all clients > 0 | degraded-latency |

## Contacts

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ data "aws_iam_policy_document" "sso_read_only_table_access" {
"arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/dates",
"arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/letters_invoice_units",
"arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/latency_percentiles",
"arn:aws:glue:${var.region}:${var.aws_account_id}:table/${var.project}-*-reporting-database/raw_latency_3m",
],
)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
resource "aws_athena_named_query" "degraded_latency" {
name = "degraded_latency"
description = "Query to identify if today's latencies are significantly worse than historic values"
workgroup = aws_athena_workgroup.user.id
database = aws_glue_catalog_database.reporting.name
query = file("${path.module}/scripts/sql/watchdog/degraded_latency.sql")

depends_on = [
null_resource.raw_latency_3m_view
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
resource "aws_cloudwatch_metric_alarm" "degraded_latency" {
alarm_name = "${local.csi}-degraded-latency"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
threshold = 1
alarm_description = "Today's latencies are significantly higher than historic trends"
treat_missing_data = "notBreaching"

metric_query {
id = "max_degraded_latency_count"
expression = "SELECT MAX(DegradedLatenciesCount) FROM \"Notify/Watchdog\" WHERE environment='${var.environment}'"
return_data = "true"
period = 3600
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ data "aws_iam_policy_document" "powerbi_gateway_permissions_policy" {
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/dates",
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/letters_invoice_units",
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/latency_percentiles",
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m",
]
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ resource "null_resource" "latency_percentiles_view" {
}

depends_on = [
null_resource.request_item_status_table,
null_resource.request_item_plan_status_table
null_resource.raw_latency_3m_view
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
resource "null_resource" "raw_latency_3m_view" {
triggers = {
sql = filesha256("${path.module}/scripts/sql/views/raw_latency_3m.sql")
}
provisioner "local-exec" {
command = <<EOT
${path.module}/scripts/create_replace_view.sh \
${aws_athena_workgroup.setup.name} \
${aws_glue_catalog_database.reporting.name} \
raw_latency_3m
EOT
}

depends_on = [
null_resource.request_item_status_table,
null_resource.request_item_plan_status_table
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,8 @@ SELECT
communicationtype,
percentile * 100 AS percentile,
approx_percentile(to_unixtime(endtime)-to_unixtime(starttime), percentile) AS latency
FROM (
--Receipt to first channel send
SELECT rip.clientid, rip.campaignid, rip.communicationtype, rq.rqcreatedtime AS starttime, rip.sendtime AS endtime FROM
(
SELECT requestid, MIN(createdtime) AS rqcreatedtime FROM request_item_status
--Query optimisation to prevent full table scan on request_item_status createdtime
WHERE createdtime >= DATE_ADD('week', -1, DATE_ADD('month', -2, CURRENT_DATE))
GROUP BY requestid
) AS rq
INNER JOIN request_item_plan_status rip ON rq.requestid = rip.requestid
WHERE rip.createdtime >= DATE_ADD('month', -2, CURRENT_DATE)
AND rip.ordernumber = 1
AND rip.channeltype = 'primary'
UNION ALL
--Failure to fallback channel send
SELECT clientid, campaignid, communicationtype,
GREATEST(
COALESCE(prevfailedtime1, DATE('2000-01-01')),
COALESCE(prevfailedtime2, DATE('2000-01-01')),
COALESCE(prevfailedtime3, DATE('2000-01-01'))
) AS starttime,
sendtime AS endtime
FROM (
SELECT
clientid,
campaignid,
ordernumber,
sendtime,
communicationtype,
LAG(completedtime,1) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime1,
LAG(completedtime,2) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime2,
LAG(completedtime,3) OVER (PARTITION BY requestitemid ORDER BY ordernumber ASC) as prevfailedtime3
FROM request_item_plan_status
WHERE createdtime >= DATE_ADD('month', -2, CURRENT_DATE)
AND channeltype = 'primary'
)
WHERE ordernumber > 1
)
FROM raw_latency_3m
CROSS JOIN UNNEST (ARRAY[0.001, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 0.999]) AS t(percentile)
WHERE starttime IS NOT NULL
AND endtime IS NOT NULL
AND starttime > DATE('2000-01-01')
AND DAY_OF_WEEK(starttime) <= 5
AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) < 18
AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) < 18
AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) >= 8
AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) >= 8
GROUP BY clientid, campaignid, communicationtype, percentile
ORDER BY clientid, campaignid, communicationtype, percentile
WHERE endtime >= DATE_ADD('month', -2, CURRENT_DATE)
GROUP BY 1, 2, 3, 4
ORDER BY 1, 2, 3, 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
CREATE OR REPLACE VIEW ${view_name} AS
WITH request_created_time AS (
SELECT
clientid,
requestid,
MIN(createdtime) AS createdtime
FROM request_item_status
WHERE createdtime >= DATE_ADD('month', -3, CURRENT_DATE)
GROUP BY clientid, requestid
),
first_channel_send AS (
--Time from batch receipt to first message send
SELECT
rip.clientid,
rip.campaignid,
rip.sendinggroupid,
rip.communicationtype,
rct.createdtime AS starttime,
rip.sendtime AS endtime
FROM request_created_time rct
INNER JOIN request_item_plan_status rip
ON rct.clientid = rip.clientid
AND rct.requestid = rip.requestid
WHERE rip.ordernumber = 1
AND rip.channeltype = 'primary'
AND rip.createdtime >= DATE_ADD('month', -3, CURRENT_DATE)
),
fallback_candidates AS (
SELECT
clientid,
campaignid,
sendinggroupid,
ordernumber,
sendtime,
communicationtype,
LAG(completedtime, 1) OVER win AS prevfailedtime1,
LAG(completedtime, 2) OVER win AS prevfailedtime2,
LAG(completedtime, 3) OVER win AS prevfailedtime3
FROM request_item_plan_status
WHERE channeltype = 'primary'
AND createdtime >= DATE_ADD('month', -3, CURRENT_DATE)
WINDOW win AS (PARTITION BY requestitemid ORDER BY ordernumber ASC)
),
fallback_channel_send AS (
--Time from failover trigger to subsequent send
SELECT
clientid,
campaignid,
sendinggroupid,
communicationtype,
GREATEST(
COALESCE(prevfailedtime1, DATE('2000-01-01')),
COALESCE(prevfailedtime2, DATE('2000-01-01')),
COALESCE(prevfailedtime3, DATE('2000-01-01'))
) AS starttime,
sendtime AS endtime
FROM fallback_candidates
WHERE ordernumber > 1
),
combined_events AS (
SELECT * FROM first_channel_send
UNION ALL
SELECT * FROM fallback_channel_send
)
SELECT *
FROM combined_events
WHERE starttime IS NOT NULL
AND endtime IS NOT NULL
AND starttime > DATE('2000-01-01')
--Exclude unsociable hours, use 2 minute tolerance to eliminate spurious values due to race conditions
AND DAY_OF_WEEK(starttime) <= 5
AND HOUR(AT_TIMEZONE(DATE_ADD('minute', 2, starttime), 'Europe/London')) BETWEEN 8 AND 17
AND HOUR(AT_TIMEZONE(DATE_ADD('minute', -2, starttime), 'Europe/London')) BETWEEN 8 AND 17;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
WITH latency_stats AS (
SELECT
clientid,
campaignid,
sendinggroupid,
communicationtype,
approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.95)
FILTER (WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE) AND endtime < CURRENT_DATE) AS monthp95latency,
approx_percentile(to_unixtime(endtime) - to_unixtime(starttime), 0.5)
FILTER (WHERE endtime >= CURRENT_DATE) AS todayp50latency
FROM raw_latency_3m
WHERE endtime >= DATE_ADD('month', -1, CURRENT_DATE)
GROUP BY 1, 2, 3, 4
)
SELECT
clientid,
COALESCE(campaignid, 'N/A') AS campaignid,
--Trigger alarm if today's median latency is more than double the 95th percentile for the last month
COUNT_IF(todayp50latency > 2 * monthp95latency)
FROM latency_stats
GROUP BY 1, 2;
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ resource "aws_sfn_state_machine" "watchdog" {
{
metric_name = "StuckRequestItemsCount",
query_id = aws_athena_named_query.stuck_request_items.id
},
{
metric_name = "DegradedLatenciesCount",
query_id = aws_athena_named_query.degraded_latency.id
}
]
environment = var.environment
Expand Down Expand Up @@ -104,6 +108,7 @@ data "aws_iam_policy_document" "sfn_watchdog" {
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status",
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_plan_status",
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/request_item_status_summary",
"arn:aws:glue:${var.region}:${local.this_account}:table/${aws_glue_catalog_database.reporting.name}/raw_latency_3m",
]
}

Expand Down
1 change: 1 addition & 0 deletions scripts/config/vale/styles/Vocab/words/accept.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ create_replace_view
athena_named_query
add_column
declaratively
degraded_latency
Loading