Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ Watchdog queries and corresponding metrics and alarms are currently setup for th
| Request item plans unexpectedly incomplete | overdue_request_item_plans | OverdueRequestItemPlansCount | Sum across all clients > 0 | overdue-request-item-plans |
| Request items incomplete after 2 weeks | overdue_request_items | OverdueRequestItemsCount | Sum across all clients > 0 | overdue-request-items |
| Requests incomplete after 2 weeks | overdue_requests | OverdueRequestsCount | Sum across all clients > 0 | overdue-requests |
| Request items stuck before being sent | stuck_request_items | StuckRequestItemsCount | Sum across all clients > 0 | stuck-request-items |

## Contacts

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
resource "aws_athena_named_query" "stuck_request_items" {
name = "stuck_request_items"
description = "Query to determine any request items unexpectedly stuck in an ENRICHED or PENDING_ENRICHMENT state"
workgroup = aws_athena_workgroup.user.id
database = aws_glue_catalog_database.reporting.name
query = file("${path.module}/scripts/sql/watchdog/stuck_request_items.sql")

depends_on = [
null_resource.request_item_status_table
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ resource "aws_cloudwatch_metric_alarm" "overdue_request_item_plans" {
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
threshold = 1
alarm_description = "This metric monitors unexpected/overdue request item plans"
alarm_description = "Request item plans that did not reach a terminal state within an expected time window"

metric_query {
id = "max_overdue_request_item_plans_count"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ resource "aws_cloudwatch_metric_alarm" "overdue_request_items" {
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
threshold = 1
alarm_description = "This metric monitors unexpected/overdue request items"
alarm_description = "Request items that did not reach a terminal state within an expected time window"

metric_query {
id = "max_overdue_request_items_count"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ resource "aws_cloudwatch_metric_alarm" "overdue_requests" {
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
threshold = 1
alarm_description = "This metric monitors unexpected/overdue requests"
alarm_description = "Requests that did not reach a terminal state within an expected time window"

metric_query {
id = "max_overdue_requests_count"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
resource "aws_cloudwatch_metric_alarm" "stuck_request_items" {
alarm_name = "${local.csi}-stuck-request-items"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
threshold = 1
alarm_description = "Request items stuck in an ENRICHED or PENDING_ENRICHMENT state for longer than an expected time window"

metric_query {
id = "max_stuck_request_items_count"
expression = "SELECT MAX(StuckRequestItemsCount) FROM \"Notify/Watchdog\" WHERE environment='${var.environment}'"
return_data = "true"
period = 3600
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
SELECT
clientid,
COALESCE(campaignid, 'N/A'),
SUM(
CASE
WHEN status='ENRICHED' AND createdtime < DATE_ADD('day', -6, CURRENT_DATE) THEN 1
WHEN status='PENDING_ENRICHMENT' AND createdtime < DATE_ADD('day', -2, CURRENT_DATE) THEN 1
ELSE 0
END
)
FROM request_item_status
WHERE createdtime >= DATE_ADD('day', -90, CURRENT_DATE)
GROUP BY clientid, campaignid
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ resource "aws_sfn_state_machine" "watchdog" {
{
metric_name = "OverdueRequestsCount",
query_id = aws_athena_named_query.overdue_requests.id
},
{
metric_name = "StuckRequestItemsCount",
query_id = aws_athena_named_query.stuck_request_items.id
}
]
environment = var.environment
Expand Down
Loading