From 8a766fede8efc39fb121a2be6d9ca13c24bcddad Mon Sep 17 00:00:00 2001 From: 0xafrogeek Date: Tue, 7 Apr 2026 21:15:44 +0000 Subject: [PATCH 1/2] feat: implement Grafana and Glowroot alerting with Telegram and Slack support - Added configuration options for Glowroot alerting in `hosts.template` and `monitor.template`. - Introduced `glowroot_alerting.yml` for default alerting settings. - Created tasks for setting up Telegram forwarder and Slack notifications in `glowroot.yml`. - Added templates for Glowroot configuration and notification policies for Grafana. - Implemented Prometheus alert rules and Grafana contact points for integrated alerting. - Enhanced Munin configuration to support Telegram and Slack alerts. --- deploy/inventory/host_vars/monitor.template | 11 + deploy/inventory/hosts.template | 6 + .../defaults/main/glowroot_alerting.yml | 25 ++ .../roles/create-instance/handlers/main.yml | 6 + .../roles/create-instance/tasks/glowroot.yml | 93 +++++++ .../glowroot-telegram-forwarder.py.j2 | 97 +++++++ ...telegram-forwarder.service.j2\342\200\216" | 20 ++ .../templates/glowroot_admin.json | 42 +++ .../templates/glowroot_config.json.j2 | 80 ++++++ .../monitoring/defaults/main/alerting.yml | 21 ++ .../roles/monitoring/defaults/main/munin.yml | 23 ++ deploy/roles/monitoring/tasks/alerting.yml | 105 +++++++ deploy/roles/monitoring/tasks/main.yml | 12 + .../grafana-alert-rules.yml.j2\342\200\216" | 258 ++++++++++++++++++ ...grafana-contact-points.yml.j2\342\200\216" | 33 +++ .../grafana-datasource-provisioning.yml.j2 | 9 + .../grafana-notification-policies.yml.j2 | 11 + .../roles/monitoring/templates/munin.conf.j2 | 10 +- .../templates/prometheus-rules.yml.j2 | 88 ++++++ 19 files changed, 948 insertions(+), 2 deletions(-) create mode 100644 deploy/inventory/host_vars/monitor.template create mode 100644 deploy/roles/create-instance/defaults/main/glowroot_alerting.yml create mode 100644 deploy/roles/create-instance/templates/glowroot-telegram-forwarder.py.j2 create mode 100644 "deploy/roles/create-instance/templates/glowroot-telegram-forwarder.service.j2\342\200\216" create mode 100644 deploy/roles/create-instance/templates/glowroot_config.json.j2 create mode 100644 deploy/roles/monitoring/defaults/main/alerting.yml create mode 100644 deploy/roles/monitoring/tasks/alerting.yml create mode 100644 "deploy/roles/monitoring/templates/grafana-alert-rules.yml.j2\342\200\216" create mode 100644 "deploy/roles/monitoring/templates/grafana-contact-points.yml.j2\342\200\216" create mode 100644 deploy/roles/monitoring/templates/grafana-datasource-provisioning.yml.j2 create mode 100644 deploy/roles/monitoring/templates/grafana-notification-policies.yml.j2 create mode 100644 deploy/roles/monitoring/templates/prometheus-rules.yml.j2 diff --git a/deploy/inventory/host_vars/monitor.template b/deploy/inventory/host_vars/monitor.template new file mode 100644 index 0000000..cd6fa5b --- /dev/null +++ b/deploy/inventory/host_vars/monitor.template @@ -0,0 +1,11 @@ +--- +# Variables for the monitoring host. +# Copy to 'monitor': cp monitor.template monitor + +# alerting_enabled: true +# alerting_telegram_bot_token: "123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11" +# alerting_telegram_chat_id: "-1001234567890" + +# alerting_default_contact_point: slack +# alerting_slack_webhook_url: "https://hooks.slack.com/services/T.../B.../xxx" +# alerting_slack_channel: "#dhis2-alerts" \ No newline at end of file diff --git a/deploy/inventory/hosts.template b/deploy/inventory/hosts.template index f4d69b4..5b9bde5 100644 --- a/deploy/inventory/hosts.template +++ b/deploy/inventory/hosts.template @@ -53,6 +53,12 @@ postgresql_version=16 server_monitoring=munin app_monitoring=glowroot +# alerting (requires server_monitoring=grafana, see docs/alerting.md) +# alerting_enabled=false +# alerting_default_contact_point=telegram +# alerting_telegram_bot_token= +# alerting_telegram_chat_id= +# glowroot_alerting_enabled=false # lxd lxd_network=172.19.2.1/24 diff --git a/deploy/roles/create-instance/defaults/main/glowroot_alerting.yml b/deploy/roles/create-instance/defaults/main/glowroot_alerting.yml new file mode 100644 index 0000000..f6308d2 --- /dev/null +++ b/deploy/roles/create-instance/defaults/main/glowroot_alerting.yml @@ -0,0 +1,25 @@ +--- +glowroot_alerting_enabled: false +glowroot_telegram_forwarder_port: '9099' + +# Slack (native Glowroot) +# glowroot_slack_webhook_url: "" +# glowroot_slack_webhook_display: "DHIS2 Alerts" + +# Email / SMTP (native Glowroot) +# glowroot_smtp_host: "" +# glowroot_smtp_port: 587 +# glowroot_smtp_connection_security: "STARTTLS" +# glowroot_smtp_username: "" +# glowroot_smtp_password: "" +# glowroot_smtp_from_address: "" +# glowroot_smtp_from_name: "Glowroot DHIS2" +# glowroot_alert_email_addresses: [] + +# Thresholds +glowroot_alert_p95_threshold_ms: 10000 +glowroot_alert_p95_time_period_seconds: 600 +glowroot_alert_error_rate_threshold: 10.0 +glowroot_alert_error_rate_time_period_seconds: 300 +glowroot_alert_heartbeat_seconds: 300 +glowroot_alert_min_transaction_count: 10 diff --git a/deploy/roles/create-instance/handlers/main.yml b/deploy/roles/create-instance/handlers/main.yml index 4ad6aae..6f0fbac 100644 --- a/deploy/roles/create-instance/handlers/main.yml +++ b/deploy/roles/create-instance/handlers/main.yml @@ -49,3 +49,9 @@ ansible.builtin.service: name: munin-node state: restarted + +- name: Restart Glowroot Telegram Forwarder + ansible.builtin.systemd: + name: glowroot-telegram-forwarder + state: restarted + daemon_reload: true diff --git a/deploy/roles/create-instance/tasks/glowroot.yml b/deploy/roles/create-instance/tasks/glowroot.yml index b467037..374c0aa 100644 --- a/deploy/roles/create-instance/tasks/glowroot.yml +++ b/deploy/roles/create-instance/tasks/glowroot.yml @@ -102,6 +102,7 @@ owner: root group: tomcat mode: "0660" + no_log: true when: not glowroot_admin_file_status.stat.exists notify: Restart Tomcat @@ -112,3 +113,95 @@ line: '\1"contextPath": "{{ "/glowroot" if dhis2_base_path | default(inventory_hostname) | to_fixed_string == "ROOT" else "/" + dhis2_base_path | default(inventory_hostname) | to_fixed_string + "-glowroot" }}",' backrefs: true notify: Restart Tomcat + +# Glowroot alerting pre-flight check +- name: Glowroot | Assert at least one notification channel is configured + ansible.builtin.assert: + that: >- + (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or + (glowroot_slack_webhook_url is defined) or + (glowroot_smtp_host is defined) + fail_msg: >- + glowroot_alerting_enabled is true but no notification channel is configured. + Set alerting_telegram_bot_token + alerting_telegram_chat_id, + or glowroot_slack_webhook_url, or glowroot_smtp_host. + when: glowroot_alerting_enabled | default(false) | bool + +# Telegram forwarder for Glowroot alerts +- name: Glowroot | Create forwarder system group + ansible.builtin.group: + name: glowroot_forwarder + system: true + when: + - glowroot_alerting_enabled | default(false) | bool + - alerting_telegram_bot_token is defined + - alerting_telegram_chat_id is defined + +- name: Glowroot | Create forwarder system user + ansible.builtin.user: + name: glowroot_forwarder + shell: /bin/false + create_home: false + system: true + group: glowroot_forwarder + when: + - glowroot_alerting_enabled | default(false) | bool + - alerting_telegram_bot_token is defined + - alerting_telegram_chat_id is defined + +- name: Glowroot | Deploy Telegram forwarder script + ansible.builtin.template: + src: glowroot-telegram-forwarder.py.j2 + dest: /opt/glowroot/glowroot-telegram-forwarder.py + owner: root + group: glowroot_forwarder + mode: "0640" + no_log: true + when: + - glowroot_alerting_enabled | default(false) | bool + - alerting_telegram_bot_token is defined + - alerting_telegram_chat_id is defined + notify: Restart Glowroot Telegram Forwarder + +- name: Glowroot | Deploy Telegram forwarder systemd service + ansible.builtin.template: + src: glowroot-telegram-forwarder.service.j2 + dest: /etc/systemd/system/glowroot-telegram-forwarder.service + owner: root + group: root + mode: "0644" + when: + - glowroot_alerting_enabled | default(false) | bool + - alerting_telegram_bot_token is defined + - alerting_telegram_chat_id is defined + notify: Restart Glowroot Telegram Forwarder + +- name: Glowroot | Enable and start Telegram forwarder + ansible.builtin.systemd: + name: glowroot-telegram-forwarder + state: started + enabled: true + daemon_reload: true + when: + - glowroot_alerting_enabled | default(false) | bool + - alerting_telegram_bot_token is defined + - alerting_telegram_chat_id is defined + +# Glowroot alert rules config +- name: Glowroot | Check if config.json exists + ansible.builtin.stat: + path: /opt/glowroot/config.json + register: glowroot_config_file_status + when: glowroot_alerting_enabled | default(false) | bool + +- name: Glowroot | Deploy config.json with alert rules + ansible.builtin.template: + src: glowroot_config.json.j2 + dest: /opt/glowroot/config.json + owner: root + group: tomcat + mode: "0660" + when: + - glowroot_alerting_enabled | default(false) | bool + - not (glowroot_config_file_status.stat.exists | default(false)) + notify: Restart Tomcat diff --git a/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.py.j2 b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.py.j2 new file mode 100644 index 0000000..3885f99 --- /dev/null +++ b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.py.j2 @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# {{ ansible_managed }} +"""Slack-to-Telegram webhook forwarder for Glowroot alerts. + +Listens on localhost for Slack-formatted webhook POSTs from Glowroot +and forwards them to a Telegram chat via the Bot API. + +Zero external dependencies -- uses only Python 3 stdlib. +""" + +import json +import sys +import urllib.request +import urllib.error +from http.server import HTTPServer, BaseHTTPRequestHandler + +LISTEN_HOST = "127.0.0.1" +LISTEN_PORT = {{ glowroot_telegram_forwarder_port | default(9099) }} +TELEGRAM_BOT_TOKEN = "{{ alerting_telegram_bot_token }}" +TELEGRAM_CHAT_ID = "{{ alerting_telegram_chat_id }}" +TELEGRAM_API = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage" + +STATUS_ICONS = {"danger": "\u26a0\ufe0f", "good": "\u2705", "warning": "\u26a0\ufe0f"} + + +def slack_to_telegram_html(payload): + """Convert a Slack attachment payload to Telegram HTML message.""" + attachments = payload.get("attachments", []) + if not attachments: + text = payload.get("text", "Unknown alert") + return f"Glowroot Alert\n{text}" + + att = attachments[0] + color = att.get("color", "warning") + icon = STATUS_ICONS.get(color, "\u2139\ufe0f") + pretext = att.get("pretext", "Alert") + body = att.get("text", "") + fallback = att.get("fallback", "") + + lines = [f"{icon} {pretext}"] + if body: + lines.append(f"\n{body}") + elif fallback: + lines.append(f"\n{fallback}") + return "\n".join(lines) + + +def send_telegram(message): + """POST a message to the Telegram Bot API.""" + data = json.dumps({ + "chat_id": TELEGRAM_CHAT_ID, + "text": message[:4096], + "parse_mode": "HTML", + }).encode("utf-8") + req = urllib.request.Request( + TELEGRAM_API, + data=data, + headers={"Content-Type": "application/json"}, + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.status + except urllib.error.URLError as exc: + print(f"Telegram API error: {exc}", file=sys.stderr) + return None + + +class WebhookHandler(BaseHTTPRequestHandler): + """Handle incoming Slack-formatted webhook POST requests.""" + + def do_POST(self): # noqa: N802 + length = int(self.headers.get("Content-Length", 0)) + raw = self.rfile.read(length) + try: + payload = json.loads(raw) + except (json.JSONDecodeError, UnicodeDecodeError): + self.send_response(400) + self.end_headers() + return + + message = slack_to_telegram_html(payload) + status = send_telegram(message) + code = 200 if status == 200 else 502 + self.send_response(code) + self.end_headers() + + def log_message(self, fmt, *args): + """Silence default stderr logging in production.""" + + +if __name__ == "__main__": + server = HTTPServer((LISTEN_HOST, LISTEN_PORT), WebhookHandler) + print(f"Listening on {LISTEN_HOST}:{LISTEN_PORT}") + try: + server.serve_forever() + except KeyboardInterrupt: + server.server_close() \ No newline at end of file diff --git "a/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.service.j2\342\200\216" "b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.service.j2\342\200\216" new file mode 100644 index 0000000..b7baecf --- /dev/null +++ "b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.service.j2\342\200\216" @@ -0,0 +1,20 @@ +{{ ansible_managed | comment }} +[Unit] +Description=Glowroot Slack-to-Telegram webhook forwarder +After=network.target + +[Service] +Type=simple +User=glowroot_forwarder +Group=glowroot_forwarder +ExecStart=/usr/bin/python3 /opt/glowroot/glowroot-telegram-forwarder.py +Restart=on-failure +RestartSec=5 +NoNewPrivileges=true +ProtectSystem=strict +ReadOnlyPaths=/opt/glowroot/glowroot-telegram-forwarder.py +ProtectHome=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/deploy/roles/create-instance/templates/glowroot_admin.json b/deploy/roles/create-instance/templates/glowroot_admin.json index a7e1a01..d1be654 100644 --- a/deploy/roles/create-instance/templates/glowroot_admin.json +++ b/deploy/roles/create-instance/templates/glowroot_admin.json @@ -49,4 +49,46 @@ ], "traceCappedDatabaseSizeMb": 500 } + {% if glowroot_alerting_enabled | default(false) | bool %} +{% if alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined %} + ,"slack": { + "webhooks": [ + { + "url": "http://127.0.0.1:{{ glowroot_telegram_forwarder_port | default(9099) }}", + "display": "Telegram (via forwarder)" + } + ] + } +{% elif glowroot_slack_webhook_url is defined and glowroot_slack_webhook_url | length > 0 %} + ,"slack": { + "webhooks": [ + { + "url": "{{ glowroot_slack_webhook_url }}", + "display": "{{ glowroot_slack_webhook_display | default('DHIS2 Alerts') }}" + } + ] + } +{% endif %} +{% if glowroot_smtp_host is defined and glowroot_smtp_host | length > 0 %} + ,"smtp": { + "host": "{{ glowroot_smtp_host }}", + "port": {{ glowroot_smtp_port | default(587) }}, + "connectionSecurity": "{{ glowroot_smtp_connection_security | default('STARTTLS') }}", + "username": "{{ glowroot_smtp_username | default('') }}", + "password": "{{ glowroot_smtp_password | default('') }}", + "fromEmailAddress": "{{ glowroot_smtp_from_address | default('') }}", + "fromDisplayName": "{{ glowroot_smtp_from_name | default('Glowroot DHIS2') }}" + } +{% endif %} +{% if glowroot_pagerduty_key is defined and glowroot_pagerduty_key | length > 0 %} + ,"pagerDuty": { + "integrationKeys": [ + { + "key": "{{ glowroot_pagerduty_key }}", + "display": "{{ glowroot_pagerduty_display | default('DHIS2') }}" + } + ] + } +{% endif %} +{% endif %} } diff --git a/deploy/roles/create-instance/templates/glowroot_config.json.j2 b/deploy/roles/create-instance/templates/glowroot_config.json.j2 new file mode 100644 index 0000000..120b7f0 --- /dev/null +++ b/deploy/roles/create-instance/templates/glowroot_config.json.j2 @@ -0,0 +1,80 @@ +{ + "alerts": [ + { + "condition": { + "conditionType": "heartbeat", + "timePeriodSeconds": {{ glowroot_alert_heartbeat_seconds | default(300) }} + }, + "severity": "CRITICAL" +{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %} + ,"slackNotification": { + "slackChannels": ["#alerts"] + } +{% endif %} +{% if glowroot_smtp_host is defined and glowroot_smtp_host | length > 0 %} + ,"emailNotification": { + "emailAddresses": {{ glowroot_alert_email_addresses | default([]) | to_json }} + } +{% endif %} + }, + { + "condition": { + "conditionType": "metric", + "metric": "error:rate", + "transactionType": "Web", + "threshold": {{ glowroot_alert_error_rate_threshold | default(10.0) }}, + "timePeriodSeconds": {{ glowroot_alert_error_rate_time_period_seconds | default(300) }} + }, + "severity": "CRITICAL" +{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %} + ,"slackNotification": { + "slackChannels": ["#alerts"] + } +{% endif %} +{% if glowroot_smtp_host is defined and glowroot_smtp_host | length > 0 %} + ,"emailNotification": { + "emailAddresses": {{ glowroot_alert_email_addresses | default([]) | to_json }} + } +{% endif %} + }, + { + "condition": { + "conditionType": "metric", + "metric": "transaction:x-percentile", + "transactionType": "Web", + "percentile": 95.0, + "threshold": {{ glowroot_alert_p95_threshold_ms | default(10000) }}, + "timePeriodSeconds": {{ glowroot_alert_p95_time_period_seconds | default(600) }}, + "minTransactionCount": {{ glowroot_alert_min_transaction_count | default(10) }} + }, + "severity": "HIGH" +{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %} + ,"slackNotification": { + "slackChannels": ["#alerts"] + } +{% endif %} + }, + { + "condition": { + "conditionType": "metric", + "metric": "gauge:java.lang:type=Memory:HeapMemoryUsage.used", +{% set heap_raw = heap_memory_size | default('4G') | string | trim %} +{% if heap_raw | regex_search('[mM]$') %} +{% set heap_bytes = (heap_raw | regex_replace('[mM]$', '') | int) * 1048576 %} +{% elif heap_raw | regex_search('[gG]$') %} +{% set heap_bytes = (heap_raw | regex_replace('[gG]$', '') | int) * 1073741824 %} +{% else %} +{% set heap_bytes = (heap_raw | int) * 1073741824 %} +{% endif %} + "threshold": {{ (heap_bytes * 0.8) | int }}, + "timePeriodSeconds": 300 + }, + "severity": "HIGH" +{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %} + ,"slackNotification": { + "slackChannels": ["#alerts"] + } +{% endif %} + } + ] +} \ No newline at end of file diff --git a/deploy/roles/monitoring/defaults/main/alerting.yml b/deploy/roles/monitoring/defaults/main/alerting.yml new file mode 100644 index 0000000..c2bb5ab --- /dev/null +++ b/deploy/roles/monitoring/defaults/main/alerting.yml @@ -0,0 +1,21 @@ +--- +alerting_enabled: false +alerting_default_contact_point: telegram + +# Telegram +# alerting_telegram_bot_token: "" +# alerting_telegram_chat_id: "" + +# Slack +# alerting_slack_webhook_url: "" +# alerting_slack_channel: "#dhis2-alerts" + +# Email +# alerting_email_addresses: [] + +# Thresholds +alert_cpu_threshold: 85 +alert_memory_threshold: 90 +alert_disk_warning_pct: 15 +alert_disk_critical_pct: 5 +alert_pg_connection_pct: 80 diff --git a/deploy/roles/monitoring/defaults/main/munin.yml b/deploy/roles/monitoring/defaults/main/munin.yml index 10a412c..14475a2 100644 --- a/deploy/roles/monitoring/defaults/main/munin.yml +++ b/deploy/roles/monitoring/defaults/main/munin.yml @@ -25,6 +25,29 @@ munin_hosts: extra: [use_node_name yes] munin_alerts: [] +# Email: +# munin_alerts: +# - name: admin +# type: email +# email: admin@example.com +# subject: "Munin Alert" +# level: "warning critical" +# +# Telegram: +# munin_alerts: +# - name: telegram +# type: telegram +# bot_token: "your-bot-token-from-botfather" +# chat_id: "-1001234567890" +# level: "warning critical" +# +# Slack: +# munin_alerts: +# - name: slack +# type: slack +# webhook_url: "https://hooks.slack.com/services/T.../B.../xxx" +# level: "warning critical" + # munin node defaults munin_node_bind_host: "*" munin_node_bind_port: "4949" diff --git a/deploy/roles/monitoring/tasks/alerting.yml b/deploy/roles/monitoring/tasks/alerting.yml new file mode 100644 index 0000000..402978b --- /dev/null +++ b/deploy/roles/monitoring/tasks/alerting.yml @@ -0,0 +1,105 @@ +--- +- name: Alerting | Assert at least one contact point is configured + ansible.builtin.assert: + that: >- + (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or + (alerting_slack_webhook_url is defined) or + (alerting_email_addresses is defined and alerting_email_addresses | length > 0) + fail_msg: >- + alerting_enabled is true but no contact point is configured. + Set alerting_telegram_bot_token + alerting_telegram_chat_id, + or alerting_slack_webhook_url, or alerting_email_addresses. + tags: [always] + +# Prometheus alert rules +- name: Alerting | Ensure Prometheus rules directory exists + ansible.builtin.file: + path: /etc/prometheus/rules + state: directory + owner: prometheus + group: prometheus + mode: '0755' + +- name: Alerting | Deploy Prometheus alert rules + ansible.builtin.template: + src: prometheus-rules.yml.j2 + dest: /etc/prometheus/rules/dhis2-alerts.yml + owner: prometheus + group: prometheus + mode: '0644' + validate: 'promtool check rules %s' + notify: Reload Prometheus + +- name: Alerting | Ensure Prometheus rule_files directive exists + ansible.builtin.lineinfile: + path: /etc/prometheus/prometheus.yml + regexp: '^rule_files:' + line: 'rule_files:' + insertbefore: '^scrape_configs:' + notify: Reload Prometheus + +- name: Alerting | Wire rules directory into prometheus.yml + ansible.builtin.lineinfile: + path: /etc/prometheus/prometheus.yml + regexp: '^\s+- "/etc/prometheus/rules/\*\.yml"' + line: ' - "/etc/prometheus/rules/*.yml"' + insertafter: '^rule_files:' + notify: Reload Prometheus + +# Grafana unified alerting +- name: Alerting | Provision Prometheus datasource with fixed UID + ansible.builtin.template: + src: grafana-datasource-provisioning.yml.j2 + dest: /etc/grafana/provisioning/datasources/prometheus.yml + owner: grafana + group: grafana + mode: '0640' + notify: Restart Grafana + +- name: Alerting | Enable Grafana unified alerting + community.general.ini_file: + path: /etc/grafana/grafana.ini + section: unified_alerting + option: enabled + value: 'true' + mode: '0640' + notify: Restart Grafana + +- name: Alerting | Create Grafana alerting provisioning directory + ansible.builtin.file: + path: /etc/grafana/provisioning/alerting + state: directory + owner: grafana + group: grafana + mode: '0755' + +- name: Alerting | Deploy Grafana contact points + ansible.builtin.template: + src: grafana-contact-points.yml.j2 + dest: /etc/grafana/provisioning/alerting/contact-points.yml + owner: grafana + group: grafana + mode: '0640' + no_log: true + notify: Restart Grafana + +- name: Alerting | Deploy Grafana notification policies + ansible.builtin.template: + src: grafana-notification-policies.yml.j2 + dest: /etc/grafana/provisioning/alerting/notification-policies.yml + owner: grafana + group: grafana + mode: '0640' + notify: Restart Grafana + +- name: Alerting | Deploy Grafana alert rules + ansible.builtin.template: + src: grafana-alert-rules.yml.j2 + dest: /etc/grafana/provisioning/alerting/alert-rules.yml + owner: grafana + group: grafana + mode: '0640' + notify: Restart Grafana + +- name: Alerting | Flush handlers + ansible.builtin.meta: flush_handlers diff --git a/deploy/roles/monitoring/tasks/main.yml b/deploy/roles/monitoring/tasks/main.yml index b8fd7b2..107ec05 100644 --- a/deploy/roles/monitoring/tasks/main.yml +++ b/deploy/roles/monitoring/tasks/main.yml @@ -40,3 +40,15 @@ # - server_monitoring is defined and server_monitoring == 'grafana' - server_monitoring is defined and server_monitoring in ['grafana', 'prometheus', 'grafana/prometheus'] - inventory_hostname in groups['monitoring'] + +- name: Alerting | Configure alerting for Grafana/Prometheus + ansible.builtin.include_tasks: + file: alerting.yml + apply: + tags: [alerting] + tags: [alerting] + when: + - alerting_enabled | default(false) | bool + - groups['monitoring'] | length > 0 + - server_monitoring is defined and server_monitoring in ['grafana', 'prometheus', 'grafana/prometheus'] + - inventory_hostname in groups['monitoring'] \ No newline at end of file diff --git "a/deploy/roles/monitoring/templates/grafana-alert-rules.yml.j2\342\200\216" "b/deploy/roles/monitoring/templates/grafana-alert-rules.yml.j2\342\200\216" new file mode 100644 index 0000000..e6ecdc6 --- /dev/null +++ "b/deploy/roles/monitoring/templates/grafana-alert-rules.yml.j2\342\200\216" @@ -0,0 +1,258 @@ +{{ ansible_managed | comment }} +apiVersion: 1 +groups: + - orgId: 1 + name: DHIS2 Infrastructure + folder: DHIS2 Alerts + interval: 60s + rules: + - uid: instance-down + title: Instance Down + condition: B + for: 5m + labels: + severity: critical + annotations: + summary: {% raw %}"Target {{ $labels.instance }} is down"{% endraw %} + + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: up == 0 + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + + - uid: high-cpu-usage + title: High CPU Usage + condition: B + for: 10m + labels: + severity: warning + annotations: + summary: "CPU usage above {{ alert_cpu_threshold }}%" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > {{ alert_cpu_threshold }} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + + - uid: high-memory-usage + title: High Memory Usage + condition: B + for: 5m + labels: + severity: warning + annotations: + summary: "Memory usage above {{ alert_memory_threshold }}%" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > {{ alert_memory_threshold }} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + + - uid: disk-space-warning + title: Disk Space Warning + condition: B + for: 5m + labels: + severity: warning + annotations: + summary: "Disk space below {{ alert_disk_warning_pct }}%" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_warning_pct }} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + + - uid: disk-space-critical + title: Disk Space Critical + condition: B + for: 5m + labels: + severity: critical + annotations: + summary: "Disk space below {{ alert_disk_critical_pct }}%" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_critical_pct }} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + + - orgId: 1 + name: DHIS2 Database + folder: DHIS2 Alerts + interval: 60s + rules: + - uid: postgres-down + title: PostgreSQL Down + condition: B + for: 2m + labels: + severity: critical + annotations: + summary: "PostgreSQL is unreachable" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: pg_up == 0 + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + + - uid: postgres-connections-high + title: PostgreSQL Connections High + condition: B + for: 5m + labels: + severity: warning + annotations: + summary: "Database connections exceed {{ alert_pg_connection_pct }}% of max" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: sum by(instance) (pg_stat_activity_count) / on(instance) pg_settings_max_connections * 100 > {{ alert_pg_connection_pct }} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + + - orgId: 1 + name: DHIS2 Application + folder: DHIS2 Alerts + interval: 60s + rules: + - uid: dhis2-endpoint-down + title: DHIS2 Endpoint Down + condition: B + for: 5m + labels: + severity: critical + annotations: + summary: "DHIS2 metrics endpoint is unreachable" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: up{job=~".*dhis.*"} == 0 + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B \ No newline at end of file diff --git "a/deploy/roles/monitoring/templates/grafana-contact-points.yml.j2\342\200\216" "b/deploy/roles/monitoring/templates/grafana-contact-points.yml.j2\342\200\216" new file mode 100644 index 0000000..cc13163 --- /dev/null +++ "b/deploy/roles/monitoring/templates/grafana-contact-points.yml.j2\342\200\216" @@ -0,0 +1,33 @@ +{{ ansible_managed | comment }} +apiVersion: 1 +contactPoints: +{% if alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined %} + - orgId: 1 + name: telegram + receivers: + - uid: telegram-default + type: telegram + settings: + bottoken: "{{ alerting_telegram_bot_token }}" + chatid: "{{ alerting_telegram_chat_id }}" + parse_mode: HTML +{% endif %} +{% if alerting_slack_webhook_url is defined %} + - orgId: 1 + name: slack + receivers: + - uid: slack-default + type: slack + settings: + url: "{{ alerting_slack_webhook_url }}" + recipient: "{{ alerting_slack_channel | default('#dhis2-alerts') }}" +{% endif %} +{% if alerting_email_addresses is defined and alerting_email_addresses | length > 0 %} + - orgId: 1 + name: email + receivers: + - uid: email-default + type: email + settings: + addresses: "{{ alerting_email_addresses | join(';') }}" +{% endif %} \ No newline at end of file diff --git a/deploy/roles/monitoring/templates/grafana-datasource-provisioning.yml.j2 b/deploy/roles/monitoring/templates/grafana-datasource-provisioning.yml.j2 new file mode 100644 index 0000000..dd8044e --- /dev/null +++ b/deploy/roles/monitoring/templates/grafana-datasource-provisioning.yml.j2 @@ -0,0 +1,9 @@ +{{ ansible_managed | comment }} +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + url: http://127.0.0.1:9090 + access: proxy + isDefault: true \ No newline at end of file diff --git a/deploy/roles/monitoring/templates/grafana-notification-policies.yml.j2 b/deploy/roles/monitoring/templates/grafana-notification-policies.yml.j2 new file mode 100644 index 0000000..3f695e6 --- /dev/null +++ b/deploy/roles/monitoring/templates/grafana-notification-policies.yml.j2 @@ -0,0 +1,11 @@ +{{ ansible_managed | comment }} +apiVersion: 1 +policies: + - orgId: 1 + receiver: "{{ alerting_default_contact_point | default('telegram') }}" + group_by: + - alertname + - instance + group_wait: 30s + group_interval: 5m + repeat_interval: 4h \ No newline at end of file diff --git a/deploy/roles/monitoring/templates/munin.conf.j2 b/deploy/roles/monitoring/templates/munin.conf.j2 index be0aa9b..d86c431 100644 --- a/deploy/roles/monitoring/templates/munin.conf.j2 +++ b/deploy/roles/monitoring/templates/munin.conf.j2 @@ -83,8 +83,14 @@ includedir {{ munin_includedir }} {% if munin_alerts %} {% for contact in munin_alerts %} contacts {{ contact.name }} -contact.{{ contact.name }}.command mail -s "{{ contact.subject }}" {{ contact.email }} -contact.{{ contact.name }}.always_send {{ contact.level }} +{% if contact.type | default('email') == 'email' %} +contact.{{ contact.name }}.command mail -s "{{ contact.subject | default('Munin Alert') }}" {{ contact.email }} +{% elif contact.type == 'telegram' %} +contact.{{ contact.name }}.command curl -sf -X POST "https://api.telegram.org/bot{{ contact.bot_token }}/sendMessage" -d "chat_id={{ contact.chat_id }}" -d "parse_mode=HTML" -d "text=Munin Alert%0AHost: ${var:host}%0AGraph: ${var:graph_title}%0AStatus: ${var:worst}%0ACategory: ${var:graph_category}" +{% elif contact.type == 'slack' %} +contact.{{ contact.name }}.command curl -sf -X POST "{{ contact.webhook_url }}" -H "Content-Type: application/json" -d '{"text":"*Munin Alert*\n*Host:* ${var:host}\n*Graph:* ${var:graph_title}\n*Status:* ${var:worst}\n*Category:* ${var:graph_category}"}' +{% endif %} +contact.{{ contact.name }}.always_send {{ contact.level | default('warning critical') }} {% endfor %} {% endif %} diff --git a/deploy/roles/monitoring/templates/prometheus-rules.yml.j2 b/deploy/roles/monitoring/templates/prometheus-rules.yml.j2 new file mode 100644 index 0000000..5bb1231 --- /dev/null +++ b/deploy/roles/monitoring/templates/prometheus-rules.yml.j2 @@ -0,0 +1,88 @@ +{{ ansible_managed | comment }} +groups: + - name: dhis2_infrastructure + rules: + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Target {{ '{{ $labels.instance }}' }} is down" + description: "{{ '{{ $labels.instance }}' }} of job {{ '{{ $labels.job }}' }} has been down for more than 5 minutes." + + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > {{ alert_cpu_threshold }} + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ '{{ $labels.instance }}' }}" + description: "CPU usage is above {{ alert_cpu_threshold }}% for more than 10 minutes." + + - alert: HighMemoryUsage + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > {{ alert_memory_threshold }} + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ '{{ $labels.instance }}' }}" + description: "Memory usage is above {{ alert_memory_threshold }}% for more than 5 minutes." + + - alert: DiskSpaceWarning + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_warning_pct }} + for: 5m + labels: + severity: warning + annotations: + summary: "Low disk space on {{ '{{ $labels.instance }}' }}" + description: "Filesystem {{ '{{ $labels.mountpoint }}' }} has less than {{ alert_disk_warning_pct }}% free space." + + - alert: DiskSpaceCritical + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_critical_pct }} + for: 5m + labels: + severity: critical + annotations: + summary: "Critical disk space on {{ '{{ $labels.instance }}' }}" + description: "Filesystem {{ '{{ $labels.mountpoint }}' }} has less than {{ alert_disk_critical_pct }}% free space." + + - name: dhis2_database + rules: + - alert: PostgresDown + expr: pg_up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "PostgreSQL is down on {{ '{{ $labels.instance }}' }}" + description: "PostgreSQL exporter reports the database is unreachable." + + - alert: PostgresConnectionsHigh + expr: sum by(instance) (pg_stat_activity_count) / on(instance) pg_settings_max_connections * 100 > {{ alert_pg_connection_pct }} + for: 5m + labels: + severity: warning + annotations: + summary: "High PostgreSQL connections on {{ '{{ $labels.instance }}' }}" + description: "Database connections exceed {{ alert_pg_connection_pct }}% of max_connections." + + - alert: PostgresLongRunningQuery + expr: pg_stat_activity_max_tx_duration{state="active"} > 3600 + for: 5m + labels: + severity: warning + annotations: + summary: "Long-running query on {{ '{{ $labels.instance }}' }}" + description: "A query has been running for more than 1 hour." + + - name: dhis2_application + rules: + - alert: DHIS2EndpointDown + expr: up{job=~".*dhis.*"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "DHIS2 endpoint {{ '{{ $labels.instance }}' }} is down" + description: "DHIS2 metrics endpoint has been unreachable for more than 5 minutes." \ No newline at end of file From de63dd89ce250384c0521222eeb6f9a509dc104f Mon Sep 17 00:00:00 2001 From: 0xafrogeek Date: Tue, 7 Apr 2026 21:17:01 +0000 Subject: [PATCH 2/2] docs: adding alerting documentation --- docs/alerting.md | 295 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 docs/alerting.md diff --git a/docs/alerting.md b/docs/alerting.md new file mode 100644 index 0000000..e8fcec9 --- /dev/null +++ b/docs/alerting.md @@ -0,0 +1,295 @@ +# Alerting + +DHIS2 server tools supports alerting via Telegram, Slack, and email for both infrastructure monitoring (Grafana/Prometheus) and application monitoring (Glowroot APM). + +## Quick Start: Telegram + +Requires `server_monitoring=grafana` or `server_monitoring=grafana/prometheus`. + +### 1. Create a Telegram Bot + +- Message [@BotFather](https://t.me/BotFather) on Telegram +- Send `/newbot`, follow the prompts +- Save the bot token (e.g., `123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11`) + +### 2. Get Your Chat ID + +- Add the bot to your Telegram group +- Message [@getmyid_bot](https://t.me/getmyid_bot) in the group, or use [@userinfobot](https://t.me/userinfobot) +- Save the chat ID (e.g., `-1001234567890` for groups) + +### 3. Configure and Deploy + +Edit `deploy/inventory/hosts` and update the `[all:vars]` section: + +```ini +server_monitoring=grafana +alerting_enabled=true +alerting_default_contact_point=telegram +alerting_telegram_bot_token=123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11 +alerting_telegram_chat_id=-1001234567890 +``` + +Then deploy: + +```bash +ansible-playbook dhis2.yml --tags monitoring,alerting +``` + +After deployment, verify alerts are working -- see [Testing Alerts](#testing-alerts) below. + +> **Note:** The `inventory/hosts` file is gitignored, so your token won't be committed to version control. For production hardening, see [Securing Credentials with Vault](#securing-credentials-with-vault) below. + +## Quick Start: Slack + +Same as Telegram, but set these variables instead: + +```ini +alerting_default_contact_point=slack +alerting_slack_webhook_url=https://hooks.slack.com/services/T.../B.../xxx +alerting_slack_channel=#dhis2-alerts +``` + +Create an incoming webhook at https://api.slack.com/messaging/webhooks. + +--- + +## Alert Rules Reference + +### Infrastructure Alerts (Grafana/Prometheus) + +| Alert | Condition | Duration | Severity | Default Threshold | +| ---------------------- | --------------------- | -------- | -------- | ----------------- | +| Instance Down | Target unreachable | 5m | critical | up == 0 | +| High CPU | CPU usage | 10m | warning | > 85% | +| High Memory | Memory usage | 5m | warning | > 90% | +| Disk Space Warning | Free space low | 5m | warning | < 15% | +| Disk Space Critical | Free space very low | 5m | critical | < 5% | +| PostgreSQL Down | DB unreachable | 2m | critical | pg_up == 0 | +| PostgreSQL Connections | Connection saturation | 5m | warning | > 80% of max | +| Long Running Query | Query duration | 5m | warning | > 1 hour | +| DHIS2 Endpoint Down | Metrics unreachable | 5m | critical | up == 0 | + +### Glowroot APM Alerts (Optional) + +| Alert | Condition | Duration | Severity | Default Threshold | +| ----------------- | ----------------- | -------- | -------- | ------------------------- | +| Heartbeat | JVM/agent down | 5m | critical | No heartbeat | +| Error Rate | Web errors | 5m | critical | > 10% | +| Response Time p95 | Slow responses | 10m | high | > 10,000 ms | +| Heap Memory | JVM heap pressure | 5m | high | > 80% of heap_memory_size | + +--- + +## Customizing Thresholds + +Override these variables in `host_vars/monitor/vars.yml` or `inventory/hosts`: + +### Infrastructure Thresholds + +| Variable | Default | Description | +| ------------------------- | ------- | --------------------------------- | +| `alert_cpu_threshold` | 85 | CPU usage percentage | +| `alert_memory_threshold` | 90 | Memory usage percentage | +| `alert_disk_warning_pct` | 15 | Disk free space warning (%) | +| `alert_disk_critical_pct` | 5 | Disk free space critical (%) | +| `alert_pg_connection_pct` | 80 | PostgreSQL connections (% of max) | + +### Glowroot Thresholds + +| Variable | Default | Description | +| ----------------------------------------------- | ------- | ----------------------------------- | +| `glowroot_alert_p95_threshold_ms` | 10000 | p95 response time (ms) | +| `glowroot_alert_p95_time_period_seconds` | 600 | Evaluation window for p95 | +| `glowroot_alert_error_rate_threshold` | 10.0 | Error rate percentage | +| `glowroot_alert_error_rate_time_period_seconds` | 300 | Evaluation window for errors | +| `glowroot_alert_heartbeat_seconds` | 300 | Heartbeat timeout | +| `glowroot_alert_min_transaction_count` | 10 | Min transactions before alert fires | + +--- + +## Glowroot APM Alerts + +Glowroot alerts on transaction times, error rates, JVM metrics, and heartbeat. These require `app_monitoring=glowroot` (the default) and Grafana/Prometheus alerting configured first (steps 1-3 above) since they share the same bot token and chat ID. + +### How It Works + +Glowroot supports Slack natively but not Telegram. A Python forwarder bridges the gap: + +1. `glowroot-telegram-forwarder.py` runs as a systemd service on each instance host +2. Glowroot's Slack webhook points at `http://127.0.0.1:9099` (the forwarder) +3. The forwarder translates the Slack payload to a Telegram Bot API call + +The forwarder uses only Python 3 stdlib, binds to localhost, auto-restarts via systemd, and is independent of Tomcat. One forwarder per host serves all instances. + +### Enabling Glowroot Alerts + +Add to `inventory/hosts`: + +```ini +glowroot_alerting_enabled=true +``` + +The forwarder uses the same `alerting_telegram_bot_token` and `alerting_telegram_chat_id` already configured for Grafana. + +### Deploying + +```bash +ansible-playbook dhis2.yml --tags create-instance +``` + +--- + +## Munin Alerts + +For users running `server_monitoring=munin`, configure the `munin_alerts` variable in `host_vars` or `group_vars`: + +### Telegram + +```yaml +munin_alerts: + - name: telegram + type: telegram + bot_token: 'your-bot-token-from-botfather' + chat_id: '-1001234567890' + level: 'warning critical' +``` + +### Slack + +```yaml +munin_alerts: + - name: slack + type: slack + webhook_url: 'https://hooks.slack.com/services/T.../B.../xxx' + level: 'warning critical' +``` + +### Email (Default) + +```yaml +munin_alerts: + - name: admin + type: email + email: admin@example.com + subject: 'Munin Alert' + level: 'warning critical' +``` + +--- + +## Testing Alerts + +### Test Telegram Delivery + +```bash +curl -s -X POST "https://api.telegram.org/bot/sendMessage" \ + -d "chat_id=" -d "text=Test alert from DHIS2 monitoring" +``` + +### Test Glowroot Forwarder + +```bash +# Check service status +systemctl status glowroot-telegram-forwarder + +# Send a test alert +curl -s -X POST http://127.0.0.1:9099 \ + -H "Content-Type: application/json" \ + -d '{ + "attachments": [{ + "fallback": "[dhis2] Test alert - triggered", + "pretext": "[dhis2] Test alert triggered", + "color": "danger", + "text": "This is a test alert", + "ts": 1712500000.0 + }], + "channel": "#alerts" + }' +``` + +### Verify Grafana Alerting + +```bash +# List contact points +curl -u admin:admin http://localhost:3000/grafana/api/v1/provisioning/contact-points + +# List alert rules +curl -u admin:admin http://localhost:3000/grafana/api/v1/provisioning/alert-rules +``` + +### Verify Prometheus Rules + +```bash +promtool check rules /etc/prometheus/rules/dhis2-alerts.yml +``` + +--- + +## Troubleshooting + +**Bot not sending messages:** + +- Ensure the bot is added to the Telegram group/chat +- Verify the chat ID sign (groups use negative IDs like `-1001234567890`) +- Test with a direct curl to the Bot API + +**Grafana unified alerting not working:** + +- Check Grafana version is 9.0+ (`grafana-server -v`) +- Verify `/etc/grafana/grafana.ini` has `[unified_alerting] enabled = true` +- Check `/etc/grafana/provisioning/alerting/` directory exists and files are owned by `grafana` + +**Glowroot forwarder not running:** + +- Check `systemctl status glowroot-telegram-forwarder` +- Check logs: `journalctl -u glowroot-telegram-forwarder` +- Verify the script exists: `ls -la /opt/glowroot/glowroot-telegram-forwarder.py` + +**Prometheus rules not loading:** + +- Validate syntax: `promtool check rules /etc/prometheus/rules/dhis2-alerts.yml` +- Check `/etc/prometheus/prometheus.yml` has `rule_files:` directive +- Reload Prometheus: `systemctl reload prometheus` + +**No alerts firing:** + +- Alerts need the `for` duration to pass before firing (e.g., 5 minutes) +- Check Grafana UI at `/grafana/alerting/list` for alert states +- Verify Prometheus targets are being scraped at `/grafana/explore` + +--- + +## Securing Credentials with Vault + +For production deployments where you want to encrypt tokens at rest, you can optionally move credentials to an ansible-vault encrypted file. + +Create the directory structure: + +```bash +cd deploy/inventory +mkdir -p host_vars/monitor +``` + +`host_vars/monitor/vars.yml` (plaintext -- references the vault): + +```yaml +alerting_enabled: true +alerting_telegram_bot_token: '{{ vault_alerting_telegram_bot_token }}' +alerting_telegram_chat_id: '-1001234567890' +``` + +`host_vars/monitor/vault.yml` (will be encrypted): + +```yaml +vault_alerting_telegram_bot_token: '123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11' +``` + +Encrypt and deploy: + +```bash +ansible-vault encrypt host_vars/monitor/vault.yml +ansible-playbook dhis2.yml --tags monitoring,alerting --vault-id @prompt +``` + +Remove the token from `inventory/hosts` after moving it to the vault.