diff --git a/deploy/inventory/host_vars/monitor.template b/deploy/inventory/host_vars/monitor.template
new file mode 100644
index 0000000..cd6fa5b
--- /dev/null
+++ b/deploy/inventory/host_vars/monitor.template
@@ -0,0 +1,11 @@
+---
+# Variables for the monitoring host.
+# Copy to 'monitor': cp monitor.template monitor
+
+# alerting_enabled: true
+# alerting_telegram_bot_token: "123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11"
+# alerting_telegram_chat_id: "-1001234567890"
+
+# alerting_default_contact_point: slack
+# alerting_slack_webhook_url: "https://hooks.slack.com/services/T.../B.../xxx"
+# alerting_slack_channel: "#dhis2-alerts"
\ No newline at end of file
diff --git a/deploy/inventory/hosts.template b/deploy/inventory/hosts.template
index f4d69b4..5b9bde5 100644
--- a/deploy/inventory/hosts.template
+++ b/deploy/inventory/hosts.template
@@ -53,6 +53,12 @@ postgresql_version=16
server_monitoring=munin
app_monitoring=glowroot
+# alerting (requires server_monitoring=grafana, see docs/alerting.md)
+# alerting_enabled=false
+# alerting_default_contact_point=telegram
+# alerting_telegram_bot_token=
+# alerting_telegram_chat_id=
+# glowroot_alerting_enabled=false
# lxd
lxd_network=172.19.2.1/24
diff --git a/deploy/roles/create-instance/defaults/main/glowroot_alerting.yml b/deploy/roles/create-instance/defaults/main/glowroot_alerting.yml
new file mode 100644
index 0000000..f6308d2
--- /dev/null
+++ b/deploy/roles/create-instance/defaults/main/glowroot_alerting.yml
@@ -0,0 +1,25 @@
+---
+glowroot_alerting_enabled: false
+glowroot_telegram_forwarder_port: '9099'
+
+# Slack (native Glowroot)
+# glowroot_slack_webhook_url: ""
+# glowroot_slack_webhook_display: "DHIS2 Alerts"
+
+# Email / SMTP (native Glowroot)
+# glowroot_smtp_host: ""
+# glowroot_smtp_port: 587
+# glowroot_smtp_connection_security: "STARTTLS"
+# glowroot_smtp_username: ""
+# glowroot_smtp_password: ""
+# glowroot_smtp_from_address: ""
+# glowroot_smtp_from_name: "Glowroot DHIS2"
+# glowroot_alert_email_addresses: []
+
+# Thresholds
+glowroot_alert_p95_threshold_ms: 10000
+glowroot_alert_p95_time_period_seconds: 600
+glowroot_alert_error_rate_threshold: 10.0
+glowroot_alert_error_rate_time_period_seconds: 300
+glowroot_alert_heartbeat_seconds: 300
+glowroot_alert_min_transaction_count: 10
diff --git a/deploy/roles/create-instance/handlers/main.yml b/deploy/roles/create-instance/handlers/main.yml
index 4ad6aae..6f0fbac 100644
--- a/deploy/roles/create-instance/handlers/main.yml
+++ b/deploy/roles/create-instance/handlers/main.yml
@@ -49,3 +49,9 @@
ansible.builtin.service:
name: munin-node
state: restarted
+
+- name: Restart Glowroot Telegram Forwarder
+ ansible.builtin.systemd:
+ name: glowroot-telegram-forwarder
+ state: restarted
+ daemon_reload: true
diff --git a/deploy/roles/create-instance/tasks/glowroot.yml b/deploy/roles/create-instance/tasks/glowroot.yml
index b467037..374c0aa 100644
--- a/deploy/roles/create-instance/tasks/glowroot.yml
+++ b/deploy/roles/create-instance/tasks/glowroot.yml
@@ -102,6 +102,7 @@
owner: root
group: tomcat
mode: "0660"
+ no_log: true
when: not glowroot_admin_file_status.stat.exists
notify: Restart Tomcat
@@ -112,3 +113,95 @@
line: '\1"contextPath": "{{ "/glowroot" if dhis2_base_path | default(inventory_hostname) | to_fixed_string == "ROOT" else "/" + dhis2_base_path | default(inventory_hostname) | to_fixed_string + "-glowroot" }}",'
backrefs: true
notify: Restart Tomcat
+
+# Glowroot alerting pre-flight check
+- name: Glowroot | Assert at least one notification channel is configured
+ ansible.builtin.assert:
+ that: >-
+ (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or
+ (glowroot_slack_webhook_url is defined) or
+ (glowroot_smtp_host is defined)
+ fail_msg: >-
+ glowroot_alerting_enabled is true but no notification channel is configured.
+ Set alerting_telegram_bot_token + alerting_telegram_chat_id,
+ or glowroot_slack_webhook_url, or glowroot_smtp_host.
+ when: glowroot_alerting_enabled | default(false) | bool
+
+# Telegram forwarder for Glowroot alerts
+- name: Glowroot | Create forwarder system group
+ ansible.builtin.group:
+ name: glowroot_forwarder
+ system: true
+ when:
+ - glowroot_alerting_enabled | default(false) | bool
+ - alerting_telegram_bot_token is defined
+ - alerting_telegram_chat_id is defined
+
+- name: Glowroot | Create forwarder system user
+ ansible.builtin.user:
+ name: glowroot_forwarder
+ shell: /bin/false
+ create_home: false
+ system: true
+ group: glowroot_forwarder
+ when:
+ - glowroot_alerting_enabled | default(false) | bool
+ - alerting_telegram_bot_token is defined
+ - alerting_telegram_chat_id is defined
+
+- name: Glowroot | Deploy Telegram forwarder script
+ ansible.builtin.template:
+ src: glowroot-telegram-forwarder.py.j2
+ dest: /opt/glowroot/glowroot-telegram-forwarder.py
+ owner: root
+ group: glowroot_forwarder
+ mode: "0640"
+ no_log: true
+ when:
+ - glowroot_alerting_enabled | default(false) | bool
+ - alerting_telegram_bot_token is defined
+ - alerting_telegram_chat_id is defined
+ notify: Restart Glowroot Telegram Forwarder
+
+- name: Glowroot | Deploy Telegram forwarder systemd service
+ ansible.builtin.template:
+ src: glowroot-telegram-forwarder.service.j2
+ dest: /etc/systemd/system/glowroot-telegram-forwarder.service
+ owner: root
+ group: root
+ mode: "0644"
+ when:
+ - glowroot_alerting_enabled | default(false) | bool
+ - alerting_telegram_bot_token is defined
+ - alerting_telegram_chat_id is defined
+ notify: Restart Glowroot Telegram Forwarder
+
+- name: Glowroot | Enable and start Telegram forwarder
+ ansible.builtin.systemd:
+ name: glowroot-telegram-forwarder
+ state: started
+ enabled: true
+ daemon_reload: true
+ when:
+ - glowroot_alerting_enabled | default(false) | bool
+ - alerting_telegram_bot_token is defined
+ - alerting_telegram_chat_id is defined
+
+# Glowroot alert rules config
+- name: Glowroot | Check if config.json exists
+ ansible.builtin.stat:
+ path: /opt/glowroot/config.json
+ register: glowroot_config_file_status
+ when: glowroot_alerting_enabled | default(false) | bool
+
+- name: Glowroot | Deploy config.json with alert rules
+ ansible.builtin.template:
+ src: glowroot_config.json.j2
+ dest: /opt/glowroot/config.json
+ owner: root
+ group: tomcat
+ mode: "0660"
+ when:
+ - glowroot_alerting_enabled | default(false) | bool
+ - not (glowroot_config_file_status.stat.exists | default(false))
+ notify: Restart Tomcat
diff --git a/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.py.j2 b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.py.j2
new file mode 100644
index 0000000..3885f99
--- /dev/null
+++ b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.py.j2
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# {{ ansible_managed }}
+"""Slack-to-Telegram webhook forwarder for Glowroot alerts.
+
+Listens on localhost for Slack-formatted webhook POSTs from Glowroot
+and forwards them to a Telegram chat via the Bot API.
+
+Zero external dependencies -- uses only Python 3 stdlib.
+"""
+
+import json
+import sys
+import urllib.request
+import urllib.error
+from http.server import HTTPServer, BaseHTTPRequestHandler
+
+LISTEN_HOST = "127.0.0.1"
+LISTEN_PORT = {{ glowroot_telegram_forwarder_port | default(9099) }}
+TELEGRAM_BOT_TOKEN = "{{ alerting_telegram_bot_token }}"
+TELEGRAM_CHAT_ID = "{{ alerting_telegram_chat_id }}"
+TELEGRAM_API = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
+
+STATUS_ICONS = {"danger": "\u26a0\ufe0f", "good": "\u2705", "warning": "\u26a0\ufe0f"}
+
+
+def slack_to_telegram_html(payload):
+ """Convert a Slack attachment payload to Telegram HTML message."""
+ attachments = payload.get("attachments", [])
+ if not attachments:
+ text = payload.get("text", "Unknown alert")
+ return f"Glowroot Alert\n{text}"
+
+ att = attachments[0]
+ color = att.get("color", "warning")
+ icon = STATUS_ICONS.get(color, "\u2139\ufe0f")
+ pretext = att.get("pretext", "Alert")
+ body = att.get("text", "")
+ fallback = att.get("fallback", "")
+
+ lines = [f"{icon} {pretext}"]
+ if body:
+ lines.append(f"\n{body}")
+ elif fallback:
+ lines.append(f"\n{fallback}")
+ return "\n".join(lines)
+
+
+def send_telegram(message):
+ """POST a message to the Telegram Bot API."""
+ data = json.dumps({
+ "chat_id": TELEGRAM_CHAT_ID,
+ "text": message[:4096],
+ "parse_mode": "HTML",
+ }).encode("utf-8")
+ req = urllib.request.Request(
+ TELEGRAM_API,
+ data=data,
+ headers={"Content-Type": "application/json"},
+ )
+ try:
+ with urllib.request.urlopen(req, timeout=10) as resp:
+ return resp.status
+ except urllib.error.URLError as exc:
+ print(f"Telegram API error: {exc}", file=sys.stderr)
+ return None
+
+
+class WebhookHandler(BaseHTTPRequestHandler):
+ """Handle incoming Slack-formatted webhook POST requests."""
+
+ def do_POST(self): # noqa: N802
+ length = int(self.headers.get("Content-Length", 0))
+ raw = self.rfile.read(length)
+ try:
+ payload = json.loads(raw)
+ except (json.JSONDecodeError, UnicodeDecodeError):
+ self.send_response(400)
+ self.end_headers()
+ return
+
+ message = slack_to_telegram_html(payload)
+ status = send_telegram(message)
+ code = 200 if status == 200 else 502
+ self.send_response(code)
+ self.end_headers()
+
+ def log_message(self, fmt, *args):
+ """Silence default stderr logging in production."""
+
+
+if __name__ == "__main__":
+ server = HTTPServer((LISTEN_HOST, LISTEN_PORT), WebhookHandler)
+ print(f"Listening on {LISTEN_HOST}:{LISTEN_PORT}")
+ try:
+ server.serve_forever()
+ except KeyboardInterrupt:
+ server.server_close()
\ No newline at end of file
diff --git "a/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.service.j2\342\200\216" "b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.service.j2\342\200\216"
new file mode 100644
index 0000000..b7baecf
--- /dev/null
+++ "b/deploy/roles/create-instance/templates/glowroot-telegram-forwarder.service.j2\342\200\216"
@@ -0,0 +1,20 @@
+{{ ansible_managed | comment }}
+[Unit]
+Description=Glowroot Slack-to-Telegram webhook forwarder
+After=network.target
+
+[Service]
+Type=simple
+User=glowroot_forwarder
+Group=glowroot_forwarder
+ExecStart=/usr/bin/python3 /opt/glowroot/glowroot-telegram-forwarder.py
+Restart=on-failure
+RestartSec=5
+NoNewPrivileges=true
+ProtectSystem=strict
+ReadOnlyPaths=/opt/glowroot/glowroot-telegram-forwarder.py
+ProtectHome=true
+PrivateTmp=true
+
+[Install]
+WantedBy=multi-user.target
\ No newline at end of file
diff --git a/deploy/roles/create-instance/templates/glowroot_admin.json b/deploy/roles/create-instance/templates/glowroot_admin.json
index a7e1a01..d1be654 100644
--- a/deploy/roles/create-instance/templates/glowroot_admin.json
+++ b/deploy/roles/create-instance/templates/glowroot_admin.json
@@ -49,4 +49,46 @@
],
"traceCappedDatabaseSizeMb": 500
}
+ {% if glowroot_alerting_enabled | default(false) | bool %}
+{% if alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined %}
+ ,"slack": {
+ "webhooks": [
+ {
+ "url": "http://127.0.0.1:{{ glowroot_telegram_forwarder_port | default(9099) }}",
+ "display": "Telegram (via forwarder)"
+ }
+ ]
+ }
+{% elif glowroot_slack_webhook_url is defined and glowroot_slack_webhook_url | length > 0 %}
+ ,"slack": {
+ "webhooks": [
+ {
+ "url": "{{ glowroot_slack_webhook_url }}",
+ "display": "{{ glowroot_slack_webhook_display | default('DHIS2 Alerts') }}"
+ }
+ ]
+ }
+{% endif %}
+{% if glowroot_smtp_host is defined and glowroot_smtp_host | length > 0 %}
+ ,"smtp": {
+ "host": "{{ glowroot_smtp_host }}",
+ "port": {{ glowroot_smtp_port | default(587) }},
+ "connectionSecurity": "{{ glowroot_smtp_connection_security | default('STARTTLS') }}",
+ "username": "{{ glowroot_smtp_username | default('') }}",
+ "password": "{{ glowroot_smtp_password | default('') }}",
+ "fromEmailAddress": "{{ glowroot_smtp_from_address | default('') }}",
+ "fromDisplayName": "{{ glowroot_smtp_from_name | default('Glowroot DHIS2') }}"
+ }
+{% endif %}
+{% if glowroot_pagerduty_key is defined and glowroot_pagerduty_key | length > 0 %}
+ ,"pagerDuty": {
+ "integrationKeys": [
+ {
+ "key": "{{ glowroot_pagerduty_key }}",
+ "display": "{{ glowroot_pagerduty_display | default('DHIS2') }}"
+ }
+ ]
+ }
+{% endif %}
+{% endif %}
}
diff --git a/deploy/roles/create-instance/templates/glowroot_config.json.j2 b/deploy/roles/create-instance/templates/glowroot_config.json.j2
new file mode 100644
index 0000000..120b7f0
--- /dev/null
+++ b/deploy/roles/create-instance/templates/glowroot_config.json.j2
@@ -0,0 +1,80 @@
+{
+ "alerts": [
+ {
+ "condition": {
+ "conditionType": "heartbeat",
+ "timePeriodSeconds": {{ glowroot_alert_heartbeat_seconds | default(300) }}
+ },
+ "severity": "CRITICAL"
+{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %}
+ ,"slackNotification": {
+ "slackChannels": ["#alerts"]
+ }
+{% endif %}
+{% if glowroot_smtp_host is defined and glowroot_smtp_host | length > 0 %}
+ ,"emailNotification": {
+ "emailAddresses": {{ glowroot_alert_email_addresses | default([]) | to_json }}
+ }
+{% endif %}
+ },
+ {
+ "condition": {
+ "conditionType": "metric",
+ "metric": "error:rate",
+ "transactionType": "Web",
+ "threshold": {{ glowroot_alert_error_rate_threshold | default(10.0) }},
+ "timePeriodSeconds": {{ glowroot_alert_error_rate_time_period_seconds | default(300) }}
+ },
+ "severity": "CRITICAL"
+{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %}
+ ,"slackNotification": {
+ "slackChannels": ["#alerts"]
+ }
+{% endif %}
+{% if glowroot_smtp_host is defined and glowroot_smtp_host | length > 0 %}
+ ,"emailNotification": {
+ "emailAddresses": {{ glowroot_alert_email_addresses | default([]) | to_json }}
+ }
+{% endif %}
+ },
+ {
+ "condition": {
+ "conditionType": "metric",
+ "metric": "transaction:x-percentile",
+ "transactionType": "Web",
+ "percentile": 95.0,
+ "threshold": {{ glowroot_alert_p95_threshold_ms | default(10000) }},
+ "timePeriodSeconds": {{ glowroot_alert_p95_time_period_seconds | default(600) }},
+ "minTransactionCount": {{ glowroot_alert_min_transaction_count | default(10) }}
+ },
+ "severity": "HIGH"
+{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %}
+ ,"slackNotification": {
+ "slackChannels": ["#alerts"]
+ }
+{% endif %}
+ },
+ {
+ "condition": {
+ "conditionType": "metric",
+ "metric": "gauge:java.lang:type=Memory:HeapMemoryUsage.used",
+{% set heap_raw = heap_memory_size | default('4G') | string | trim %}
+{% if heap_raw | regex_search('[mM]$') %}
+{% set heap_bytes = (heap_raw | regex_replace('[mM]$', '') | int) * 1048576 %}
+{% elif heap_raw | regex_search('[gG]$') %}
+{% set heap_bytes = (heap_raw | regex_replace('[gG]$', '') | int) * 1073741824 %}
+{% else %}
+{% set heap_bytes = (heap_raw | int) * 1073741824 %}
+{% endif %}
+ "threshold": {{ (heap_bytes * 0.8) | int }},
+ "timePeriodSeconds": 300
+ },
+ "severity": "HIGH"
+{% if (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or (glowroot_slack_webhook_url is defined) %}
+ ,"slackNotification": {
+ "slackChannels": ["#alerts"]
+ }
+{% endif %}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/deploy/roles/monitoring/defaults/main/alerting.yml b/deploy/roles/monitoring/defaults/main/alerting.yml
new file mode 100644
index 0000000..c2bb5ab
--- /dev/null
+++ b/deploy/roles/monitoring/defaults/main/alerting.yml
@@ -0,0 +1,21 @@
+---
+alerting_enabled: false
+alerting_default_contact_point: telegram
+
+# Telegram
+# alerting_telegram_bot_token: ""
+# alerting_telegram_chat_id: ""
+
+# Slack
+# alerting_slack_webhook_url: ""
+# alerting_slack_channel: "#dhis2-alerts"
+
+# Email
+# alerting_email_addresses: []
+
+# Thresholds
+alert_cpu_threshold: 85
+alert_memory_threshold: 90
+alert_disk_warning_pct: 15
+alert_disk_critical_pct: 5
+alert_pg_connection_pct: 80
diff --git a/deploy/roles/monitoring/defaults/main/munin.yml b/deploy/roles/monitoring/defaults/main/munin.yml
index 10a412c..14475a2 100644
--- a/deploy/roles/monitoring/defaults/main/munin.yml
+++ b/deploy/roles/monitoring/defaults/main/munin.yml
@@ -25,6 +25,29 @@ munin_hosts:
extra: [use_node_name yes]
munin_alerts: []
+# Email:
+# munin_alerts:
+# - name: admin
+# type: email
+# email: admin@example.com
+# subject: "Munin Alert"
+# level: "warning critical"
+#
+# Telegram:
+# munin_alerts:
+# - name: telegram
+# type: telegram
+# bot_token: "your-bot-token-from-botfather"
+# chat_id: "-1001234567890"
+# level: "warning critical"
+#
+# Slack:
+# munin_alerts:
+# - name: slack
+# type: slack
+# webhook_url: "https://hooks.slack.com/services/T.../B.../xxx"
+# level: "warning critical"
+
# munin node defaults
munin_node_bind_host: "*"
munin_node_bind_port: "4949"
diff --git a/deploy/roles/monitoring/tasks/alerting.yml b/deploy/roles/monitoring/tasks/alerting.yml
new file mode 100644
index 0000000..402978b
--- /dev/null
+++ b/deploy/roles/monitoring/tasks/alerting.yml
@@ -0,0 +1,105 @@
+---
+- name: Alerting | Assert at least one contact point is configured
+ ansible.builtin.assert:
+ that: >-
+ (alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined) or
+ (alerting_slack_webhook_url is defined) or
+ (alerting_email_addresses is defined and alerting_email_addresses | length > 0)
+ fail_msg: >-
+ alerting_enabled is true but no contact point is configured.
+ Set alerting_telegram_bot_token + alerting_telegram_chat_id,
+ or alerting_slack_webhook_url, or alerting_email_addresses.
+ tags: [always]
+
+# Prometheus alert rules
+- name: Alerting | Ensure Prometheus rules directory exists
+ ansible.builtin.file:
+ path: /etc/prometheus/rules
+ state: directory
+ owner: prometheus
+ group: prometheus
+ mode: '0755'
+
+- name: Alerting | Deploy Prometheus alert rules
+ ansible.builtin.template:
+ src: prometheus-rules.yml.j2
+ dest: /etc/prometheus/rules/dhis2-alerts.yml
+ owner: prometheus
+ group: prometheus
+ mode: '0644'
+ validate: 'promtool check rules %s'
+ notify: Reload Prometheus
+
+- name: Alerting | Ensure Prometheus rule_files directive exists
+ ansible.builtin.lineinfile:
+ path: /etc/prometheus/prometheus.yml
+ regexp: '^rule_files:'
+ line: 'rule_files:'
+ insertbefore: '^scrape_configs:'
+ notify: Reload Prometheus
+
+- name: Alerting | Wire rules directory into prometheus.yml
+ ansible.builtin.lineinfile:
+ path: /etc/prometheus/prometheus.yml
+ regexp: '^\s+- "/etc/prometheus/rules/\*\.yml"'
+ line: ' - "/etc/prometheus/rules/*.yml"'
+ insertafter: '^rule_files:'
+ notify: Reload Prometheus
+
+# Grafana unified alerting
+- name: Alerting | Provision Prometheus datasource with fixed UID
+ ansible.builtin.template:
+ src: grafana-datasource-provisioning.yml.j2
+ dest: /etc/grafana/provisioning/datasources/prometheus.yml
+ owner: grafana
+ group: grafana
+ mode: '0640'
+ notify: Restart Grafana
+
+- name: Alerting | Enable Grafana unified alerting
+ community.general.ini_file:
+ path: /etc/grafana/grafana.ini
+ section: unified_alerting
+ option: enabled
+ value: 'true'
+ mode: '0640'
+ notify: Restart Grafana
+
+- name: Alerting | Create Grafana alerting provisioning directory
+ ansible.builtin.file:
+ path: /etc/grafana/provisioning/alerting
+ state: directory
+ owner: grafana
+ group: grafana
+ mode: '0755'
+
+- name: Alerting | Deploy Grafana contact points
+ ansible.builtin.template:
+ src: grafana-contact-points.yml.j2
+ dest: /etc/grafana/provisioning/alerting/contact-points.yml
+ owner: grafana
+ group: grafana
+ mode: '0640'
+ no_log: true
+ notify: Restart Grafana
+
+- name: Alerting | Deploy Grafana notification policies
+ ansible.builtin.template:
+ src: grafana-notification-policies.yml.j2
+ dest: /etc/grafana/provisioning/alerting/notification-policies.yml
+ owner: grafana
+ group: grafana
+ mode: '0640'
+ notify: Restart Grafana
+
+- name: Alerting | Deploy Grafana alert rules
+ ansible.builtin.template:
+ src: grafana-alert-rules.yml.j2
+ dest: /etc/grafana/provisioning/alerting/alert-rules.yml
+ owner: grafana
+ group: grafana
+ mode: '0640'
+ notify: Restart Grafana
+
+- name: Alerting | Flush handlers
+ ansible.builtin.meta: flush_handlers
diff --git a/deploy/roles/monitoring/tasks/main.yml b/deploy/roles/monitoring/tasks/main.yml
index b8fd7b2..107ec05 100644
--- a/deploy/roles/monitoring/tasks/main.yml
+++ b/deploy/roles/monitoring/tasks/main.yml
@@ -40,3 +40,15 @@
# - server_monitoring is defined and server_monitoring == 'grafana'
- server_monitoring is defined and server_monitoring in ['grafana', 'prometheus', 'grafana/prometheus']
- inventory_hostname in groups['monitoring']
+
+- name: Alerting | Configure alerting for Grafana/Prometheus
+ ansible.builtin.include_tasks:
+ file: alerting.yml
+ apply:
+ tags: [alerting]
+ tags: [alerting]
+ when:
+ - alerting_enabled | default(false) | bool
+ - groups['monitoring'] | length > 0
+ - server_monitoring is defined and server_monitoring in ['grafana', 'prometheus', 'grafana/prometheus']
+ - inventory_hostname in groups['monitoring']
\ No newline at end of file
diff --git "a/deploy/roles/monitoring/templates/grafana-alert-rules.yml.j2\342\200\216" "b/deploy/roles/monitoring/templates/grafana-alert-rules.yml.j2\342\200\216"
new file mode 100644
index 0000000..e6ecdc6
--- /dev/null
+++ "b/deploy/roles/monitoring/templates/grafana-alert-rules.yml.j2\342\200\216"
@@ -0,0 +1,258 @@
+{{ ansible_managed | comment }}
+apiVersion: 1
+groups:
+ - orgId: 1
+ name: DHIS2 Infrastructure
+ folder: DHIS2 Alerts
+ interval: 60s
+ rules:
+ - uid: instance-down
+ title: Instance Down
+ condition: B
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: {% raw %}"Target {{ $labels.instance }} is down"{% endraw %}
+
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: up == 0
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
+
+ - uid: high-cpu-usage
+ title: High CPU Usage
+ condition: B
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "CPU usage above {{ alert_cpu_threshold }}%"
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > {{ alert_cpu_threshold }}
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
+
+ - uid: high-memory-usage
+ title: High Memory Usage
+ condition: B
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Memory usage above {{ alert_memory_threshold }}%"
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > {{ alert_memory_threshold }}
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
+
+ - uid: disk-space-warning
+ title: Disk Space Warning
+ condition: B
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Disk space below {{ alert_disk_warning_pct }}%"
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_warning_pct }}
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
+
+ - uid: disk-space-critical
+ title: Disk Space Critical
+ condition: B
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Disk space below {{ alert_disk_critical_pct }}%"
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_critical_pct }}
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
+
+ - orgId: 1
+ name: DHIS2 Database
+ folder: DHIS2 Alerts
+ interval: 60s
+ rules:
+ - uid: postgres-down
+ title: PostgreSQL Down
+ condition: B
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: "PostgreSQL is unreachable"
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: pg_up == 0
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
+
+ - uid: postgres-connections-high
+ title: PostgreSQL Connections High
+ condition: B
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Database connections exceed {{ alert_pg_connection_pct }}% of max"
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: sum by(instance) (pg_stat_activity_count) / on(instance) pg_settings_max_connections * 100 > {{ alert_pg_connection_pct }}
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
+
+ - orgId: 1
+ name: DHIS2 Application
+ folder: DHIS2 Alerts
+ interval: 60s
+ rules:
+ - uid: dhis2-endpoint-down
+ title: DHIS2 Endpoint Down
+ condition: B
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "DHIS2 metrics endpoint is unreachable"
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: prometheus
+ model:
+ expr: up{job=~".*dhis.*"} == 0
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ relativeTimeRange:
+ from: 600
+ to: 0
+ datasourceUid: __expr__
+ model:
+ type: reduce
+ expression: A
+ reducer: last
+ refId: B
\ No newline at end of file
diff --git "a/deploy/roles/monitoring/templates/grafana-contact-points.yml.j2\342\200\216" "b/deploy/roles/monitoring/templates/grafana-contact-points.yml.j2\342\200\216"
new file mode 100644
index 0000000..cc13163
--- /dev/null
+++ "b/deploy/roles/monitoring/templates/grafana-contact-points.yml.j2\342\200\216"
@@ -0,0 +1,33 @@
+{{ ansible_managed | comment }}
+apiVersion: 1
+contactPoints:
+{% if alerting_telegram_bot_token is defined and alerting_telegram_chat_id is defined %}
+ - orgId: 1
+ name: telegram
+ receivers:
+ - uid: telegram-default
+ type: telegram
+ settings:
+ bottoken: "{{ alerting_telegram_bot_token }}"
+ chatid: "{{ alerting_telegram_chat_id }}"
+ parse_mode: HTML
+{% endif %}
+{% if alerting_slack_webhook_url is defined %}
+ - orgId: 1
+ name: slack
+ receivers:
+ - uid: slack-default
+ type: slack
+ settings:
+ url: "{{ alerting_slack_webhook_url }}"
+ recipient: "{{ alerting_slack_channel | default('#dhis2-alerts') }}"
+{% endif %}
+{% if alerting_email_addresses is defined and alerting_email_addresses | length > 0 %}
+ - orgId: 1
+ name: email
+ receivers:
+ - uid: email-default
+ type: email
+ settings:
+ addresses: "{{ alerting_email_addresses | join(';') }}"
+{% endif %}
\ No newline at end of file
diff --git a/deploy/roles/monitoring/templates/grafana-datasource-provisioning.yml.j2 b/deploy/roles/monitoring/templates/grafana-datasource-provisioning.yml.j2
new file mode 100644
index 0000000..dd8044e
--- /dev/null
+++ b/deploy/roles/monitoring/templates/grafana-datasource-provisioning.yml.j2
@@ -0,0 +1,9 @@
+{{ ansible_managed | comment }}
+apiVersion: 1
+datasources:
+ - name: Prometheus
+ type: prometheus
+ uid: prometheus
+ url: http://127.0.0.1:9090
+ access: proxy
+ isDefault: true
\ No newline at end of file
diff --git a/deploy/roles/monitoring/templates/grafana-notification-policies.yml.j2 b/deploy/roles/monitoring/templates/grafana-notification-policies.yml.j2
new file mode 100644
index 0000000..3f695e6
--- /dev/null
+++ b/deploy/roles/monitoring/templates/grafana-notification-policies.yml.j2
@@ -0,0 +1,11 @@
+{{ ansible_managed | comment }}
+apiVersion: 1
+policies:
+ - orgId: 1
+ receiver: "{{ alerting_default_contact_point | default('telegram') }}"
+ group_by:
+ - alertname
+ - instance
+ group_wait: 30s
+ group_interval: 5m
+ repeat_interval: 4h
\ No newline at end of file
diff --git a/deploy/roles/monitoring/templates/munin.conf.j2 b/deploy/roles/monitoring/templates/munin.conf.j2
index be0aa9b..d86c431 100644
--- a/deploy/roles/monitoring/templates/munin.conf.j2
+++ b/deploy/roles/monitoring/templates/munin.conf.j2
@@ -83,8 +83,14 @@ includedir {{ munin_includedir }}
{% if munin_alerts %}
{% for contact in munin_alerts %}
contacts {{ contact.name }}
-contact.{{ contact.name }}.command mail -s "{{ contact.subject }}" {{ contact.email }}
-contact.{{ contact.name }}.always_send {{ contact.level }}
+{% if contact.type | default('email') == 'email' %}
+contact.{{ contact.name }}.command mail -s "{{ contact.subject | default('Munin Alert') }}" {{ contact.email }}
+{% elif contact.type == 'telegram' %}
+contact.{{ contact.name }}.command curl -sf -X POST "https://api.telegram.org/bot{{ contact.bot_token }}/sendMessage" -d "chat_id={{ contact.chat_id }}" -d "parse_mode=HTML" -d "text=Munin Alert%0AHost: ${var:host}%0AGraph: ${var:graph_title}%0AStatus: ${var:worst}%0ACategory: ${var:graph_category}"
+{% elif contact.type == 'slack' %}
+contact.{{ contact.name }}.command curl -sf -X POST "{{ contact.webhook_url }}" -H "Content-Type: application/json" -d '{"text":"*Munin Alert*\n*Host:* ${var:host}\n*Graph:* ${var:graph_title}\n*Status:* ${var:worst}\n*Category:* ${var:graph_category}"}'
+{% endif %}
+contact.{{ contact.name }}.always_send {{ contact.level | default('warning critical') }}
{% endfor %}
{% endif %}
diff --git a/deploy/roles/monitoring/templates/prometheus-rules.yml.j2 b/deploy/roles/monitoring/templates/prometheus-rules.yml.j2
new file mode 100644
index 0000000..5bb1231
--- /dev/null
+++ b/deploy/roles/monitoring/templates/prometheus-rules.yml.j2
@@ -0,0 +1,88 @@
+{{ ansible_managed | comment }}
+groups:
+ - name: dhis2_infrastructure
+ rules:
+ - alert: InstanceDown
+ expr: up == 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Target {{ '{{ $labels.instance }}' }} is down"
+ description: "{{ '{{ $labels.instance }}' }} of job {{ '{{ $labels.job }}' }} has been down for more than 5 minutes."
+
+ - alert: HighCPUUsage
+ expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > {{ alert_cpu_threshold }}
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High CPU usage on {{ '{{ $labels.instance }}' }}"
+ description: "CPU usage is above {{ alert_cpu_threshold }}% for more than 10 minutes."
+
+ - alert: HighMemoryUsage
+ expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > {{ alert_memory_threshold }}
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High memory usage on {{ '{{ $labels.instance }}' }}"
+ description: "Memory usage is above {{ alert_memory_threshold }}% for more than 5 minutes."
+
+ - alert: DiskSpaceWarning
+ expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_warning_pct }}
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Low disk space on {{ '{{ $labels.instance }}' }}"
+ description: "Filesystem {{ '{{ $labels.mountpoint }}' }} has less than {{ alert_disk_warning_pct }}% free space."
+
+ - alert: DiskSpaceCritical
+ expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < {{ alert_disk_critical_pct }}
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Critical disk space on {{ '{{ $labels.instance }}' }}"
+ description: "Filesystem {{ '{{ $labels.mountpoint }}' }} has less than {{ alert_disk_critical_pct }}% free space."
+
+ - name: dhis2_database
+ rules:
+ - alert: PostgresDown
+ expr: pg_up == 0
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: "PostgreSQL is down on {{ '{{ $labels.instance }}' }}"
+ description: "PostgreSQL exporter reports the database is unreachable."
+
+ - alert: PostgresConnectionsHigh
+ expr: sum by(instance) (pg_stat_activity_count) / on(instance) pg_settings_max_connections * 100 > {{ alert_pg_connection_pct }}
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High PostgreSQL connections on {{ '{{ $labels.instance }}' }}"
+ description: "Database connections exceed {{ alert_pg_connection_pct }}% of max_connections."
+
+ - alert: PostgresLongRunningQuery
+ expr: pg_stat_activity_max_tx_duration{state="active"} > 3600
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Long-running query on {{ '{{ $labels.instance }}' }}"
+ description: "A query has been running for more than 1 hour."
+
+ - name: dhis2_application
+ rules:
+ - alert: DHIS2EndpointDown
+ expr: up{job=~".*dhis.*"} == 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "DHIS2 endpoint {{ '{{ $labels.instance }}' }} is down"
+ description: "DHIS2 metrics endpoint has been unreachable for more than 5 minutes."
\ No newline at end of file
diff --git a/docs/alerting.md b/docs/alerting.md
new file mode 100644
index 0000000..e8fcec9
--- /dev/null
+++ b/docs/alerting.md
@@ -0,0 +1,295 @@
+# Alerting
+
+DHIS2 server tools supports alerting via Telegram, Slack, and email for both infrastructure monitoring (Grafana/Prometheus) and application monitoring (Glowroot APM).
+
+## Quick Start: Telegram
+
+Requires `server_monitoring=grafana` or `server_monitoring=grafana/prometheus`.
+
+### 1. Create a Telegram Bot
+
+- Message [@BotFather](https://t.me/BotFather) on Telegram
+- Send `/newbot`, follow the prompts
+- Save the bot token (e.g., `123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11`)
+
+### 2. Get Your Chat ID
+
+- Add the bot to your Telegram group
+- Message [@getmyid_bot](https://t.me/getmyid_bot) in the group, or use [@userinfobot](https://t.me/userinfobot)
+- Save the chat ID (e.g., `-1001234567890` for groups)
+
+### 3. Configure and Deploy
+
+Edit `deploy/inventory/hosts` and update the `[all:vars]` section:
+
+```ini
+server_monitoring=grafana
+alerting_enabled=true
+alerting_default_contact_point=telegram
+alerting_telegram_bot_token=123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11
+alerting_telegram_chat_id=-1001234567890
+```
+
+Then deploy:
+
+```bash
+ansible-playbook dhis2.yml --tags monitoring,alerting
+```
+
+After deployment, verify alerts are working -- see [Testing Alerts](#testing-alerts) below.
+
+> **Note:** The `inventory/hosts` file is gitignored, so your token won't be committed to version control. For production hardening, see [Securing Credentials with Vault](#securing-credentials-with-vault) below.
+
+## Quick Start: Slack
+
+Same as Telegram, but set these variables instead:
+
+```ini
+alerting_default_contact_point=slack
+alerting_slack_webhook_url=https://hooks.slack.com/services/T.../B.../xxx
+alerting_slack_channel=#dhis2-alerts
+```
+
+Create an incoming webhook at https://api.slack.com/messaging/webhooks.
+
+---
+
+## Alert Rules Reference
+
+### Infrastructure Alerts (Grafana/Prometheus)
+
+| Alert | Condition | Duration | Severity | Default Threshold |
+| ---------------------- | --------------------- | -------- | -------- | ----------------- |
+| Instance Down | Target unreachable | 5m | critical | up == 0 |
+| High CPU | CPU usage | 10m | warning | > 85% |
+| High Memory | Memory usage | 5m | warning | > 90% |
+| Disk Space Warning | Free space low | 5m | warning | < 15% |
+| Disk Space Critical | Free space very low | 5m | critical | < 5% |
+| PostgreSQL Down | DB unreachable | 2m | critical | pg_up == 0 |
+| PostgreSQL Connections | Connection saturation | 5m | warning | > 80% of max |
+| Long Running Query | Query duration | 5m | warning | > 1 hour |
+| DHIS2 Endpoint Down | Metrics unreachable | 5m | critical | up == 0 |
+
+### Glowroot APM Alerts (Optional)
+
+| Alert | Condition | Duration | Severity | Default Threshold |
+| ----------------- | ----------------- | -------- | -------- | ------------------------- |
+| Heartbeat | JVM/agent down | 5m | critical | No heartbeat |
+| Error Rate | Web errors | 5m | critical | > 10% |
+| Response Time p95 | Slow responses | 10m | high | > 10,000 ms |
+| Heap Memory | JVM heap pressure | 5m | high | > 80% of heap_memory_size |
+
+---
+
+## Customizing Thresholds
+
+Override these variables in `host_vars/monitor/vars.yml` or `inventory/hosts`:
+
+### Infrastructure Thresholds
+
+| Variable | Default | Description |
+| ------------------------- | ------- | --------------------------------- |
+| `alert_cpu_threshold` | 85 | CPU usage percentage |
+| `alert_memory_threshold` | 90 | Memory usage percentage |
+| `alert_disk_warning_pct` | 15 | Disk free space warning (%) |
+| `alert_disk_critical_pct` | 5 | Disk free space critical (%) |
+| `alert_pg_connection_pct` | 80 | PostgreSQL connections (% of max) |
+
+### Glowroot Thresholds
+
+| Variable | Default | Description |
+| ----------------------------------------------- | ------- | ----------------------------------- |
+| `glowroot_alert_p95_threshold_ms` | 10000 | p95 response time (ms) |
+| `glowroot_alert_p95_time_period_seconds` | 600 | Evaluation window for p95 |
+| `glowroot_alert_error_rate_threshold` | 10.0 | Error rate percentage |
+| `glowroot_alert_error_rate_time_period_seconds` | 300 | Evaluation window for errors |
+| `glowroot_alert_heartbeat_seconds` | 300 | Heartbeat timeout |
+| `glowroot_alert_min_transaction_count` | 10 | Min transactions before alert fires |
+
+---
+
+## Glowroot APM Alerts
+
+Glowroot alerts on transaction times, error rates, JVM metrics, and heartbeat. These require `app_monitoring=glowroot` (the default) and Grafana/Prometheus alerting configured first (steps 1-3 above) since they share the same bot token and chat ID.
+
+### How It Works
+
+Glowroot supports Slack natively but not Telegram. A Python forwarder bridges the gap:
+
+1. `glowroot-telegram-forwarder.py` runs as a systemd service on each instance host
+2. Glowroot's Slack webhook points at `http://127.0.0.1:9099` (the forwarder)
+3. The forwarder translates the Slack payload to a Telegram Bot API call
+
+The forwarder uses only Python 3 stdlib, binds to localhost, auto-restarts via systemd, and is independent of Tomcat. One forwarder per host serves all instances.
+
+### Enabling Glowroot Alerts
+
+Add to `inventory/hosts`:
+
+```ini
+glowroot_alerting_enabled=true
+```
+
+The forwarder uses the same `alerting_telegram_bot_token` and `alerting_telegram_chat_id` already configured for Grafana.
+
+### Deploying
+
+```bash
+ansible-playbook dhis2.yml --tags create-instance
+```
+
+---
+
+## Munin Alerts
+
+For users running `server_monitoring=munin`, configure the `munin_alerts` variable in `host_vars` or `group_vars`:
+
+### Telegram
+
+```yaml
+munin_alerts:
+ - name: telegram
+ type: telegram
+ bot_token: 'your-bot-token-from-botfather'
+ chat_id: '-1001234567890'
+ level: 'warning critical'
+```
+
+### Slack
+
+```yaml
+munin_alerts:
+ - name: slack
+ type: slack
+ webhook_url: 'https://hooks.slack.com/services/T.../B.../xxx'
+ level: 'warning critical'
+```
+
+### Email (Default)
+
+```yaml
+munin_alerts:
+ - name: admin
+ type: email
+ email: admin@example.com
+ subject: 'Munin Alert'
+ level: 'warning critical'
+```
+
+---
+
+## Testing Alerts
+
+### Test Telegram Delivery
+
+```bash
+curl -s -X POST "https://api.telegram.org/bot/sendMessage" \
+ -d "chat_id=" -d "text=Test alert from DHIS2 monitoring"
+```
+
+### Test Glowroot Forwarder
+
+```bash
+# Check service status
+systemctl status glowroot-telegram-forwarder
+
+# Send a test alert
+curl -s -X POST http://127.0.0.1:9099 \
+ -H "Content-Type: application/json" \
+ -d '{
+ "attachments": [{
+ "fallback": "[dhis2] Test alert - triggered",
+ "pretext": "[dhis2] Test alert triggered",
+ "color": "danger",
+ "text": "This is a test alert",
+ "ts": 1712500000.0
+ }],
+ "channel": "#alerts"
+ }'
+```
+
+### Verify Grafana Alerting
+
+```bash
+# List contact points
+curl -u admin:admin http://localhost:3000/grafana/api/v1/provisioning/contact-points
+
+# List alert rules
+curl -u admin:admin http://localhost:3000/grafana/api/v1/provisioning/alert-rules
+```
+
+### Verify Prometheus Rules
+
+```bash
+promtool check rules /etc/prometheus/rules/dhis2-alerts.yml
+```
+
+---
+
+## Troubleshooting
+
+**Bot not sending messages:**
+
+- Ensure the bot is added to the Telegram group/chat
+- Verify the chat ID sign (groups use negative IDs like `-1001234567890`)
+- Test with a direct curl to the Bot API
+
+**Grafana unified alerting not working:**
+
+- Check Grafana version is 9.0+ (`grafana-server -v`)
+- Verify `/etc/grafana/grafana.ini` has `[unified_alerting] enabled = true`
+- Check `/etc/grafana/provisioning/alerting/` directory exists and files are owned by `grafana`
+
+**Glowroot forwarder not running:**
+
+- Check `systemctl status glowroot-telegram-forwarder`
+- Check logs: `journalctl -u glowroot-telegram-forwarder`
+- Verify the script exists: `ls -la /opt/glowroot/glowroot-telegram-forwarder.py`
+
+**Prometheus rules not loading:**
+
+- Validate syntax: `promtool check rules /etc/prometheus/rules/dhis2-alerts.yml`
+- Check `/etc/prometheus/prometheus.yml` has `rule_files:` directive
+- Reload Prometheus: `systemctl reload prometheus`
+
+**No alerts firing:**
+
+- Alerts need the `for` duration to pass before firing (e.g., 5 minutes)
+- Check Grafana UI at `/grafana/alerting/list` for alert states
+- Verify Prometheus targets are being scraped at `/grafana/explore`
+
+---
+
+## Securing Credentials with Vault
+
+For production deployments where you want to encrypt tokens at rest, you can optionally move credentials to an ansible-vault encrypted file.
+
+Create the directory structure:
+
+```bash
+cd deploy/inventory
+mkdir -p host_vars/monitor
+```
+
+`host_vars/monitor/vars.yml` (plaintext -- references the vault):
+
+```yaml
+alerting_enabled: true
+alerting_telegram_bot_token: '{{ vault_alerting_telegram_bot_token }}'
+alerting_telegram_chat_id: '-1001234567890'
+```
+
+`host_vars/monitor/vault.yml` (will be encrypted):
+
+```yaml
+vault_alerting_telegram_bot_token: '123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11'
+```
+
+Encrypt and deploy:
+
+```bash
+ansible-vault encrypt host_vars/monitor/vault.yml
+ansible-playbook dhis2.yml --tags monitoring,alerting --vault-id @prompt
+```
+
+Remove the token from `inventory/hosts` after moving it to the vault.