diff --git a/CHANGELOG.md b/CHANGELOG.md index d23799a..530cc1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Grafana dashboard for monitoring all gatekeeperd Prometheus metrics (request rates, latency, security events, relay operations, system health) +- Helm ConfigMap template for Grafana sidecar dashboard auto-provisioning (`grafana.dashboard.enabled`) +- Helm ServiceMonitor template for Prometheus Operator (`serviceMonitor.enabled`) +- Monitoring documentation (`docs/MONITORING.md`) covering dashboard setup, ServiceMonitor, and metrics reference +- Root-level `dashboards/grafana-gatekeeperd.json` for non-Kubernetes users (Docker, bare metal) + ## [0.2.7] - 2026-02-10 ### Added diff --git a/README.md b/README.md index b283299..ebed1fd 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ A webhook authentication, authorization, and validation proxy for enterprise env - [Planned Work](#planned-work) - [Installation](#installation) - [Usage](#usage) +- [Monitoring](#monitoring) - [Development](#development) - [Configuring TLS](#configuring-tls) - [Contributing](#contributing) @@ -431,6 +432,18 @@ See [docs/USAGE.md](docs/USAGE.md) for detailed command-line usage: --- +## Monitoring + +See [docs/MONITORING.md](docs/MONITORING.md) for: + +- Grafana dashboard setup (manual import or Helm sidecar provisioning) +- Prometheus ServiceMonitor configuration +- Complete metrics reference + +A pre-built dashboard is available at [`dashboards/grafana-gatekeeperd.json`](dashboards/grafana-gatekeeperd.json). + +--- + ## Development See [docs/DEVELOPMENT.md](docs/DEVELOPMENT.md) for: diff --git a/charts/gatekeeperd/dashboards/gatekeeperd.json b/charts/gatekeeperd/dashboards/gatekeeperd.json new file mode 100644 index 0000000..ab1f650 --- /dev/null +++ b/charts/gatekeeperd/dashboards/gatekeeperd.json @@ -0,0 +1,1004 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Monitoring dashboard for gatekeeperd webhook proxy", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "orange", "value": 90 }, + { "color": "green", "value": 99 } + ] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\", status=~\"2..\"}[$__rate_interval])) / sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])) * 100", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Success Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 0.1 }, + { "color": "red", "value": 1 } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\", status!~\"2..\"}[$__rate_interval]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(gatekeeper_relay_clients_connected{namespace=~\"$namespace\", instance=~\"$instance\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Relay Clients Connected", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "title": "Requests", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 5, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (hostname) (rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ hostname }}", + "refId": "A" + } + ], + "title": "Request Rate by Hostname", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { "id": "byRegexp", "options": "2.." }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byRegexp", "options": "4.." }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byRegexp", "options": "5.." }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 6, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (status) (rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ status }}", + "refId": "A" + } + ], + "title": "Requests by Status", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "id": 7, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Request Latency", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "scheme", "schemeName": "Oranges" }, + "custom": { + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "scaleDistribution": { "type": "log", "log": 2 } + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "id": 8, + "options": { + "calculate": false, + "cellGap": 1, + "color": { "exponent": 0.5, "fill": "dark-orange", "mode": "scheme", "scheme": "Oranges", "steps": 64 }, + "exemplars": { "color": "rgba(255,0,255,0.7)" }, + "filterValues": { "le": 1e-9 }, + "tooltip": { "mode": "single", "showColorScale": true, "yHistogram": true }, + "yAxis": { "axisPlacement": "left", "unit": "s" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{ le }}", + "refId": "A" + } + ], + "title": "Request Latency Heatmap", + "type": "heatmap" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 102, + "title": "Security", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 }, + "id": 9, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (verifier, reason) (rate(gatekeeper_verification_failures_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ verifier }} ({{ reason }})", + "refId": "A" + } + ], + "title": "Verification Failures", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 }, + "id": 10, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (allowlist) (rate(gatekeeper_ip_filter_denied_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ allowlist }}", + "refId": "A" + } + ], + "title": "IP Filter Denials", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }, + "id": 11, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (validator) (rate(gatekeeper_validation_failures_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ validator }}", + "refId": "A" + } + ], + "title": "Validation Failures", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "id": 103, + "title": "Relay", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, + "id": 12, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_relay_webhooks_queued_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Queued", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_relay_webhooks_delivered_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Delivered", + "refId": "B" + } + ], + "title": "Relay Webhooks: Queued vs Delivered", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, + "id": 13, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(gatekeeper_relay_delivery_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(gatekeeper_relay_delivery_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(gatekeeper_relay_delivery_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Relay Delivery Latency", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 40 }, + "id": 14, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (reason) (rate(gatekeeper_relay_delivery_errors_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "{{ reason }}", + "refId": "A" + } + ], + "title": "Relay Delivery Errors", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 40 }, + "id": 15, + "options": { + "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (token) (gatekeeper_relay_webhooks_pending{namespace=~\"$namespace\", instance=~\"$instance\"})", + "legendFormat": "{{ token }}", + "refId": "A" + } + ], + "title": "Pending Webhooks", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 40 }, + "id": 16, + "options": { + "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (token) (gatekeeper_relay_clients_connected{namespace=~\"$namespace\", instance=~\"$instance\"})", + "legendFormat": "{{ token }}", + "refId": "A" + } + ], + "title": "Clients per Token", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 48 }, + "id": 104, + "title": "System", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 49 }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "gatekeeper_ip_ranges_loaded{namespace=~\"$namespace\", instance=~\"$instance\"}", + "legendFormat": "{{ allowlist }}", + "refId": "A" + } + ], + "title": "IP Ranges Loaded", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 49 }, + "id": 18, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (allowlist) (rate(gatekeeper_ip_range_fetch_errors_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "{{ allowlist }}", + "refId": "A" + } + ], + "title": "IP Range Fetch Errors", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 49 }, + "id": 19, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (hostname, destination) (rate(gatekeeper_forward_errors_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ hostname }} -> {{ destination }}", + "refId": "A" + } + ], + "title": "Forward Errors", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": ["gatekeeperd", "webhook"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(gatekeeper_requests_total, namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(gatekeeper_requests_total, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "query": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, hostname)", + "hide": 0, + "includeAll": true, + "label": "Hostname", + "multi": true, + "name": "hostname", + "query": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, hostname)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Gatekeeperd", + "uid": "gatekeeperd", + "version": 1 +} diff --git a/charts/gatekeeperd/templates/grafana-dashboard-configmap.yaml b/charts/gatekeeperd/templates/grafana-dashboard-configmap.yaml new file mode 100644 index 0000000..b89c70d --- /dev/null +++ b/charts/gatekeeperd/templates/grafana-dashboard-configmap.yaml @@ -0,0 +1,22 @@ +{{- if .Values.grafana.dashboard.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gatekeeperd.fullname" . }}-grafana-dashboard + {{- if .Values.grafana.dashboard.namespace }} + namespace: {{ .Values.grafana.dashboard.namespace }} + {{- end }} + labels: + {{- include "gatekeeperd.labels" . | nindent 4 }} + {{ .Values.grafana.dashboard.sidecarLabel }}: "1" + {{- with .Values.grafana.dashboard.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.grafana.dashboard.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +data: + gatekeeperd.json: |- +{{ .Files.Get "dashboards/gatekeeperd.json" | indent 4 }} +{{- end }} diff --git a/charts/gatekeeperd/templates/servicemonitor.yaml b/charts/gatekeeperd/templates/servicemonitor.yaml new file mode 100644 index 0000000..31ce6e3 --- /dev/null +++ b/charts/gatekeeperd/templates/servicemonitor.yaml @@ -0,0 +1,26 @@ +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "gatekeeperd.fullname" . }} + {{- if .Values.serviceMonitor.namespace }} + namespace: {{ .Values.serviceMonitor.namespace }} + {{- end }} + labels: + {{- include "gatekeeperd.labels" . | nindent 4 }} + {{- with .Values.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "gatekeeperd.selectorLabels" . | nindent 6 }} + {{- if .Values.serviceMonitor.namespace }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + {{- end }} + endpoints: + - port: metrics + interval: {{ .Values.serviceMonitor.interval }} +{{- end }} diff --git a/charts/gatekeeperd/values.yaml b/charts/gatekeeperd/values.yaml index b9e303b..7ba3c36 100644 --- a/charts/gatekeeperd/values.yaml +++ b/charts/gatekeeperd/values.yaml @@ -339,3 +339,19 @@ valkey: limits: cpu: 200m memory: 256Mi + +# Grafana dashboard provisioning via sidecar +grafana: + dashboard: + enabled: false + sidecarLabel: grafana_dashboard + namespace: "" + labels: {} + annotations: {} + +# Prometheus Operator ServiceMonitor +serviceMonitor: + enabled: false + namespace: "" + interval: 30s + labels: {} diff --git a/dashboards/grafana-gatekeeperd.json b/dashboards/grafana-gatekeeperd.json new file mode 100644 index 0000000..ab1f650 --- /dev/null +++ b/dashboards/grafana-gatekeeperd.json @@ -0,0 +1,1004 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Monitoring dashboard for gatekeeperd webhook proxy", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "orange", "value": 90 }, + { "color": "green", "value": 99 } + ] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\", status=~\"2..\"}[$__rate_interval])) / sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])) * 100", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Success Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 0.1 }, + { "color": "red", "value": 1 } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\", status!~\"2..\"}[$__rate_interval]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(gatekeeper_relay_clients_connected{namespace=~\"$namespace\", instance=~\"$instance\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Relay Clients Connected", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "title": "Requests", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 5, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (hostname) (rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ hostname }}", + "refId": "A" + } + ], + "title": "Request Rate by Hostname", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { "id": "byRegexp", "options": "2.." }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byRegexp", "options": "4.." }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byRegexp", "options": "5.." }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 6, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (status) (rate(gatekeeper_requests_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ status }}", + "refId": "A" + } + ], + "title": "Requests by Status", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "id": 7, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Request Latency", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "scheme", "schemeName": "Oranges" }, + "custom": { + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "scaleDistribution": { "type": "log", "log": 2 } + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "id": 8, + "options": { + "calculate": false, + "cellGap": 1, + "color": { "exponent": 0.5, "fill": "dark-orange", "mode": "scheme", "scheme": "Oranges", "steps": 64 }, + "exemplars": { "color": "rgba(255,0,255,0.7)" }, + "filterValues": { "le": 1e-9 }, + "tooltip": { "mode": "single", "showColorScale": true, "yHistogram": true }, + "yAxis": { "axisPlacement": "left", "unit": "s" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (le) (rate(gatekeeper_request_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{ le }}", + "refId": "A" + } + ], + "title": "Request Latency Heatmap", + "type": "heatmap" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 102, + "title": "Security", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 }, + "id": 9, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (verifier, reason) (rate(gatekeeper_verification_failures_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ verifier }} ({{ reason }})", + "refId": "A" + } + ], + "title": "Verification Failures", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 }, + "id": 10, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (allowlist) (rate(gatekeeper_ip_filter_denied_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ allowlist }}", + "refId": "A" + } + ], + "title": "IP Filter Denials", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }, + "id": 11, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (validator) (rate(gatekeeper_validation_failures_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ validator }}", + "refId": "A" + } + ], + "title": "Validation Failures", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "id": 103, + "title": "Relay", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, + "id": 12, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_relay_webhooks_queued_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Queued", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(gatekeeper_relay_webhooks_delivered_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Delivered", + "refId": "B" + } + ], + "title": "Relay Webhooks: Queued vs Delivered", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, + "id": 13, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(gatekeeper_relay_delivery_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(gatekeeper_relay_delivery_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(gatekeeper_relay_delivery_duration_seconds_bucket{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Relay Delivery Latency", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 40 }, + "id": 14, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (reason) (rate(gatekeeper_relay_delivery_errors_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "{{ reason }}", + "refId": "A" + } + ], + "title": "Relay Delivery Errors", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 40 }, + "id": 15, + "options": { + "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (token) (gatekeeper_relay_webhooks_pending{namespace=~\"$namespace\", instance=~\"$instance\"})", + "legendFormat": "{{ token }}", + "refId": "A" + } + ], + "title": "Pending Webhooks", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 40 }, + "id": 16, + "options": { + "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (token) (gatekeeper_relay_clients_connected{namespace=~\"$namespace\", instance=~\"$instance\"})", + "legendFormat": "{{ token }}", + "refId": "A" + } + ], + "title": "Clients per Token", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 48 }, + "id": 104, + "title": "System", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 49 }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "gatekeeper_ip_ranges_loaded{namespace=~\"$namespace\", instance=~\"$instance\"}", + "legendFormat": "{{ allowlist }}", + "refId": "A" + } + ], + "title": "IP Ranges Loaded", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 49 }, + "id": 18, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (allowlist) (rate(gatekeeper_ip_range_fetch_errors_total{namespace=~\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "{{ allowlist }}", + "refId": "A" + } + ], + "title": "IP Range Fetch Errors", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 49 }, + "id": 19, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum by (hostname, destination) (rate(gatekeeper_forward_errors_total{namespace=~\"$namespace\", instance=~\"$instance\", hostname=~\"$hostname\"}[$__rate_interval]))", + "legendFormat": "{{ hostname }} -> {{ destination }}", + "refId": "A" + } + ], + "title": "Forward Errors", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": ["gatekeeperd", "webhook"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(gatekeeper_requests_total, namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(gatekeeper_requests_total, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "query": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, hostname)", + "hide": 0, + "includeAll": true, + "label": "Hostname", + "multi": true, + "name": "hostname", + "query": "label_values(gatekeeper_requests_total{namespace=~\"$namespace\"}, hostname)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Gatekeeperd", + "uid": "gatekeeperd", + "version": 1 +} diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml new file mode 100644 index 0000000..c539e34 --- /dev/null +++ b/docker-compose.monitoring.yml @@ -0,0 +1,28 @@ +# Monitoring overlay — Prometheus + Grafana +# +# Usage: +# docker-compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d +# docker-compose -f docker-compose.yml -f docker-compose.monitoring.yml down + +services: + prometheus: + image: prom/prometheus + ports: + - "9091:9090" + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro + depends_on: + - gatekeeperd + + grafana: + image: grafana/grafana + ports: + - "3000:3000" + volumes: + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ./dashboards:/var/lib/grafana/dashboards:ro + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + depends_on: + - prometheus diff --git a/docker-compose.yml b/docker-compose.yml index 1518b3c..f8b1fed 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,9 +1,11 @@ # Docker Compose for local development and testing # # Usage: -# docker-compose up # Build and start server only -# docker-compose --profile relay up # Build and start server + relay client -# docker-compose down # Stop all services +# docker-compose up # Build and start server only +# docker-compose --profile relay up # Build and start server + relay client +# docker-compose -f docker-compose.yml \ +# -f docker-compose.monitoring.yml up # Start with Prometheus + Grafana +# docker-compose down # Stop all services # # Using pre-built images (PR builds or releases): # GATEKEEPERD_IMAGE=ghcr.io/tight-line/gatekeeperd:pr-123-abc1234 \ diff --git a/docs/MONITORING.md b/docs/MONITORING.md new file mode 100644 index 0000000..2fffa93 --- /dev/null +++ b/docs/MONITORING.md @@ -0,0 +1,95 @@ +# Monitoring + +Gatekeeper exposes Prometheus metrics on the metrics port (default `9090`) at `/metrics`. A pre-built Grafana dashboard is included for visualizing all metrics. + +## Grafana Dashboard + +The dashboard JSON is available at [`dashboards/grafana-gatekeeperd.json`](../dashboards/grafana-gatekeeperd.json). It covers all 14 metrics exported by gatekeeperd, organized into four sections: + +- **Overview** - Request rate, success rate, error rate, connected relay clients +- **Requests** - Rate by hostname, by status code, latency percentiles (p50/p95/p99), latency heatmap +- **Security** - Verification failures by verifier/reason, IP filter denials, validation failures +- **Relay** - Webhooks queued vs delivered, delivery latency, delivery errors, pending webhooks, clients per token +- **System** - IP ranges loaded per allowlist, IP range fetch errors, forward errors by hostname/destination + +### Template Variables + +The dashboard includes four template variables for filtering: + +| Variable | Description | +|----------|-------------| +| `datasource` | Prometheus datasource to query | +| `namespace` | Kubernetes namespace (supports "All") | +| `instance` | Instance selector (supports "All") | +| `hostname` | Route hostname filter (supports multi-select) | + +### Manual Import (Docker, Bare Metal) + +1. Open Grafana and navigate to **Dashboards > Import** +2. Upload `dashboards/grafana-gatekeeperd.json` or paste its contents +3. Select your Prometheus datasource +4. Click **Import** + +Ensure your Prometheus instance is scraping the gatekeeperd metrics endpoint (`:9090/metrics`). + +### Helm / Kubernetes + +#### Dashboard ConfigMap (Grafana Sidecar) + +If you use the [Grafana sidecar](https://github.com/grafana/helm-charts/tree/main/charts/grafana) to auto-provision dashboards, enable the ConfigMap in your Helm values: + +```yaml +grafana: + dashboard: + enabled: true +``` + +This creates a ConfigMap with the `grafana_dashboard` label, which the Grafana sidecar picks up automatically. You can customize the label, namespace, and annotations: + +```yaml +grafana: + dashboard: + enabled: true + sidecarLabel: grafana_dashboard # default + namespace: monitoring # deploy ConfigMap to a specific namespace + labels: {} + annotations: {} +``` + +#### ServiceMonitor (Prometheus Operator) + +If you use the [Prometheus Operator](https://prometheus-operator.dev/), enable the ServiceMonitor: + +```yaml +serviceMonitor: + enabled: true +``` + +This creates a ServiceMonitor that tells Prometheus to scrape the gatekeeperd metrics port. You can customize the scrape interval, namespace, and labels: + +```yaml +serviceMonitor: + enabled: true + interval: 30s # default + namespace: "" # deploy to a specific namespace + labels: {} # additional labels for ServiceMonitor selection +``` + +## Metrics Reference + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `gatekeeper_requests_total` | Counter | `hostname`, `status`, `namespace`, `instance` | Total HTTP requests processed | +| `gatekeeper_request_duration_seconds` | Histogram | `hostname`, `namespace`, `instance` | Request processing duration | +| `gatekeeper_verification_failures_total` | Counter | `verifier`, `reason`, `hostname`, `namespace`, `instance` | Webhook signature verification failures | +| `gatekeeper_validation_failures_total` | Counter | `validator`, `hostname`, `namespace`, `instance` | Payload schema validation failures | +| `gatekeeper_ip_filter_denied_total` | Counter | `allowlist`, `hostname`, `namespace`, `instance` | Requests denied by IP allowlist | +| `gatekeeper_ip_ranges_loaded` | Gauge | `allowlist`, `namespace`, `instance` | Number of IP ranges currently loaded per allowlist | +| `gatekeeper_ip_range_fetch_errors_total` | Counter | `allowlist`, `namespace`, `instance` | Errors fetching IP range updates | +| `gatekeeper_forward_errors_total` | Counter | `hostname`, `destination`, `namespace`, `instance` | Errors forwarding requests to backends | +| `gatekeeper_relay_webhooks_queued_total` | Counter | `namespace`, `instance` | Total webhooks queued for relay delivery | +| `gatekeeper_relay_webhooks_delivered_total` | Counter | `namespace`, `instance` | Total webhooks delivered via relay | +| `gatekeeper_relay_delivery_errors_total` | Counter | `reason`, `namespace`, `instance` | Relay delivery errors | +| `gatekeeper_relay_webhooks_pending` | Gauge | `token`, `namespace`, `instance` | Webhooks currently pending delivery per relay token | +| `gatekeeper_relay_clients_connected` | Gauge | `token`, `namespace`, `instance` | Relay clients currently connected per token | +| `gatekeeper_relay_delivery_duration_seconds` | Histogram | `namespace`, `instance` | Relay webhook delivery duration | diff --git a/internal/relay/redis_manager.go b/internal/relay/redis_manager.go index 7792a34..332c04f 100644 --- a/internal/relay/redis_manager.go +++ b/internal/relay/redis_manager.go @@ -351,12 +351,12 @@ func (m *RedisManager) pollNewMessage(ctx context.Context, key string) (*Webhook Block: blockTimeout, }).Result() + // coverage:ignore - timing edge case: XReadGroup timeout vs context cancellation race if err == redis.Nil { if ctx.Err() != nil { return nil, ctx.Err() } - // coverage:ignore - timing edge case: block times out before context expires (rare) - return nil, nil // Continue polling + return nil, nil // coverage:ignore - block timeout before context expires } if err != nil { if ctx.Err() != nil { diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..279b22b --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,7 @@ +apiVersion: 1 + +providers: + - name: default + type: file + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..86fd346 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,8 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 0000000..f684adf --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,7 @@ +global: + scrape_interval: 5s + +scrape_configs: + - job_name: gatekeeperd + static_configs: + - targets: ["gatekeeperd:9090"]