From 4af0f5dfe047dae6cd305905cfa77a65214a9e02 Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 10:43:05 -0600 Subject: [PATCH 01/12] Manage dashboards via IaC --- .../grafana_dashboards/alerts_dashboard.json | 2687 +++++++++++++++++ python-pulumi/src/ptd/paths.py | 4 + .../ptd/pulumi_resources/aws_eks_cluster.py | 55 +- 3 files changed, 2745 insertions(+), 1 deletion(-) create mode 100644 python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json diff --git a/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json b/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json new file mode 100644 index 0000000..f7ea756 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json @@ -0,0 +1,2687 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 42, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 14, + "panels": [], + "title": "Cloudwatch", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 325000000, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 314572800 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "aws_ec2_network_out_average{cluster=\"$Cluster\", job=\"integrations/cloudwatch\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "EC2 Network Out High", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 410000, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 400000 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "aws_ec2_network_packets_out_average{cluster=\"$Cluster\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "EC2 Network Packets Out High", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 100, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "aws_fsx_used_storage_capacity_average{cluster=\"$Cluster\"} / aws_fsx_storage_capacity_average{cluster=\"$Cluster\"} * 100", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + } + ], + "title": "FSx Capacity", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 5, + "panels": [], + "title": "Pods", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(pod) (kube_pod_container_status_terminated_reason{cluster=\"$Cluster\", reason!=\"Completed\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "PodError", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(phase, pod) (kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\", cluster=\"$Cluster\"}) > 0", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{pod}}: {{phase}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "PodNotHealthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Shows the delta between expected and ready Deployments in the selected cluster. Any non-zero value satisfies the alert condition.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "kube_deployment_status_replicas{cluster=\"$Cluster\"} \n - \n kube_deployment_status_replicas_ready{cluster=\"$Cluster\"} > 0", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{deployment}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "DeploymentReplicasMismatch", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Shows the delta between expected and ready Statefulsets in the selected cluster. Any non-zero value satisfies the alert condition.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "loki-write" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "kube_statefulset_status_replicas{cluster=\"$Cluster\"} \n - \n kube_statefulset_status_replicas_ready{cluster=\"$Cluster\"}", + "hide": false, + "instant": false, + "legendFormat": "{{statefulset}}", + "range": true, + "refId": "C" + } + ], + "title": "StatefulSetReplicasMismatch", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "increase(kube_pod_container_status_restarts_total{cluster=\"$Cluster\"}[15m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{namespace}}:{{pod}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "PodRestarts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "points", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "count by(pod) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", cluster=\"$Cluster\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "CrashLoopBackOff", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 6, + "panels": [], + "title": "Nodes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(node) (kube_node_status_condition{cluster=\"$Cluster\", condition=\"MemoryPressure\", status=\"true\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "NodeMemoryPressure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(node) (kube_node_status_condition{cluster=\"$Cluster\", condition=\"DiskPressure\", status=\"true\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "NodeDiskPressure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(node, condition) (kube_node_status_condition{cluster=\"$Cluster\", condition!=\"Ready\", status=\"true\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{node}}:{{condition}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Node Not Ready", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 59 + }, + "id": 12, + "panels": [], + "title": "Healthchecks", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Displays HTTP response codes from the products. Includes both external \"fqdn\" checks and kuberenetes \"internal\" service URL checks. A value of 200 means everything is OK.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "points", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "avg by(ptd_component, check_type, ptd_site) (probe_http_status_code{cluster=\"$Cluster\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{check_type}}:{{ptd_site}}-{{ptd_component}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Healthchecks", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 68 + }, + "id": 18, + "panels": [], + "title": "Applications", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Loki ingester has experienced WAL disk full failures. This indicates storage issues with the Loki WAL directory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 0 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 69 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "loki_ingester_wal_disk_full_failures_total{cluster=\"$Cluster\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Loki WAL Disk Full Failures", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 20, + "panels": [], + "title": "Mimir", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "No metrics have been received from workload cluster. This could indicate Alloy is not running, network issues between the workload and control room, or the workload cluster is down.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "points", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "up{job=\"prometheus.scrape.kube_state_metrics\", cluster=\"$Cluster\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Workload Metrics Silent", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 86 + }, + "id": 22, + "panels": [], + "title": "RDS", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "RDS instance CPU utilization is above 80% for more than 10 minutes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 100, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 80 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 87 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "exemplar": false, + "expr": "aws_rds_cpuutilization_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "RDS CPU Utilization High", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "RDS instance has less than 5 GiB of free storage space remaining.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "green", + "value": 5368709120 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "yellow" + }, + { + "color": "green", + "value": 5368709120 + } + ] + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "{__name__=\"aws_rds_free_storage_space_average\", __tenant_id__=\"654654567442\", account_id=\"654654567442\", cluster=\"default_demo01-staging-20250423-control-plane\", dimension_DatabaseClass=\"db.t3.small\", instance=\"29b39e68f4c08b716b6a68b333b21f5f\", job=\"integrations/cloudwatch\", name=\"global\", region=\"us-east-2\"}" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 87 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "aws_rds_free_storage_space_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "RDS Free Storage Low", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "green", + "value": 536870912 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "yellow" + }, + { + "color": "green", + "value": 536870912 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 95 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "aws_rds_freeable_memory_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "RDS Freeable Memory Low", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "RDS instance has more than 80 active database connections for more than 5 minutes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 90, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 80 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 95 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "aws_rds_database_connections_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "RDS Database Connections High", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 103 + }, + "id": 27, + "panels": [], + "title": "LoadBalancer", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 10 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 104 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "aws_applicationelb_httpcode_target_5_xx_count_sum{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "ALB Target 5XX Errors High", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 12, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 104 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "aws_applicationelb_un_healthy_host_count_average{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "ALB Unhealthy Targets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 0 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 112 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "aws_networkelb_un_healthy_host_count_average{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "NLB Unhealthy Targets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "description": "Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 2 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 2 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 112 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "editorMode": "code", + "expr": "aws_applicationelb_target_response_time_average{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "ALB Response Latency High", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 38, + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "current": { + "selected": false, + "text": "default_academy01-staging-20250415-control-plane", + "value": "default_academy01-staging-20250415-control-plane" + }, + "datasource": { + "type": "prometheus", + "uid": "mimir" + }, + "definition": "label_values(kube_pod_status_phase,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "Cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_pod_status_phase,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Alert-Based Dashboard", + "uid": "", + "version": 8, + "weekStart": "" +} \ No newline at end of file diff --git a/python-pulumi/src/ptd/paths.py b/python-pulumi/src/ptd/paths.py index f747d21..e3ee6ab 100644 --- a/python-pulumi/src/ptd/paths.py +++ b/python-pulumi/src/ptd/paths.py @@ -25,6 +25,10 @@ def alerts() -> pathlib.Path: return top() / "python-pulumi" / "src" / "ptd" / "grafana_alerts" +def dashboards() -> pathlib.Path: + return top() / "python-pulumi" / "src" / "ptd" / "grafana_dashboards" + + class Paths: @property def root(self) -> pathlib.Path: diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 7d97f0c..63db3d4 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1950,6 +1950,9 @@ def with_grafana( # Create alert configmaps for all YAML files in the alerts directory self._create_alert_configmaps(grafana_ns) + # Create dashboard configmaps for all JSON files in the dashboards directory + self._create_dashboard_configmaps(grafana_ns) + # TODO: auth.proxy should be configurable, prod grafana auth will need tighter controls than letting anyone in as an Editor k8s.helm.v3.Release( f"{self.name}-grafana", @@ -2073,7 +2076,13 @@ def with_grafana( "alerts": { "enabled": True, "searchNamespace": "grafana", - } + }, + "dashboards": { + "enabled": True, + "searchNamespace": "grafana", + "label": "grafana_dashboard", + "folder": "/tmp/dashboards", + }, }, }, ), @@ -2515,6 +2524,50 @@ def _create_alert_configmaps(self, ns: k8s.core.v1.Namespace): opts=pulumi.ResourceOptions(parent=self, provider=self.provider, depends_on=ns), ) + def _create_dashboard_configmaps(self, ns: k8s.core.v1.Namespace): + """ + Create ConfigMaps for Grafana dashboard provisioning. + + Reads all JSON files from grafana_dashboards/ directory and creates + Kubernetes ConfigMaps with the grafana_dashboard label. The Grafana + sidecar watches for this label and provisions dashboards automatically. + + Dashboard UIDs are enforced to match the filename (without .json extension) + to ensure idempotent updates and prevent duplicate dashboards. + """ + import json + + dashboards_dir = ptd.paths.dashboards() + + for dashboard_file in sorted(dashboards_dir.glob("*.json")): + dashboard_name = dashboard_file.stem + + # Read and parse JSON + try: + with open(dashboard_file) as f: + dashboard_json = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in {dashboard_file}: {e}") + + # Enforce UID = filename for idempotency + # Set id to null (Grafana provisioning ignores it) + dashboard_json["uid"] = dashboard_name + dashboard_json["id"] = None + + # Convert back to JSON string + dashboard_content = json.dumps(dashboard_json, indent=2) + + k8s.core.v1.ConfigMap( + f"{self.name}-grafana-{dashboard_name}-dashboard", + metadata={ + "name": f"grafana-{dashboard_name}-dashboard", + "namespace": "grafana", + "labels": {"grafana_dashboard": "1"}, + }, + data={f"{dashboard_name}.json": dashboard_content}, + opts=pulumi.ResourceOptions(parent=self, provider=self.provider, depends_on=ns), + ) + def setup_tailscale_access(self): sg_name = f"{self.sg_prefix}-tailscale" self.eks.vpc_config.apply(lambda config: self._setup_sg_access(sg_name, config.vpc_id)) From ce3463fab23023b236fa6bc9816b987367adcbfb Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 11:23:52 -0600 Subject: [PATCH 02/12] Handle roborev feedback --- .../grafana_dashboards/alerts_dashboard.json | 31 +------------------ .../ptd/pulumi_resources/aws_eks_cluster.py | 1 - 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json b/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json index f7ea756..ef2093f 100644 --- a/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json +++ b/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 42, + "id": null, "links": [], "liveNow": false, "panels": [ @@ -1869,30 +1869,6 @@ } } ] - }, - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "{__name__=\"aws_rds_free_storage_space_average\", __tenant_id__=\"654654567442\", account_id=\"654654567442\", cluster=\"default_demo01-staging-20250423-control-plane\", dimension_DatabaseClass=\"db.t3.small\", instance=\"29b39e68f4c08b716b6a68b333b21f5f\", job=\"integrations/cloudwatch\", name=\"global\", region=\"us-east-2\"}" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] } ] }, @@ -2646,11 +2622,6 @@ "list": [ { "allValue": "", - "current": { - "selected": false, - "text": "default_academy01-staging-20250415-control-plane", - "value": "default_academy01-staging-20250415-control-plane" - }, "datasource": { "type": "prometheus", "uid": "mimir" diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 63db3d4..3a2346b 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -2081,7 +2081,6 @@ def with_grafana( "enabled": True, "searchNamespace": "grafana", "label": "grafana_dashboard", - "folder": "/tmp/dashboards", }, }, }, From 10082536a292d5dfedd040a734e9410d2bb124d3 Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 12:14:07 -0600 Subject: [PATCH 03/12] fix(python-pulumi): sanitize dashboard names for Kubernetes RFC 1123 compliance Dashboard filenames with underscores (e.g., alerts_dashboard.json) were causing Kubernetes ConfigMap creation to fail because underscores violate RFC 1123 naming rules. This fix sanitizes dashboard names by replacing underscores with hyphens for Kubernetes resource names while preserving the original filename structure for Grafana UIDs and data keys. Co-authored-by: Claude Sonnet 4.5 --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 3a2346b..7557f16 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -2540,6 +2540,8 @@ def _create_dashboard_configmaps(self, ns: k8s.core.v1.Namespace): for dashboard_file in sorted(dashboards_dir.glob("*.json")): dashboard_name = dashboard_file.stem + # Sanitize name for Kubernetes (RFC 1123: only lowercase alphanumeric, '-', '.') + k8s_safe_name = dashboard_name.replace("_", "-") # Read and parse JSON try: @@ -2557,9 +2559,9 @@ def _create_dashboard_configmaps(self, ns: k8s.core.v1.Namespace): dashboard_content = json.dumps(dashboard_json, indent=2) k8s.core.v1.ConfigMap( - f"{self.name}-grafana-{dashboard_name}-dashboard", + f"{self.name}-grafana-{k8s_safe_name}-dashboard", metadata={ - "name": f"grafana-{dashboard_name}-dashboard", + "name": f"grafana-{k8s_safe_name}-dashboard", "namespace": "grafana", "labels": {"grafana_dashboard": "1"}, }, From 91e56c03e8a612de3937e70731c73f67f5877e06 Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 12:21:35 -0600 Subject: [PATCH 04/12] fix(python-pulumi): implement robust RFC 1123 name sanitization with validation Improves dashboard ConfigMap naming to handle all RFC 1123 edge cases: 1. **Added sanitize_k8s_name() utility function** (lib.py): - Converts to lowercase - Replaces all non-alphanumeric chars (including underscores, dots) with hyphens - Collapses consecutive hyphens into single hyphen - Strips leading/trailing hyphens - Validates result matches RFC 1123 pattern: ^[a-z0-9]([a-z0-9-]*[a-z0-9])?$ 2. **Updated dashboard provisioning** (aws_eks_cluster.py): - Uses new sanitize_k8s_name() function instead of inline replace - Validation now catches invalid names before Kubernetes API errors 3. **Comprehensive test coverage** (test_lib.py, test_dashboard_configmaps.py): - 40+ unit tests for sanitization edge cases - Tests for leading/trailing special chars, uppercase, consecutive chars - Integration tests for ConfigMap structure and naming patterns - RFC 1123 pattern validation tests Fixes: dashboard names with underscores (alerts_dashboard.json) now correctly sanitize to Kubernetes-compliant names (grafana-alerts-dashboard-dashboard). Addresses review feedback: complete RFC 1123 sanitization, validation, and tests. Co-authored-by: Claude Sonnet 4.5 --- .../ptd/pulumi_resources/aws_eks_cluster.py | 6 +- python-pulumi/src/ptd/pulumi_resources/lib.py | 41 +++++ .../tests/test_dashboard_configmaps.py | 171 ++++++++++++++++++ python-pulumi/tests/test_lib.py | 124 ++++++++++++- 4 files changed, 339 insertions(+), 3 deletions(-) create mode 100644 python-pulumi/tests/test_dashboard_configmaps.py diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 7557f16..f170497 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -19,6 +19,7 @@ import ptd.junkdrawer import ptd.oidc import ptd.paths +import ptd.pulumi_resources.lib import ptd.secrecy @@ -2540,8 +2541,9 @@ def _create_dashboard_configmaps(self, ns: k8s.core.v1.Namespace): for dashboard_file in sorted(dashboards_dir.glob("*.json")): dashboard_name = dashboard_file.stem - # Sanitize name for Kubernetes (RFC 1123: only lowercase alphanumeric, '-', '.') - k8s_safe_name = dashboard_name.replace("_", "-") + + # Sanitize name for Kubernetes RFC 1123 compliance + k8s_safe_name = ptd.pulumi_resources.lib.sanitize_k8s_name(dashboard_name) # Read and parse JSON try: diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index f292ad1..d05f332 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -1,7 +1,48 @@ +import re + _AWS_TAG_KEY_MAX_LENGTH = 128 _AWS_TAG_VALUE_MAX_LENGTH = 256 +def sanitize_k8s_name(name: str) -> str: + """Sanitize a name to be RFC 1123 compliant for Kubernetes resources. + + RFC 1123 subdomain rules: + - Must contain only lowercase alphanumeric characters and hyphens + - Must start and end with an alphanumeric character + - Maximum length is 253 characters (not enforced here) + + This function: + 1. Converts to lowercase + 2. Replaces all non-alphanumeric characters with hyphens + 3. Collapses consecutive hyphens into a single hyphen + 4. Strips leading/trailing hyphens + + Raises: + ValueError: If the sanitized name is empty or still RFC 1123 non-compliant + """ + if not name: + msg = "Name cannot be empty" + raise ValueError(msg) + + # Convert to lowercase and replace invalid chars with hyphens + sanitized = re.sub(r'[^a-z0-9-]', '-', name.lower()) + + # Collapse consecutive hyphens into single hyphen + sanitized = re.sub(r'-+', '-', sanitized) + + # Strip leading/trailing hyphens + sanitized = sanitized.strip('-') + + # Validate the result matches RFC 1123 subdomain pattern + # Pattern: must start/end with alphanumeric, can contain hyphens in between + if not sanitized or not re.match(r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$', sanitized): + msg = f"Name '{name}' cannot be sanitized to RFC 1123 format (result: '{sanitized}')" + raise ValueError(msg) + + return sanitized + + def format_lb_tags(tags: dict[str, str]) -> str: """Format tags as comma-separated key=value pairs for AWS LB Controller annotations. diff --git a/python-pulumi/tests/test_dashboard_configmaps.py b/python-pulumi/tests/test_dashboard_configmaps.py new file mode 100644 index 0000000..faabe88 --- /dev/null +++ b/python-pulumi/tests/test_dashboard_configmaps.py @@ -0,0 +1,171 @@ +"""Tests for Grafana dashboard ConfigMap provisioning in aws_eks_cluster.py""" + +import json +import tempfile +from pathlib import Path + +import pytest + +from ptd.pulumi_resources.lib import sanitize_k8s_name + + +def test_dashboard_name_sanitization_basic(): + """Test basic underscore replacement for dashboard names""" + assert sanitize_k8s_name("alerts_dashboard") == "alerts-dashboard" + + +def test_dashboard_name_sanitization_edge_cases(): + """Test edge cases in dashboard name sanitization""" + # Leading/trailing underscores + assert sanitize_k8s_name("_dashboard_") == "dashboard" + + # Mixed case and special chars + assert sanitize_k8s_name("My_Custom_Dashboard!") == "my-custom-dashboard" + + # Multiple consecutive special chars + assert sanitize_k8s_name("test___dashboard") == "test-dashboard" + + +def test_dashboard_name_sanitization_invalid(): + """Test that invalid dashboard names raise errors""" + with pytest.raises(ValueError, match="cannot be sanitized to RFC 1123 format"): + sanitize_k8s_name("___") # Only underscores + + with pytest.raises(ValueError, match="Name cannot be empty"): + sanitize_k8s_name("") # Empty string + + +def test_dashboard_configmap_structure(): + """Test the expected structure of dashboard ConfigMaps""" + # This is a documentation test showing the expected structure + dashboard_name = "alerts_dashboard" + k8s_safe_name = sanitize_k8s_name(dashboard_name) + cluster_name = "test-cluster" + + # Expected ConfigMap structure + expected_metadata = { + "name": f"grafana-{k8s_safe_name}-dashboard", + "namespace": "grafana", + "labels": {"grafana_dashboard": "1"}, + } + + # Verify sanitized name is RFC 1123 compliant + assert k8s_safe_name == "alerts-dashboard" + assert expected_metadata["name"] == "grafana-alerts-dashboard-dashboard" + + # Verify the data key uses original dashboard name (with underscore) + expected_data_key = f"{dashboard_name}.json" + assert expected_data_key == "alerts_dashboard.json" + + +def test_dashboard_uid_enforcement(): + """Test that dashboard UID is enforced to match filename""" + dashboard_name = "my_test_dashboard" + + # Simulate the dashboard JSON structure + dashboard_json = { + "title": "My Test Dashboard", + "uid": "old_uid", # This should be overwritten + "id": 123, # This should be set to None + } + + # What the code does: + dashboard_json["uid"] = dashboard_name + dashboard_json["id"] = None + + assert dashboard_json["uid"] == "my_test_dashboard" + assert dashboard_json["id"] is None + + +def test_rfc1123_compliance_validation(): + """Test RFC 1123 subdomain naming rules (simplified version without dots)""" + import re + + # Valid names (alphanumeric and hyphens, start/end with alphanumeric) + valid_names = [ + "a", + "abc", + "a-b", + "abc-123", + "alerts-dashboard", + "123-abc", + "my-dashboard", + ] + + # Simplified pattern for our use case (no dots) + rfc1123_pattern = r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$' + + for name in valid_names: + assert re.match(rfc1123_pattern, name), f"Valid name failed: {name}" + + # Invalid names (should not match) + invalid_names = [ + "-abc", # Starts with hyphen + "abc-", # Ends with hyphen + "ABC", # Uppercase + "a_b", # Underscore + "a b", # Space + "a.b", # Dot (not allowed in our simplified version) + "", # Empty + ] + + for name in invalid_names: + assert not re.match(rfc1123_pattern, name), f"Invalid name passed: {name}" + + +def test_dashboard_file_processing_simulation(): + """Simulate processing a dashboard file to ensure end-to-end behavior""" + with tempfile.TemporaryDirectory() as tmpdir: + dashboard_dir = Path(tmpdir) + + # Create a test dashboard file + dashboard_file = dashboard_dir / "alerts_dashboard.json" + dashboard_content = { + "title": "Alerts Dashboard", + "uid": "wrong_uid", + "id": 999, + "panels": [], + } + + with open(dashboard_file, "w") as f: + json.dump(dashboard_content, f) + + # Simulate what the code does + dashboard_name = dashboard_file.stem + k8s_safe_name = sanitize_k8s_name(dashboard_name) + + # Read and parse + with open(dashboard_file) as f: + dashboard_json = json.load(f) + + # Enforce UID and null id + dashboard_json["uid"] = dashboard_name + dashboard_json["id"] = None + + # Verify results + assert dashboard_name == "alerts_dashboard" + assert k8s_safe_name == "alerts-dashboard" + assert dashboard_json["uid"] == "alerts_dashboard" + assert dashboard_json["id"] is None + assert dashboard_json["title"] == "Alerts Dashboard" + + +def test_configmap_naming_pattern(): + """Test the ConfigMap naming pattern used in the code""" + cluster_name = "main01-staging" + dashboard_name = "alerts_dashboard" + k8s_safe_name = sanitize_k8s_name(dashboard_name) + + # Pulumi resource name (can contain underscores, used internally) + pulumi_resource_name = f"{cluster_name}-grafana-{k8s_safe_name}-dashboard" + + # Kubernetes metadata name (must be RFC 1123 compliant) + k8s_metadata_name = f"grafana-{k8s_safe_name}-dashboard" + + # Verify patterns + assert pulumi_resource_name == "main01-staging-grafana-alerts-dashboard-dashboard" + assert k8s_metadata_name == "grafana-alerts-dashboard-dashboard" + + # Verify metadata name is RFC 1123 compliant + import re + assert re.match(r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$', k8s_metadata_name) diff --git a/python-pulumi/tests/test_lib.py b/python-pulumi/tests/test_lib.py index 4b1f8ed..c1649b7 100644 --- a/python-pulumi/tests/test_lib.py +++ b/python-pulumi/tests/test_lib.py @@ -1,6 +1,6 @@ import pytest -from ptd.pulumi_resources.lib import format_lb_tags +from ptd.pulumi_resources.lib import format_lb_tags, sanitize_k8s_name def test_format_lb_tags_normal() -> None: @@ -127,3 +127,125 @@ def test_format_lb_tags_slash_in_key() -> None: result = format_lb_tags({"posit.team/true-name": "myapp", "posit.team/environment": "production"}) assert "posit.team/true-name=myapp" in result assert "posit.team/environment=production" in result + + +# ===== sanitize_k8s_name tests ===== + + +def test_sanitize_k8s_name_basic_underscore_replacement() -> None: + """Basic test: underscores replaced with hyphens""" + assert sanitize_k8s_name("alerts_dashboard") == "alerts-dashboard" + + +def test_sanitize_k8s_name_multiple_underscores() -> None: + """Multiple underscores are all replaced""" + assert sanitize_k8s_name("my_test_dashboard") == "my-test-dashboard" + + +def test_sanitize_k8s_name_uppercase_lowercased() -> None: + """Uppercase letters are lowercased""" + assert sanitize_k8s_name("MyDashboard") == "mydashboard" + + +def test_sanitize_k8s_name_mixed_case_and_underscores() -> None: + """Mix of uppercase and underscores""" + assert sanitize_k8s_name("My_Dashboard") == "my-dashboard" + + +def test_sanitize_k8s_name_leading_underscore_stripped() -> None: + """Leading underscores are stripped""" + assert sanitize_k8s_name("_dashboard") == "dashboard" + + +def test_sanitize_k8s_name_trailing_underscore_stripped() -> None: + """Trailing underscores are stripped""" + assert sanitize_k8s_name("dashboard_") == "dashboard" + + +def test_sanitize_k8s_name_leading_and_trailing_underscores() -> None: + """Both leading and trailing underscores are stripped""" + assert sanitize_k8s_name("_dashboard_") == "dashboard" + + +def test_sanitize_k8s_name_special_chars_replaced() -> None: + """Special characters replaced with hyphens""" + assert sanitize_k8s_name("my@dashboard!") == "my-dashboard" + + +def test_sanitize_k8s_name_spaces_replaced() -> None: + """Spaces replaced with hyphens""" + assert sanitize_k8s_name("my dashboard") == "my-dashboard" + + +def test_sanitize_k8s_name_dots_replaced() -> None: + """Dots are replaced with hyphens (simpler for dashboard names)""" + assert sanitize_k8s_name("my.dashboard") == "my-dashboard" + + +def test_sanitize_k8s_name_hyphens_preserved() -> None: + """Hyphens are preserved""" + assert sanitize_k8s_name("my-dashboard") == "my-dashboard" + + +def test_sanitize_k8s_name_alphanumeric_preserved() -> None: + """Alphanumeric characters are preserved (lowercased)""" + assert sanitize_k8s_name("Dashboard123") == "dashboard123" + + +def test_sanitize_k8s_name_consecutive_special_chars() -> None: + """Consecutive special characters are collapsed into a single hyphen""" + assert sanitize_k8s_name("my___dashboard") == "my-dashboard" + + +def test_sanitize_k8s_name_only_special_chars_fails() -> None: + """Name with only special characters cannot be sanitized""" + with pytest.raises(ValueError, match="cannot be sanitized to RFC 1123 format"): + sanitize_k8s_name("___") + + +def test_sanitize_k8s_name_empty_string_fails() -> None: + """Empty string raises ValueError""" + with pytest.raises(ValueError, match="Name cannot be empty"): + sanitize_k8s_name("") + + +def test_sanitize_k8s_name_only_leading_underscore_fails() -> None: + """Name that becomes empty after stripping fails""" + with pytest.raises(ValueError, match="cannot be sanitized to RFC 1123 format"): + sanitize_k8s_name("_") + + +def test_sanitize_k8s_name_complex_case() -> None: + """Complex real-world example""" + assert sanitize_k8s_name("_My_Dashboard_2024!") == "my-dashboard-2024" + + +def test_sanitize_k8s_name_already_valid() -> None: + """Already valid names pass through (lowercased)""" + assert sanitize_k8s_name("valid-name") == "valid-name" + + +def test_sanitize_k8s_name_single_char() -> None: + """Single character names are valid""" + assert sanitize_k8s_name("a") == "a" + assert sanitize_k8s_name("1") == "1" + + +def test_sanitize_k8s_name_two_chars() -> None: + """Two character names are valid""" + assert sanitize_k8s_name("ab") == "ab" + + +def test_sanitize_k8s_name_hyphen_in_middle() -> None: + """Hyphen in middle is valid (part of RFC 1123 pattern)""" + assert sanitize_k8s_name("a-b") == "a-b" + + +def test_sanitize_k8s_name_leading_hyphen_after_special_char() -> None: + """Special char at start becomes hyphen, then gets stripped""" + assert sanitize_k8s_name("!dashboard") == "dashboard" + + +def test_sanitize_k8s_name_trailing_hyphen_after_special_char() -> None: + """Special char at end becomes hyphen, then gets stripped""" + assert sanitize_k8s_name("dashboard!") == "dashboard" From eed98dbf06a7c7f85849eec58d7b0ab71ba8047b Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 13:55:36 -0600 Subject: [PATCH 05/12] Set default selection to All --- .../grafana_dashboards/alerts_dashboard.json | 82 +++++++++++-------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json b/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json index ef2093f..7659f91 100644 --- a/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json +++ b/python-pulumi/src/ptd/grafana_dashboards/alerts_dashboard.json @@ -18,7 +18,6 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, "links": [], "liveNow": false, "panels": [ @@ -123,7 +122,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "aws_ec2_network_out_average{cluster=\"$Cluster\", job=\"integrations/cloudwatch\"}", + "expr": "aws_ec2_network_out_average{cluster=~\"$Cluster\", job=\"integrations/cloudwatch\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -223,7 +222,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "aws_ec2_network_packets_out_average{cluster=\"$Cluster\"}", + "expr": "aws_ec2_network_packets_out_average{cluster=~\"$Cluster\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -323,7 +322,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "aws_fsx_used_storage_capacity_average{cluster=\"$Cluster\"} / aws_fsx_storage_capacity_average{cluster=\"$Cluster\"} * 100", + "expr": "aws_fsx_used_storage_capacity_average{cluster=~\"$Cluster\"} / aws_fsx_storage_capacity_average{cluster=~\"$Cluster\"} * 100", "hide": false, "instant": false, "legendFormat": "__auto", @@ -433,7 +432,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(pod) (kube_pod_container_status_terminated_reason{cluster=\"$Cluster\", reason!=\"Completed\"})", + "expr": "sum by(pod) (kube_pod_container_status_terminated_reason{cluster=~\"$Cluster\", reason!=\"Completed\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -532,7 +531,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(phase, pod) (kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\", cluster=\"$Cluster\"}) > 0", + "expr": "sum by(phase, pod) (kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\", cluster=~\"$Cluster\"}) > 0", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -593,7 +592,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -631,7 +631,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "kube_deployment_status_replicas{cluster=\"$Cluster\"} \n - \n kube_deployment_status_replicas_ready{cluster=\"$Cluster\"} > 0", + "expr": "kube_deployment_status_replicas{cluster=~\"$Cluster\"} \n - \n kube_deployment_status_replicas_ready{cluster=~\"$Cluster\"} > 0", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -692,7 +692,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -754,7 +755,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "kube_statefulset_status_replicas{cluster=\"$Cluster\"} \n - \n kube_statefulset_status_replicas_ready{cluster=\"$Cluster\"}", + "expr": "kube_statefulset_status_replicas{cluster=~\"$Cluster\"} \n - \n kube_statefulset_status_replicas_ready{cluster=~\"$Cluster\"}", "hide": false, "instant": false, "legendFormat": "{{statefulset}}", @@ -812,7 +813,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -851,7 +853,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "increase(kube_pod_container_status_restarts_total{cluster=\"$Cluster\"}[15m])", + "expr": "increase(kube_pod_container_status_restarts_total{cluster=~\"$Cluster\"}[15m])", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -911,7 +913,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -949,7 +952,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "count by(pod) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", cluster=\"$Cluster\"})", + "expr": "count by(pod) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", cluster=~\"$Cluster\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1023,7 +1026,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1061,7 +1065,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(node) (kube_node_status_condition{cluster=\"$Cluster\", condition=\"MemoryPressure\", status=\"true\"})", + "expr": "sum by(node) (kube_node_status_condition{cluster=~\"$Cluster\", condition=\"MemoryPressure\", status=\"true\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1122,7 +1126,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1160,7 +1165,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(node) (kube_node_status_condition{cluster=\"$Cluster\", condition=\"DiskPressure\", status=\"true\"})", + "expr": "sum by(node) (kube_node_status_condition{cluster=~\"$Cluster\", condition=\"DiskPressure\", status=\"true\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1222,7 +1227,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1260,7 +1266,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(node, condition) (kube_node_status_condition{cluster=\"$Cluster\", condition!=\"Ready\", status=\"true\"})", + "expr": "sum by(node, condition) (kube_node_status_condition{cluster=~\"$Cluster\", condition!=\"Ready\", status=\"true\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1334,7 +1340,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1374,7 +1381,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "avg by(ptd_component, check_type, ptd_site) (probe_http_status_code{cluster=\"$Cluster\"})", + "expr": "avg by(ptd_component, check_type, ptd_site) (probe_http_status_code{cluster=~\"$Cluster\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1506,7 +1513,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "loki_ingester_wal_disk_full_failures_total{cluster=\"$Cluster\"}", + "expr": "loki_ingester_wal_disk_full_failures_total{cluster=~\"$Cluster\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1640,7 +1647,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "up{job=\"prometheus.scrape.kube_state_metrics\", cluster=\"$Cluster\"}", + "expr": "up{job=\"prometheus.scrape.kube_state_metrics\", cluster=~\"$Cluster\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -1777,7 +1784,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "aws_rds_cpuutilization_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "expr": "aws_rds_cpuutilization_average{job=\"integrations/cloudwatch\", cluster=~\"$Cluster\"}", "format": "time_series", "instant": false, "legendFormat": "__auto", @@ -1898,7 +1905,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "aws_rds_free_storage_space_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "expr": "aws_rds_free_storage_space_average{job=\"integrations/cloudwatch\", cluster=~\"$Cluster\"}", "instant": false, "legendFormat": "__auto", "range": true, @@ -2019,7 +2026,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "aws_rds_freeable_memory_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "expr": "aws_rds_freeable_memory_average{job=\"integrations/cloudwatch\", cluster=~\"$Cluster\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -2142,7 +2149,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "aws_rds_database_connections_average{job=\"integrations/cloudwatch\", cluster=\"$Cluster\"}", + "expr": "aws_rds_database_connections_average{job=\"integrations/cloudwatch\", cluster=~\"$Cluster\"}", "instant": false, "legendFormat": "__auto", "range": true, @@ -2274,7 +2281,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "aws_applicationelb_httpcode_target_5_xx_count_sum{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "expr": "aws_applicationelb_httpcode_target_5_xx_count_sum{job=\"integrations/cloudwatch\",cluster=~\"$Cluster\"}", "instant": false, "legendFormat": "__auto", "range": true, @@ -2370,7 +2377,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "aws_applicationelb_un_healthy_host_count_average{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "expr": "aws_applicationelb_un_healthy_host_count_average{job=\"integrations/cloudwatch\",cluster=~\"$Cluster\"}", "instant": false, "legendFormat": "__auto", "range": true, @@ -2485,7 +2492,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "aws_networkelb_un_healthy_host_count_average{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "expr": "aws_networkelb_un_healthy_host_count_average{job=\"integrations/cloudwatch\",cluster=~\"$Cluster\"}", "instant": false, "legendFormat": "__auto", "range": true, @@ -2604,7 +2611,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "aws_applicationelb_target_response_time_average{job=\"integrations/cloudwatch\",cluster=\"$Cluster\"}", + "expr": "aws_applicationelb_target_response_time_average{job=\"integrations/cloudwatch\",cluster=~\"$Cluster\"}", "instant": false, "legendFormat": "__auto", "range": true, @@ -2622,13 +2629,18 @@ "list": [ { "allValue": "", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, "datasource": { "type": "prometheus", "uid": "mimir" }, "definition": "label_values(kube_pod_status_phase,cluster)", "hide": 0, - "includeAll": false, + "includeAll": true, "multi": false, "name": "Cluster", "options": [], @@ -2652,7 +2664,7 @@ "timepicker": {}, "timezone": "", "title": "Alert-Based Dashboard", - "uid": "", - "version": 8, + "uid": "alerts_dashboard", + "version": 1, "weekStart": "" } \ No newline at end of file From 279141d743fba065adf52459d417f059a440ed3a Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 14:47:11 -0600 Subject: [PATCH 06/12] Add Kubernetes Global View dashboard --- .../grafana_dashboards/k8s-views-global.json | 3147 +++++++++++++++++ 1 file changed, 3147 insertions(+) create mode 100644 python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json diff --git a/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json new file mode 100644 index 0000000..6ffdc40 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json @@ -0,0 +1,3147 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.3.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern 'Global View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 67, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 77, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "Real" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + } + ], + "title": "Global CPU Usage", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 78, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "hide": false, + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "Real" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + } + ], + "title": "Global RAM Usage", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 1 + }, + "id": 63, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 10, + "x": 14, + "y": 1 + }, + "id": 52, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kube_namespace_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Namespaces", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_running{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Containers", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Pods", + "refId": "O" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_service_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Services", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_endpoint_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Endpoints", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_ingress_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Ingresses", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_deployment_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Deployments", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_statefulset_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Statefulsets", + "refId": "G" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_daemonset_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Daemonsets", + "refId": "H" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_persistentvolumeclaim_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Persistent Volume Claims", + "refId": "I" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_hpa_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Horizontal Pod Autoscalers", + "refId": "J" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_configmap_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Configmaps", + "refId": "K" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_secret_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Secrets", + "refId": "L" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_networkpolicy_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Network Policies", + "refId": "M" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "hide": false, + "interval": "", + "legendFormat": "Nodes", + "refId": "N" + } + ], + "title": "Kubernetes Resource Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 5 + }, + "id": 59, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(kube_namespace_created{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Namespaces", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 37, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "Real" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "Total" + } + ], + "title": "CPU Usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 39, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"})", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "Real" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "Total" + } + ], + "title": "RAM Usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 9 + }, + "id": 62, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Pods", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 71, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU %", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 0.7 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 72, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "interval": "$resolution", + "legendFormat": "CPU usage in %", + "range": true, + "refId": "A" + } + ], + "title": "Cluster CPU Utilization", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "CPU usage in %", + "mode": "reduceRow", + "reduce": { + "reducer": "mean" + }, + "replaceFields": true + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MEMORY", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 0.5 + }, + { + "color": "red", + "value": 0.7 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 55, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "interval": "$resolution", + "legendFormat": "Memory usage in %", + "range": true, + "refId": "A" + } + ], + "title": "Cluster Memory Utilization", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Memory usage in %", + "mode": "reduceRow", + "reduce": { + "reducer": "mean" + }, + "replaceFields": true + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU CORES", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 50, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{image!=\"\", cluster=\"$cluster\"}) by (namespace)", + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU %", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 54, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ node }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Utilization by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MEMORY", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 73, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Utilization by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "SECONDS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 82, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Throttled seconds by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "NB", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 83, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Throttled by instance", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 86, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_qos_class{cluster=\"$cluster\"}) by (qos_class)", + "interval": "", + "legendFormat": "{{ qos_class }} pods", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_info{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total pods", + "range": true, + "refId": "B" + } + ], + "title": "Kubernetes Pods QoS classes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 85, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_reason{cluster=\"$cluster\"}) by (reason)", + "interval": "", + "legendFormat": "{{ reason }}", + "range": true, + "refId": "A" + } + ], + "title": "Kubernetes Pods Status Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 87, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(container_oom_events_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "OOM Events by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 88, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts by namespace", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 69, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 66 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "interval": "$resolution", + "legendFormat": "Received : {{ device }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "interval": "$resolution", + "legendFormat": "Transmitted : {{ device }}", + "range": true, + "refId": "B" + } + ], + "title": "Global Network Utilization by device", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "DROPPED PACKETS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 66 + }, + "id": 53, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Packets dropped (receive)", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Packets dropped (transmit)", + "range": true, + "refId": "B" + } + ], + "title": "Network Saturation - Packets dropped", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 79, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "interval": "$resolution", + "legendFormat": "Received : {{ namespace }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted : {{ namespace }}", + "range": true, + "refId": "B" + } + ], + "title": "Network Received by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 80, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "B" + } + ], + "title": "Total Network Received (with all virtual devices) by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 82 + }, + "id": 56, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "B" + } + ], + "title": "Network Received (without loopback) by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 82 + }, + "id": 81, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "B" + } + ], + "title": "Network Received (loopback only) by instance", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "job", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes Global View", + "uid": "k8s_views_global", + "version": 45, + "weekStart": "" +} \ No newline at end of file From 388e320e8090e6f9a4fd0eb730aeef3de54e648e Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 15:16:06 -0600 Subject: [PATCH 07/12] fix(grafana): enable multi-cluster support for Kubernetes Global View dashboard Fixed the k8s-views-global dashboard to properly support multi-cluster deployments: - Enabled "All" cluster selection (includeAll: true) - Added default selection to "All" clusters - Changed all cluster query filters from exact match (=) to regex (=~) (59 occurrences) - Changed all job query filters from exact match (=) to regex (=~) (19 occurrences) - Fixed dashboard UID to use hyphens instead of underscores (k8s-views-global) This aligns with the multi-cluster pattern used in alerts_dashboard.json and enables users to view metrics across all clusters or select specific clusters. Co-Authored-By: Claude Sonnet 4.5 --- .../grafana_dashboards/k8s-views-global.json | 127 +++++++++--------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json index 6ffdc40..f3e0c1a 100644 --- a/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json +++ b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json @@ -188,7 +188,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])))", "interval": "", "legendFormat": "Real", "range": true, @@ -200,7 +200,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=~\"$cluster\"})", "hide": false, "legendFormat": "Requests", "range": true, @@ -212,7 +212,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=~\"$cluster\"})", "hide": false, "legendFormat": "Limits", "range": true, @@ -288,7 +288,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"})", "hide": false, "interval": "", "legendFormat": "Real", @@ -301,7 +301,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=~\"$cluster\"})", "hide": false, "legendFormat": "Requests", "range": true, @@ -313,7 +313,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=~\"$cluster\"})", "hide": false, "legendFormat": "Limits", "range": true, @@ -378,7 +378,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "expr": "count(count by (node) (kube_node_info{cluster=~\"$cluster\"}))", "interval": "", "legendFormat": "", "range": true, @@ -482,7 +482,7 @@ "uid": "${datasource}" }, "exemplar": true, - "expr": "sum(kube_namespace_labels{cluster=\"$cluster\"})", + "expr": "sum(kube_namespace_labels{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Namespaces", "refId": "A" @@ -492,7 +492,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_pod_container_status_running{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_status_running{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Running Containers", "refId": "B" @@ -502,7 +502,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Running Pods", "refId": "O" @@ -512,7 +512,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_service_info{cluster=\"$cluster\"})", + "expr": "sum(kube_service_info{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Services", "refId": "C" @@ -522,7 +522,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_endpoint_info{cluster=\"$cluster\"})", + "expr": "sum(kube_endpoint_info{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Endpoints", "refId": "D" @@ -532,7 +532,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_ingress_info{cluster=\"$cluster\"})", + "expr": "sum(kube_ingress_info{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Ingresses", "refId": "E" @@ -542,7 +542,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_deployment_labels{cluster=\"$cluster\"})", + "expr": "sum(kube_deployment_labels{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Deployments", "refId": "F" @@ -552,7 +552,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_statefulset_labels{cluster=\"$cluster\"})", + "expr": "sum(kube_statefulset_labels{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Statefulsets", "refId": "G" @@ -562,7 +562,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_daemonset_labels{cluster=\"$cluster\"})", + "expr": "sum(kube_daemonset_labels{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Daemonsets", "refId": "H" @@ -572,7 +572,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_persistentvolumeclaim_info{cluster=\"$cluster\"})", + "expr": "sum(kube_persistentvolumeclaim_info{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Persistent Volume Claims", "refId": "I" @@ -582,7 +582,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_hpa_labels{cluster=\"$cluster\"})", + "expr": "sum(kube_hpa_labels{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Horizontal Pod Autoscalers", "refId": "J" @@ -592,7 +592,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_configmap_info{cluster=\"$cluster\"})", + "expr": "sum(kube_configmap_info{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Configmaps", "refId": "K" @@ -602,7 +602,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_secret_info{cluster=\"$cluster\"})", + "expr": "sum(kube_secret_info{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Secrets", "refId": "L" @@ -612,7 +612,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_networkpolicy_labels{cluster=\"$cluster\"})", + "expr": "sum(kube_networkpolicy_labels{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Network Policies", "refId": "M" @@ -623,7 +623,7 @@ "uid": "${datasource}" }, "exemplar": true, - "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "expr": "count(count by (node) (kube_node_info{cluster=~\"$cluster\"}))", "hide": false, "interval": "", "legendFormat": "Nodes", @@ -686,7 +686,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(kube_namespace_created{cluster=\"$cluster\"})", + "expr": "count(kube_namespace_created{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -751,7 +751,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))", "interval": "", "legendFormat": "Real", "range": true, @@ -763,7 +763,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", "hide": false, "legendFormat": "Requests", "range": true, @@ -775,7 +775,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", "hide": false, "legendFormat": "Limits", "range": true, @@ -787,7 +787,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(machine_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(machine_cpu_cores{cluster=~\"$cluster\"})", "hide": false, "legendFormat": "Total", "range": true, @@ -853,7 +853,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"})", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"})", "interval": "", "legendFormat": "Real", "range": true, @@ -865,7 +865,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", "hide": false, "legendFormat": "Requests", "range": true, @@ -877,7 +877,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"} == 1))", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", "hide": false, "legendFormat": "Limits", "range": true, @@ -889,7 +889,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(machine_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(machine_memory_bytes{cluster=~\"$cluster\"})", "hide": false, "legendFormat": "Total", "range": true, @@ -952,7 +952,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -1076,7 +1076,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])))", "interval": "$resolution", "legendFormat": "CPU usage in %", "range": true, @@ -1197,7 +1197,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"})", "interval": "$resolution", "legendFormat": "Memory usage in %", "range": true, @@ -1319,7 +1319,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\", cluster=~\"$cluster\"}[$__rate_interval])) by (namespace)", "format": "time_series", "hide": false, "interval": "$resolution", @@ -1426,7 +1426,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(container_memory_working_set_bytes{image!=\"\", cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_working_set_bytes{image!=\"\", cluster=~\"$cluster\"}) by (namespace)", "interval": "$resolution", "legendFormat": "{{ namespace }}", "range": true, @@ -1532,7 +1532,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))) by (instance)", + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))) by (instance)", "interval": "$resolution", "legendFormat": "{{ node }}", "range": true, @@ -1637,7 +1637,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) by (instance)", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"}) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "{{ instance }}", @@ -1748,7 +1748,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=~\"$cluster\"}[$__rate_interval])) by (namespace) > 0", "interval": "$resolution", "legendFormat": "{{ namespace }}", "range": true, @@ -1858,7 +1858,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_cpu_core_throttles_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "{{ instance }}", "range": true, @@ -1976,7 +1976,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(kube_pod_status_qos_class{cluster=\"$cluster\"}) by (qos_class)", + "expr": "sum(kube_pod_status_qos_class{cluster=~\"$cluster\"}) by (qos_class)", "interval": "", "legendFormat": "{{ qos_class }} pods", "range": true, @@ -1988,7 +1988,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kube_pod_info{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_info{cluster=~\"$cluster\"})", "hide": false, "legendFormat": "Total pods", "range": true, @@ -2093,7 +2093,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(kube_pod_status_reason{cluster=\"$cluster\"}) by (reason)", + "expr": "sum(kube_pod_status_reason{cluster=~\"$cluster\"}) by (reason)", "interval": "", "legendFormat": "{{ reason }}", "range": true, @@ -2199,7 +2199,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(increase(container_oom_events_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "expr": "sum(increase(container_oom_events_total{cluster=~\"$cluster\"}[$__rate_interval])) by (namespace) > 0", "interval": "", "legendFormat": "{{ namespace }}", "range": true, @@ -2305,7 +2305,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(increase(kube_pod_container_status_restarts_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "expr": "sum(increase(kube_pod_container_status_restarts_total{cluster=~\"$cluster\"}[$__rate_interval])) by (namespace) > 0", "interval": "", "legendFormat": "{{ namespace }}", "range": true, @@ -2422,7 +2422,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (device)", "interval": "$resolution", "legendFormat": "Received : {{ device }}", "range": true, @@ -2435,7 +2435,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (device)", "interval": "$resolution", "legendFormat": "Transmitted : {{ device }}", "range": true, @@ -2534,7 +2534,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "expr": "sum(rate(node_network_receive_drop_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))", "interval": "$resolution", "legendFormat": "Packets dropped (receive)", "range": true, @@ -2547,7 +2547,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "- sum(rate(node_network_transmit_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "expr": "- sum(rate(node_network_transmit_drop_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))", "interval": "$resolution", "legendFormat": "Packets dropped (transmit)", "range": true, @@ -2647,7 +2647,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=~\"$cluster\"}[$__rate_interval])) by (namespace)", "interval": "$resolution", "legendFormat": "Received : {{ namespace }}", "range": true, @@ -2659,7 +2659,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "- sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "expr": "- sum(rate(container_network_transmit_bytes_total{cluster=~\"$cluster\"}[$__rate_interval])) by (namespace)", "hide": false, "interval": "$resolution", "legendFormat": "Transmitted : {{ namespace }}", @@ -2759,7 +2759,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_network_receive_bytes_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "Received bytes in {{ instance }}", "range": true, @@ -2771,7 +2771,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "- sum(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "expr": "- sum(rate(node_network_transmit_bytes_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "Transmitted bytes in {{ instance }}", @@ -2872,7 +2872,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "Received bytes in {{ instance }}", "range": true, @@ -2884,7 +2884,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "Transmitted bytes in {{ instance }}", @@ -2985,7 +2985,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_network_receive_bytes_total{device=\"lo\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "Received bytes in {{ instance }}", "range": true, @@ -2997,7 +2997,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "- sum(rate(node_network_transmit_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "expr": "- sum(rate(node_network_transmit_bytes_total{device=\"lo\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "Transmitted bytes in {{ instance }}", @@ -3043,8 +3043,13 @@ }, "definition": "label_values(kube_node_info,cluster)", "hide": 0, - "includeAll": false, + "includeAll": true, "multi": false, + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, "name": "cluster", "options": [], "query": { @@ -3115,7 +3120,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "definition": "label_values(node_cpu_seconds_total{cluster=~\"$cluster\"},job)", "hide": 0, "includeAll": false, "multi": true, @@ -3123,7 +3128,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "query": "label_values(node_cpu_seconds_total{cluster=~\"$cluster\"},job)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3141,7 +3146,7 @@ "timepicker": {}, "timezone": "", "title": "Kubernetes Global View", - "uid": "k8s_views_global", + "uid": "k8s-views-global", "version": 45, "weekStart": "" } \ No newline at end of file From 30f456032d69fe00459c306ebfecffca4525aa37 Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 15:32:03 -0600 Subject: [PATCH 08/12] refactor(grafana): remove unused variables from Kubernetes Global View dashboard Cleaned up unnecessary template variables and annotations that don't apply to PTD: - Removed datasource variable (now hardcoded to mimir for all queries) - Removed job variable (only had one entry, not useful) - Removed terraform and oncall annotation toggles (unused tags) - Updated all 87 datasource references to use "mimir" directly - Removed all 19 job filter references from queries This simplifies the dashboard configuration and removes controls that don't apply to PTD's infrastructure setup. Co-Authored-By: Claude Sonnet 4.5 --- .../grafana_dashboards/k8s-views-global.json | 293 +++++++----------- 1 file changed, 106 insertions(+), 187 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json index f3e0c1a..042f6b5 100644 --- a/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json +++ b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json @@ -61,42 +61,6 @@ "type": "dashboard" }, "type": "dashboard" - }, - { - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": false, - "iconColor": "#5c4ee5", - "name": "terraform", - "target": { - "limit": 100, - "matchAny": false, - "tags": [ - "terraform" - ], - "type": "tags" - } - }, - { - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": false, - "iconColor": "red", - "name": "oncall", - "target": { - "limit": 100, - "matchAny": false, - "tags": [ - "oncall" - ], - "type": "tags" - } } ] }, @@ -111,7 +75,7 @@ "collapsed": false, "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "gridPos": { "h": 1, @@ -127,7 +91,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -184,11 +148,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])))", + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\"}[$__rate_interval])))", "interval": "", "legendFormat": "Real", "range": true, @@ -197,7 +161,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=~\"$cluster\"})", @@ -209,7 +173,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_cpu_cores{cluster=~\"$cluster\"})", @@ -225,7 +189,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -284,11 +248,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"})", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\"}) / sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\"})", "hide": false, "interval": "", "legendFormat": "Real", @@ -298,7 +262,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=~\"$cluster\"})", @@ -310,7 +274,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1)) / sum(machine_memory_bytes{cluster=~\"$cluster\"})", @@ -326,7 +290,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -374,7 +338,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -391,7 +355,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -479,7 +443,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "exemplar": true, "expr": "sum(kube_namespace_labels{cluster=~\"$cluster\"})", @@ -490,7 +454,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_pod_container_status_running{cluster=~\"$cluster\"})", "interval": "", @@ -500,7 +464,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"})", "interval": "", @@ -510,7 +474,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_service_info{cluster=~\"$cluster\"})", "interval": "", @@ -520,7 +484,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_endpoint_info{cluster=~\"$cluster\"})", "interval": "", @@ -530,7 +494,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_ingress_info{cluster=~\"$cluster\"})", "interval": "", @@ -540,7 +504,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_deployment_labels{cluster=~\"$cluster\"})", "interval": "", @@ -550,7 +514,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_statefulset_labels{cluster=~\"$cluster\"})", "interval": "", @@ -560,7 +524,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_daemonset_labels{cluster=~\"$cluster\"})", "interval": "", @@ -570,7 +534,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_persistentvolumeclaim_info{cluster=~\"$cluster\"})", "interval": "", @@ -580,7 +544,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_hpa_labels{cluster=~\"$cluster\"})", "interval": "", @@ -590,7 +554,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_configmap_info{cluster=~\"$cluster\"})", "interval": "", @@ -600,7 +564,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_secret_info{cluster=~\"$cluster\"})", "interval": "", @@ -610,7 +574,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_networkpolicy_labels{cluster=~\"$cluster\"})", "interval": "", @@ -620,7 +584,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "exemplar": true, "expr": "count(count by (node) (kube_node_info{cluster=~\"$cluster\"}))", @@ -636,7 +600,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -684,7 +648,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "count(kube_namespace_created{cluster=~\"$cluster\"})", "interval": "", @@ -698,7 +662,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -747,11 +711,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))", + "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\"}[$__rate_interval]))", "interval": "", "legendFormat": "Real", "range": true, @@ -760,7 +724,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", @@ -772,7 +736,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", @@ -784,7 +748,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(machine_cpu_cores{cluster=~\"$cluster\"})", @@ -800,7 +764,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -849,11 +813,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"})", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\"})", "interval": "", "legendFormat": "Real", "range": true, @@ -862,7 +826,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", @@ -874,7 +838,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\"} and on(namespace, pod) max by (namespace, pod) (kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"} == 1))", @@ -886,7 +850,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(machine_memory_bytes{cluster=~\"$cluster\"})", @@ -902,7 +866,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -950,7 +914,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=~\"$cluster\"})", "interval": "", @@ -965,7 +929,7 @@ "collapsed": false, "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "gridPos": { "h": 1, @@ -981,7 +945,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -1072,11 +1036,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])))", + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\"}[$__rate_interval])))", "interval": "$resolution", "legendFormat": "CPU usage in %", "range": true, @@ -1102,7 +1066,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -1193,11 +1157,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"})", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\"}) / sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\"})", "interval": "$resolution", "legendFormat": "Memory usage in %", "range": true, @@ -1223,7 +1187,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -1315,7 +1279,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -1334,7 +1298,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -1422,7 +1386,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -1439,7 +1403,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -1528,11 +1492,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))) by (instance)", + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=~\"$cluster\"}[$__rate_interval]))) by (instance)", "interval": "$resolution", "legendFormat": "{{ node }}", "range": true, @@ -1545,7 +1509,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -1633,11 +1597,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\", job=~\"$job\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\", job=~\"$job\"}) by (instance)", + "expr": "sum(node_memory_MemTotal_bytes{cluster=~\"$cluster\"} - node_memory_MemAvailable_bytes{cluster=~\"$cluster\"}) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "{{ instance }}", @@ -1651,7 +1615,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "description": "No data is generally a good thing here.", "fieldConfig": { @@ -1744,7 +1708,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -1761,7 +1725,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "description": "No data is generally a good thing here.", "fieldConfig": { @@ -1854,11 +1818,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_cpu_core_throttles_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_cpu_core_throttles_total{cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "{{ instance }}", "range": true, @@ -1884,7 +1848,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -1972,7 +1936,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -1985,7 +1949,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "sum(kube_pod_info{cluster=~\"$cluster\"})", @@ -2001,7 +1965,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -2089,7 +2053,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -2106,7 +2070,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "description": "No data is generally a good thing here.", "fieldConfig": { @@ -2195,7 +2159,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -2212,7 +2176,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "description": "No data is generally a good thing here.", "fieldConfig": { @@ -2301,7 +2265,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -2319,7 +2283,7 @@ "collapsed": false, "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "gridPos": { "h": 1, @@ -2335,7 +2299,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "description": "Dropped noisy virtual devices for readability.", "fieldConfig": { @@ -2418,11 +2382,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (device)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=~\"$cluster\"}[$__rate_interval])) by (device)", "interval": "$resolution", "legendFormat": "Received : {{ device }}", "range": true, @@ -2431,11 +2395,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (device)", + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=~\"$cluster\"}[$__rate_interval])) by (device)", "interval": "$resolution", "legendFormat": "Transmitted : {{ device }}", "range": true, @@ -2448,7 +2412,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -2530,11 +2494,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_drop_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))", + "expr": "sum(rate(node_network_receive_drop_total{cluster=~\"$cluster\"}[$__rate_interval]))", "interval": "$resolution", "legendFormat": "Packets dropped (receive)", "range": true, @@ -2543,11 +2507,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "- sum(rate(node_network_transmit_drop_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval]))", + "expr": "- sum(rate(node_network_transmit_drop_total{cluster=~\"$cluster\"}[$__rate_interval]))", "interval": "$resolution", "legendFormat": "Packets dropped (transmit)", "range": true, @@ -2561,7 +2525,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -2643,7 +2607,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, @@ -2656,7 +2620,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "expr": "- sum(rate(container_network_transmit_bytes_total{cluster=~\"$cluster\"}[$__rate_interval])) by (namespace)", @@ -2673,7 +2637,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "fieldConfig": { "defaults": { @@ -2755,11 +2719,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_network_receive_bytes_total{cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "Received bytes in {{ instance }}", "range": true, @@ -2768,10 +2732,10 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", - "expr": "- sum(rate(node_network_transmit_bytes_total{cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", + "expr": "- sum(rate(node_network_transmit_bytes_total{cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "Transmitted bytes in {{ instance }}", @@ -2785,7 +2749,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "description": "Dropped noisy virtual devices for readability.", "fieldConfig": { @@ -2868,11 +2832,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "Received bytes in {{ instance }}", "range": true, @@ -2881,10 +2845,10 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", - "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "Transmitted bytes in {{ instance }}", @@ -2898,7 +2862,7 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "description": "Dropped noisy virtual devices for readability.", "fieldConfig": { @@ -2981,11 +2945,11 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", "exemplar": true, - "expr": "sum(rate(node_network_receive_bytes_total{device=\"lo\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(node_network_receive_bytes_total{device=\"lo\", cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", "interval": "$resolution", "legendFormat": "Received bytes in {{ instance }}", "range": true, @@ -2994,10 +2958,10 @@ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "editorMode": "code", - "expr": "- sum(rate(node_network_transmit_bytes_total{device=\"lo\", cluster=~\"$cluster\", job=~\"$job\"}[$__rate_interval])) by (instance)", + "expr": "- sum(rate(node_network_transmit_bytes_total{device=\"lo\", cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", "hide": false, "interval": "$resolution", "legendFormat": "Transmitted bytes in {{ instance }}", @@ -3017,29 +2981,11 @@ ], "templating": { "list": [ - { - "current": { - "selected": false, - "text": "", - "value": "" - }, - "hide": 0, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "queryValue": "", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, { "current": {}, "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "mimir" }, "definition": "label_values(kube_node_info,cluster)", "hide": 0, @@ -3109,33 +3055,6 @@ "queryValue": "", "skipUrlSync": false, "type": "custom" - }, - { - "current": { - "selected": false, - "text": "", - "value": "" - }, - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "definition": "label_values(node_cpu_seconds_total{cluster=~\"$cluster\"},job)", - "hide": 0, - "includeAll": false, - "multi": true, - "name": "job", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(node_cpu_seconds_total{cluster=~\"$cluster\"},job)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "type": "query" } ] }, From 98874a2641f56f1107822a6b844083dcf239f9bb Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 15:37:25 -0600 Subject: [PATCH 09/12] chore(grafana): change default resolution to 1m for Kubernetes Global View Changed the default resolution from 30s to 1m to provide a better balance between query performance and data granularity. Co-Authored-By: Claude Sonnet 4.5 --- .../src/ptd/grafana_dashboards/k8s-views-global.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json index 042f6b5..652e079 100644 --- a/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json +++ b/python-pulumi/src/ptd/grafana_dashboards/k8s-views-global.json @@ -3012,8 +3012,8 @@ { "current": { "selected": false, - "text": "30s", - "value": "30s" + "text": "1m", + "value": "1m" }, "hide": 0, "includeAll": false, @@ -3031,12 +3031,12 @@ "value": "15s" }, { - "selected": true, + "selected": false, "text": "30s", "value": "30s" }, { - "selected": false, + "selected": true, "text": "1m", "value": "1m" }, From 9e871255e771239ae827658c5100097c6266c855 Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 16:04:34 -0600 Subject: [PATCH 10/12] File formatting --- python-pulumi/src/ptd/pulumi_resources/lib.py | 8 ++++---- python-pulumi/tests/test_dashboard_configmaps.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/lib.py b/python-pulumi/src/ptd/pulumi_resources/lib.py index d05f332..0d5506e 100644 --- a/python-pulumi/src/ptd/pulumi_resources/lib.py +++ b/python-pulumi/src/ptd/pulumi_resources/lib.py @@ -26,17 +26,17 @@ def sanitize_k8s_name(name: str) -> str: raise ValueError(msg) # Convert to lowercase and replace invalid chars with hyphens - sanitized = re.sub(r'[^a-z0-9-]', '-', name.lower()) + sanitized = re.sub(r"[^a-z0-9-]", "-", name.lower()) # Collapse consecutive hyphens into single hyphen - sanitized = re.sub(r'-+', '-', sanitized) + sanitized = re.sub(r"-+", "-", sanitized) # Strip leading/trailing hyphens - sanitized = sanitized.strip('-') + sanitized = sanitized.strip("-") # Validate the result matches RFC 1123 subdomain pattern # Pattern: must start/end with alphanumeric, can contain hyphens in between - if not sanitized or not re.match(r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$', sanitized): + if not sanitized or not re.match(r"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$", sanitized): msg = f"Name '{name}' cannot be sanitized to RFC 1123 format (result: '{sanitized}')" raise ValueError(msg) diff --git a/python-pulumi/tests/test_dashboard_configmaps.py b/python-pulumi/tests/test_dashboard_configmaps.py index faabe88..20468f2 100644 --- a/python-pulumi/tests/test_dashboard_configmaps.py +++ b/python-pulumi/tests/test_dashboard_configmaps.py @@ -93,7 +93,7 @@ def test_rfc1123_compliance_validation(): ] # Simplified pattern for our use case (no dots) - rfc1123_pattern = r'^[a-z0-9]([a-z0-9-]*[a-z0-9])?$' + rfc1123_pattern = r"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$" for name in valid_names: assert re.match(rfc1123_pattern, name), f"Valid name failed: {name}" @@ -168,4 +168,5 @@ def test_configmap_naming_pattern(): # Verify metadata name is RFC 1123 compliant import re - assert re.match(r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$', k8s_metadata_name) + + assert re.match(r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", k8s_metadata_name) From cef03bb400625c79d8f21934eec04435f439d10f Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 9 Mar 2026 16:13:45 -0600 Subject: [PATCH 11/12] fix(python-pulumi): resolve linter warnings in dashboard code Fixed ruff linter warnings: - B904: Use explicit exception chaining with 'from e' - TRY003/EM102: Assign error message to variable before raising - F841: Remove unused 'cluster_name' variable in test Co-Authored-By: Claude Sonnet 4.5 --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 3 ++- python-pulumi/tests/test_dashboard_configmaps.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index f170497..06f7f4f 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -2550,7 +2550,8 @@ def _create_dashboard_configmaps(self, ns: k8s.core.v1.Namespace): with open(dashboard_file) as f: dashboard_json = json.load(f) except json.JSONDecodeError as e: - raise ValueError(f"Invalid JSON in {dashboard_file}: {e}") + msg = f"Invalid JSON in {dashboard_file}: {e}" + raise ValueError(msg) from e # Enforce UID = filename for idempotency # Set id to null (Grafana provisioning ignores it) diff --git a/python-pulumi/tests/test_dashboard_configmaps.py b/python-pulumi/tests/test_dashboard_configmaps.py index 20468f2..57dffaf 100644 --- a/python-pulumi/tests/test_dashboard_configmaps.py +++ b/python-pulumi/tests/test_dashboard_configmaps.py @@ -40,7 +40,6 @@ def test_dashboard_configmap_structure(): # This is a documentation test showing the expected structure dashboard_name = "alerts_dashboard" k8s_safe_name = sanitize_k8s_name(dashboard_name) - cluster_name = "test-cluster" # Expected ConfigMap structure expected_metadata = { From bcc4bc6bb32c71d5f664e1bf166f76098e88e3e9 Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Tue, 10 Mar 2026 08:41:12 -0600 Subject: [PATCH 12/12] Remove duplicate import --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 06f7f4f..8abb4c1 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -2535,7 +2535,6 @@ def _create_dashboard_configmaps(self, ns: k8s.core.v1.Namespace): Dashboard UIDs are enforced to match the filename (without .json extension) to ensure idempotent updates and prevent duplicate dashboards. """ - import json dashboards_dir = ptd.paths.dashboards()