From 0af598d8bcba2bc5f0b3092b291890384b98d5a2 Mon Sep 17 00:00:00 2001 From: eedo_y Date: Sat, 11 Apr 2026 14:03:21 +0900 Subject: [PATCH 1/2] =?UTF-8?q?chore:=20=EB=AA=A8=EB=8B=88=ED=84=B0?= =?UTF-8?q?=EB=A7=81=20=EB=8C=80=EC=8B=9C=EB=B3=B4=EB=93=9C=20=EC=97=B0?= =?UTF-8?q?=EA=B2=B0=20(#23)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../backend/spring-boot-overview.json | 782 ++++++++++++++++++ .../dashboards/logs/loki-logs-overview.json | 583 +++++++++++++ .../templates/grafana-dashboards.yaml | 27 + .../monitoring-core/values-prod-gitops.yaml | 15 + k8s-helm/releases/monitoring-core/values.yaml | 15 + 5 files changed, 1422 insertions(+) create mode 100644 k8s-helm/releases/monitoring-core/dashboards/backend/spring-boot-overview.json create mode 100644 k8s-helm/releases/monitoring-core/dashboards/logs/loki-logs-overview.json create mode 100644 k8s-helm/releases/monitoring-core/templates/grafana-dashboards.yaml diff --git a/k8s-helm/releases/monitoring-core/dashboards/backend/spring-boot-overview.json b/k8s-helm/releases/monitoring-core/dashboards/backend/spring-boot-overview.json new file mode 100644 index 0000000..7c10282 --- /dev/null +++ b/k8s-helm/releases/monitoring-core/dashboards/backend/spring-boot-overview.json @@ -0,0 +1,782 @@ +{ + "__inputs": [], + "__requires": [ + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "timeseries", + "name": "Time series", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.8 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job) (rate(http_server_requests_seconds_count{namespace=~\"$namespace\", job=~\"$job\"}[5m]))", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "100 * sum by (job) (rate(http_server_requests_seconds_count{namespace=~\"$namespace\", job=~\"$job\", status=~\"5..\"}[5m])) / clamp_min(sum by (job) (rate(http_server_requests_seconds_count{namespace=~\"$namespace\", job=~\"$job\"}[5m])), 0.001)", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP 5xx Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job) (rate(http_server_requests_seconds_sum{namespace=~\"$namespace\", job=~\"$job\"}[5m])) / clamp_min(sum by (job) (rate(http_server_requests_seconds_count{namespace=~\"$namespace\", job=~\"$job\"}[5m])), 0.001)", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP Average Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1073741824 + }, + { + "color": "red", + "value": 2147483648 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job) (jvm_memory_used_bytes{namespace=~\"$namespace\", job=~\"$job\", area=\"heap\"})", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "JVM Heap Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.6 + }, + { + "color": "red", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg by (job) (process_cpu_usage{namespace=~\"$namespace\", job=~\"$job\"})", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "Process CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 400 + }, + { + "color": "red", + "value": 800 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job) (jvm_threads_live_threads{namespace=~\"$namespace\", job=~\"$job\"})", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "Live Threads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job) (rate(jvm_gc_pause_seconds_sum{namespace=~\"$namespace\", job=~\"$job\"}[5m]))", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 20 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job, pool) (hikaricp_connections_active{namespace=~\"$namespace\", job=~\"$job\"})", + "legendFormat": "{{job}} / {{pool}}", + "range": true, + "refId": "A" + } + ], + "title": "Hikari Active Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job) (tomcat_threads_busy_threads{namespace=~\"$namespace\", job=~\"$job\"})", + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "Tomcat Busy Threads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (job, level) (rate(logback_events_total{namespace=~\"$namespace\", job=~\"$job\"}[5m]))", + "legendFormat": "{{job}} / {{level}}", + "range": true, + "refId": "A" + } + ], + "title": "Logback Events", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "pinhouse", + "spring-boot", + "micrometer" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(jvm_memory_used_bytes, namespace)", + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(jvm_memory_used_bytes, namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(jvm_memory_used_bytes{namespace=~\"$namespace\"}, job)", + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(jvm_memory_used_bytes{namespace=~\"$namespace\"}, job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Spring Boot Overview", + "uid": "spring-boot-overview", + "version": 1, + "weekStart": "" +} diff --git a/k8s-helm/releases/monitoring-core/dashboards/logs/loki-logs-overview.json b/k8s-helm/releases/monitoring-core/dashboards/logs/loki-logs-overview.json new file mode 100644 index 0000000..259265e --- /dev/null +++ b/k8s-helm/releases/monitoring-core/dashboards/logs/loki-logs-overview.json @@ -0,0 +1,583 @@ +{ + "__inputs": [], + "__requires": [ + { + "id": "loki", + "name": "Loki", + "type": "datasource", + "version": "1.0.0" + }, + { + "id": "timeseries", + "name": "Time series", + "type": "panel", + "version": "" + }, + { + "id": "logs", + "name": "Logs", + "type": "panel", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "logs" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (namespace) (count_over_time({namespace=~\"$namespace\"}[$__interval]))", + "legendFormat": "{{namespace}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "로그 볼륨 (네임스페이스별)", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "logs" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (service) (count_over_time({namespace=~\"$namespace\", service=~\"$service\"}[$__interval]))", + "legendFormat": "{{service}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "로그 볼륨 (서비스별)", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "logs" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*ERROR.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*WARN.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*INFO.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (level) (count_over_time({namespace=~\"$namespace\", service=~\"$service\"} | json | level != \"\" | __error__=\"\" [$__interval]))", + "legendFormat": "{{level}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "로그 레벨별 분포", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "logs" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (pod) (count_over_time({namespace=~\"$namespace\", service=~\"$service\", pod=~\"$pod\"}[$__interval]))", + "legendFormat": "{{pod}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "로그 볼륨 (Pod별)", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{namespace=~\"$namespace\", service=~\"$service\", pod=~\"$pod\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "로그 스트림", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "logs" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 6, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (namespace) (count_over_time({namespace=~\"$namespace\"} |~ \"(?i)error\" [$__interval]))", + "legendFormat": "{{namespace}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "에러 로그 (네임스페이스별)", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 7, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{namespace=~\"$namespace\", service=~\"$service\"} |~ \"(?i)error|exception|fatal\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "에러/예외 로그만 보기", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "pinhouse", + "loki", + "logs" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "definition": "label_values(namespace)", + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": { + "label": "namespace", + "refId": "LokiVariableQueryEditor-VariableQuery", + "stream": "", + "type": 1 + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "definition": "label_values({namespace=~\"$namespace\"}, service)", + "includeAll": true, + "label": "Service", + "multi": true, + "name": "service", + "options": [], + "query": { + "label": "service", + "refId": "LokiVariableQueryEditor-VariableQuery", + "stream": "{namespace=~\"$namespace\"}", + "type": 1 + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "definition": "label_values({namespace=~\"$namespace\", service=~\"$service\"}, pod)", + "includeAll": true, + "label": "Pod", + "multi": true, + "name": "pod", + "options": [], + "query": { + "label": "pod", + "refId": "LokiVariableQueryEditor-VariableQuery", + "stream": "{namespace=~\"$namespace\", service=~\"$service\"}", + "type": 1 + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "Asia/Seoul", + "title": "Loki Logs Overview", + "uid": "loki-logs-overview", + "version": 1, + "weekStart": "" +} diff --git a/k8s-helm/releases/monitoring-core/templates/grafana-dashboards.yaml b/k8s-helm/releases/monitoring-core/templates/grafana-dashboards.yaml new file mode 100644 index 0000000..7add798 --- /dev/null +++ b/k8s-helm/releases/monitoring-core/templates/grafana-dashboards.yaml @@ -0,0 +1,27 @@ +{{- $root := . -}} +{{- range $path, $_ := $root.Files.Glob "dashboards/**/*.json" }} +{{- $relativePath := trimPrefix "dashboards/" $path -}} +{{- $configMapName := trimSuffix ".json" $relativePath | replace "/" "-" | replace "_" "-" | lower -}} +{{- $folderName := base (dir $relativePath) -}} +--- +apiVersion: v1 +kind: ConfigMap + +# 대시보드 ConfigMap 생성 +metadata: + name: {{ printf "%s-%s" $root.Release.Name $configMapName | trunc 63 | trimSuffix "-" }} + labels: + grafana_dashboard: "1" + app.kubernetes.io/name: {{ $root.Chart.Name | quote }} + app.kubernetes.io/instance: {{ $root.Release.Name | quote }} + app.kubernetes.io/component: "grafana-dashboard" + {{- if ne $folderName "." }} + + # 어노테이션 + annotations: + grafana_folder: {{ $folderName | quote }} + {{- end }} +data: + {{ base $path }}: |- +{{ $root.Files.Get $path | indent 4 }} +{{- end }} diff --git a/k8s-helm/releases/monitoring-core/values-prod-gitops.yaml b/k8s-helm/releases/monitoring-core/values-prod-gitops.yaml index 4b4ea6e..4d0f5e7 100644 --- a/k8s-helm/releases/monitoring-core/values-prod-gitops.yaml +++ b/k8s-helm/releases/monitoring-core/values-prod-gitops.yaml @@ -10,6 +10,21 @@ kube-prometheus-stack: userKey: GRAFANA_ADMIN_USER passwordKey: GRAFANA_ADMIN_PASSWORD + # Grafana Sidecar를 통한 대시보드 자동 로딩 설정 + sidecar: + dashboards: + enabled: true + # ConfigMap에서 찾을 라벨 + label: grafana_dashboard + labelValue: "1" + # 모든 네임스페이스에서 대시보드 ConfigMap 검색 + searchNamespace: ALL + # 폴더 어노테이션 지원 + folderAnnotation: grafana_folder + # 대시보드 프로비저닝 설정 + provider: + foldersFromFilesStructure: true + # Grafana Pod 리소스는 임시로 주석 처리하고 차트 기본값을 사용합니다. # resources: # requests: diff --git a/k8s-helm/releases/monitoring-core/values.yaml b/k8s-helm/releases/monitoring-core/values.yaml index 1ee37c5..431c0d7 100644 --- a/k8s-helm/releases/monitoring-core/values.yaml +++ b/k8s-helm/releases/monitoring-core/values.yaml @@ -31,6 +31,21 @@ kube-prometheus-stack: # 운영 전에는 ExternalSecret 기반 existingSecret으로 전환하는 것을 권장합니다. adminPassword: "change-me-before-apply" + # Grafana Sidecar를 통한 대시보드 자동 로딩 설정 + sidecar: + dashboards: + enabled: true + # ConfigMap에서 찾을 라벨 + label: grafana_dashboard + labelValue: "1" + # 모든 네임스페이스에서 대시보드 ConfigMap 검색 + searchNamespace: ALL + # 폴더 어노테이션 지원 + folderAnnotation: grafana_folder + # 대시보드 프로비저닝 설정 + provider: + foldersFromFilesStructure: true + # 리소스 요청/제한은 임시로 주석 처리하고 차트 기본값을 사용합니다. # resources: # requests: From e0c8cfbd5013dba9d4e44b0cc5dc1951e816485f Mon Sep 17 00:00:00 2001 From: eedo_y Date: Sat, 11 Apr 2026 14:36:52 +0900 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20loki=20=EA=B3=B5=EC=8B=9D=EB=AC=B8?= =?UTF-8?q?=EC=84=9C=20=EB=B0=98=EC=98=81=ED=95=98=EC=97=AC=20=EB=AC=B8?= =?UTF-8?q?=EB=B2=95=20=EC=88=98=EC=A0=95=20(#49)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../releases/monitoring-alloy/values.yaml | 83 ++++++------------- 1 file changed, 27 insertions(+), 56 deletions(-) diff --git a/k8s-helm/releases/monitoring-alloy/values.yaml b/k8s-helm/releases/monitoring-alloy/values.yaml index 3604bf7..e2b512c 100644 --- a/k8s-helm/releases/monitoring-alloy/values.yaml +++ b/k8s-helm/releases/monitoring-alloy/values.yaml @@ -78,19 +78,7 @@ alloy: } } - // ServiceMonitor 리소스를 읽어 메트릭 대상을 자동으로 수집합니다. - prometheus.operator.servicemonitors "cluster" { - forward_to = [prometheus.remote_write.prometheus.receiver] - - clustering { - enabled = true - } - - scrape { - default_scrape_interval = "30s" - default_scrape_timeout = "10s" - } - } + // TODO! ServiceMonitor는 추후 추가 예정 // PodMonitor 리소스를 읽어 Pod 단위 메트릭 대상을 자동으로 수집합니다. prometheus.operator.podmonitors "cluster" { @@ -124,15 +112,21 @@ alloy: // Logs // ======================================== // 각 Alloy Pod는 자신이 올라간 노드의 Pod 로그만 읽도록 범위를 제한합니다. - discovery.kubernetes "pods_on_same_node" { - role = "pod" + loki.source.podlogs "pods" { + forward_to = [loki.relabel.pod_logs.receiver] - selectors { - role = "pod" - field = "spec.nodeName=" + coalesce(sys.env("HOSTNAME"), constants.hostname) + node_filter { + enabled = true } } + // CRI 로그 포맷을 파싱한 뒤 Loki로 전달합니다. + loki.process "pod_logs" { + forward_to = [loki.write.default.receiver] + + stage.cri {} + } + // Kubernetes 메타데이터를 Loki 조회용 라벨로 정리합니다. loki.relabel "pod_logs" { forward_to = [loki.process.pod_logs.receiver] @@ -161,70 +155,47 @@ alloy: target_label = "node" } - // service 기본값은 컨테이너 이름으로 두어 앱 라벨이 없는 로그도 조회 가능하게 합니다. + // service 추출 규칙: 우선순위가 낮은 것부터 높은 순서로 덮어씁니다. + + // 1. 기본값: 컨테이너 이름 rule { source_labels = ["__meta_kubernetes_pod_container_name"] - regex = "(.+)" - replacement = "$1" target_label = "service" } - // 현재 애플리케이션 매니페스트가 주로 사용하는 app 라벨이 있으면 service를 앱 이름으로 맞춥니다. + // 2. Pod controller가 ReplicaSet이면 Deployment 이름 추출 + // ReplicaSet 이름 형식: - + // pod-template-hash는 마지막 하이픈 뒤의 8-10자리 영숫자 rule { - source_labels = ["__meta_kubernetes_pod_label_app"] - regex = "(.+)" + source_labels = ["__meta_kubernetes_pod_controller_kind", "__meta_kubernetes_pod_controller_name"] + separator = "/" + regex = "ReplicaSet/(.+)-[a-z0-9]{8,10}$" replacement = "$1" target_label = "service" } - // 표준 app.kubernetes.io/name 라벨이 있으면 service를 그 값으로 정규화합니다. + // 3. app 라벨이 있으면 우선 사용 rule { - source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"] + source_labels = ["__meta_kubernetes_pod_label_app"] regex = "(.+)" - replacement = "$1" target_label = "service" } - // service_name은 최종 service 값을 그대로 복사해 Tempo 연동이나 대시보드 변수에 재사용합니다. + // 4. 표준 app.kubernetes.io/name 라벨이 있으면 최우선 rule { - source_labels = ["service"] + source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"] regex = "(.+)" - replacement = "$1" - target_label = "service_name" - } - - // job 기본값은 namespace/pod 형식으로 두어 app 라벨이 없는 로그도 식별할 수 있게 합니다. - rule { - source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_name"] - separator = "/" - regex = "(.+)/(.+)" - replacement = "$1/$2" - target_label = "job" + target_label = "service" } - // service가 정규화됐으면 job도 namespace/service 형식으로 맞춥니다. + // job은 namespace/service 형식으로 설정합니다. rule { source_labels = ["__meta_kubernetes_namespace", "service"] separator = "/" - regex = "(.+)/(.+)" - replacement = "$1/$2" target_label = "job" } } - // CRI 로그 포맷을 파싱한 뒤 Loki로 전달합니다. - loki.process "pod_logs" { - forward_to = [loki.write.default.receiver] - - stage.cri {} - } - - // 현재 노드의 Pod 로그를 읽어 relabel 단계로 넘깁니다. - loki.source.kubernetes "pods" { - targets = discovery.kubernetes.pods_on_same_node.targets - forward_to = [loki.relabel.pod_logs.receiver] - } - // 공통 외부 라벨을 함께 붙여 Loki로 전송합니다. loki.write "default" { external_labels = {