From 72a36f2565e6f4db31f60d0799e3b5b0f0237d30 Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Thu, 4 Dec 2025 12:00:51 +0000 Subject: [PATCH 1/2] wip: Add Infra Monitoring Stack configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Docker Compose with Prometheus, Grafana, Alertmanager, Node Exporter, Blackbox Exporter, cAdvisor - Prometheus configuration with scrape configs - Alert rules for host, container, service, and network monitoring - Alertmanager configuration with routing and receivers - Blackbox Exporter modules for HTTP, TCP, ICMP, DNS probes - Grafana datasources provisioning đŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../alertmanager/alertmanager.yml | 184 +++++++++++ 2-infra-monitoring/blackbox/blackbox.yml | 169 +++++++++++ 2-infra-monitoring/docker-compose.yml | 194 ++++++++++++ .../provisioning/datasources/datasources.yml | 64 ++++ .../prometheus/alerts/alerts.yml | 285 ++++++++++++++++++ 2-infra-monitoring/prometheus/prometheus.yml | 152 ++++++++++ 6 files changed, 1048 insertions(+) create mode 100644 2-infra-monitoring/alertmanager/alertmanager.yml create mode 100644 2-infra-monitoring/blackbox/blackbox.yml create mode 100644 2-infra-monitoring/docker-compose.yml create mode 100644 2-infra-monitoring/grafana/provisioning/datasources/datasources.yml create mode 100644 2-infra-monitoring/prometheus/alerts/alerts.yml create mode 100644 2-infra-monitoring/prometheus/prometheus.yml diff --git a/2-infra-monitoring/alertmanager/alertmanager.yml b/2-infra-monitoring/alertmanager/alertmanager.yml new file mode 100644 index 0000000..a64da5b --- /dev/null +++ b/2-infra-monitoring/alertmanager/alertmanager.yml @@ -0,0 +1,184 @@ +# Alertmanager Configuration +# ========================== +# RiasztĂĄs kezelĂ©si Ă©s Ă©rtesĂ­tĂ©si konfigurĂĄciĂł. +# +# Receiver tĂ­pusok: +# - Email (SMTP) +# - Webhook (ĂĄltalĂĄnos HTTP) +# - Slack (opcionĂĄlis) +# +# HasznĂĄlat: +# A riasztĂĄsok a Prometheus-bĂłl Ă©rkeznek, Ă©s a megfelelƑ +# receiver-hez kerĂŒlnek a route szabĂĄlyok alapjĂĄn. + +global: + # SMTP konfigurĂĄciĂł email Ă©rtesĂ­tĂ©sekhez + # ValĂłs környezetben ĂĄllĂ­tsd be a megfelelƑ SMTP szervert + smtp_smarthost: "smtp.example.com:587" + smtp_from: "alertmanager@example.com" + smtp_auth_username: "alertmanager@example.com" + smtp_auth_password: "${SMTP_PASSWORD}" + smtp_require_tls: true + + # Webhook timeout + http_config: + follow_redirects: true + + # Resolve timeout - mennyi ideig vĂĄrunk mielƑtt resolved stĂĄtuszba kerĂŒl + resolve_timeout: 5m + +# ÉrtesĂ­tĂ©si sablonok +templates: + - "/etc/alertmanager/templates/*.tmpl" + +# Route konfigurĂĄciĂł +# A riasztĂĄsok ĂștvĂĄlasztĂĄsa a cĂ­mkĂ©k alapjĂĄn +route: + # AlapĂ©rtelmezett receiver + receiver: "default-receiver" + + # CsoportosĂ­tĂĄsi beĂĄllĂ­tĂĄsok + # A riasztĂĄsok ezek alapjĂĄn kerĂŒlnek csoportosĂ­tĂĄsra + group_by: ["alertname", "severity", "instance"] + + # VĂĄrakozĂĄsi idƑ mielƑtt az elsƑ Ă©rtesĂ­tĂ©s kikĂŒldĂ©sre kerĂŒl + group_wait: 30s + + # VĂĄrakozĂĄsi idƑ a csoport következƑ Ă©rtesĂ­tĂ©se elƑtt + group_interval: 5m + + # IsmĂ©tlĂ©si intervallum mĂ©g nem megoldott riasztĂĄsokhoz + repeat_interval: 4h + + # Gyermek route-ok specifikus szabĂĄlyokkal + routes: + # Kritikus riasztĂĄsok azonnali Ă©rtesĂ­tĂ©ssel + - match: + severity: critical + receiver: "critical-receiver" + group_wait: 10s + repeat_interval: 1h + continue: false + + # Warning riasztĂĄsok + - match: + severity: warning + receiver: "warning-receiver" + group_wait: 1m + repeat_interval: 4h + continue: false + + # Info szintƱ riasztĂĄsok (opcionĂĄlis logolĂĄs) + - match: + severity: info + receiver: "info-receiver" + group_wait: 5m + repeat_interval: 12h + continue: false + + # InfrastruktĂșra specifikus riasztĂĄsok + - match_re: + alertname: "^(Prometheus|Grafana|Alertmanager).*" + receiver: "infra-receiver" + continue: false + +# Inhibit szabĂĄlyok +# MegakadĂĄlyozza az alacsonyabb prioritĂĄsĂș riasztĂĄsokat, +# ha magasabb prioritĂĄsĂș mĂĄr aktĂ­v +inhibit_rules: + # Critical elnyomja a warning-ot ugyanazon instance-rƑl + - source_match: + severity: "critical" + target_match: + severity: "warning" + equal: ["alertname", "instance"] + + # Critical elnyomja az info-t + - source_match: + severity: "critical" + target_match: + severity: "info" + equal: ["alertname", "instance"] + + # Warning elnyomja az info-t + - source_match: + severity: "warning" + target_match: + severity: "info" + equal: ["alertname", "instance"] + +# Receiver definĂ­ciĂłk +receivers: + # AlapĂ©rtelmezett receiver (webhook) + - name: "default-receiver" + webhook_configs: + - url: "http://localhost:5001/webhook" + send_resolved: true + # Ha nincs webhook endpoint, ez sikertelen lesz, de nem okoz problĂ©mĂĄt + + # Kritikus riasztĂĄsok receiver-je + - name: "critical-receiver" + # Email Ă©rtesĂ­tĂ©s + email_configs: + - to: "oncall@example.com" + send_resolved: true + headers: + subject: "[CRITICAL] {{ .GroupLabels.alertname }}" + html: | +

Critical Alert

+

Alert: {{ .GroupLabels.alertname }}

+

Severity: {{ .CommonLabels.severity }}

+ {{ range .Alerts }} +
+

Instance: {{ .Labels.instance }}

+

Summary: {{ .Annotations.summary }}

+

Description: {{ .Annotations.description }}

+ {{ end }} + # Webhook (pl. PagerDuty, OpsGenie integrĂĄciĂł) + webhook_configs: + - url: "http://localhost:5001/webhook/critical" + send_resolved: true + + # Warning riasztĂĄsok receiver-je + - name: "warning-receiver" + email_configs: + - to: "team@example.com" + send_resolved: true + headers: + subject: "[WARNING] {{ .GroupLabels.alertname }}" + webhook_configs: + - url: "http://localhost:5001/webhook/warning" + send_resolved: true + + # Info szintƱ riasztĂĄsok (csak webhook) + - name: "info-receiver" + webhook_configs: + - url: "http://localhost:5001/webhook/info" + send_resolved: false + + # InfrastruktĂșra riasztĂĄsok + - name: "infra-receiver" + email_configs: + - to: "infra-team@example.com" + send_resolved: true + headers: + subject: "[INFRA] {{ .GroupLabels.alertname }}" + webhook_configs: + - url: "http://localhost:5001/webhook/infra" + send_resolved: true + +# ========================================= +# MEGJEGYZÉS +# ========================================= +# ValĂłs környezetben: +# 1. ÁllĂ­tsd be a megfelelƑ SMTP szervert +# 2. KonfigurĂĄld a webhook URL-eket +# 3. OpcionĂĄlisan add hozzĂĄ a Slack/Teams integrĂĄciĂłt: +# +# slack_configs: +# - api_url: "${SLACK_WEBHOOK_URL}" +# channel: "#alerts" +# send_resolved: true +# +# A környezeti vĂĄltozĂłkat a docker-compose.yml-ben +# vagy .env fĂĄjlban definiĂĄld. diff --git a/2-infra-monitoring/blackbox/blackbox.yml b/2-infra-monitoring/blackbox/blackbox.yml new file mode 100644 index 0000000..e0163f5 --- /dev/null +++ b/2-infra-monitoring/blackbox/blackbox.yml @@ -0,0 +1,169 @@ +# Blackbox Exporter Configuration +# ================================ +# Endpoint monitoring konfigurĂĄciĂł kĂŒlönbözƑ protokollokhoz. +# +# Modulok: +# - http_2xx: HTTP GET sikeres vĂĄlasz ellenƑrzĂ©s +# - http_post_2xx: HTTP POST sikeres vĂĄlasz +# - tcp_connect: TCP kapcsolat ellenƑrzĂ©s +# - icmp: ICMP ping ellenƑrzĂ©s +# - dns: DNS lekĂ©rdezĂ©s ellenƑrzĂ©s + +modules: + # ========================================= + # HTTP MODULES + # ========================================= + + # EgyszerƱ HTTP 2xx vĂĄlasz ellenƑrzĂ©s + http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [] # Defaults to 2xx + method: GET + no_follow_redirects: false + fail_if_ssl: false + fail_if_not_ssl: false + preferred_ip_protocol: "ip4" + ip_protocol_fallback: true + + # HTTP 2xx SSL kötelezƑ + http_2xx_ssl: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [] + method: GET + fail_if_not_ssl: true + preferred_ip_protocol: "ip4" + + # HTTP POST kĂ©rĂ©s + http_post_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + method: POST + headers: + Content-Type: application/json + body: '{}' + + # HTTP Basic Auth + http_basic_auth: + prober: http + timeout: 5s + http: + method: GET + basic_auth: + username: "${HTTP_BASIC_USER}" + password: "${HTTP_BASIC_PASS}" + + # HTTP tartalmazzon adott stringet + http_content_match: + prober: http + timeout: 10s + http: + method: GET + fail_if_body_not_matches_regexp: + - "OK|healthy|success" + + # ========================================= + # TCP MODULES + # ========================================= + + # EgyszerƱ TCP kapcsolat ellenƑrzĂ©s + tcp_connect: + prober: tcp + timeout: 5s + tcp: + preferred_ip_protocol: "ip4" + + # TCP SSL/TLS kapcsolat + tcp_connect_tls: + prober: tcp + timeout: 5s + tcp: + tls: true + preferred_ip_protocol: "ip4" + + # TCP banner ellenƑrzĂ©s (pl. SSH) + tcp_banner_ssh: + prober: tcp + timeout: 5s + tcp: + query_response: + - expect: "^SSH-" + + # TCP banner ellenƑrzĂ©s (SMTP) + tcp_banner_smtp: + prober: tcp + timeout: 5s + tcp: + query_response: + - expect: "^220" + + # ========================================= + # ICMP MODULES + # ========================================= + + # ICMP ping (IPv4) + icmp: + prober: icmp + timeout: 5s + icmp: + preferred_ip_protocol: "ip4" + ip_protocol_fallback: true + + # ICMP ping (IPv6) + icmp_ipv6: + prober: icmp + timeout: 5s + icmp: + preferred_ip_protocol: "ip6" + ip_protocol_fallback: false + + # ========================================= + # DNS MODULES + # ========================================= + + # DNS A rekord lekĂ©rdezĂ©s + dns_a: + prober: dns + timeout: 5s + dns: + query_name: "example.com" + query_type: "A" + valid_rcodes: + - NOERROR + preferred_ip_protocol: "ip4" + + # DNS SOA rekord lekĂ©rdezĂ©s + dns_soa: + prober: dns + timeout: 5s + dns: + query_name: "example.com" + query_type: "SOA" + valid_rcodes: + - NOERROR + + # ========================================= + # GRPC MODULE (opcionĂĄlis) + # ========================================= + + # gRPC health check + grpc: + prober: grpc + timeout: 5s + grpc: + tls: false + preferred_ip_protocol: "ip4" + + grpc_tls: + prober: grpc + timeout: 5s + grpc: + tls: true + preferred_ip_protocol: "ip4" diff --git a/2-infra-monitoring/docker-compose.yml b/2-infra-monitoring/docker-compose.yml new file mode 100644 index 0000000..052e679 --- /dev/null +++ b/2-infra-monitoring/docker-compose.yml @@ -0,0 +1,194 @@ +# Infra Monitoring Stack +# ===================== +# Docker Compose konfigurĂĄciĂł a teljes monitoring stack-hez. +# +# SzolgĂĄltatĂĄsok: +# - Prometheus (9090): Metrika gyƱjtĂ©s Ă©s tĂĄrolĂĄs +# - Grafana (3000): VizualizĂĄciĂł Ă©s dashboardok +# - Alertmanager (9093): RiasztĂĄs kezelĂ©s +# - Node Exporter (9100): Host metrikĂĄk +# - Blackbox Exporter (9115): Endpoint monitoring +# - cAdvisor (8080): Container metrikĂĄk +# +# HasznĂĄlat: +# docker-compose up -d +# docker-compose logs -f prometheus +# docker-compose down + +version: "3.8" + +networks: + monitoring: + driver: bridge + name: monitoring-network + +volumes: + prometheus_data: + name: prometheus-data + grafana_data: + name: grafana-data + alertmanager_data: + name: alertmanager-data + +services: + # ========================================= + # PROMETHEUS - Metrika gyƱjtĂ©s Ă©s tĂĄrolĂĄs + # ========================================= + prometheus: + image: prom/prometheus:v2.47.0 + container_name: prometheus + restart: unless-stopped + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/alerts:/etc/prometheus/alerts:ro + - prometheus_data:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--storage.tsdb.retention.time=15d" + - "--web.console.libraries=/etc/prometheus/console_libraries" + - "--web.console.templates=/etc/prometheus/consoles" + - "--web.enable-lifecycle" + - "--web.enable-admin-api" + networks: + - monitoring + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + + # ========================================= + # GRAFANA - VizualizĂĄciĂł Ă©s dashboardok + # ========================================= + grafana: + image: grafana/grafana:10.1.0 + container_name: grafana + restart: unless-stopped + ports: + - "3000:3000" + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + environment: + # Admin felhasznĂĄlĂł beĂĄllĂ­tĂĄsok + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} + # Server beĂĄllĂ­tĂĄsok + - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3000} + # Anonim hozzĂĄfĂ©rĂ©s (opcionĂĄlis) + - GF_AUTH_ANONYMOUS_ENABLED=false + # TelepĂ­tĂ©s beĂĄllĂ­tĂĄsok + - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-piechart-panel + # Log szint + - GF_LOG_LEVEL=warn + networks: + - monitoring + depends_on: + - prometheus + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/health"] + interval: 30s + timeout: 10s + retries: 3 + + # ========================================= + # ALERTMANAGER - RiasztĂĄs kezelĂ©s + # ========================================= + alertmanager: + image: prom/alertmanager:v0.26.0 + container_name: alertmanager + restart: unless-stopped + ports: + - "9093:9093" + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager_data:/alertmanager + command: + - "--config.file=/etc/alertmanager/alertmanager.yml" + - "--storage.path=/alertmanager" + - "--web.external-url=http://localhost:9093" + networks: + - monitoring + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:9093/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + + # ========================================= + # NODE EXPORTER - Host metrikĂĄk + # ========================================= + node-exporter: + image: prom/node-exporter:v1.6.1 + container_name: node-exporter + restart: unless-stopped + ports: + - "9100:9100" + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - "--path.procfs=/host/proc" + - "--path.sysfs=/host/sys" + - "--path.rootfs=/rootfs" + - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" + networks: + - monitoring + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:9100/metrics"] + interval: 30s + timeout: 10s + retries: 3 + + # ========================================= + # BLACKBOX EXPORTER - Endpoint monitoring + # ========================================= + blackbox-exporter: + image: prom/blackbox-exporter:v0.24.0 + container_name: blackbox-exporter + restart: unless-stopped + ports: + - "9115:9115" + volumes: + - ./blackbox/blackbox.yml:/etc/blackbox_exporter/config.yml:ro + command: + - "--config.file=/etc/blackbox_exporter/config.yml" + networks: + - monitoring + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:9115/health"] + interval: 30s + timeout: 10s + retries: 3 + + # ========================================= + # cADVISOR - Container metrikĂĄk + # ========================================= + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.47.2 + container_name: cadvisor + restart: unless-stopped + ports: + - "8080:8080" + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + privileged: true + devices: + - /dev/kmsg + networks: + - monitoring + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/healthz"] + interval: 30s + timeout: 10s + retries: 3 diff --git a/2-infra-monitoring/grafana/provisioning/datasources/datasources.yml b/2-infra-monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..bcc2f84 --- /dev/null +++ b/2-infra-monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,64 @@ +# Grafana Datasources Provisioning +# ================================= +# Automatikus datasource konfigurĂĄciĂł Grafana indĂ­tĂĄskor. +# +# A datasource-ok a docker-compose network-ön keresztĂŒl +# Ă©rhetƑk el a service nevĂŒkkel. + +apiVersion: 1 + +# TörlendƑ datasource-ok (opcionĂĄlis) +deleteDatasources: [] + +# Datasource definĂ­ciĂłk +datasources: + # ========================================= + # PROMETHEUS - FƑ metrika forrĂĄs + # ========================================= + - name: Prometheus + type: prometheus + # AlapĂ©rtelmezett datasource + isDefault: true + # HozzĂĄfĂ©rĂ©si mĂłd (proxy = Grafana szerveren keresztĂŒl) + access: proxy + # URL a docker network-ön keresztĂŒl + url: http://prometheus:9090 + # AlapĂ©rtelmezett scrape intervallum + jsonData: + timeInterval: "15s" + # HTTP method + httpMethod: POST + # Increment queries + incrementalQuerying: true + incrementalQueryOverlapWindow: 10m + # Editable-e a UI-n + editable: false + # VerziĂł (auto increment) + version: 1 + + # ========================================= + # ALERTMANAGER - RiasztĂĄs ĂĄllapotok + # ========================================= + - name: Alertmanager + type: alertmanager + access: proxy + url: http://alertmanager:9093 + jsonData: + # Implementation type + implementation: prometheus + editable: false + version: 1 + + # ========================================= + # LOKI - Log aggregĂĄciĂł (opcionĂĄlis) + # ========================================= + # Ha Loki-t hasznĂĄlsz log gyƱjtĂ©shez, add hozzĂĄ: + # + # - name: Loki + # type: loki + # access: proxy + # url: http://loki:3100 + # jsonData: + # maxLines: 1000 + # editable: false + # version: 1 diff --git a/2-infra-monitoring/prometheus/alerts/alerts.yml b/2-infra-monitoring/prometheus/alerts/alerts.yml new file mode 100644 index 0000000..61f9200 --- /dev/null +++ b/2-infra-monitoring/prometheus/alerts/alerts.yml @@ -0,0 +1,285 @@ +# Prometheus Alert Rules +# ====================== +# RiasztĂĄsi szabĂĄlyok a rendszer monitorozĂĄsĂĄhoz. +# +# KategĂłriĂĄk: +# - Host (CPU, memĂłria, lemez) +# - Container (Docker) +# - SzolgĂĄltatĂĄs (endpoints) +# - HĂĄlĂłzat (connectivity) + +groups: + # ========================================= + # HOST ALERTS - CPU, Memory, Disk + # ========================================= + - name: host_alerts + rules: + # Magas CPU hasznĂĄlat + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "CPU usage is above 80% (current: {{ $value | printf \"%.1f\" }}%)" + + # Kritikus CPU hasznĂĄlat + - alert: CriticalCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical CPU usage on {{ $labels.instance }}" + description: "CPU usage is above 95% (current: {{ $value | printf \"%.1f\" }}%)" + + # Magas memĂłria hasznĂĄlat + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage is above 80% (current: {{ $value | printf \"%.1f\" }}%)" + + # Kritikus memĂłria hasznĂĄlat + - alert: CriticalMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical memory usage on {{ $labels.instance }}" + description: "Memory usage is above 95% (current: {{ $value | printf \"%.1f\" }}%)" + + # Alacsony szabad lemezterĂŒlet + - alert: LowDiskSpace + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20 + for: 5m + labels: + severity: warning + annotations: + summary: "Low disk space on {{ $labels.instance }}" + description: "Disk {{ $labels.mountpoint }} has less than 20% free space (current: {{ $value | printf \"%.1f\" }}%)" + + # Kritikusan alacsony lemezterĂŒlet + - alert: CriticalDiskSpace + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical disk space on {{ $labels.instance }}" + description: "Disk {{ $labels.mountpoint }} has less than 10% free space (current: {{ $value | printf \"%.1f\" }}%)" + + # Magas lemez I/O wait + - alert: HighDiskIOWait + expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "High disk I/O wait on {{ $labels.instance }}" + description: "Disk I/O wait is above 30% (current: {{ $value | printf \"%.1f\" }}%)" + + # Node Exporter nem elĂ©rhetƑ + - alert: NodeExporterDown + expr: up{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Node Exporter is down on {{ $labels.instance }}" + description: "Node Exporter has been unreachable for more than 1 minute" + + # ========================================= + # CONTAINER ALERTS - Docker + # ========================================= + - name: container_alerts + rules: + # Container leĂĄllt + - alert: ContainerDown + expr: absent(container_last_seen{name!=""}) + for: 1m + labels: + severity: warning + annotations: + summary: "Container {{ $labels.name }} is down" + description: "Container has not been seen for more than 1 minute" + + # Magas container CPU hasznĂĄlat + - alert: ContainerHighCPU + expr: (sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage in container {{ $labels.name }}" + description: "Container CPU usage is above 80% (current: {{ $value | printf \"%.1f\" }}%)" + + # Magas container memĂłria hasznĂĄlat + - alert: ContainerHighMemory + expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage in container {{ $labels.name }}" + description: "Container memory usage is above 80% of limit (current: {{ $value | printf \"%.1f\" }}%)" + + # Container Ășjraindult + - alert: ContainerRestarted + expr: increase(container_restart_count{name!=""}[15m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: "Container {{ $labels.name }} has restarted" + description: "Container has restarted {{ $value | printf \"%.0f\" }} time(s) in the last 15 minutes" + + # ========================================= + # SERVICE ALERTS - Endpoints + # ========================================= + - name: service_alerts + rules: + # Prometheus nem elĂ©rhetƑ + - alert: PrometheusDown + expr: up{job="prometheus"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Prometheus is down" + description: "Prometheus instance has been unreachable for more than 1 minute" + + # Grafana nem elĂ©rhetƑ + - alert: GrafanaDown + expr: up{job="grafana"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Grafana is down" + description: "Grafana has been unreachable for more than 1 minute" + + # Alertmanager nem elĂ©rhetƑ + - alert: AlertmanagerDown + expr: up{job="alertmanager"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Alertmanager is down" + description: "Alertmanager has been unreachable for more than 1 minute" + + # HTTP endpoint nem elĂ©rhetƑ + - alert: HTTPEndpointDown + expr: probe_success{job="blackbox-http"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "HTTP endpoint {{ $labels.instance }} is down" + description: "HTTP probe has been failing for more than 1 minute" + + # LassĂș HTTP vĂĄlasz + - alert: SlowHTTPResponse + expr: probe_duration_seconds{job="blackbox-http"} > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Slow HTTP response from {{ $labels.instance }}" + description: "HTTP response time is above 2 seconds (current: {{ $value | printf \"%.2f\" }}s)" + + # ========================================= + # NETWORK ALERTS - Connectivity + # ========================================= + - name: network_alerts + rules: + # ICMP ping sikertelen + - alert: ICMPProbeFailed + expr: probe_success{job="blackbox-icmp"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "ICMP probe to {{ $labels.instance }} is failing" + description: "ICMP probe has been failing for more than 2 minutes" + + # TCP kapcsolat sikertelen + - alert: TCPConnectionFailed + expr: probe_success{job="blackbox-tcp"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "TCP connection to {{ $labels.instance }} is failing" + description: "TCP probe has been failing for more than 1 minute" + + # Magas hĂĄlĂłzati forgalom + - alert: HighNetworkTraffic + expr: rate(node_network_receive_bytes_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) * 8 / 1000000 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "High network traffic on {{ $labels.instance }}" + description: "Network receive rate is above 100 Mbps on {{ $labels.device }} (current: {{ $value | printf \"%.1f\" }} Mbps)" + + # HĂĄlĂłzati hibĂĄk + - alert: NetworkErrors + expr: rate(node_network_receive_errs_total[5m]) > 0 or rate(node_network_transmit_errs_total[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Network errors detected on {{ $labels.instance }}" + description: "Network errors detected on interface {{ $labels.device }}" + + # ========================================= + # PROMETHEUS ALERTS - Self-monitoring + # ========================================= + - name: prometheus_alerts + rules: + # Prometheus konfigurĂĄciĂł reload sikertelen + - alert: PrometheusConfigReloadFailed + expr: prometheus_config_last_reload_successful == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus configuration reload failed" + description: "Configuration reload has been failing" + + # Prometheus rule evaluation lassĂș + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus rule evaluation is slow" + description: "Rule group {{ $labels.rule_group }} is taking longer than its interval to evaluate" + + # Prometheus target scrape sikertelen + - alert: PrometheusTargetScrapeFailed + expr: up == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target {{ $labels.job }} scrape failed" + description: "Target {{ $labels.instance }} has been down for more than 5 minutes" + + # Prometheus storage magas + - alert: PrometheusStorageHigh + expr: prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024 > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus storage is high" + description: "Prometheus storage is above 10GB (current: {{ $value | printf \"%.1f\" }}GB)" diff --git a/2-infra-monitoring/prometheus/prometheus.yml b/2-infra-monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..d91c5d4 --- /dev/null +++ b/2-infra-monitoring/prometheus/prometheus.yml @@ -0,0 +1,152 @@ +# Prometheus Configuration +# ======================== +# Metrika gyƱjtĂ©si Ă©s riasztĂĄsi konfigurĂĄciĂł. +# +# Scrape targets: +# - Prometheus önmagĂĄt (self-monitoring) +# - Node Exporter (host metrikĂĄk) +# - cAdvisor (container metrikĂĄk) +# - Blackbox Exporter (endpoint monitoring) +# - Alertmanager (alert metrikĂĄk) + +global: + # AlapĂ©rtelmezett scrape intervallum + scrape_interval: 15s + # KiĂ©rtĂ©kelĂ©si intervallum a szabĂĄlyokhoz + evaluation_interval: 15s + # KĂŒlsƑ cĂ­mkĂ©k a riasztĂĄsokhoz + external_labels: + monitor: "homelab-monitor" + environment: "production" + +# RiasztĂĄsi szabĂĄlyok betöltĂ©se +rule_files: + - "/etc/prometheus/alerts/*.yml" + +# Alertmanager konfigurĂĄciĂł +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + # Timeout az alert kĂŒldĂ©shez + timeout: 10s + +# Scrape konfigurĂĄciĂłk +scrape_configs: + # ========================================= + # PROMETHEUS - Self monitoring + # ========================================= + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + metrics_path: /metrics + scheme: http + + # ========================================= + # NODE EXPORTER - Host metrikĂĄk + # ========================================= + - job_name: "node-exporter" + static_configs: + - targets: ["node-exporter:9100"] + labels: + instance: "monitoring-host" + # Extra metrikĂĄk szƱrĂ©se (opcionĂĄlis) + # metric_relabel_configs: + # - source_labels: [__name__] + # regex: 'node_.*' + # action: keep + + # ========================================= + # cADVISOR - Container metrikĂĄk + # ========================================= + - job_name: "cadvisor" + static_configs: + - targets: ["cadvisor:8080"] + # Hosszabb scrape timeout containerekhez + scrape_timeout: 10s + + # ========================================= + # ALERTMANAGER - Alert metrikĂĄk + # ========================================= + - job_name: "alertmanager" + static_configs: + - targets: ["alertmanager:9093"] + + # ========================================= + # BLACKBOX EXPORTER - HTTP Probes + # ========================================= + - job_name: "blackbox-http" + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + # Monitorozott HTTP endpointok + - http://prometheus:9090/-/healthy + - http://grafana:3000/api/health + - http://alertmanager:9093/-/healthy + labels: + probe_type: "internal" + relabel_configs: + # A cĂ©l URL-t ĂĄtĂ­rjuk target paramĂ©terrĂ© + - source_labels: [__address__] + target_label: __param_target + # Az instance cĂ­mke az eredeti target lesz + - source_labels: [__param_target] + target_label: instance + # A blackbox exporter a valĂłdi target + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ========================================= + # BLACKBOX EXPORTER - ICMP Probes (Ping) + # ========================================= + - job_name: "blackbox-icmp" + metrics_path: /probe + params: + module: [icmp] + static_configs: + - targets: + # PingelendƑ hostok (pĂ©lda) + - 8.8.8.8 + - 1.1.1.1 + labels: + probe_type: "external_dns" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ========================================= + # BLACKBOX EXPORTER - TCP Probes + # ========================================= + - job_name: "blackbox-tcp" + metrics_path: /probe + params: + module: [tcp_connect] + static_configs: + - targets: + # Monitorozott TCP portok (pĂ©lda) + - prometheus:9090 + - grafana:3000 + labels: + probe_type: "internal_tcp" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # ========================================= + # GRAFANA - Grafana metrikĂĄk + # ========================================= + - job_name: "grafana" + static_configs: + - targets: ["grafana:3000"] + metrics_path: /metrics From 27a8d6c7a64d4ccdad69b33b3dbf4e640d1fcd76 Mon Sep 17 00:00:00 2001 From: "Gabe@w7dev" Date: Thu, 4 Dec 2025 12:09:32 +0000 Subject: [PATCH 2/2] feat(phase-3): Add Grafana dashboards and documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add dashboard provisioning configuration (dashboards.yml) - Add Server Health dashboard with CPU, memory, disk, network panels - Add Docker Overview dashboard with container metrics - Add Network Overview dashboard with HTTP, ICMP, TCP probes - Add comprehensive bilingual README documentation Dashboards include: - Gauge/stat panels for quick status overview - Time series graphs for trend analysis - Bar gauges for comparison views - Table summaries with multiple metrics - Template variables for filtering đŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- 2-infra-monitoring/README.md | 272 +++ .../grafana/dashboards/docker-overview.json | 1440 ++++++++++++++++ .../grafana/dashboards/network-overview.json | 1494 +++++++++++++++++ .../grafana/dashboards/server-health.json | 1283 ++++++++++++++ .../provisioning/dashboards/dashboards.yml | 51 + 5 files changed, 4540 insertions(+) create mode 100644 2-infra-monitoring/README.md create mode 100644 2-infra-monitoring/grafana/dashboards/docker-overview.json create mode 100644 2-infra-monitoring/grafana/dashboards/network-overview.json create mode 100644 2-infra-monitoring/grafana/dashboards/server-health.json create mode 100644 2-infra-monitoring/grafana/provisioning/dashboards/dashboards.yml diff --git a/2-infra-monitoring/README.md b/2-infra-monitoring/README.md new file mode 100644 index 0000000..255830c --- /dev/null +++ b/2-infra-monitoring/README.md @@ -0,0 +1,272 @@ +# Infra Monitoring Stack + +## ÁttekintĂ©s / Overview + +Teljes körƱ infrastruktĂșra monitoring megoldĂĄs Docker Compose alapokon. A stack tartalmaz metrika gyƱjtĂ©st, vizualizĂĄciĂłt, riasztĂĄskezelĂ©st Ă©s endpoint monitoring-ot. + +Complete infrastructure monitoring solution based on Docker Compose. The stack includes metric collection, visualization, alert management, and endpoint monitoring. + +## ArchitektĂșra / Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ MONITORING STACK │ +├────────────────────────────────────────────────────────────────── +│ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │ +│ │ Grafana │◄───│ Prometheus │◄───│ Exporters │ │ +│ │ (3000) │ │ (9090) │ │ │ │ +│ └─────────────┘ └──────┬──────┘ │ - Node Exporter │ │ +│ │ │ │ (9100) │ │ +│ │ â–Œ │ - cAdvisor │ │ +│ │ ┌─────────────┐ │ (8080) │ │ +│ └──────────â–ș│Alertmanager │ │ - Blackbox │ │ +│ │ (9093) │ │ (9115) │ │ +│ └─────────────┘ └─────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Komponensek / Components + +| SzolgĂĄltatĂĄs / Service | Port | LeĂ­rĂĄs / Description | +|------------------------|------|----------------------| +| Prometheus | 9090 | Metrika gyƱjtĂ©s Ă©s tĂĄrolĂĄs / Metric collection and storage | +| Grafana | 3000 | VizualizĂĄciĂł Ă©s dashboardok / Visualization and dashboards | +| Alertmanager | 9093 | RiasztĂĄs kezelĂ©s / Alert management | +| Node Exporter | 9100 | Host metrikĂĄk / Host metrics | +| Blackbox Exporter | 9115 | Endpoint monitoring (HTTP, ICMP, TCP) | +| cAdvisor | 8080 | Container metrikĂĄk / Container metrics | + +## Gyors indĂ­tĂĄs / Quick Start + +### ElƑfeltĂ©telek / Prerequisites + +- Docker Engine 20.10+ +- Docker Compose v2.0+ +- Linux host (Node Exporter Ă©s cAdvisor miatt / due to Node Exporter and cAdvisor) + +### IndĂ­tĂĄs / Start + +```bash +# A 2-infra-monitoring könyvtĂĄrban / In the 2-infra-monitoring directory +cd 2-infra-monitoring + +# Stack indĂ­tĂĄsa / Start the stack +docker-compose up -d + +# NaplĂłk megtekintĂ©se / View logs +docker-compose logs -f + +# SzolgĂĄltatĂĄs ĂĄllapot / Service status +docker-compose ps +``` + +### ElĂ©rĂ©s / Access + +- **Grafana**: http://localhost:3000 + - AlapĂ©rtelmezett felhasznĂĄlĂł / Default user: `admin` + - AlapĂ©rtelmezett jelszĂł / Default password: `admin` +- **Prometheus**: http://localhost:9090 +- **Alertmanager**: http://localhost:9093 + +## KönyvtĂĄrstruktĂșra / Directory Structure + +``` +2-infra-monitoring/ +├── docker-compose.yml # FƑ Docker Compose konfigurĂĄciĂł +├── README.md # Ez a dokumentum / This document +├── prometheus/ +│ ├── prometheus.yml # Prometheus konfigurĂĄciĂł +│ └── alerts/ +│ └── alerts.yml # RiasztĂĄsi szabĂĄlyok / Alert rules +├── alertmanager/ +│ └── alertmanager.yml # Alertmanager konfigurĂĄciĂł +├── blackbox/ +│ └── blackbox.yml # Blackbox Exporter konfigurĂĄciĂł +└── grafana/ + ├── provisioning/ + │ ├── datasources/ + │ │ └── datasources.yml # Datasource konfigurĂĄciĂł + │ └── dashboards/ + │ └── dashboards.yml # Dashboard provider konfigurĂĄciĂł + └── dashboards/ + ├── server-health.json # Szerver egĂ©szsĂ©g dashboard + ├── docker-overview.json # Docker container dashboard + └── network-overview.json # HĂĄlĂłzati monitoring dashboard +``` + +## Dashboardok / Dashboards + +### Server Health / Szerver EgĂ©szsĂ©g +- CPU hasznĂĄlat Ă©s load average +- MemĂłria hasznĂĄlat Ă©s swap +- Lemez hasznĂĄlat Ă©s I/O +- HĂĄlĂłzati forgalom Ă©s hibĂĄk + +### Docker Overview / Docker ÁttekintĂ©s +- FutĂł containerek szĂĄma +- Container CPU Ă©s memĂłria hasznĂĄlat +- HĂĄlĂłzati forgalom containerenkĂ©nt +- Filesystem I/O + +### Network Overview / HĂĄlĂłzati ÁttekintĂ©s +- HTTP endpoint stĂĄtusz Ă©s vĂĄlaszidƑ +- ICMP ping latency +- TCP port elĂ©rhetƑsĂ©g +- SSL tanĂșsĂ­tvĂĄny lejĂĄrat + +## RiasztĂĄsok / Alerts + +### Host Alerts +| Alert | SĂșlyossĂĄg / Severity | KĂŒszöb / Threshold | +|-------|---------------------|-------------------| +| HighCPUUsage | warning | >80% (5m) | +| CriticalCPUUsage | critical | >95% (2m) | +| HighMemoryUsage | warning | >80% (5m) | +| CriticalMemoryUsage | critical | >95% (2m) | +| LowDiskSpace | warning | <20% free | +| CriticalDiskSpace | critical | <10% free | + +### Container Alerts +| Alert | SĂșlyossĂĄg / Severity | KĂŒszöb / Threshold | +|-------|---------------------|-------------------| +| ContainerDown | warning | not seen for 1m | +| ContainerHighCPU | warning | >80% (5m) | +| ContainerHighMemory | warning | >80% of limit (5m) | + +### Service Alerts +| Alert | SĂșlyossĂĄg / Severity | KĂŒszöb / Threshold | +|-------|---------------------|-------------------| +| PrometheusDown | critical | down for 1m | +| GrafanaDown | critical | down for 1m | +| AlertmanagerDown | critical | down for 1m | +| HTTPEndpointDown | critical | probe failed for 1m | + +### Network Alerts +| Alert | SĂșlyossĂĄg / Severity | KĂŒszöb / Threshold | +|-------|---------------------|-------------------| +| ICMPProbeFailed | warning | failed for 2m | +| TCPConnectionFailed | critical | failed for 1m | +| HighNetworkTraffic | warning | >100 Mbps (5m) | + +## KonfigurĂĄciĂł / Configuration + +### Környezeti vĂĄltozĂłk / Environment Variables + +Hozz lĂ©tre egy `.env` fĂĄjlt a docker-compose.yml mellett: +Create a `.env` file next to docker-compose.yml: + +```bash +# Grafana admin beĂĄllĂ­tĂĄsok / Grafana admin settings +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=secure_password_here +GRAFANA_ROOT_URL=http://localhost:3000 + +# SMTP beĂĄllĂ­tĂĄsok az email Ă©rtesĂ­tĂ©sekhez / SMTP settings for email notifications +SMTP_PASSWORD=smtp_password_here +``` + +### Új scrape target hozzĂĄadĂĄsa / Adding New Scrape Targets + +Szerkeszd a `prometheus/prometheus.yml` fĂĄjlt: +Edit the `prometheus/prometheus.yml` file: + +```yaml +scrape_configs: + - job_name: "my-new-service" + static_configs: + - targets: ["service-host:port"] +``` + +### Új endpoint monitoring / Adding Endpoint Monitoring + +HTTP endpoint hozzĂĄadĂĄsa / Add HTTP endpoint: + +```yaml +# prometheus/prometheus.yml - blackbox-http job +static_configs: + - targets: + - https://example.com +``` + +### Alertmanager Ă©rtesĂ­tĂ©sek / Alertmanager Notifications + +Email Ă©rtesĂ­tĂ©sekhez ĂĄllĂ­tsd be az SMTP szervert az `alertmanager/alertmanager.yml` fĂĄjlban. +For email notifications, configure the SMTP server in `alertmanager/alertmanager.yml`. + +Slack integrĂĄciĂł / Slack integration: + +```yaml +receivers: + - name: "slack-notifications" + slack_configs: + - api_url: "${SLACK_WEBHOOK_URL}" + channel: "#alerts" + send_resolved: true +``` + +## KarbantartĂĄs / Maintenance + +### Prometheus adatok törlĂ©se / Clearing Prometheus Data + +```bash +docker-compose stop prometheus +docker volume rm prometheus-data +docker-compose up -d prometheus +``` + +### KonfigurĂĄciĂł ĂșjratöltĂ©se / Reloading Configuration + +```bash +# Prometheus +curl -X POST http://localhost:9090/-/reload + +# Alertmanager +curl -X POST http://localhost:9093/-/reload +``` + +### Backup + +```bash +# Grafana dashboardok Ă©s beĂĄllĂ­tĂĄsok / Grafana dashboards and settings +docker cp grafana:/var/lib/grafana ./grafana-backup + +# Prometheus adatok / Prometheus data +docker cp prometheus:/prometheus ./prometheus-backup +``` + +## HibaelhĂĄrĂ­tĂĄs / Troubleshooting + +### Container nem indul / Container Won't Start + +```bash +# EllenƑrizd a naplĂłkat / Check logs +docker-compose logs [service-name] + +# EllenƑrizd az erƑforrĂĄsokat / Check resources +docker stats +``` + +### Prometheus nem gyƱjti a metrikĂĄkat / Prometheus Not Collecting Metrics + +1. EllenƑrizd a targets stĂĄtuszĂĄt / Check targets status: http://localhost:9090/targets +2. EllenƑrizd a scrape konfigurĂĄciĂłt / Check scrape config +3. EllenƑrizd a hĂĄlĂłzati kapcsolatot / Check network connectivity + +### Grafana nem mutat adatokat / Grafana Shows No Data + +1. EllenƑrizd a datasource kapcsolatot / Check datasource connection +2. EllenƑrizd az idƑintervallumot / Check time range +3. Prometheus targets ĂĄllapota / Prometheus targets status + +## BƑvĂ­tĂ©si lehetƑsĂ©gek / Extension Options + +- **Loki**: Log aggregĂĄciĂł / Log aggregation +- **Tempo**: Distributed tracing +- **Pushgateway**: Batch job metrikĂĄk / Batch job metrics +- **Thanos**: Long-term storage Ă©s HA + +## Licenc / License + +MIT License diff --git a/2-infra-monitoring/grafana/dashboards/docker-overview.json b/2-infra-monitoring/grafana/dashboards/docker-overview.json new file mode 100644 index 0000000..f6b1d75 --- /dev/null +++ b/2-infra-monitoring/grafana/dashboards/docker-overview.json @@ -0,0 +1,1440 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Docker container monitoring - CPU, memĂłria, hĂĄlĂłzat, I/O", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "ÖsszefoglalĂł / Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(container_last_seen{name!=\"\", name!~\"POD.*\"})", + "legendFormat": "Containers", + "refId": "A" + } + ], + "title": "FutĂł Containerek / Running Containers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(container_cpu_usage_seconds_total{name!=\"\", name!~\"POD.*\"}[5m])) * 100", + "legendFormat": "CPU", + "refId": "A" + } + ], + "title": "Összes CPU / Total CPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 8589934592 + }, + { + "color": "red", + "value": 17179869184 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(container_memory_usage_bytes{name!=\"\", name!~\"POD.*\"})", + "legendFormat": "Memory", + "refId": "A" + } + ], + "title": "Összes MemĂłria / Total Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(container_network_receive_bytes_total{name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "RX", + "refId": "A" + } + ], + "title": "HĂĄlĂłzat RX / Network RX", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(container_network_transmit_bytes_total{name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "TX", + "refId": "A" + } + ], + "title": "HĂĄlĂłzat TX / Network TX", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(container_restart_count{name!=\"\", name!~\"POD.*\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "title": "ÚjraindĂ­tĂĄsok (1h) / Restarts (1h)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 8, + "panels": [], + "title": "Container CPU", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m])) * 100", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Container CPU HasznĂĄlat / Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 10, + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m])) * 100", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "CPU ContainerenkĂ©nt / CPU per Container", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 11, + "panels": [], + "title": "Container MemĂłria / Container Memory", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "container_memory_usage_bytes{name=~\"$container\", name!=\"\", name!~\"POD.*\"}", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Container MemĂłria HasznĂĄlat / Container Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 536870912 + }, + { + "color": "red", + "value": 1073741824 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 13, + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "container_memory_usage_bytes{name=~\"$container\", name!=\"\", name!~\"POD.*\"}", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "MemĂłria ContainerenkĂ©nt / Memory per Container", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 14, + "panels": [], + "title": "Container HĂĄlĂłzat / Container Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*TX.*" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_network_receive_bytes_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "{{name}} RX", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_network_transmit_bytes_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "{{name}} TX", + "refId": "B" + } + ], + "title": "HĂĄlĂłzati Forgalom / Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_network_receive_errors_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "{{name}} RX errors", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_network_transmit_errors_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "{{name}} TX errors", + "refId": "B" + } + ], + "title": "HĂĄlĂłzati HibĂĄk / Network Errors", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 17, + "panels": [], + "title": "Container I/O", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*write.*" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_fs_reads_bytes_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "{{name}} read", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_fs_writes_bytes_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "legendFormat": "{{name}} write", + "refId": "B" + } + ], + "title": "Filesystem I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU %" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "max", + "value": 100 + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "max", + "value": 2147483648 + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1073741824 + }, + { + "color": "red", + "value": 1610612736 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Net RX" + }, + "properties": [ + { + "id": "unit", + "value": "Bps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Net TX" + }, + "properties": [ + { + "id": "unit", + "value": "Bps" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 19, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "CPU %" + } + ] + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m])) * 100", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "CPU" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "container_memory_usage_bytes{name=~\"$container\", name!=\"\", name!~\"POD.*\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "Memory" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_network_receive_bytes_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "NetRX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (name) (rate(container_network_transmit_bytes_total{name=~\"$container\", name!=\"\", name!~\"POD.*\"}[5m]))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "NetTX" + } + ], + "title": "Container ÖsszefoglalĂł / Container Summary", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "name" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true + }, + "indexByName": {}, + "renameByName": { + "Value #CPU": "CPU %", + "Value #Memory": "Memory", + "Value #NetRX": "Net RX", + "Value #NetTX": "Net TX", + "name": "Container" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "docker", + "containers", + "cadvisor" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(container_last_seen{name!=\"\", name!~\"POD.*\"}, name)", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": true, + "name": "container", + "options": [], + "query": { + "query": "label_values(container_last_seen{name!=\"\", name!~\"POD.*\"}, name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Docker Overview / Docker ÁttekintĂ©s", + "uid": "docker-overview", + "version": 1, + "weekStart": "" +} diff --git a/2-infra-monitoring/grafana/dashboards/network-overview.json b/2-infra-monitoring/grafana/dashboards/network-overview.json new file mode 100644 index 0000000..76119de --- /dev/null +++ b/2-infra-monitoring/grafana/dashboards/network-overview.json @@ -0,0 +1,1494 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "HĂĄlĂłzati endpoint monitoring - HTTP, ICMP, TCP probes / Network endpoint monitoring", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "ÖsszefoglalĂł / Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-http\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "HTTP Endpoint StĂĄtusz / HTTP Endpoint Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-icmp\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "ICMP Ping StĂĄtusz / ICMP Ping Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-tcp\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "TCP Port StĂĄtusz / TCP Port Status", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 5, + "panels": [], + "title": "HTTP Probes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_duration_seconds{job=\"blackbox-http\", instance=~\"$http_target\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "HTTP VĂĄlaszidƑ / HTTP Response Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-http\", instance=~\"$http_target\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "HTTP ElĂ©rhetƑsĂ©g / HTTP Availability", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 90 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 8, + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "avg_over_time(probe_success{job=\"blackbox-http\", instance=~\"$http_target\"}[$__range]) * 100", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "HTTP Uptime % (idƑszak) / HTTP Uptime % (period)", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 7 + }, + { + "color": "red", + "value": 14 + } + ] + }, + "unit": "d" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 9, + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(probe_ssl_earliest_cert_expiry{job=\"blackbox-http\", instance=~\"$http_target\"} - time()) / 86400", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "SSL TanĂșsĂ­tvĂĄny LejĂĄrat / SSL Certificate Expiry (days)", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 10, + "panels": [], + "title": "ICMP Probes (Ping)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_duration_seconds{job=\"blackbox-icmp\", instance=~\"$icmp_target\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Ping Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-icmp\", instance=~\"$icmp_target\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "ICMP ElĂ©rhetƑsĂ©g / ICMP Availability", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 13, + "panels": [], + "title": "TCP Probes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_duration_seconds{job=\"blackbox-tcp\", instance=~\"$tcp_target\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "TCP Kapcsolat IdƑ / TCP Connection Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-tcp\", instance=~\"$tcp_target\"}", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "TCP ElĂ©rhetƑsĂ©g / TCP Availability", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 16, + "panels": [], + "title": "RĂ©szletes InformĂĄciĂłk / Detailed Information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Status" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Response Time" + }, + "properties": [ + { + "id": "unit", + "value": "s" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "max", + "value": 2 + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "SSL Days" + }, + "properties": [ + { + "id": "unit", + "value": "d" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "max", + "value": 90 + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 14 + }, + { + "color": "green", + "value": 30 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 43 + }, + "id": 17, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Status" + } + ] + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-http\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "Status" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_duration_seconds{job=\"blackbox-http\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "Duration" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(probe_ssl_earliest_cert_expiry{job=\"blackbox-http\"} - time()) / 86400", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "SSL" + } + ], + "title": "HTTP Probe ÖsszefoglalĂł / HTTP Probe Summary", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "instance" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "job": true, + "job 1": true, + "job 2": true, + "probe_type": true, + "probe_type 1": true, + "probe_type 2": true + }, + "indexByName": {}, + "renameByName": { + "Value #Duration": "Response Time", + "Value #SSL": "SSL Days", + "Value #Status": "Status", + "instance": "Endpoint" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "network", + "blackbox", + "probes" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(probe_success{job=\"blackbox-http\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "HTTP Target", + "multi": true, + "name": "http_target", + "options": [], + "query": { + "query": "label_values(probe_success{job=\"blackbox-http\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(probe_success{job=\"blackbox-icmp\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "ICMP Target", + "multi": true, + "name": "icmp_target", + "options": [], + "query": { + "query": "label_values(probe_success{job=\"blackbox-icmp\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(probe_success{job=\"blackbox-tcp\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "TCP Target", + "multi": true, + "name": "tcp_target", + "options": [], + "query": { + "query": "label_values(probe_success{job=\"blackbox-tcp\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Network Overview / HĂĄlĂłzati ÁttekintĂ©s", + "uid": "network-overview", + "version": 1, + "weekStart": "" +} diff --git a/2-infra-monitoring/grafana/dashboards/server-health.json b/2-infra-monitoring/grafana/dashboards/server-health.json new file mode 100644 index 0000000..fe3ecfa --- /dev/null +++ b/2-infra-monitoring/grafana/dashboards/server-health.json @@ -0,0 +1,1283 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Server egĂ©szsĂ©gi ĂĄllapot monitoring - CPU, memĂłria, lemez, hĂĄlĂłzat", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "ÖsszefoglalĂł / Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"$instance\"}[5m])) * 100)", + "legendFormat": "CPU", + "refId": "A" + } + ], + "title": "CPU HasznĂĄlat / CPU Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$instance\"} / node_memory_MemTotal_bytes{instance=~\"$instance\"})) * 100", + "legendFormat": "Memory", + "refId": "A" + } + ], + "title": "MemĂłria HasznĂĄlat / Memory Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(1 - (node_filesystem_avail_bytes{instance=~\"$instance\", mountpoint=\"/\", fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{instance=~\"$instance\", mountpoint=\"/\", fstype!~\"tmpfs|overlay\"})) * 100", + "legendFormat": "Disk", + "refId": "A" + } + ], + "title": "Lemez HasznĂĄlat / Disk Usage (/)", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_time_seconds{instance=~\"$instance\"} - node_boot_time_seconds{instance=~\"$instance\"}", + "legendFormat": "Uptime", + "refId": "A" + } + ], + "title": "ÜzemidƑ / Uptime", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 6, + "panels": [], + "title": "CPU RĂ©szletek / CPU Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (mode) (rate(node_cpu_seconds_total{instance=~\"$instance\", mode!=\"idle\"}[5m])) * 100 / count(node_cpu_seconds_total{instance=~\"$instance\", mode=\"idle\"})", + "legendFormat": "{{mode}}", + "refId": "A" + } + ], + "title": "CPU MĂłdok / CPU Modes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load1{instance=~\"$instance\"}", + "legendFormat": "1m avg", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load5{instance=~\"$instance\"}", + "legendFormat": "5m avg", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load15{instance=~\"$instance\"}", + "legendFormat": "15m avg", + "refId": "C" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 9, + "panels": [], + "title": "MemĂłria RĂ©szletek / Memory Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Available" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"} - node_memory_MemAvailable_bytes{instance=~\"$instance\"}", + "legendFormat": "Used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Buffers_bytes{instance=~\"$instance\"}", + "legendFormat": "Buffers", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_Cached_bytes{instance=~\"$instance\"}", + "legendFormat": "Cached", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemAvailable_bytes{instance=~\"$instance\"}", + "legendFormat": "Available", + "refId": "D" + } + ], + "title": "MemĂłria RĂ©szletezĂ©s / Memory Breakdown", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_SwapTotal_bytes{instance=~\"$instance\"} - node_memory_SwapFree_bytes{instance=~\"$instance\"}", + "legendFormat": "Swap Used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_SwapFree_bytes{instance=~\"$instance\"}", + "legendFormat": "Swap Free", + "refId": "B" + } + ], + "title": "Swap HasznĂĄlat / Swap Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 12, + "panels": [], + "title": "Lemez RĂ©szletek / Disk Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 13, + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(1 - (node_filesystem_avail_bytes{instance=~\"$instance\", fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{instance=~\"$instance\", fstype!~\"tmpfs|overlay\"})) * 100", + "legendFormat": "{{mountpoint}}", + "refId": "A" + } + ], + "title": "Lemez HasznĂĄlat MountpontonkĂ©nt / Disk Usage by Mountpoint", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_disk_read_bytes_total{instance=~\"$instance\", device!~\"loop.*\"}[5m])", + "legendFormat": "{{device}} read", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_disk_written_bytes_total{instance=~\"$instance\", device!~\"loop.*\"}[5m])", + "legendFormat": "{{device}} write", + "refId": "B" + } + ], + "title": "Lemez I/O / Disk I/O", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 15, + "panels": [], + "title": "HĂĄlĂłzat / Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*transmit.*" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_network_receive_bytes_total{instance=~\"$instance\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m]) * 8", + "legendFormat": "{{device}} receive", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_network_transmit_bytes_total{instance=~\"$instance\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m]) * 8", + "legendFormat": "{{device}} transmit", + "refId": "B" + } + ], + "title": "HĂĄlĂłzati Forgalom / Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_network_receive_errs_total{instance=~\"$instance\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m])", + "legendFormat": "{{device}} rx errors", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_network_transmit_errs_total{instance=~\"$instance\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m])", + "legendFormat": "{{device}} tx errors", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_network_receive_drop_total{instance=~\"$instance\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m])", + "legendFormat": "{{device}} rx dropped", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_network_transmit_drop_total{instance=~\"$instance\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m])", + "legendFormat": "{{device}} tx dropped", + "refId": "D" + } + ], + "title": "HĂĄlĂłzati HibĂĄk / Network Errors", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "server", + "node-exporter", + "infrastructure" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_uname_info, instance)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [], + "query": { + "query": "label_values(node_uname_info, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Server Health / Szerver EgĂ©szsĂ©g", + "uid": "server-health", + "version": 1, + "weekStart": "" +} diff --git a/2-infra-monitoring/grafana/provisioning/dashboards/dashboards.yml b/2-infra-monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..d4a0275 --- /dev/null +++ b/2-infra-monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,51 @@ +# Grafana Dashboard Provisioning +# =============================== +# Automatikus dashboard betöltĂ©s Grafana indĂ­tĂĄskor. +# +# A dashboardok a /var/lib/grafana/dashboards mappĂĄbĂłl +# töltƑdnek be JSON formĂĄtumban. + +apiVersion: 1 + +# Dashboard provider definĂ­ciĂłk +providers: + # ========================================= + # DEFAULT PROVIDER - FƑ dashboard könyvtĂĄr + # ========================================= + - name: "default" + # Egyedi azonosĂ­tĂł + orgId: 1 + # Mappa neve a Grafana UI-ban + folder: "" + # Mappa UID (ĂŒres = gyökĂ©r) + folderUid: "" + # Provider tĂ­pus (file = fĂĄjlrendszer) + type: file + # FrissĂ­tĂ©s letiltĂĄsa a UI-rĂłl + disableDeletion: false + # SzerkesztĂ©s engedĂ©lyezĂ©se + editable: true + # FrissĂ­tĂ©si intervallum (mĂĄsodperc) + updateIntervalSeconds: 30 + # FĂĄjl beĂĄllĂ­tĂĄsok + options: + # Dashboard JSON fĂĄjlok helye + path: /var/lib/grafana/dashboards + # AlmappĂĄk keresĂ©se + foldersFromFilesStructure: false + + # ========================================= + # INFRASTRUCTURE - InfrastruktĂșra dashboardok + # ========================================= + # Ha kĂŒlön mappĂĄba szeretnĂ©d rendezni: + # + # - name: "infrastructure" + # orgId: 1 + # folder: "Infrastructure" + # folderUid: "infra" + # type: file + # disableDeletion: false + # editable: true + # updateIntervalSeconds: 30 + # options: + # path: /var/lib/grafana/dashboards/infrastructure