From 04586bef4c0df7f5c740b03c5176b648ee14752e Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 11 May 2026 14:39:05 -0500 Subject: [PATCH 1/4] Add CheckMK local check for Velero failed backups --- lakehouse/velero_failed_backups | 94 +++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 lakehouse/velero_failed_backups diff --git a/lakehouse/velero_failed_backups b/lakehouse/velero_failed_backups new file mode 100644 index 0000000..b9ebe73 --- /dev/null +++ b/lakehouse/velero_failed_backups @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +CheckMK local check: failed Velero backups + +Deploy to: /usr/lib/check_mk_agent/local/velero_failed_backups +Make executable: chmod +x /usr/lib/check_mk_agent/local/velero_failed_backups + +Status mapping: + 0 (OK) – all backups completed successfully + 2 (CRIT) – one or more backups are Failed or PartiallyFailed + 3 (UNKNOWN) – kubectl/parse error +""" + +import json +import subprocess +import sys + +KUBECTL = "microk8s kubectl" +FAILED_PHASES = {"Failed", "PartiallyFailed"} + + +def get_backups() -> list[dict]: + try: + result = subprocess.run( + f"{KUBECTL} get backups -A -o json", + shell=True, + capture_output=True, + text=True, + timeout=30, + ) + except subprocess.TimeoutExpired: + print("3 Velero_Backups - UNKNOWN: kubectl timed out") + sys.exit(0) + + if result.returncode != 0: + stderr = result.stderr.strip().replace("\n", " ") + print(f"3 Velero_Backups - UNKNOWN: kubectl error – {stderr}") + sys.exit(0) + + try: + return json.loads(result.stdout).get("items", []) + except json.JSONDecodeError as exc: + print(f"3 Velero_Backups - UNKNOWN: JSON parse error – {exc}") + sys.exit(0) + + +def service_name(key: str) -> str: + return "Velero_Backup_" + key.replace("/", "_").replace("-", "_") + + +def main() -> None: + backups = get_backups() + + if not backups: + print("0 Velero_Backups count=0 OK: no backups found (is Velero installed?)") + return + + bad = [] + + for backup in backups: + meta = backup.get("metadata", {}) + status = backup.get("status", {}) + phase = status.get("phase", "Unknown") + + if phase not in FAILED_PHASES: + continue + + namespace = meta.get("namespace", "unknown") + name = meta.get("name", "unknown") + key = f"{namespace}/{name}" + errors = status.get("errors", 0) + warnings = status.get("warnings", 0) + start = status.get("startTimestamp", "unknown") + expiry = status.get("expiration", "unknown") + + perf = f"errors={errors};0;0 warnings={warnings};0;0" + detail = ( + f"{phase} | started: {start} | " + f"errors: {errors}, warnings: {warnings} | " + f"expires: {expiry}" + ) + + bad.append(f"2 {service_name(key)} {perf} CRIT: {key} – {detail}") + + if not bad: + print(f"0 Velero_Backups count={len(backups)} OK: all {len(backups)} backup(s) completed successfully") + return + + for line in bad: + print(line) + + +if __name__ == "__main__": + main() From c7b543e57e4f4c913b5673345f7ba407d419bdbb Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 11 May 2026 14:40:21 -0500 Subject: [PATCH 2/4] Add script to monitor Kubernetes failed jobs This script checks for failed Kubernetes jobs due to BackoffLimitExceeded or DeadlineExceeded events and outputs the status for CheckMK. --- lakehouse/k8s_failed_jobs | 99 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 lakehouse/k8s_failed_jobs diff --git a/lakehouse/k8s_failed_jobs b/lakehouse/k8s_failed_jobs new file mode 100644 index 0000000..a076e91 --- /dev/null +++ b/lakehouse/k8s_failed_jobs @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +CheckMK local check: Kubernetes failed jobs (BackoffLimitExceeded / DeadlineExceeded) + +Deploy to: /usr/lib/check_mk_agent/local/k8s_failed_jobs +Make executable: chmod +x /usr/lib/check_mk_agent/local/k8s_failed_jobs + +Each failed job surfaces as its own CheckMK service: + "K8s Job Failed: /" + +Status mapping: + 0 (OK) – job no longer has a failure event (stale / resolved) + 2 (CRIT) – BackoffLimitExceeded or DeadlineExceeded event exists +""" + +import json +import subprocess +import sys +from collections import defaultdict + +KUBECTL = "microk8s kubectl" +FAILURE_REASONS = {"BackoffLimitExceeded", "DeadlineExceeded"} + + +def get_events() -> list[dict]: + try: + result = subprocess.run( + f"{KUBECTL} get events -A -o json", + shell=True, + capture_output=True, + text=True, + timeout=30, + ) + except subprocess.TimeoutExpired: + print("3 K8s_Failed_Jobs - UNKNOWN: kubectl timed out") + sys.exit(0) + + if result.returncode != 0: + stderr = result.stderr.strip().replace("\n", " ") + print(f"3 K8s_Failed_Jobs - UNKNOWN: kubectl error – {stderr}") + sys.exit(0) + + try: + return json.loads(result.stdout).get("items", []) + except json.JSONDecodeError as exc: + print(f"3 K8s_Failed_Jobs - UNKNOWN: JSON parse error – {exc}") + sys.exit(0) + + +def collect_failures(events: list[dict]) -> dict[str, dict]: + """ + Returns a dict keyed by "namespace/job-name". + Each value holds the most recent failure reason and event count. + """ + failures: dict[str, dict] = defaultdict(lambda: {"reason": "", "count": 0, "message": ""}) + + for event in events: + obj = event.get("involvedObject", {}) + reason = event.get("reason", "") + + if obj.get("kind") != "Job" or reason not in FAILURE_REASONS: + continue + + namespace = event.get("metadata", {}).get("namespace", "unknown") + job_name = obj.get("name", "unknown") + key = f"{namespace}/{job_name}" + + failures[key]["reason"] = reason + failures[key]["count"] += event.get("count", 1) + failures[key]["message"] = event.get("message", "").replace("\n", " ") + + return failures + + +def service_name(key: str) -> str: + """Produce a stable, space-free CheckMK service identifier.""" + return "K8s_Job_Failed_" + key.replace("/", "_").replace("-", "_") + + +def main() -> None: + events = get_events() + failures = collect_failures(events) + + if not failures: + # Emit a single OK roll-up so the service stays visible when all is well + print("0 K8s_Failed_Jobs count=0 OK: no failed jobs found") + return + + for key, info in sorted(failures.items()): + svc = service_name(key) + reason = info["reason"] + count = info["count"] + msg = info["message"] or "no details" + perf = f"event_count={count}" + print(f"2 {svc} {perf} CRIT: {key} – {reason} (events: {count}) | {msg}") + + +if __name__ == "__main__": + main() From bb6ddc9d7cf382ad67b11e6799f4c4dc673995c2 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 11 May 2026 15:07:25 -0500 Subject: [PATCH 3/4] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- lakehouse/velero_failed_backups | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lakehouse/velero_failed_backups b/lakehouse/velero_failed_backups index b9ebe73..876a68d 100644 --- a/lakehouse/velero_failed_backups +++ b/lakehouse/velero_failed_backups @@ -15,15 +15,14 @@ import json import subprocess import sys -KUBECTL = "microk8s kubectl" +KUBECTL = ["microk8s", "kubectl"] FAILED_PHASES = {"Failed", "PartiallyFailed"} def get_backups() -> list[dict]: try: result = subprocess.run( - f"{KUBECTL} get backups -A -o json", - shell=True, + KUBECTL + ["get", "backups", "-A", "-o", "json"], capture_output=True, text=True, timeout=30, From 4852be7af6f6296f924a4dd7c0be3bebdd435f79 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 11 May 2026 15:08:02 -0500 Subject: [PATCH 4/4] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- lakehouse/velero_failed_backups | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lakehouse/velero_failed_backups b/lakehouse/velero_failed_backups index 876a68d..8196037 100644 --- a/lakehouse/velero_failed_backups +++ b/lakehouse/velero_failed_backups @@ -85,6 +85,11 @@ def main() -> None: print(f"0 Velero_Backups count={len(backups)} OK: all {len(backups)} backup(s) completed successfully") return + print( + f"2 Velero_Backups count={len(backups)} failed={len(bad)} " + f"CRIT: {len(bad)} of {len(backups)} backup(s) failed or partially failed" + ) + for line in bad: print(line)