diff --git a/lakehouse/k8s_failed_jobs b/lakehouse/k8s_failed_jobs new file mode 100644 index 0000000..a076e91 --- /dev/null +++ b/lakehouse/k8s_failed_jobs @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +CheckMK local check: Kubernetes failed jobs (BackoffLimitExceeded / DeadlineExceeded) + +Deploy to: /usr/lib/check_mk_agent/local/k8s_failed_jobs +Make executable: chmod +x /usr/lib/check_mk_agent/local/k8s_failed_jobs + +Each failed job surfaces as its own CheckMK service: + "K8s Job Failed: /" + +Status mapping: + 0 (OK) – job no longer has a failure event (stale / resolved) + 2 (CRIT) – BackoffLimitExceeded or DeadlineExceeded event exists +""" + +import json +import subprocess +import sys +from collections import defaultdict + +KUBECTL = "microk8s kubectl" +FAILURE_REASONS = {"BackoffLimitExceeded", "DeadlineExceeded"} + + +def get_events() -> list[dict]: + try: + result = subprocess.run( + f"{KUBECTL} get events -A -o json", + shell=True, + capture_output=True, + text=True, + timeout=30, + ) + except subprocess.TimeoutExpired: + print("3 K8s_Failed_Jobs - UNKNOWN: kubectl timed out") + sys.exit(0) + + if result.returncode != 0: + stderr = result.stderr.strip().replace("\n", " ") + print(f"3 K8s_Failed_Jobs - UNKNOWN: kubectl error – {stderr}") + sys.exit(0) + + try: + return json.loads(result.stdout).get("items", []) + except json.JSONDecodeError as exc: + print(f"3 K8s_Failed_Jobs - UNKNOWN: JSON parse error – {exc}") + sys.exit(0) + + +def collect_failures(events: list[dict]) -> dict[str, dict]: + """ + Returns a dict keyed by "namespace/job-name". + Each value holds the most recent failure reason and event count. + """ + failures: dict[str, dict] = defaultdict(lambda: {"reason": "", "count": 0, "message": ""}) + + for event in events: + obj = event.get("involvedObject", {}) + reason = event.get("reason", "") + + if obj.get("kind") != "Job" or reason not in FAILURE_REASONS: + continue + + namespace = event.get("metadata", {}).get("namespace", "unknown") + job_name = obj.get("name", "unknown") + key = f"{namespace}/{job_name}" + + failures[key]["reason"] = reason + failures[key]["count"] += event.get("count", 1) + failures[key]["message"] = event.get("message", "").replace("\n", " ") + + return failures + + +def service_name(key: str) -> str: + """Produce a stable, space-free CheckMK service identifier.""" + return "K8s_Job_Failed_" + key.replace("/", "_").replace("-", "_") + + +def main() -> None: + events = get_events() + failures = collect_failures(events) + + if not failures: + # Emit a single OK roll-up so the service stays visible when all is well + print("0 K8s_Failed_Jobs count=0 OK: no failed jobs found") + return + + for key, info in sorted(failures.items()): + svc = service_name(key) + reason = info["reason"] + count = info["count"] + msg = info["message"] or "no details" + perf = f"event_count={count}" + print(f"2 {svc} {perf} CRIT: {key} – {reason} (events: {count}) | {msg}") + + +if __name__ == "__main__": + main() diff --git a/lakehouse/velero_failed_backups b/lakehouse/velero_failed_backups new file mode 100644 index 0000000..8196037 --- /dev/null +++ b/lakehouse/velero_failed_backups @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +""" +CheckMK local check: failed Velero backups + +Deploy to: /usr/lib/check_mk_agent/local/velero_failed_backups +Make executable: chmod +x /usr/lib/check_mk_agent/local/velero_failed_backups + +Status mapping: + 0 (OK) – all backups completed successfully + 2 (CRIT) – one or more backups are Failed or PartiallyFailed + 3 (UNKNOWN) – kubectl/parse error +""" + +import json +import subprocess +import sys + +KUBECTL = ["microk8s", "kubectl"] +FAILED_PHASES = {"Failed", "PartiallyFailed"} + + +def get_backups() -> list[dict]: + try: + result = subprocess.run( + KUBECTL + ["get", "backups", "-A", "-o", "json"], + capture_output=True, + text=True, + timeout=30, + ) + except subprocess.TimeoutExpired: + print("3 Velero_Backups - UNKNOWN: kubectl timed out") + sys.exit(0) + + if result.returncode != 0: + stderr = result.stderr.strip().replace("\n", " ") + print(f"3 Velero_Backups - UNKNOWN: kubectl error – {stderr}") + sys.exit(0) + + try: + return json.loads(result.stdout).get("items", []) + except json.JSONDecodeError as exc: + print(f"3 Velero_Backups - UNKNOWN: JSON parse error – {exc}") + sys.exit(0) + + +def service_name(key: str) -> str: + return "Velero_Backup_" + key.replace("/", "_").replace("-", "_") + + +def main() -> None: + backups = get_backups() + + if not backups: + print("0 Velero_Backups count=0 OK: no backups found (is Velero installed?)") + return + + bad = [] + + for backup in backups: + meta = backup.get("metadata", {}) + status = backup.get("status", {}) + phase = status.get("phase", "Unknown") + + if phase not in FAILED_PHASES: + continue + + namespace = meta.get("namespace", "unknown") + name = meta.get("name", "unknown") + key = f"{namespace}/{name}" + errors = status.get("errors", 0) + warnings = status.get("warnings", 0) + start = status.get("startTimestamp", "unknown") + expiry = status.get("expiration", "unknown") + + perf = f"errors={errors};0;0 warnings={warnings};0;0" + detail = ( + f"{phase} | started: {start} | " + f"errors: {errors}, warnings: {warnings} | " + f"expires: {expiry}" + ) + + bad.append(f"2 {service_name(key)} {perf} CRIT: {key} – {detail}") + + if not bad: + print(f"0 Velero_Backups count={len(backups)} OK: all {len(backups)} backup(s) completed successfully") + return + + print( + f"2 Velero_Backups count={len(backups)} failed={len(bad)} " + f"CRIT: {len(bad)} of {len(backups)} backup(s) failed or partially failed" + ) + + for line in bad: + print(line) + + +if __name__ == "__main__": + main()