Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions lakehouse/k8s_failed_jobs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python3
"""
CheckMK local check: Kubernetes failed jobs (BackoffLimitExceeded / DeadlineExceeded)

Deploy to: /usr/lib/check_mk_agent/local/k8s_failed_jobs
Make executable: chmod +x /usr/lib/check_mk_agent/local/k8s_failed_jobs

Each failed job surfaces as its own CheckMK service:
"K8s Job Failed: <namespace>/<job-name>"

Status mapping:
0 (OK) – job no longer has a failure event (stale / resolved)
2 (CRIT) – BackoffLimitExceeded or DeadlineExceeded event exists
"""

import json
import subprocess
import sys
from collections import defaultdict

KUBECTL = "microk8s kubectl"
FAILURE_REASONS = {"BackoffLimitExceeded", "DeadlineExceeded"}


def get_events() -> list[dict]:
try:
result = subprocess.run(
f"{KUBECTL} get events -A -o json",
shell=True,
capture_output=True,
text=True,
timeout=30,
)
except subprocess.TimeoutExpired:
print("3 K8s_Failed_Jobs - UNKNOWN: kubectl timed out")
sys.exit(0)

if result.returncode != 0:
stderr = result.stderr.strip().replace("\n", " ")
print(f"3 K8s_Failed_Jobs - UNKNOWN: kubectl error – {stderr}")
sys.exit(0)

try:
return json.loads(result.stdout).get("items", [])
except json.JSONDecodeError as exc:
print(f"3 K8s_Failed_Jobs - UNKNOWN: JSON parse error – {exc}")
sys.exit(0)


def collect_failures(events: list[dict]) -> dict[str, dict]:
"""
Returns a dict keyed by "namespace/job-name".
Each value holds the most recent failure reason and event count.
"""
failures: dict[str, dict] = defaultdict(lambda: {"reason": "", "count": 0, "message": ""})

for event in events:
obj = event.get("involvedObject", {})
reason = event.get("reason", "")

if obj.get("kind") != "Job" or reason not in FAILURE_REASONS:
continue

namespace = event.get("metadata", {}).get("namespace", "unknown")
job_name = obj.get("name", "unknown")
key = f"{namespace}/{job_name}"

failures[key]["reason"] = reason
failures[key]["count"] += event.get("count", 1)
failures[key]["message"] = event.get("message", "").replace("\n", " ")

return failures


def service_name(key: str) -> str:
"""Produce a stable, space-free CheckMK service identifier."""
return "K8s_Job_Failed_" + key.replace("/", "_").replace("-", "_")


def main() -> None:
events = get_events()
failures = collect_failures(events)

if not failures:
# Emit a single OK roll-up so the service stays visible when all is well
print("0 K8s_Failed_Jobs count=0 OK: no failed jobs found")
return

for key, info in sorted(failures.items()):
svc = service_name(key)
reason = info["reason"]
count = info["count"]
msg = info["message"] or "no details"
perf = f"event_count={count}"
print(f"2 {svc} {perf} CRIT: {key} – {reason} (events: {count}) | {msg}")


if __name__ == "__main__":
main()
98 changes: 98 additions & 0 deletions lakehouse/velero_failed_backups
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3
"""
CheckMK local check: failed Velero backups

Deploy to: /usr/lib/check_mk_agent/local/velero_failed_backups
Make executable: chmod +x /usr/lib/check_mk_agent/local/velero_failed_backups

Status mapping:
0 (OK) – all backups completed successfully
2 (CRIT) – one or more backups are Failed or PartiallyFailed
3 (UNKNOWN) – kubectl/parse error
"""

import json
import subprocess
import sys

KUBECTL = ["microk8s", "kubectl"]
FAILED_PHASES = {"Failed", "PartiallyFailed"}


def get_backups() -> list[dict]:
Comment thread
bio-boris marked this conversation as resolved.
try:
result = subprocess.run(
KUBECTL + ["get", "backups", "-A", "-o", "json"],
capture_output=True,
text=True,
timeout=30,
)
except subprocess.TimeoutExpired:
print("3 Velero_Backups - UNKNOWN: kubectl timed out")
sys.exit(0)

if result.returncode != 0:
stderr = result.stderr.strip().replace("\n", " ")
print(f"3 Velero_Backups - UNKNOWN: kubectl error – {stderr}")
sys.exit(0)

try:
return json.loads(result.stdout).get("items", [])
except json.JSONDecodeError as exc:
print(f"3 Velero_Backups - UNKNOWN: JSON parse error – {exc}")
sys.exit(0)


def service_name(key: str) -> str:
return "Velero_Backup_" + key.replace("/", "_").replace("-", "_")


def main() -> None:
backups = get_backups()

if not backups:
print("0 Velero_Backups count=0 OK: no backups found (is Velero installed?)")
return

bad = []

for backup in backups:
meta = backup.get("metadata", {})
status = backup.get("status", {})
phase = status.get("phase", "Unknown")

if phase not in FAILED_PHASES:
continue

namespace = meta.get("namespace", "unknown")
name = meta.get("name", "unknown")
key = f"{namespace}/{name}"
errors = status.get("errors", 0)
warnings = status.get("warnings", 0)
start = status.get("startTimestamp", "unknown")
expiry = status.get("expiration", "unknown")

perf = f"errors={errors};0;0 warnings={warnings};0;0"
detail = (
f"{phase} | started: {start} | "
f"errors: {errors}, warnings: {warnings} | "
f"expires: {expiry}"
)

bad.append(f"2 {service_name(key)} {perf} CRIT: {key} – {detail}")

if not bad:
print(f"0 Velero_Backups count={len(backups)} OK: all {len(backups)} backup(s) completed successfully")
return

Comment thread
bio-boris marked this conversation as resolved.
print(
f"2 Velero_Backups count={len(backups)} failed={len(bad)} "
f"CRIT: {len(bad)} of {len(backups)} backup(s) failed or partially failed"
)

for line in bad:
print(line)


if __name__ == "__main__":
main()