Skip to content

Commit 9c3ffe6

Browse files
feat: Add node-not-ready event monitoring support (#12)
* feat: Add node-not-ready event monitoring support - Add 'node-not-ready' event type to EventConfiguration enum - Implement mapNodeEventType function in event watcher - Update event type filtering to handle Node events separately from Pod events - Update CRD schemas to include node-not-ready event type - Update documentation and examples with node monitoring capabilities - Generate updated deepcopy code for API changes This enables khook to monitor Kubernetes node readiness events and trigger appropriate agent responses for node-level issues like kubelet failures, network problems, or resource pressure. Signed-off-by: Kyungho Kang <kyungho@dable.io> * Update .gitignore Signed-off-by: kyungho-for-ops <kyungho1495@gmail.com> Signed-off-by: Kyungho Kang <kyungho@dable.io> * Update .gitignore Signed-off-by: kyungho-for-ops <kyungho1495@gmail.com> Signed-off-by: Kyungho Kang <kyungho@dable.io> * Trigger final checks update Signed-off-by: Kyungho Kang <kyungho@dable.io> --------- Signed-off-by: Kyungho Kang <kyungho@dable.io> Signed-off-by: kyungho-for-ops <kyungho1495@gmail.com> Co-authored-by: Kyungho Kang <kyungho@dable.io>
1 parent c8abacf commit 9c3ffe6

7 files changed

Lines changed: 103 additions & 18 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,4 @@ helm/*/requirements.lock
6666

6767
# Generated files
6868
*.pb.go
69-
zz_generated.*.go
69+
zz_generated.*.go

README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ The KAgent Hook Controller monitors Kubernetes events and triggers Kagent agents
1515

1616
### Key Features
1717

18-
- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed) in a single hook configuration
18+
- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready) in a single hook configuration
1919
- **Basic Deduplication**: Prevents duplicate notifications with 10-minute timeout logic
2020
- **Kagent Integration**: Integrates with the Kagent platform for AI agent incident response. (Can in theory talk to any a2a-enabled agent)
2121
- **Status Tracking**: Provides real-time status updates and audit trails through Kubernetes events
@@ -155,6 +155,7 @@ The controller supports monitoring the following Kubernetes event types:
155155
| `pod-pending` | Pod is stuck in pending state | Resource constraints, scheduling issues, image pull failures |
156156
| `oom-kill` | Pod was killed due to out-of-memory | Memory limits exceeded, memory leaks |
157157
| `probe-failed` | Liveness or readiness probe failed | Application not responding, configuration issues |
158+
| `node-not-ready` | Node is not in Ready state | Kubelet issues, network problems, resource pressure, hardware failures |
158159

159160
## Future
160161
The controller will support reacting to additional Kubernetes event.
@@ -277,6 +278,21 @@ spec:
277278
After analysis - use all available tools to try and resolve. Annotate the updated resources with "kagentFix: <dateTime>"
278279
- If a resource can't be patched - delete it and recreate as needed. Don't ask for permission. Assume autonomous execution.
279280
Autonomous remediation: proceed with the best possible way to remediate. Don't ask for approval.
281+
- eventType: node-not-ready
282+
agentId: node-recovery-specialist
283+
prompt: |
284+
CRITICAL: Node {{.ResourceName}} is not ready at {{.EventTime}}.
285+
This is a cluster-level emergency requiring immediate attention.
286+
287+
AUTONOMOUS MODE: Diagnose and resolve node issues immediately:
288+
• Check node conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
289+
• Analyze kubelet logs and system resources
290+
• Verify network connectivity and DNS resolution
291+
• Check for hardware or kernel issues
292+
• Attempt node recovery procedures (restart kubelet, clear disk space, etc.)
293+
• If node cannot be recovered, safely drain and replace
294+
• Never ask for permission - assume autonomous execution
295+
• Annotate fixed resources with: kagentFix=<timestamp>
280296
```
281297

282298
### Development Environment Monitoring

api/v1alpha2/hook_types.go

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ type HookSpec struct {
2525
// EventConfiguration defines a single event type configuration
2626
type EventConfiguration struct {
2727
// EventType specifies the type of Kubernetes event to monitor
28-
// +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed
28+
// +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed;node-not-ready
2929
// +kubebuilder:validation:Required
3030
EventType string `json:"eventType"`
3131

@@ -84,14 +84,15 @@ func (h *Hook) Validate() error {
8484
func (h *Hook) validateEventConfiguration(config EventConfiguration, index int) error {
8585
// Validate EventType
8686
validEventTypes := map[string]bool{
87-
"pod-restart": true,
88-
"pod-pending": true,
89-
"oom-kill": true,
90-
"probe-failed": true,
87+
"pod-restart": true,
88+
"pod-pending": true,
89+
"oom-kill": true,
90+
"probe-failed": true,
91+
"node-not-ready": true,
9192
}
9293

9394
if !validEventTypes[config.EventType] {
94-
return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", index, config.EventType)
95+
return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", index, config.EventType)
9596
}
9697

9798
// Validate AgentRef
@@ -390,7 +391,7 @@ func validateHook(hook *Hook) (admission.Warnings, error) {
390391

391392
// Validate event type
392393
if !isValidEventType(config.EventType) {
393-
allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", i, config.EventType))
394+
allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", i, config.EventType))
394395
}
395396

396397
// Validate agentId is not empty
@@ -419,10 +420,11 @@ func validateHook(hook *Hook) (admission.Warnings, error) {
419420
// isValidEventType checks if the provided event type is valid
420421
func isValidEventType(eventType string) bool {
421422
validTypes := map[string]bool{
422-
"pod-restart": true,
423-
"pod-pending": true,
424-
"oom-kill": true,
425-
"probe-failed": true,
423+
"pod-restart": true,
424+
"pod-pending": true,
425+
"oom-kill": true,
426+
"probe-failed": true,
427+
"node-not-ready": true,
426428
}
427429
return validTypes[eventType]
428430
}

api/v1alpha2/zz_generated.deepcopy.go

Lines changed: 43 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/kagent.dev_hooks.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ spec:
7171
- pod-pending
7272
- oom-kill
7373
- probe-failed
74+
- node-not-ready
7475
type: string
7576
prompt:
7677
description: Prompt specifies the prompt template to send to

helm/khook-crds/crds/kagent.dev_hooks.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ spec:
7171
- pod-pending
7272
- oom-kill
7373
- probe-failed
74+
- node-not-ready
7475
type: string
7576
prompt:
7677
description: Prompt specifies the prompt template to send to

internal/event/watcher.go

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -251,14 +251,18 @@ func (w *Watcher) mapKubernetesEvent(k8sEvent *eventsv1.Event) *interfaces.Event
251251

252252
// mapEventType maps Kubernetes event reasons to our event types
253253
func (w *Watcher) mapEventType(k8sEvent *eventsv1.Event) string {
254-
// Ignore Normal events entirely; only act on warnings/errors
255-
if strings.ToLower(k8sEvent.Type) == "normal" {
256-
return ""
257-
}
258-
// Map based on the regarding object kind and event reason
254+
// Map based on the regarding object kind and event reason first
259255
switch k8sEvent.Regarding.Kind {
260256
case "Pod":
257+
// For pods, ignore Normal events entirely; only act on warnings/errors
258+
if strings.ToLower(k8sEvent.Type) == "normal" {
259+
return ""
260+
}
261261
return w.mapPodEventType(k8sEvent)
262+
case "Node":
263+
// For nodes, we need to check both Normal and Warning events
264+
// as NodeNotReady events are typically Normal type
265+
return w.mapNodeEventType(k8sEvent)
262266
default:
263267
return ""
264268
}
@@ -317,3 +321,21 @@ func (w *Watcher) mapPodEventType(k8sEvent *eventsv1.Event) string {
317321

318322
return ""
319323
}
324+
325+
// mapNodeEventType maps node-related events to our event types
326+
func (w *Watcher) mapNodeEventType(k8sEvent *eventsv1.Event) string {
327+
reason := strings.ToLower(k8sEvent.Reason)
328+
message := strings.ToLower(k8sEvent.Note)
329+
eventType := strings.ToLower(k8sEvent.Type)
330+
331+
switch {
332+
// Node not ready events
333+
case reason == "nodenotready":
334+
return "node-not-ready"
335+
336+
default:
337+
// Log unknown node events for future enhancement
338+
w.logger.V(1).Info("Unknown node event", "reason", reason, "type", eventType, "message", message)
339+
return ""
340+
}
341+
}

0 commit comments

Comments
 (0)