From c598b4a185448bb73f44f1f957e178fe9e99cc82 Mon Sep 17 00:00:00 2001 From: matthew-pilot Date: Fri, 29 May 2026 23:50:15 +0000 Subject: [PATCH] fix(policy): gate eval error fails-closed by default (PILOT-262) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Policy gate events (connect, dial, datagram) returned ALLOW when expression evaluation failed — fail-open behaviour that a corrupted policy file or unexpected context drift could silently widen into "allow everything." Add a FailClosed field to PolicyDocument (fail_closed in JSON), defaulting to true. When fail_closed: - EvaluateGate returns DENY on eval error instead of ALLOW - EvaluateActions / evaluatePerPeerCycle publish policy.eval_error events and log at ERROR instead of silently returning - Operators who need legacy fail-open set fail_closed: false Pin the new default in audit tests; add explicit fail_open pathway test so the legacy behaviour stays covered. Closes PILOT-262 --- policylang/engine.go | 11 ++++++ policylang/policy.go | 6 ++++ runner.go | 27 +++++++++++++-- zz_audit_defensive_test.go | 70 ++++++++++++++++++++++++++------------ 4 files changed, 90 insertions(+), 24 deletions(-) diff --git a/policylang/engine.go b/policylang/engine.go index b9f3d79..b0cd36c 100644 --- a/policylang/engine.go +++ b/policylang/engine.go @@ -175,6 +175,17 @@ func (cp *CompiledPolicy) EvaluatePeerExpr(ruleName string, actionIdx int, peerC return runProgram(prog, peerCtx) } +// FailClosed reports whether the engine should deny/reject on expression +// evaluation errors. When nil in PolicyDocument, defaults to true +// (fail-closed — more secure). Set fail_closed: false to restore legacy +// fail-open behaviour. +func (cp *CompiledPolicy) FailClosed() bool { + if cp.Doc.FailClosed == nil { + return true // default: fail-closed + } + return *cp.Doc.FailClosed +} + // HasRulesFor returns true if the policy has any rules for the given event type. func (cp *CompiledPolicy) HasRulesFor(eventType EventType) bool { for _, cr := range cp.rules { diff --git a/policylang/policy.go b/policylang/policy.go index 41d8cde..533c5d9 100644 --- a/policylang/policy.go +++ b/policylang/policy.go @@ -81,9 +81,15 @@ type Rule struct { // DefaultVerdict controls gate-event behavior when no rule produces a // verdict. Valid values: "allow" (default, backwards-compatible) or "deny" // (default-deny; write explicit allow rules). +// +// FailClosed controls behavior when an expression evaluation error occurs. +// When nil (default), the engine fails-closed (DENY on gate errors, skip +// actions on cycle/join/leave errors). Set fail_closed: false in the JSON +// to restore legacy fail-open semantics. type PolicyDocument struct { Version int `json:"version"` DefaultVerdict string `json:"default_verdict,omitempty"` + FailClosed *bool `json:"fail_closed,omitempty"` Config map[string]interface{} `json:"config,omitempty"` Rules []Rule `json:"rules"` } diff --git a/runner.go b/runner.go index 1b89c62..0fcb8c0 100644 --- a/runner.go +++ b/runner.go @@ -215,8 +215,17 @@ func mergeTags(nodeInfoTags, policyTags []string) []string { func (pr *PolicyRunner) EvaluateGate(eventType EventType, ctx map[string]interface{}) bool { dirs, err := pr.compiled.Evaluate(eventType, ctx) if err != nil { - slog.Warn("policy: gate eval error", "network_id", pr.netID, "event", eventType, "err", err) - return true // fail open on error + if pr.compiled.FailClosed() { + slog.Error("policy: gate eval error — denying (fail_closed)", "network_id", pr.netID, "event", eventType, "err", err) + pr.runtime.PublishEvent("policy.eval_error", map[string]interface{}{ + "network_id": pr.netID, + "event": eventType, + "error": err.Error(), + }) + return false + } + slog.Warn("policy: gate eval error — allowing (fail_open)", "network_id", pr.netID, "event", eventType, "err", err) + return true } // Execute side effects (tag, etc.) before the verdict. @@ -258,6 +267,9 @@ func (pr *PolicyRunner) EvaluateGate(eventType EventType, ctx map[string]interfa func (pr *PolicyRunner) evaluatePerPeerCycle(ctx map[string]interface{}) { dirs, err := pr.compiled.Evaluate(EventCycle, ctx) if err != nil { + if pr.compiled.FailClosed() { + slog.Error("policy: per-peer cycle eval error — skipping (fail_closed)", "network_id", pr.netID, "err", err) + } return } for _, d := range dirs { @@ -272,7 +284,16 @@ func (pr *PolicyRunner) evaluatePerPeerCycle(ctx map[string]interface{}) { func (pr *PolicyRunner) EvaluateActions(eventType EventType, ctx map[string]interface{}) { dirs, err := pr.compiled.Evaluate(eventType, ctx) if err != nil { - slog.Warn("policy: action eval error", "network_id", pr.netID, "event", eventType, "err", err) + if pr.compiled.FailClosed() { + slog.Error("policy: action eval error — skipping directives (fail_closed)", "network_id", pr.netID, "event", eventType, "err", err) + pr.runtime.PublishEvent("policy.eval_error", map[string]interface{}{ + "network_id": pr.netID, + "event": eventType, + "error": err.Error(), + }) + } else { + slog.Warn("policy: action eval error — skipping directives (fail_open)", "network_id", pr.netID, "event", eventType, "err", err) + } return } diff --git a/zz_audit_defensive_test.go b/zz_audit_defensive_test.go index dc61eba..9125aa9 100644 --- a/zz_audit_defensive_test.go +++ b/zz_audit_defensive_test.go @@ -113,17 +113,12 @@ func TestPin_DefaultVerdictUnrecognized_RejectedByValidate(t *testing.T) { } } -// TestPin_EvaluateGate_FailOpenOnEvalError pins the documented fail-open -// behaviour when the expression engine returns an error mid-evaluation. -// The reasoning is in runner.go: "// fail open on error" — flipping this to -// fail-closed without a deliberate review would brick live connections. -// Pinning the current behaviour so the trade-off is an explicit choice next -// time, not an accident. -func TestPin_EvaluateGate_FailOpenOnEvalError(t *testing.T) { +// TestPin_EvaluateGate_FailClosedOnEvalError pins the fail-closed default +// behaviour (PILOT-262): when the expression engine returns an error +// mid-evaluation and fail_closed is not explicitly set, the gate must +// DENY (fail-closed) — the more secure default. +func TestPin_EvaluateGate_FailClosedOnEvalError(t *testing.T) { t.Parallel() - // Use a rule whose match references a function that exists at - // compile but blows up at runtime — duration("nope") returns an - // error inside expr, which surfaces from runProgram. doc := &PolicyDocument{ Version: 1, Rules: []Rule{ @@ -136,14 +131,46 @@ func TestPin_EvaluateGate_FailOpenOnEvalError(t *testing.T) { if err != nil { t.Fatalf("Compile: %v", err) } - pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}} + // Default (no fail_closed field) → fail-closed → deny on error. + if cp.FailClosed() != true { + t.Fatal("default FailClosed must be true") + } + pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}, runtime: &fakeRuntime{}} + if pr.EvaluateGate(EventConnect, map[string]interface{}{ + "port": 80, "peer_id": 1, "network_id": 1, + "peer_tags": []string{}, "peer_age_s": 0.0, "members": 0, + }) { + t.Fatal("fail-closed default: eval error must deny, got allow") + } +} - // Eval error → EvaluateGate returns true (fail-open). +// TestPin_EvaluateGate_FailOpenExplicit preserves the legacy fail-open +// pathway when fail_closed: false is explicitly set in the policy document. +func TestPin_EvaluateGate_FailOpenExplicit(t *testing.T) { + t.Parallel() + f := false + doc := &PolicyDocument{ + Version: 1, + FailClosed: &f, + Rules: []Rule{ + {Name: "boom", On: EventConnect, Match: `duration("nope") > 0`, Actions: []Action{ + {Type: ActionDeny}, + }}, + }, + } + cp, err := Compile(doc) + if err != nil { + t.Fatalf("Compile: %v", err) + } + if cp.FailClosed() != false { + t.Fatal("explicit fail_closed: false must report false") + } + pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}, runtime: &fakeRuntime{}} if !pr.EvaluateGate(EventConnect, map[string]interface{}{ "port": 80, "peer_id": 1, "network_id": 1, "peer_tags": []string{}, "peer_age_s": 0.0, "members": 0, }) { - t.Fatal("fail-open contract: eval error must allow, got deny") + t.Fatal("explicit fail_open: eval error must allow, got deny") } } @@ -205,14 +232,15 @@ func TestPin_ExprTimeout_GateBoundedLatency(t *testing.T) { // than tearing down the goroutine. The pkg-level test in policylang already // covers this; this is the integration-level guard from the runner side so // the *whole stack* is exercised, not just the helper. +// +// Uses fail_closed: false so the legacy allow-on-error verdict path stays +// tested. The no-panic invariant is what matters here. func TestPin_ExprTimeout_PanicSurfacesAsError(t *testing.T) { t.Parallel() - // Compile a rule whose expression accesses a field absent from ctx — under - // expr.AllowUndefinedVariables this becomes nil and dereference is the - // usual panic candidate. If it doesn't panic in this expr version, the - // test still passes (no crash is the invariant). + failOpen := false doc := &PolicyDocument{ - Version: 1, + Version: 1, + FailClosed: &failOpen, Rules: []Rule{ {Name: "boom", On: EventConnect, Match: `peer_tags[10] == "x"`, // OOB on empty peer_tags @@ -223,19 +251,19 @@ func TestPin_ExprTimeout_PanicSurfacesAsError(t *testing.T) { if err != nil { t.Fatalf("Compile: %v", err) } - pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}} + pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}, runtime: &fakeRuntime{}} // Must not panic. defer func() { if r := recover(); r != nil { - t.Fatalf("BUG: gate panicked instead of fail-open: %v", r) + t.Fatalf("BUG: gate panicked: %v", r) } }() if !pr.EvaluateGate(EventConnect, map[string]interface{}{ "port": 80, "peer_id": 1, "network_id": 1, "peer_tags": []string{}, "peer_age_s": 0.0, "members": 0, }) { - t.Fatal("OOB index → expected fail-open allow") + t.Fatal("explicit fail_open: OOB → must allow") } }