From c598b4a185448bb73f44f1f957e178fe9e99cc82 Mon Sep 17 00:00:00 2001
From: matthew-pilot <matthew@vulturelabs.io>
Date: Fri, 29 May 2026 23:50:15 +0000
Subject: [PATCH] fix(policy): gate eval error fails-closed by default
 (PILOT-262)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Policy gate events (connect, dial, datagram) returned ALLOW when
expression evaluation failed — fail-open behaviour that a corrupted
policy file or unexpected context drift could silently widen into
"allow everything."

Add a FailClosed field to PolicyDocument (fail_closed in JSON),
defaulting to true. When fail_closed:
- EvaluateGate returns DENY on eval error instead of ALLOW
- EvaluateActions / evaluatePerPeerCycle publish policy.eval_error
  events and log at ERROR instead of silently returning
- Operators who need legacy fail-open set fail_closed: false

Pin the new default in audit tests; add explicit fail_open pathway
test so the legacy behaviour stays covered.

Closes PILOT-262
---
 policylang/engine.go       | 11 ++++++
 policylang/policy.go       |  6 ++++
 runner.go                  | 27 +++++++++++++--
 zz_audit_defensive_test.go | 70 ++++++++++++++++++++++++++------------
 4 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/policylang/engine.go b/policylang/engine.go
index b9f3d79..b0cd36c 100644
--- a/policylang/engine.go
+++ b/policylang/engine.go
@@ -175,6 +175,17 @@ func (cp *CompiledPolicy) EvaluatePeerExpr(ruleName string, actionIdx int, peerC
 	return runProgram(prog, peerCtx)
 }
 
+// FailClosed reports whether the engine should deny/reject on expression
+// evaluation errors. When nil in PolicyDocument, defaults to true
+// (fail-closed — more secure). Set fail_closed: false to restore legacy
+// fail-open behaviour.
+func (cp *CompiledPolicy) FailClosed() bool {
+	if cp.Doc.FailClosed == nil {
+		return true // default: fail-closed
+	}
+	return *cp.Doc.FailClosed
+}
+
 // HasRulesFor returns true if the policy has any rules for the given event type.
 func (cp *CompiledPolicy) HasRulesFor(eventType EventType) bool {
 	for _, cr := range cp.rules {
diff --git a/policylang/policy.go b/policylang/policy.go
index 41d8cde..533c5d9 100644
--- a/policylang/policy.go
+++ b/policylang/policy.go
@@ -81,9 +81,15 @@ type Rule struct {
 // DefaultVerdict controls gate-event behavior when no rule produces a
 // verdict. Valid values: "allow" (default, backwards-compatible) or "deny"
 // (default-deny; write explicit allow rules).
+//
+// FailClosed controls behavior when an expression evaluation error occurs.
+// When nil (default), the engine fails-closed (DENY on gate errors, skip
+// actions on cycle/join/leave errors). Set fail_closed: false in the JSON
+// to restore legacy fail-open semantics.
 type PolicyDocument struct {
 	Version        int                    `json:"version"`
 	DefaultVerdict string                 `json:"default_verdict,omitempty"`
+	FailClosed     *bool                  `json:"fail_closed,omitempty"`
 	Config         map[string]interface{} `json:"config,omitempty"`
 	Rules          []Rule                 `json:"rules"`
 }
diff --git a/runner.go b/runner.go
index 1b89c62..0fcb8c0 100644
--- a/runner.go
+++ b/runner.go
@@ -215,8 +215,17 @@ func mergeTags(nodeInfoTags, policyTags []string) []string {
 func (pr *PolicyRunner) EvaluateGate(eventType EventType, ctx map[string]interface{}) bool {
 	dirs, err := pr.compiled.Evaluate(eventType, ctx)
 	if err != nil {
-		slog.Warn("policy: gate eval error", "network_id", pr.netID, "event", eventType, "err", err)
-		return true // fail open on error
+		if pr.compiled.FailClosed() {
+			slog.Error("policy: gate eval error — denying (fail_closed)", "network_id", pr.netID, "event", eventType, "err", err)
+			pr.runtime.PublishEvent("policy.eval_error", map[string]interface{}{
+				"network_id": pr.netID,
+				"event":      eventType,
+				"error":      err.Error(),
+			})
+			return false
+		}
+		slog.Warn("policy: gate eval error — allowing (fail_open)", "network_id", pr.netID, "event", eventType, "err", err)
+		return true
 	}
 
 	// Execute side effects (tag, etc.) before the verdict.
@@ -258,6 +267,9 @@ func (pr *PolicyRunner) EvaluateGate(eventType EventType, ctx map[string]interfa
 func (pr *PolicyRunner) evaluatePerPeerCycle(ctx map[string]interface{}) {
 	dirs, err := pr.compiled.Evaluate(EventCycle, ctx)
 	if err != nil {
+		if pr.compiled.FailClosed() {
+			slog.Error("policy: per-peer cycle eval error — skipping (fail_closed)", "network_id", pr.netID, "err", err)
+		}
 		return
 	}
 	for _, d := range dirs {
@@ -272,7 +284,16 @@ func (pr *PolicyRunner) evaluatePerPeerCycle(ctx map[string]interface{}) {
 func (pr *PolicyRunner) EvaluateActions(eventType EventType, ctx map[string]interface{}) {
 	dirs, err := pr.compiled.Evaluate(eventType, ctx)
 	if err != nil {
-		slog.Warn("policy: action eval error", "network_id", pr.netID, "event", eventType, "err", err)
+		if pr.compiled.FailClosed() {
+			slog.Error("policy: action eval error — skipping directives (fail_closed)", "network_id", pr.netID, "event", eventType, "err", err)
+			pr.runtime.PublishEvent("policy.eval_error", map[string]interface{}{
+				"network_id": pr.netID,
+				"event":      eventType,
+				"error":      err.Error(),
+			})
+		} else {
+			slog.Warn("policy: action eval error — skipping directives (fail_open)", "network_id", pr.netID, "event", eventType, "err", err)
+		}
 		return
 	}
 
diff --git a/zz_audit_defensive_test.go b/zz_audit_defensive_test.go
index dc61eba..9125aa9 100644
--- a/zz_audit_defensive_test.go
+++ b/zz_audit_defensive_test.go
@@ -113,17 +113,12 @@ func TestPin_DefaultVerdictUnrecognized_RejectedByValidate(t *testing.T) {
 	}
 }
 
-// TestPin_EvaluateGate_FailOpenOnEvalError pins the documented fail-open
-// behaviour when the expression engine returns an error mid-evaluation.
-// The reasoning is in runner.go: "// fail open on error" — flipping this to
-// fail-closed without a deliberate review would brick live connections.
-// Pinning the current behaviour so the trade-off is an explicit choice next
-// time, not an accident.
-func TestPin_EvaluateGate_FailOpenOnEvalError(t *testing.T) {
+// TestPin_EvaluateGate_FailClosedOnEvalError pins the fail-closed default
+// behaviour (PILOT-262): when the expression engine returns an error
+// mid-evaluation and fail_closed is not explicitly set, the gate must
+// DENY (fail-closed) — the more secure default.
+func TestPin_EvaluateGate_FailClosedOnEvalError(t *testing.T) {
 	t.Parallel()
-	// Use a rule whose match references a function that exists at
-	// compile but blows up at runtime — duration("nope") returns an
-	// error inside expr, which surfaces from runProgram.
 	doc := &PolicyDocument{
 		Version: 1,
 		Rules: []Rule{
@@ -136,14 +131,46 @@ func TestPin_EvaluateGate_FailOpenOnEvalError(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Compile: %v", err)
 	}
-	pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}}
+	// Default (no fail_closed field) → fail-closed → deny on error.
+	if cp.FailClosed() != true {
+		t.Fatal("default FailClosed must be true")
+	}
+	pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}, runtime: &fakeRuntime{}}
+	if pr.EvaluateGate(EventConnect, map[string]interface{}{
+		"port": 80, "peer_id": 1, "network_id": 1,
+		"peer_tags": []string{}, "peer_age_s": 0.0, "members": 0,
+	}) {
+		t.Fatal("fail-closed default: eval error must deny, got allow")
+	}
+}
 
-	// Eval error → EvaluateGate returns true (fail-open).
+// TestPin_EvaluateGate_FailOpenExplicit preserves the legacy fail-open
+// pathway when fail_closed: false is explicitly set in the policy document.
+func TestPin_EvaluateGate_FailOpenExplicit(t *testing.T) {
+	t.Parallel()
+	f := false
+	doc := &PolicyDocument{
+		Version:    1,
+		FailClosed: &f,
+		Rules: []Rule{
+			{Name: "boom", On: EventConnect, Match: `duration("nope") > 0`, Actions: []Action{
+				{Type: ActionDeny},
+			}},
+		},
+	}
+	cp, err := Compile(doc)
+	if err != nil {
+		t.Fatalf("Compile: %v", err)
+	}
+	if cp.FailClosed() != false {
+		t.Fatal("explicit fail_closed: false must report false")
+	}
+	pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}, runtime: &fakeRuntime{}}
 	if !pr.EvaluateGate(EventConnect, map[string]interface{}{
 		"port": 80, "peer_id": 1, "network_id": 1,
 		"peer_tags": []string{}, "peer_age_s": 0.0, "members": 0,
 	}) {
-		t.Fatal("fail-open contract: eval error must allow, got deny")
+		t.Fatal("explicit fail_open: eval error must allow, got deny")
 	}
 }
 
@@ -205,14 +232,15 @@ func TestPin_ExprTimeout_GateBoundedLatency(t *testing.T) {
 // than tearing down the goroutine. The pkg-level test in policylang already
 // covers this; this is the integration-level guard from the runner side so
 // the *whole stack* is exercised, not just the helper.
+//
+// Uses fail_closed: false so the legacy allow-on-error verdict path stays
+// tested. The no-panic invariant is what matters here.
 func TestPin_ExprTimeout_PanicSurfacesAsError(t *testing.T) {
 	t.Parallel()
-	// Compile a rule whose expression accesses a field absent from ctx — under
-	// expr.AllowUndefinedVariables this becomes nil and dereference is the
-	// usual panic candidate. If it doesn't panic in this expr version, the
-	// test still passes (no crash is the invariant).
+	failOpen := false
 	doc := &PolicyDocument{
-		Version: 1,
+		Version:    1,
+		FailClosed: &failOpen,
 		Rules: []Rule{
 			{Name: "boom", On: EventConnect,
 				Match:   `peer_tags[10] == "x"`, // OOB on empty peer_tags
@@ -223,19 +251,19 @@ func TestPin_ExprTimeout_PanicSurfacesAsError(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Compile: %v", err)
 	}
-	pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}}
+	pr := &PolicyRunner{netID: 1, compiled: cp, peers: map[uint32]*managedPeer{}, runtime: &fakeRuntime{}}
 
 	// Must not panic.
 	defer func() {
 		if r := recover(); r != nil {
-			t.Fatalf("BUG: gate panicked instead of fail-open: %v", r)
+			t.Fatalf("BUG: gate panicked: %v", r)
 		}
 	}()
 	if !pr.EvaluateGate(EventConnect, map[string]interface{}{
 		"port": 80, "peer_id": 1, "network_id": 1,
 		"peer_tags": []string{}, "peer_age_s": 0.0, "members": 0,
 	}) {
-		t.Fatal("OOB index → expected fail-open allow")
+		t.Fatal("explicit fail_open: OOB → must allow")
 	}
 }