From 6547a6cdcb9eec4eea71f80f8a53be43420cf6d0 Mon Sep 17 00:00:00 2001
From: chetnapadhi <padhichetna@gmail.com>
Date: Fri, 17 Apr 2026 09:58:51 +0530
Subject: [PATCH 01/30] feat: support negative matching in queries

---
 README.md                            |  28 ++++
 internal/engine/benchmark_test.go    |  45 +++++-
 internal/engine/combined.go          |  20 ++-
 internal/engine/combined_test.go     | 113 ++++++++++++++
 internal/engine/embedding.go         |  84 ++++++++++-
 internal/engine/embedding_test.go    |  64 ++++++++
 internal/engine/lexical.go           |  85 ++++++++++-
 internal/engine/lexical_test.go      |  57 +++++++
 internal/engine/query_parser.go      |  47 ++++++
 internal/engine/query_parser_test.go | 101 +++++++++++++
 internal/types/types.go              |   7 +
 semantic_test.go                     | 216 +++++++++++++++++++++++++++
 12 files changed, 849 insertions(+), 18 deletions(-)
 create mode 100644 internal/engine/query_parser.go
 create mode 100644 internal/engine/query_parser_test.go

diff --git a/README.md b/README.md
index dfaba84..8cc06a6 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,34 @@ result, err := matcher.Find(ctx, "log in button", elements, semantic.FindOptions
 // result.BestScore = 0.82
 ```
 
+## Negative Queries
+
+Queries can include exclusion intent using:
+`not`, `without`, `exclude`, `excluding`, `except`, `no`, `ignore`.
+
+Examples:
+
+```text
+button not submit
+link without logout
+textbox excluding email
+```
+
+CLI examples:
+
+```bash
+semantic find "button not submit" --snapshot page.json
+semantic find "link without logout" --snapshot page.json
+semantic find "textbox not email" --snapshot page.json --strategy combined
+```
+
+Behavior:
+
+- Positive tokens contribute to base match score.
+- Negative tokens apply penalty when they match an element.
+- Strong negative hits can fully exclude an element from results.
+- Negative matching is synonym-aware (for example, `not login` can penalize `Sign In`).
+
 ## Package Layout
 
 ```
diff --git a/internal/engine/benchmark_test.go b/internal/engine/benchmark_test.go
index 92bbea6..c37528c 100644
--- a/internal/engine/benchmark_test.go
+++ b/internal/engine/benchmark_test.go
@@ -3,6 +3,7 @@ package engine
 import (
 	"context"
 	"github.com/pinchtab/semantic/internal/types"
+	"strconv"
 	"testing"
 )
 
@@ -169,8 +170,9 @@ func BenchmarkCombinedFind_100Elements(b *testing.B) {
 	elements := make([]types.ElementDescriptor, 0, 100)
 	for len(elements) < 100 {
 		for _, e := range base {
-			e.Ref = "e" + string(rune('0'+len(elements)))
-			elements = append(elements, e)
+			clone := e
+			clone.Ref = "e" + strconv.Itoa(len(elements))
+			elements = append(elements, clone)
 			if len(elements) >= 100 {
 				break
 			}
@@ -203,3 +205,42 @@ func BenchmarkCalibrateConfidence(b *testing.B) {
 		types.CalibrateConfidence(0.75)
 	}
 }
+
+func benchElementsSized(n int) []types.ElementDescriptor {
+	base := benchElements()
+	out := make([]types.ElementDescriptor, 0, n)
+	for len(out) < n {
+		for _, e := range base {
+			clone := e
+			clone.Ref = "e" + strconv.Itoa(len(out))
+			out = append(out, clone)
+			if len(out) >= n {
+				break
+			}
+		}
+	}
+	return out
+}
+
+func BenchmarkCombinedFind_Issue24_100Elements(b *testing.B) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := benchElementsSized(100)
+	ctx := context.Background()
+	opts := types.FindOptions{Threshold: 0.3, TopK: 3}
+
+	queries := []string{
+		"sign in button",
+		"button not submit",
+		"textbox not email",
+	}
+
+	for _, q := range queries {
+		b.Run(q, func(b *testing.B) {
+			b.ReportAllocs()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_, _ = m.Find(ctx, q, elements, opts)
+			}
+		})
+	}
+}
diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index bf1f0dd..5eabf02 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -44,9 +44,11 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 		opts.TopK = 3
 	}
 
+	parsed := ParseQuery(query)
+
 	lexW, embW := c.weights(opts)
 
-	lexResult, embResult, err := c.runBoth(ctx, query, elements, opts)
+	lexResult, embResult, err := c.runBothParsed(ctx, parsed, elements, opts)
 	if err != nil {
 		return types.FindResult{}, err
 	}
@@ -66,8 +68,10 @@ type matcherResult struct {
 	err    error
 }
 
-func (c *CombinedMatcher) runBoth(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, types.FindResult, error) {
+func (c *CombinedMatcher) runBothParsed(ctx context.Context, parsed types.ParsedQuery, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, types.FindResult, error) {
 	internalOpts := types.FindOptions{
+		// Lower threshold allows both strategies to contribute to fusion
+		// before final filtering at the caller's requested threshold.
 		Threshold: opts.Threshold * 0.5,
 		TopK:      len(elements),
 	}
@@ -81,8 +85,8 @@ func (c *CombinedMatcher) runBoth(ctx context.Context, query string, elements []
 				lexCh <- matcherResult{err: fmt.Errorf("lexical matcher panic: %v", p)}
 			}
 		}()
-		r, err := c.lexical.Find(ctx, query, elements, internalOpts)
-		lexCh <- matcherResult{r, err}
+		r := c.lexical.findWithParsed(parsed, elements, internalOpts)
+		lexCh <- matcherResult{result: r}
 	}()
 	go func() {
 		defer func() {
@@ -90,7 +94,7 @@ func (c *CombinedMatcher) runBoth(ctx context.Context, query string, elements []
 				embCh <- matcherResult{err: fmt.Errorf("embedding matcher panic: %v", p)}
 			}
 		}()
-		r, err := c.embedding.Find(ctx, query, elements, internalOpts)
+		r, err := c.embedding.findWithParsed(parsed, elements, internalOpts)
 		embCh <- matcherResult{r, err}
 	}()
 
@@ -135,6 +139,12 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 	candidates := make([]scored, 0, len(allRefs))
 	for ref := range allRefs {
 		combined := lexW*lexScores[ref] + embW*embScores[ref]
+		if combined < 0 {
+			combined = 0
+		}
+		if combined > 1 {
+			combined = 1
+		}
 		if combined >= opts.Threshold {
 			s := scored{ref: ref, score: combined, el: refToElem[ref]}
 			if opts.Explain {
diff --git a/internal/engine/combined_test.go b/internal/engine/combined_test.go
index 4411eb3..c882591 100644
--- a/internal/engine/combined_test.go
+++ b/internal/engine/combined_test.go
@@ -161,6 +161,95 @@ func TestCombinedMatcher_FusesBothStrategies(t *testing.T) {
 	}
 }
 
+func TestCombinedMatcher_NegativePenalization(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "submit", Role: "button", Name: "Submit"},
+		{Ref: "cancel", Role: "button", Name: "Cancel"},
+	}
+
+	res, err := m.Find(context.Background(), "button not cancel", elements, types.FindOptions{Threshold: 0, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(res.Matches) < 2 {
+		t.Fatalf("expected two matches, got %d", len(res.Matches))
+	}
+	if res.BestRef != "submit" {
+		t.Fatalf("expected cancel to be penalized, got best=%s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_NegativeSynonymExpansion(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "password", Role: "textbox", Name: "Password"},
+		{Ref: "email", Role: "textbox", Name: "Email"},
+	}
+
+	res, err := m.Find(context.Background(), "input no pwd", elements, types.FindOptions{Threshold: 0, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(res.Matches) < 2 {
+		t.Fatalf("expected two matches, got %d", len(res.Matches))
+	}
+	if res.BestRef != "email" {
+		t.Fatalf("expected password to be demoted by negative synonym, got best=%s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_NegativeOnlyQuery(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "submit", Role: "button", Name: "Submit"},
+		{Ref: "cancel", Role: "button", Name: "Cancel"},
+		{Ref: "email", Role: "textbox", Name: "Email"},
+	}
+
+	res, err := m.Find(context.Background(), "not submit", elements, types.FindOptions{Threshold: 0.3, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(res.Matches) == 0 {
+		t.Fatalf("expected non-submit matches to remain")
+	}
+	for _, match := range res.Matches {
+		if match.Ref == "submit" {
+			t.Fatalf("expected submit to be filtered out for negative-only query")
+		}
+	}
+}
+
+func TestCombinedMatcher_PositiveQueryRegression(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "submit", Role: "button", Name: "Submit"},
+		{Ref: "cancel", Role: "button", Name: "Cancel"},
+	}
+
+	res, err := m.Find(context.Background(), "submit button", elements, types.FindOptions{Threshold: 0.05, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if res.BestRef != "submit" {
+		t.Fatalf("expected positive query behavior unchanged, got best=%s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_EmptyQueryReturnsNoResults(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{{Ref: "submit", Role: "button", Name: "Submit"}}
+
+	res, err := m.Find(context.Background(), "   ", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(res.Matches) != 0 {
+		t.Fatalf("expected no matches for empty query, got %d", len(res.Matches))
+	}
+}
+
 func TestCombinedMatcher_NoElements(t *testing.T) {
 	m := NewCombinedMatcher(NewHashingEmbedder(128))
 
@@ -466,3 +555,27 @@ func TestCombinedMatcher_WeightsApplied(t *testing.T) {
 		t.Errorf("expected BestRef=e0, got %s", result.BestRef)
 	}
 }
+
+func TestCombinedMatcher_ClampsScoreWithCustomWeights(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "e0", Role: "button", Name: "Sign In"},
+		{Ref: "e1", Role: "link", Name: "Help"},
+	}
+
+	result, err := m.Find(context.Background(), "sign in button", elements, types.FindOptions{
+		Threshold:       0,
+		TopK:            2,
+		LexicalWeight:   2.0,
+		EmbeddingWeight: 2.0,
+	})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+
+	for _, match := range result.Matches {
+		if match.Score < 0 || match.Score > 1 {
+			t.Fatalf("expected clamped score in [0,1], got %f for ref=%s", match.Score, match.Ref)
+		}
+	}
+}
diff --git a/internal/engine/embedding.go b/internal/engine/embedding.go
index 6723255..778d107 100644
--- a/internal/engine/embedding.go
+++ b/internal/engine/embedding.go
@@ -5,6 +5,7 @@ import (
 	"github.com/pinchtab/semantic/internal/types"
 	"math"
 	"sort"
+	"strings"
 )
 
 // Embedder converts text into dense vectors. See NewHashingEmbedder.
@@ -48,25 +49,60 @@ func (m *EmbeddingMatcher) Strategy() string {
 }
 
 func (m *EmbeddingMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+	parsed := ParseQuery(query)
+	return m.findWithParsed(parsed, elements, opts)
+}
+
+func (m *EmbeddingMatcher) findWithParsed(parsed types.ParsedQuery, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
 	if opts.TopK <= 0 {
 		opts.TopK = 3
 	}
 
+	if len(parsed.Positive) == 0 && len(parsed.Negative) == 0 {
+		return types.FindResult{
+			Strategy:     m.Strategy(),
+			ElementCount: len(elements),
+		}, nil
+	}
+
+	positiveQuery := strings.Join(parsed.Positive, " ")
+	negativeQuery := strings.Join(parsed.Negative, " ")
+	negativeOnly := len(parsed.Positive) == 0 && len(parsed.Negative) > 0
+
 	// Build composite descriptions.
 	descs := make([]string, len(elements))
 	for i, el := range elements {
 		descs[i] = el.Composite()
 	}
 
-	// Embed query + all descriptions in a single batch.
-	texts := append([]string{query}, descs...)
+	// Embed positive/negative query components and all descriptions in one batch.
+	texts := make([]string, 0, len(descs)+2)
+	if len(parsed.Positive) > 0 {
+		texts = append(texts, positiveQuery)
+	}
+	if len(parsed.Negative) > 0 {
+		texts = append(texts, negativeQuery)
+	}
+	texts = append(texts, descs...)
 	vectors, err := m.embedder.Embed(texts)
 	if err != nil {
 		return types.FindResult{}, err
 	}
 
-	queryVec := vectors[0]
-	elemVecs := vectors[1:]
+	idx := 0
+	var posVec []float32
+	if len(parsed.Positive) > 0 {
+		posVec = vectors[idx]
+		idx++
+	}
+
+	var negVec []float32
+	if len(parsed.Negative) > 0 {
+		negVec = vectors[idx]
+		idx++
+	}
+
+	elemVecs := vectors[idx:]
 	contextVecs := elemVecs
 	if m.neighborWeight > 0 && len(elemVecs) > 1 {
 		contextVecs = m.withNeighborContext(elemVecs)
@@ -79,14 +115,46 @@ func (m *EmbeddingMatcher) Find(_ context.Context, query string, elements []type
 
 	var candidates []scored
 	for i, el := range elements {
-		sim := CosineSimilarity(queryVec, contextVecs[i])
-		if sim >= opts.Threshold {
-			candidates = append(candidates, scored{desc: el, score: sim})
+		score := 1.0
+		if len(parsed.Positive) > 0 {
+			score = CosineSimilarity(posVec, contextVecs[i])
+		}
+
+		if len(parsed.Negative) > 0 {
+			// Debug note: negSim is the negative-token similarity used to apply
+			// exclusion/down-weight penalties for this candidate.
+			// Negatives should compare against the element vector itself.
+			negSim := CosineSimilarity(negVec, elemVecs[i])
+			if len(parsed.Positive) == 0 {
+				if negSim > 0.5 {
+					score = 0
+				}
+			} else if negSim > 0.5 {
+				score *= 1 - (negSim * 0.8)
+			}
+		}
+
+		if score < 0 {
+			score = 0
+		}
+		if score > 1 {
+			score = 1
+		}
+		if negativeOnly && score == 0 {
+			continue
+		}
+
+		if score >= opts.Threshold {
+			candidates = append(candidates, scored{desc: el, score: score})
 		}
 	}
 
 	sort.Slice(candidates, func(i, j int) bool {
-		return candidates[i].score > candidates[j].score
+		scoreDiff := candidates[i].score - candidates[j].score
+		if math.Abs(scoreDiff) > 1e-9 {
+			return scoreDiff > 0
+		}
+		return candidates[i].desc.Ref < candidates[j].desc.Ref
 	})
 
 	if len(candidates) > opts.TopK {
diff --git a/internal/engine/embedding_test.go b/internal/engine/embedding_test.go
index fde31e0..adb09dd 100644
--- a/internal/engine/embedding_test.go
+++ b/internal/engine/embedding_test.go
@@ -187,6 +187,70 @@ func TestEmbeddingMatcher_ThresholdFiltering(t *testing.T) {
 	}
 }
 
+func TestEmbeddingMatcher_NegativePenalty(t *testing.T) {
+	e := newScriptedEmbedder(map[string][]float32{
+		"button":         {1, 0},
+		"cancel":         {0, 1},
+		"button: Submit": {1, 0},
+		"button: Cancel": {1, 1},
+	})
+	m := NewEmbeddingMatcherWithNeighborWeight(e, 0)
+
+	elements := []types.ElementDescriptor{
+		{Ref: "submit", Role: "button", Name: "Submit"},
+		{Ref: "cancel", Role: "button", Name: "Cancel"},
+	}
+
+	res, err := m.Find(context.Background(), "button not cancel", elements, types.FindOptions{Threshold: 0, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(res.Matches) < 2 {
+		t.Fatalf("expected two matches, got %d", len(res.Matches))
+	}
+	if res.BestRef != "submit" {
+		t.Fatalf("expected negative term to demote cancel, got %s", res.BestRef)
+	}
+}
+
+func TestEmbeddingMatcher_NegativeOnlyQuery(t *testing.T) {
+	e := newScriptedEmbedder(map[string][]float32{
+		"submit":         {1, 0},
+		"button: Submit": {1, 0},
+		"button: Cancel": {0, 1},
+	})
+	m := NewEmbeddingMatcherWithNeighborWeight(e, 0)
+
+	elements := []types.ElementDescriptor{
+		{Ref: "submit", Role: "button", Name: "Submit"},
+		{Ref: "cancel", Role: "button", Name: "Cancel"},
+	}
+
+	res, err := m.Find(context.Background(), "not submit", elements, types.FindOptions{Threshold: 0.3, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(res.Matches) != 1 {
+		t.Fatalf("expected only non-submit element to remain, got %d matches", len(res.Matches))
+	}
+	if res.BestRef != "cancel" {
+		t.Fatalf("expected cancel to remain after negative-only query, got %s", res.BestRef)
+	}
+}
+
+func TestEmbeddingMatcher_EmptyQueryReturnsNoResults(t *testing.T) {
+	m := NewEmbeddingMatcher(newDummyEmbedder(64))
+	res, err := m.Find(context.Background(), "   ", []types.ElementDescriptor{
+		{Ref: "e1", Role: "button", Name: "Submit"},
+	}, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(res.Matches) != 0 {
+		t.Fatalf("expected no matches for empty query, got %d", len(res.Matches))
+	}
+}
+
 func TestEmbeddingMatcher_NeighborContextDisambiguatesRealWorldButtons(t *testing.T) {
 	e := newScriptedEmbedder(map[string][]float32{
 		"laptop add to cart":         {1, 1, 0},
diff --git a/internal/engine/lexical.go b/internal/engine/lexical.go
index ee53a59..d9cc658 100644
--- a/internal/engine/lexical.go
+++ b/internal/engine/lexical.go
@@ -48,10 +48,25 @@ func NewLexicalMatcher() *LexicalMatcher {
 func (m *LexicalMatcher) Strategy() string { return "lexical" }
 
 func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+	parsed := ParseQuery(query)
+	return m.findWithParsed(parsed, elements, opts), nil
+}
+
+func (m *LexicalMatcher) findWithParsed(parsed types.ParsedQuery, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult {
 	if opts.TopK <= 0 {
 		opts.TopK = 3
 	}
 
+	if len(parsed.Positive) == 0 && len(parsed.Negative) == 0 {
+		return types.FindResult{
+			Strategy:     "lexical",
+			ElementCount: len(elements),
+		}
+	}
+
+	negativeOnly := len(parsed.Positive) == 0 && len(parsed.Negative) > 0
+	positiveQuery := strings.Join(parsed.Positive, " ")
+
 	ef := BuildElementFrequency(elements)
 
 	type scored struct {
@@ -62,11 +77,39 @@ func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types.
 	var candidates []scored
 	for _, el := range elements {
 		composite := el.Composite()
-		score := lexicalScore(query, composite, el.Interactive, ef)
-		score += positionalBoost(query, el.Positional)
+		descTokens := tokenize(composite)
+		score := 0.0
+		if len(parsed.Positive) == 0 {
+			// Negative-only query means "everything except negatives".
+			score = 1.0
+		} else {
+			score = lexicalScoreTokens(parsed.Positive, descTokens, el.Interactive, ef)
+			score += positionalBoost(positiveQuery, el.Positional)
+		}
+
+		if len(parsed.Negative) > 0 {
+			// Debug note: negativeScore reflects how strongly negative tokens match
+			// this element; hasStrongNegativeHit indicates exact/synonym token hit.
+			negativeScore := lexicalScoreTokens(parsed.Negative, descTokens, el.Interactive, ef)
+			switch {
+			case hasStrongNegativeHit(parsed.Negative, descTokens) || negativeScore > 0.7:
+				// Applied penalty: full exclusion.
+				score = 0
+			case negativeScore > 0.4:
+				// Applied penalty: multiplicative down-weight.
+				score *= 1 - negativeScore
+			}
+		}
+
+		if score < 0 {
+			score = 0
+		}
 		if score > 1.0 {
 			score = 1.0
 		}
+		if negativeOnly && score == 0 {
+			continue
+		}
 		if score >= opts.Threshold {
 			candidates = append(candidates, scored{desc: el, score: score})
 		}
@@ -116,7 +159,7 @@ func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types.
 		result.BestScore = result.Matches[0].Score
 	}
 
-	return result, nil
+	return result
 }
 
 func tokenize(s string) []string {
@@ -226,7 +269,10 @@ func LexicalScoreWithFrequency(query, desc string, ef *ElementFrequency) float64
 func lexicalScore(query, desc string, interactive bool, ef *ElementFrequency) float64 {
 	rawQTokens := tokenize(query)
 	rawDTokens := tokenize(desc)
+	return lexicalScoreTokens(rawQTokens, rawDTokens, interactive, ef)
+}
 
+func lexicalScoreTokens(rawQTokens, rawDTokens []string, interactive bool, ef *ElementFrequency) float64 {
 	qTokens := removeStopwordsContextAware(rawQTokens, rawDTokens)
 	dTokens := removeStopwordsContextAware(rawDTokens, rawQTokens)
 
@@ -561,3 +607,36 @@ func tokenPrefixScore(qTokens, dTokens []string) float64 {
 
 	return total / float64(len(qTokens))
 }
+
+func hasStrongNegativeHit(negativeTokens, descTokens []string) bool {
+	if len(negativeTokens) == 0 || len(descTokens) == 0 {
+		return false
+	}
+
+	dSet := tokenSet(descTokens)
+	for _, nt := range negativeTokens {
+		if dSet[nt] {
+			return true
+		}
+		if syns, ok := synonymIndex[nt]; ok {
+			for syn := range syns {
+				synTokens := strings.Fields(syn)
+				if len(synTokens) == 0 {
+					continue
+				}
+				allPresent := true
+				for _, st := range synTokens {
+					if !dSet[st] {
+						allPresent = false
+						break
+					}
+				}
+				if allPresent {
+					return true
+				}
+			}
+		}
+	}
+
+	return false
+}
diff --git a/internal/engine/lexical_test.go b/internal/engine/lexical_test.go
index fbf47ac..109e05b 100644
--- a/internal/engine/lexical_test.go
+++ b/internal/engine/lexical_test.go
@@ -586,4 +586,61 @@ func TestLexicalMatcher_ThresholdFiltering(t *testing.T) {
 	}
 }
 
+func TestLexicalMatcher_NegativePenalization(t *testing.T) {
+	m := NewLexicalMatcher()
+	elements := []types.ElementDescriptor{
+		{Ref: "submit", Role: "button", Name: "Submit"},
+		{Ref: "cancel", Role: "button", Name: "Cancel"},
+	}
+
+	result, err := m.Find(context.Background(), "button not cancel", elements, types.FindOptions{
+		Threshold: 0,
+		TopK:      2,
+	})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(result.Matches) < 2 {
+		t.Fatalf("expected two matches, got %d", len(result.Matches))
+	}
+	if result.BestRef != "submit" {
+		t.Fatalf("expected submit to rank above canceled element, got %s", result.BestRef)
+	}
+}
+
+func TestLexicalMatcher_NegativeSynonymExpansion(t *testing.T) {
+	m := NewLexicalMatcher()
+	elements := []types.ElementDescriptor{
+		{Ref: "password", Role: "textbox", Name: "Password"},
+		{Ref: "email", Role: "textbox", Name: "Email"},
+	}
+
+	result, err := m.Find(context.Background(), "input no pwd", elements, types.FindOptions{
+		Threshold: 0,
+		TopK:      2,
+	})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(result.Matches) < 2 {
+		t.Fatalf("expected two matches, got %d", len(result.Matches))
+	}
+	if result.BestRef != "email" {
+		t.Fatalf("expected password synonym to be penalized, got best=%s", result.BestRef)
+	}
+}
+
+func TestLexicalMatcher_EmptyQueryReturnsNoResults(t *testing.T) {
+	m := NewLexicalMatcher()
+	elements := []types.ElementDescriptor{{Ref: "e1", Role: "button", Name: "Submit"}}
+
+	result, err := m.Find(context.Background(), "   ", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if len(result.Matches) != 0 {
+		t.Fatalf("expected no matches for empty query, got %d", len(result.Matches))
+	}
+}
+
 // dummyEmbedder tests
diff --git a/internal/engine/query_parser.go b/internal/engine/query_parser.go
new file mode 100644
index 0000000..2f0c584
--- /dev/null
+++ b/internal/engine/query_parser.go
@@ -0,0 +1,47 @@
+package engine
+
+import "github.com/pinchtab/semantic/internal/types"
+
+// Query grammar:
+//
+//	<positive tokens> [NEGATIVE_TRIGGER <negative token>...]+
+//
+// A NEGATIVE_TRIGGER is one of:
+// not, without, exclude, excluding, except, no, ignore.
+// After a trigger, all following tokens are classified as negative until
+// another trigger or the end of the query.
+type ParsedQuery = types.ParsedQuery
+
+var negativeTriggers = map[string]bool{
+	"not":       true,
+	"without":   true,
+	"exclude":   true,
+	"excluding": true,
+	"except":    true,
+	"no":        true,
+	"ignore":    true,
+}
+
+// ParseQuery tokenizes and classifies tokens into positive and negative terms.
+func ParseQuery(raw string) ParsedQuery {
+	tokens := tokenize(raw)
+	parsed := types.ParsedQuery{
+		Positive: make([]string, 0, len(tokens)),
+		Negative: make([]string, 0, len(tokens)),
+	}
+
+	inNegative := false
+	for _, tok := range tokens {
+		if negativeTriggers[tok] {
+			inNegative = true
+			continue
+		}
+		if inNegative {
+			parsed.Negative = append(parsed.Negative, tok)
+			continue
+		}
+		parsed.Positive = append(parsed.Positive, tok)
+	}
+
+	return parsed
+}
diff --git a/internal/engine/query_parser_test.go b/internal/engine/query_parser_test.go
new file mode 100644
index 0000000..0b80656
--- /dev/null
+++ b/internal/engine/query_parser_test.go
@@ -0,0 +1,101 @@
+package engine
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestParseQuery_TableDriven(t *testing.T) {
+	tests := []struct {
+		name     string
+		raw      string
+		positive []string
+		negative []string
+	}{
+		{
+			name:     "button not submit",
+			raw:      "button not submit",
+			positive: []string{"button"},
+			negative: []string{"submit"},
+		},
+		{
+			name:     "button not sign in",
+			raw:      "button not sign in",
+			positive: []string{"button"},
+			negative: []string{"sign", "in"},
+		},
+		{
+			name:     "link without logout",
+			raw:      "link without logout",
+			positive: []string{"link"},
+			negative: []string{"logout"},
+		},
+		{
+			name:     "input excluding email",
+			raw:      "input excluding email",
+			positive: []string{"input"},
+			negative: []string{"email"},
+		},
+		{
+			name:     "button except close",
+			raw:      "button except close",
+			positive: []string{"button"},
+			negative: []string{"close"},
+		},
+		{
+			name:     "sign in button",
+			raw:      "sign in button",
+			positive: []string{"sign", "in", "button"},
+			negative: nil,
+		},
+		{
+			name:     "not button",
+			raw:      "not button",
+			positive: nil,
+			negative: []string{"button"},
+		},
+		{
+			name:     "input no password no username",
+			raw:      "input no password no username",
+			positive: []string{"input"},
+			negative: []string{"password", "username"},
+		},
+		{
+			name:     "negative segment break by trigger",
+			raw:      "button not sign in except submit",
+			positive: []string{"button"},
+			negative: []string{"sign", "in", "submit"},
+		},
+		{
+			name:     "trailing trigger behaves as positive query",
+			raw:      "button not",
+			positive: []string{"button"},
+			negative: nil,
+		},
+		{
+			name:     "repeated triggers",
+			raw:      "button not submit not cancel",
+			positive: []string{"button"},
+			negative: []string{"submit", "cancel"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := ParseQuery(tt.raw)
+			if !reflect.DeepEqual(normalizeTokens(got.Positive), normalizeTokens(tt.positive)) {
+				t.Fatalf("positive mismatch: got=%v want=%v", got.Positive, tt.positive)
+			}
+			if !reflect.DeepEqual(normalizeTokens(got.Negative), normalizeTokens(tt.negative)) {
+				t.Fatalf("negative mismatch: got=%v want=%v", got.Negative, tt.negative)
+			}
+		})
+	}
+}
+
+func normalizeTokens(tokens []string) []string {
+	if len(tokens) == 0 {
+		return []string{}
+	}
+	return tokens
+}
diff --git a/internal/types/types.go b/internal/types/types.go
index 20af53b..6acde4a 100644
--- a/internal/types/types.go
+++ b/internal/types/types.go
@@ -53,6 +53,13 @@ type FindResult struct {
 	ElementCount int // total elements evaluated
 }
 
+// ParsedQuery splits a raw query into positive and negative token groups.
+// Negative tokens are interpreted as terms that should be penalized or excluded.
+type ParsedQuery struct {
+	Positive []string
+	Negative []string
+}
+
 // ConfidenceLabel returns "high", "medium", or "low" for the best match.
 func (r *FindResult) ConfidenceLabel() string {
 	return CalibrateConfidence(r.BestScore)
diff --git a/semantic_test.go b/semantic_test.go
index a2a32ee..34ea0b0 100644
--- a/semantic_test.go
+++ b/semantic_test.go
@@ -7,6 +7,222 @@ import (
 	"github.com/pinchtab/semantic"
 )
 
+func negativeMatchingFixture() []semantic.ElementDescriptor {
+	return []semantic.ElementDescriptor{
+		{Ref: "e0", Role: "button", Name: "Submit"},
+		{Ref: "e1", Role: "button", Name: "Cancel"},
+		{Ref: "e2", Role: "button", Name: "Sign In"},
+		{Ref: "e3", Role: "link", Name: "Logout"},
+		{Ref: "e4", Role: "textbox", Name: "Email"},
+		{Ref: "e5", Role: "textbox", Name: "Password"},
+	}
+}
+
+func findScore(matches []semantic.ElementMatch, ref string) (float64, bool) {
+	for _, m := range matches {
+		if m.Ref == ref {
+			return m.Score, true
+		}
+	}
+	return 0, false
+}
+
+func TestLexicalMatcher_NegativeMatching_Issue24Cases(t *testing.T) {
+	m := semantic.NewLexicalMatcher()
+	elements := negativeMatchingFixture()
+
+	tests := []struct {
+		name  string
+		query string
+		check func(t *testing.T, result semantic.FindResult)
+	}{
+		{
+			name:  "button not submit penalizes submit",
+			query: "button not submit",
+			check: func(t *testing.T, result semantic.FindResult) {
+				e0, ok := findScore(result.Matches, "e0")
+				if !ok {
+					t.Fatalf("expected e0 in matches")
+				}
+				e1, ok := findScore(result.Matches, "e1")
+				if !ok {
+					t.Fatalf("expected e1 in matches")
+				}
+				e2, ok := findScore(result.Matches, "e2")
+				if !ok {
+					t.Fatalf("expected e2 in matches")
+				}
+				if !(e1 > e0 || e2 > e0) {
+					t.Fatalf("expected e1 or e2 to rank above penalized e0, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2)
+				}
+			},
+		},
+		{
+			name:  "button not behaves as button",
+			query: "button not",
+			check: func(t *testing.T, result semantic.FindResult) {
+				if result.BestRef == "e4" || result.BestRef == "e5" {
+					t.Fatalf("expected a button-like result, got %s", result.BestRef)
+				}
+			},
+		},
+		{
+			name:  "button not cancel penalizes cancel",
+			query: "button not cancel",
+			check: func(t *testing.T, result semantic.FindResult) {
+				e0, ok := findScore(result.Matches, "e0")
+				if !ok {
+					t.Fatalf("expected e0 in matches")
+				}
+				e1, ok := findScore(result.Matches, "e1")
+				if !ok {
+					t.Fatalf("expected e1 in matches")
+				}
+				if e0 <= e1 {
+					t.Fatalf("expected e0 above penalized e1, got e0=%.4f e1=%.4f", e0, e1)
+				}
+			},
+		},
+		{
+			name:  "textbox not email prefers password",
+			query: "textbox not email",
+			check: func(t *testing.T, result semantic.FindResult) {
+				e4, ok := findScore(result.Matches, "e4")
+				if !ok {
+					t.Fatalf("expected e4 in matches")
+				}
+				e5, ok := findScore(result.Matches, "e5")
+				if !ok {
+					t.Fatalf("expected e5 in matches")
+				}
+				if e5 <= e4 {
+					t.Fatalf("expected e5 above penalized e4, got e4=%.4f e5=%.4f", e4, e5)
+				}
+			},
+		},
+		{
+			name:  "button not login penalizes sign in by synonym",
+			query: "button not login",
+			check: func(t *testing.T, result semantic.FindResult) {
+				e0, ok := findScore(result.Matches, "e0")
+				if !ok {
+					t.Fatalf("expected e0 in matches")
+				}
+				e1, ok := findScore(result.Matches, "e1")
+				if !ok {
+					t.Fatalf("expected e1 in matches")
+				}
+				e2, ok := findScore(result.Matches, "e2")
+				if !ok {
+					t.Fatalf("expected e2 in matches")
+				}
+				if !(e0 > e2 && e1 > e2) {
+					t.Fatalf("expected e2 to be penalized by login/sign in synonym, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2)
+				}
+			},
+		},
+		{
+			name:  "button not sign in penalizes sign in",
+			query: "button not sign in",
+			check: func(t *testing.T, result semantic.FindResult) {
+				e0, ok := findScore(result.Matches, "e0")
+				if !ok {
+					t.Fatalf("expected e0 in matches")
+				}
+				e1, ok := findScore(result.Matches, "e1")
+				if !ok {
+					t.Fatalf("expected e1 in matches")
+				}
+				e2, ok := findScore(result.Matches, "e2")
+				if !ok {
+					t.Fatalf("expected e2 in matches")
+				}
+				if !(e0 > e2 && e1 > e2) {
+					t.Fatalf("expected e2 to be penalized by multi-token negative, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2)
+				}
+			},
+		},
+		{
+			name:  "button not submit not cancel penalizes both",
+			query: "button not submit not cancel",
+			check: func(t *testing.T, result semantic.FindResult) {
+				e0, ok := findScore(result.Matches, "e0")
+				if !ok {
+					t.Fatalf("expected e0 in matches")
+				}
+				e1, ok := findScore(result.Matches, "e1")
+				if !ok {
+					t.Fatalf("expected e1 in matches")
+				}
+				e2, ok := findScore(result.Matches, "e2")
+				if !ok {
+					t.Fatalf("expected e2 in matches")
+				}
+				if !(e2 > e0 && e2 > e1) {
+					t.Fatalf("expected both submit/cancel to be penalized, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2)
+				}
+			},
+		},
+		{
+			name:  "sign in button regression",
+			query: "sign in button",
+			check: func(t *testing.T, result semantic.FindResult) {
+				if result.BestRef != "e2" {
+					t.Fatalf("expected e2 as best result, got %s", result.BestRef)
+				}
+			},
+		},
+		{
+			name:  "link without logout drives logout near zero",
+			query: "link without logout",
+			check: func(t *testing.T, result semantic.FindResult) {
+				e3, ok := findScore(result.Matches, "e3")
+				if !ok {
+					t.Fatalf("expected e3 in matches")
+				}
+				if e3 > 0.1 {
+					t.Fatalf("expected e3 near-zero score, got %.4f", e3)
+				}
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := m.Find(context.Background(), tt.query, elements, semantic.FindOptions{
+				Threshold: 0,
+				TopK:      len(elements),
+			})
+			if err != nil {
+				t.Fatalf("Find returned error: %v", err)
+			}
+			tt.check(t, result)
+		})
+	}
+}
+
+func TestLexicalMatcher_NegativeOnlyQuery(t *testing.T) {
+	m := semantic.NewLexicalMatcher()
+	elements := negativeMatchingFixture()
+
+	result, err := m.Find(context.Background(), "not submit", elements, semantic.FindOptions{
+		Threshold: 0,
+		TopK:      len(elements),
+	})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+
+	if len(result.Matches) == 0 {
+		t.Fatalf("expected non-empty matches for negative-only query")
+	}
+	for _, m := range result.Matches {
+		if m.Ref == "e0" {
+			t.Fatalf("expected submit element to be excluded for negative-only query")
+		}
+	}
+}
+
 func TestNewCombinedMatcher_Find(t *testing.T) {
 	m := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128))
 

From 6bff209cd2c6beb88e21d721f0e24efd971b7faa Mon Sep 17 00:00:00 2001
From: chetnapadhi <padhichetna@gmail.com>
Date: Fri, 17 Apr 2026 11:14:39 +0530
Subject: [PATCH 02/30] fix: address review feedback (stopwords, parser, test
 cleanup)

---
 internal/engine/combined_test.go     |  8 +++-----
 internal/engine/embedding_test.go    |  9 +++++----
 internal/engine/lexical.go           | 10 +++++++++-
 internal/engine/query_parser.go      |  2 +-
 internal/engine/query_parser_test.go |  4 ++--
 semantic_test.go                     |  8 +++-----
 6 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/internal/engine/combined_test.go b/internal/engine/combined_test.go
index c882591..346ad1f 100644
--- a/internal/engine/combined_test.go
+++ b/internal/engine/combined_test.go
@@ -212,12 +212,10 @@ func TestCombinedMatcher_NegativeOnlyQuery(t *testing.T) {
 		t.Fatalf("Find returned error: %v", err)
 	}
 	if len(res.Matches) == 0 {
-		t.Fatalf("expected non-submit matches to remain")
+		t.Fatalf("expected non-empty matches for leading-not query")
 	}
-	for _, match := range res.Matches {
-		if match.Ref == "submit" {
-			t.Fatalf("expected submit to be filtered out for negative-only query")
-		}
+	if res.BestRef != "submit" {
+		t.Fatalf("expected leading-not query to behave as positive text, got best=%s", res.BestRef)
 	}
 }
 
diff --git a/internal/engine/embedding_test.go b/internal/engine/embedding_test.go
index adb09dd..08d0e1a 100644
--- a/internal/engine/embedding_test.go
+++ b/internal/engine/embedding_test.go
@@ -215,6 +215,7 @@ func TestEmbeddingMatcher_NegativePenalty(t *testing.T) {
 
 func TestEmbeddingMatcher_NegativeOnlyQuery(t *testing.T) {
 	e := newScriptedEmbedder(map[string][]float32{
+		"not submit":     {1, 0},
 		"submit":         {1, 0},
 		"button: Submit": {1, 0},
 		"button: Cancel": {0, 1},
@@ -230,11 +231,11 @@ func TestEmbeddingMatcher_NegativeOnlyQuery(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Find returned error: %v", err)
 	}
-	if len(res.Matches) != 1 {
-		t.Fatalf("expected only non-submit element to remain, got %d matches", len(res.Matches))
+	if len(res.Matches) == 0 {
+		t.Fatalf("expected non-empty matches for leading-not query")
 	}
-	if res.BestRef != "cancel" {
-		t.Fatalf("expected cancel to remain after negative-only query, got %s", res.BestRef)
+	if res.BestRef != "submit" {
+		t.Fatalf("expected leading-not query to behave as positive text, got %s", res.BestRef)
 	}
 }
 
diff --git a/internal/engine/lexical.go b/internal/engine/lexical.go
index d9cc658..c8348d1 100644
--- a/internal/engine/lexical.go
+++ b/internal/engine/lexical.go
@@ -615,6 +615,9 @@ func hasStrongNegativeHit(negativeTokens, descTokens []string) bool {
 
 	dSet := tokenSet(descTokens)
 	for _, nt := range negativeTokens {
+		if isStopword(nt) || isSemanticStopword(nt) {
+			continue
+		}
 		if dSet[nt] {
 			return true
 		}
@@ -625,13 +628,18 @@ func hasStrongNegativeHit(negativeTokens, descTokens []string) bool {
 					continue
 				}
 				allPresent := true
+				hasMeaningfulToken := false
 				for _, st := range synTokens {
+					if isStopword(st) || isSemanticStopword(st) {
+						continue
+					}
+					hasMeaningfulToken = true
 					if !dSet[st] {
 						allPresent = false
 						break
 					}
 				}
-				if allPresent {
+				if hasMeaningfulToken && allPresent {
 					return true
 				}
 			}
diff --git a/internal/engine/query_parser.go b/internal/engine/query_parser.go
index 2f0c584..a16bbc7 100644
--- a/internal/engine/query_parser.go
+++ b/internal/engine/query_parser.go
@@ -32,7 +32,7 @@ func ParseQuery(raw string) ParsedQuery {
 
 	inNegative := false
 	for _, tok := range tokens {
-		if negativeTriggers[tok] {
+		if negativeTriggers[tok] && len(parsed.Positive) > 0 {
 			inNegative = true
 			continue
 		}
diff --git a/internal/engine/query_parser_test.go b/internal/engine/query_parser_test.go
index 0b80656..be0b35d 100644
--- a/internal/engine/query_parser_test.go
+++ b/internal/engine/query_parser_test.go
@@ -51,8 +51,8 @@ func TestParseQuery_TableDriven(t *testing.T) {
 		{
 			name:     "not button",
 			raw:      "not button",
-			positive: nil,
-			negative: []string{"button"},
+			positive: []string{"not", "button"},
+			negative: nil,
 		},
 		{
 			name:     "input no password no username",
diff --git a/semantic_test.go b/semantic_test.go
index 34ea0b0..e41120b 100644
--- a/semantic_test.go
+++ b/semantic_test.go
@@ -214,12 +214,10 @@ func TestLexicalMatcher_NegativeOnlyQuery(t *testing.T) {
 	}
 
 	if len(result.Matches) == 0 {
-		t.Fatalf("expected non-empty matches for negative-only query")
+		t.Fatalf("expected non-empty matches for leading-not query")
 	}
-	for _, m := range result.Matches {
-		if m.Ref == "e0" {
-			t.Fatalf("expected submit element to be excluded for negative-only query")
-		}
+	if result.BestRef != "e0" {
+		t.Fatalf("expected leading-not query to behave as positive text, got best=%s", result.BestRef)
 	}
 }
 

From 87e38070c9116d727da60276b460c3b2630668b9 Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Wed, 22 Apr 2026 11:57:48 +0100
Subject: [PATCH 03/30] feat: add context-aware negative matching

---
 README.md                             |   6 +
 internal/engine/combined.go           |   4 +-
 internal/engine/embedding.go          | 141 ++++++++++++---------
 internal/engine/lexical.go            |  11 +-
 internal/engine/query_context.go      | 168 ++++++++++++++++++++++++++
 internal/engine/query_context_test.go | 164 +++++++++++++++++++++++++
 6 files changed, 431 insertions(+), 63 deletions(-)
 create mode 100644 internal/engine/query_context.go
 create mode 100644 internal/engine/query_context_test.go

diff --git a/README.md b/README.md
index 8cc06a6..86a2449 100644
--- a/README.md
+++ b/README.md
@@ -58,12 +58,18 @@ result, err := matcher.Find(ctx, "log in button", elements, semantic.FindOptions
 Queries can include exclusion intent using:
 `not`, `without`, `exclude`, `excluding`, `except`, `no`, `ignore`.
 
+There are two supported patterns:
+- token exclusion, for example `button not submit`
+- context exclusion, for example `submit button not in header`
+
 Examples:
 
 ```text
 button not submit
 link without logout
 textbox excluding email
+submit button not in header
+login link, not the footer one
 ```
 
 CLI examples:
diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index 5eabf02..be538e1 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -44,7 +44,7 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 		opts.TopK = 3
 	}
 
-	parsed := ParseQuery(query)
+	parsed := ParseQueryContext(query)
 
 	lexW, embW := c.weights(opts)
 
@@ -68,7 +68,7 @@ type matcherResult struct {
 	err    error
 }
 
-func (c *CombinedMatcher) runBothParsed(ctx context.Context, parsed types.ParsedQuery, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, types.FindResult, error) {
+func (c *CombinedMatcher) runBothParsed(ctx context.Context, parsed QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, types.FindResult, error) {
 	internalOpts := types.FindOptions{
 		// Lower threshold allows both strategies to contribute to fusion
 		// before final filtering at the caller's requested threshold.
diff --git a/internal/engine/embedding.go b/internal/engine/embedding.go
index 778d107..e36ebc0 100644
--- a/internal/engine/embedding.go
+++ b/internal/engine/embedding.go
@@ -49,11 +49,12 @@ func (m *EmbeddingMatcher) Strategy() string {
 }
 
 func (m *EmbeddingMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
-	parsed := ParseQuery(query)
-	return m.findWithParsed(parsed, elements, opts)
+	ctx := ParseQueryContext(query)
+	return m.findWithParsed(ctx, elements, opts)
 }
 
-func (m *EmbeddingMatcher) findWithParsed(parsed types.ParsedQuery, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+func (m *EmbeddingMatcher) findWithParsed(ctx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+	parsed := ctx.Base
 	if opts.TopK <= 0 {
 		opts.TopK = 3
 	}
@@ -65,17 +66,52 @@ func (m *EmbeddingMatcher) findWithParsed(parsed types.ParsedQuery, elements []t
 		}, nil
 	}
 
+	filtered := filterContextExcludedElements(elements, ctx)
+	if len(filtered) == 0 {
+		return types.FindResult{Strategy: m.Strategy(), ElementCount: len(elements)}, nil
+	}
+
+	vectors, err := m.embedQueryAndElements(parsed, filtered)
+	if err != nil {
+		return types.FindResult{}, err
+	}
+
+	candidates := m.scoreCandidates(parsed, filtered, vectors, opts.Threshold)
+	sort.Slice(candidates, func(i, j int) bool {
+		scoreDiff := candidates[i].score - candidates[j].score
+		if math.Abs(scoreDiff) > 1e-9 {
+			return scoreDiff > 0
+		}
+		return candidates[i].desc.Ref < candidates[j].desc.Ref
+	})
+
+	if len(candidates) > opts.TopK {
+		candidates = candidates[:opts.TopK]
+	}
+
+	return buildEmbeddingResult(m.Strategy(), len(elements), candidates), nil
+}
+
+func filterContextExcludedElements(elements []types.ElementDescriptor, ctx QueryContext) []types.ElementDescriptor {
+	filtered := make([]types.ElementDescriptor, 0, len(elements))
+	for _, el := range elements {
+		if ctx.HasScope && matchesExcludedContext(el, ctx.Exclude) {
+			continue
+		}
+		filtered = append(filtered, el)
+	}
+	return filtered
+}
+
+func (m *EmbeddingMatcher) embedQueryAndElements(parsed types.ParsedQuery, elements []types.ElementDescriptor) ([][]float32, error) {
 	positiveQuery := strings.Join(parsed.Positive, " ")
 	negativeQuery := strings.Join(parsed.Negative, " ")
-	negativeOnly := len(parsed.Positive) == 0 && len(parsed.Negative) > 0
 
-	// Build composite descriptions.
 	descs := make([]string, len(elements))
 	for i, el := range elements {
 		descs[i] = el.Composite()
 	}
 
-	// Embed positive/negative query components and all descriptions in one batch.
 	texts := make([]string, 0, len(descs)+2)
 	if len(parsed.Positive) > 0 {
 		texts = append(texts, positiveQuery)
@@ -84,11 +120,16 @@ func (m *EmbeddingMatcher) findWithParsed(parsed types.ParsedQuery, elements []t
 		texts = append(texts, negativeQuery)
 	}
 	texts = append(texts, descs...)
-	vectors, err := m.embedder.Embed(texts)
-	if err != nil {
-		return types.FindResult{}, err
-	}
+	return m.embedder.Embed(texts)
+}
+
+type embeddingScored struct {
+	desc  types.ElementDescriptor
+	score float64
+}
 
+func (m *EmbeddingMatcher) scoreCandidates(parsed types.ParsedQuery, elements []types.ElementDescriptor, vectors [][]float32, threshold float64) []embeddingScored {
+	negativeOnly := len(parsed.Positive) == 0 && len(parsed.Negative) > 0
 	idx := 0
 	var posVec []float32
 	if len(parsed.Positive) > 0 {
@@ -108,64 +149,50 @@ func (m *EmbeddingMatcher) findWithParsed(parsed types.ParsedQuery, elements []t
 		contextVecs = m.withNeighborContext(elemVecs)
 	}
 
-	type scored struct {
-		desc  types.ElementDescriptor
-		score float64
-	}
-
-	var candidates []scored
+	var candidates []embeddingScored
 	for i, el := range elements {
-		score := 1.0
-		if len(parsed.Positive) > 0 {
-			score = CosineSimilarity(posVec, contextVecs[i])
-		}
-
-		if len(parsed.Negative) > 0 {
-			// Debug note: negSim is the negative-token similarity used to apply
-			// exclusion/down-weight penalties for this candidate.
-			// Negatives should compare against the element vector itself.
-			negSim := CosineSimilarity(negVec, elemVecs[i])
-			if len(parsed.Positive) == 0 {
-				if negSim > 0.5 {
-					score = 0
-				}
-			} else if negSim > 0.5 {
-				score *= 1 - (negSim * 0.8)
-			}
-		}
-
-		if score < 0 {
-			score = 0
-		}
-		if score > 1 {
-			score = 1
-		}
+		score := scoreEmbeddingCandidate(parsed, posVec, negVec, contextVecs[i], elemVecs[i])
 		if negativeOnly && score == 0 {
 			continue
 		}
-
-		if score >= opts.Threshold {
-			candidates = append(candidates, scored{desc: el, score: score})
+		if score >= threshold {
+			candidates = append(candidates, embeddingScored{desc: el, score: score})
 		}
 	}
+	return candidates
+}
 
-	sort.Slice(candidates, func(i, j int) bool {
-		scoreDiff := candidates[i].score - candidates[j].score
-		if math.Abs(scoreDiff) > 1e-9 {
-			return scoreDiff > 0
+func scoreEmbeddingCandidate(parsed types.ParsedQuery, posVec, negVec, contextVec, elemVec []float32) float64 {
+	score := 1.0
+	if len(parsed.Positive) > 0 {
+		score = CosineSimilarity(posVec, contextVec)
+	}
+
+	if len(parsed.Negative) > 0 {
+		negSim := CosineSimilarity(negVec, elemVec)
+		if len(parsed.Positive) == 0 {
+			if negSim > 0.5 {
+				score = 0
+			}
+		} else if negSim > 0.5 {
+			score *= 1 - (negSim * 0.8)
 		}
-		return candidates[i].desc.Ref < candidates[j].desc.Ref
-	})
+	}
 
-	if len(candidates) > opts.TopK {
-		candidates = candidates[:opts.TopK]
+	if score < 0 {
+		return 0
+	}
+	if score > 1 {
+		return 1
 	}
+	return score
+}
 
+func buildEmbeddingResult(strategy string, elementCount int, candidates []embeddingScored) types.FindResult {
 	result := types.FindResult{
-		Strategy:     m.Strategy(),
-		ElementCount: len(elements),
+		Strategy:     strategy,
+		ElementCount: elementCount,
 	}
-
 	for _, c := range candidates {
 		result.Matches = append(result.Matches, types.ElementMatch{
 			Ref:   c.desc.Ref,
@@ -174,13 +201,11 @@ func (m *EmbeddingMatcher) findWithParsed(parsed types.ParsedQuery, elements []t
 			Name:  c.desc.Name,
 		})
 	}
-
 	if len(result.Matches) > 0 {
 		result.BestRef = result.Matches[0].Ref
 		result.BestScore = result.Matches[0].Score
 	}
-
-	return result, nil
+	return result
 }
 
 func (m *EmbeddingMatcher) withNeighborContext(base [][]float32) [][]float32 {
diff --git a/internal/engine/lexical.go b/internal/engine/lexical.go
index c8348d1..95dd3a2 100644
--- a/internal/engine/lexical.go
+++ b/internal/engine/lexical.go
@@ -48,11 +48,12 @@ func NewLexicalMatcher() *LexicalMatcher {
 func (m *LexicalMatcher) Strategy() string { return "lexical" }
 
 func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
-	parsed := ParseQuery(query)
-	return m.findWithParsed(parsed, elements, opts), nil
+	ctx := ParseQueryContext(query)
+	return m.findWithParsed(ctx, elements, opts), nil
 }
 
-func (m *LexicalMatcher) findWithParsed(parsed types.ParsedQuery, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult {
+func (m *LexicalMatcher) findWithParsed(ctx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult {
+	parsed := ctx.Base
 	if opts.TopK <= 0 {
 		opts.TopK = 3
 	}
@@ -76,6 +77,10 @@ func (m *LexicalMatcher) findWithParsed(parsed types.ParsedQuery, elements []typ
 
 	var candidates []scored
 	for _, el := range elements {
+		if ctx.HasScope && matchesExcludedContext(el, ctx.Exclude) {
+			continue
+		}
+
 		composite := el.Composite()
 		descTokens := tokenize(composite)
 		score := 0.0
diff --git a/internal/engine/query_context.go b/internal/engine/query_context.go
new file mode 100644
index 0000000..677d16b
--- /dev/null
+++ b/internal/engine/query_context.go
@@ -0,0 +1,168 @@
+package engine
+
+import (
+	"regexp"
+	"strings"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+var negativeContextPattern = regexp.MustCompile(`(?i)\b(not|without|exclude|excluding|except|ignore)\b`)
+
+type QueryContext struct {
+	Base     ParsedQuery
+	Exclude  []string
+	HasScope bool
+}
+
+func ParseQueryContext(raw string) QueryContext {
+	parsed := ParseQuery(raw)
+	cleaned := strings.TrimSpace(raw)
+	if cleaned == "" {
+		return QueryContext{Base: parsed}
+	}
+
+	loc := negativeContextPattern.FindStringIndex(cleaned)
+	if loc == nil {
+		return QueryContext{Base: parsed}
+	}
+
+	baseRaw := strings.TrimSpace(cleaned[:loc[0]])
+	remainder := strings.TrimSpace(cleaned[loc[1]:])
+	if baseRaw == "" || remainder == "" {
+		return QueryContext{Base: parsed}
+	}
+
+	baseParsed := ParseQuery(baseRaw)
+	if len(baseParsed.Positive) == 0 {
+		return QueryContext{Base: parsed}
+	}
+	if len(parsed.Negative) == 0 {
+		return QueryContext{Base: parsed}
+	}
+
+	exclude := normalizeContextPhrase(remainder)
+	if len(exclude) == 0 {
+		return QueryContext{Base: parsed}
+	}
+
+	if !looksLikeContextPhrase(exclude) {
+		return QueryContext{Base: parsed}
+	}
+
+	return QueryContext{
+		Base:     baseParsed,
+		Exclude:  exclude,
+		HasScope: true,
+	}
+}
+
+func normalizeContextPhrase(raw string) []string {
+	words := tokenize(strings.Trim(raw, ",.;:- "))
+	if len(words) == 0 {
+		return nil
+	}
+
+	for len(words) > 0 && contextLeadingFillers[words[0]] {
+		words = words[1:]
+	}
+	for len(words) > 0 && contextTrailingFillers[words[len(words)-1]] {
+		words = words[:len(words)-1]
+	}
+	if len(words) == 0 {
+		return nil
+	}
+	return words
+}
+
+func matchesExcludedContext(el types.ElementDescriptor, excludeTokens []string) bool {
+	if len(excludeTokens) == 0 {
+		return false
+	}
+
+	ctxTokens := tokenize(strings.Join([]string{
+		el.Parent,
+		el.Section,
+		el.Positional.LabelledBy,
+		el.Role,
+		el.Name,
+		el.Value,
+	}, " "))
+	if len(ctxTokens) == 0 {
+		return false
+	}
+
+	ctxSet := tokenSet(ctxTokens)
+	matched := 0
+	meaningful := 0
+	for _, tok := range excludeTokens {
+		if isStopword(tok) || isSemanticStopword(tok) {
+			continue
+		}
+		meaningful++
+		if ctxSet[tok] {
+			matched++
+		}
+	}
+	if meaningful == 0 {
+		return false
+	}
+	if matched == meaningful {
+		return true
+	}
+	if meaningful > 1 && float64(matched)/float64(meaningful) >= 0.7 {
+		return true
+	}
+	return false
+}
+
+var contextLeadingFillers = map[string]bool{
+	"in":     true,
+	"on":     true,
+	"at":     true,
+	"of":     true,
+	"to":     true,
+	"from":   true,
+	"inside": true,
+	"within": true,
+	"the":    true,
+	"a":      true,
+	"an":     true,
+}
+
+var contextTrailingFillers = map[string]bool{
+	"one": true,
+}
+
+var contextHintTokens = map[string]bool{
+	"header":     true,
+	"footer":     true,
+	"sidebar":    true,
+	"nav":        true,
+	"navigation": true,
+	"menu":       true,
+	"toolbar":    true,
+	"dialog":     true,
+	"modal":      true,
+	"form":       true,
+	"panel":      true,
+	"section":    true,
+	"content":    true,
+	"main":       true,
+	"top":        true,
+	"bottom":     true,
+	"left":       true,
+	"right":      true,
+	"sticky":     true,
+	"primary":    true,
+	"secondary":  true,
+}
+
+func looksLikeContextPhrase(tokens []string) bool {
+	for _, tok := range tokens {
+		if contextHintTokens[tok] {
+			return true
+		}
+	}
+	return false
+}
diff --git a/internal/engine/query_context_test.go b/internal/engine/query_context_test.go
new file mode 100644
index 0000000..4314d64
--- /dev/null
+++ b/internal/engine/query_context_test.go
@@ -0,0 +1,164 @@
+package engine
+
+import (
+	"context"
+	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+func TestParseQueryContext_BasicPatterns(t *testing.T) {
+	tests := []struct {
+		name         string
+		query        string
+		wantPositive []string
+		wantNegative []string
+		wantExclude  []string
+		wantHasScope bool
+	}{
+		{
+			name:         "plain negative tokens",
+			query:        "button not submit",
+			wantPositive: []string{"button"},
+			wantNegative: []string{"submit"},
+		},
+		{
+			name:         "context exclusion in header",
+			query:        "submit button not in header",
+			wantPositive: []string{"submit", "button"},
+			wantExclude:  []string{"header"},
+			wantHasScope: true,
+		},
+		{
+			name:         "context exclusion with filler tail",
+			query:        "login link, not the footer one",
+			wantPositive: []string{"login", "link"},
+			wantExclude:  []string{"footer"},
+			wantHasScope: true,
+		},
+		{
+			name:         "excluding sidebar",
+			query:        "search box excluding sidebar",
+			wantPositive: []string{"search", "box"},
+			wantExclude:  []string{"sidebar"},
+			wantHasScope: true,
+		},
+		{
+			name:         "leading not stays literal",
+			query:        "not now button",
+			wantPositive: []string{"not", "now", "button"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := ParseQueryContext(tt.query)
+			assertTokens(t, got.Base.Positive, tt.wantPositive, "positive")
+			assertTokens(t, got.Base.Negative, tt.wantNegative, "negative")
+			assertTokens(t, got.Exclude, tt.wantExclude, "exclude")
+			if got.HasScope != tt.wantHasScope {
+				t.Fatalf("HasScope mismatch: got=%v want=%v", got.HasScope, tt.wantHasScope)
+			}
+		})
+	}
+}
+
+func TestMatchesExcludedContext(t *testing.T) {
+	el := types.ElementDescriptor{
+		Ref:     "e1",
+		Role:    "button",
+		Name:    "Submit",
+		Parent:  "Account Header",
+		Section: "Top Header Actions",
+		Positional: types.PositionalHints{
+			LabelledBy: "Header controls",
+		},
+	}
+
+	if !matchesExcludedContext(el, []string{"header"}) {
+		t.Fatalf("expected header exclusion to match")
+	}
+	if !matchesExcludedContext(el, []string{"top", "header"}) {
+		t.Fatalf("expected multi-token exclusion to match")
+	}
+	if matchesExcludedContext(el, []string{"sidebar"}) {
+		t.Fatalf("did not expect unrelated exclusion to match")
+	}
+}
+
+func TestNegativeContextAcrossMatchers(t *testing.T) {
+	elements := []types.ElementDescriptor{
+		{Ref: "header-submit", Role: "button", Name: "Submit", Section: "Header"},
+		{Ref: "main-submit", Role: "button", Name: "Submit", Section: "Checkout content"},
+		{Ref: "footer-submit", Role: "button", Name: "Submit", Section: "Footer"},
+	}
+
+	queries := []string{
+		"submit button not in header",
+		"submit button except footer",
+	}
+
+	matchers := []types.ElementMatcher{
+		NewLexicalMatcher(),
+		NewEmbeddingMatcher(NewHashingEmbedder(128)),
+		NewCombinedMatcher(NewHashingEmbedder(128)),
+	}
+
+	for _, matcher := range matchers {
+		for _, query := range queries {
+			res, err := matcher.Find(context.Background(), query, elements, types.FindOptions{Threshold: 0, TopK: 3})
+			if err != nil {
+				t.Fatalf("%s Find failed for %q: %v", matcher.Strategy(), query, err)
+			}
+			for _, match := range res.Matches {
+				if query == "submit button not in header" && match.Ref == "header-submit" {
+					t.Fatalf("%s should exclude header match for %q", matcher.Strategy(), query)
+				}
+				if query == "submit button except footer" && match.Ref == "footer-submit" {
+					t.Fatalf("%s should exclude footer match for %q", matcher.Strategy(), query)
+				}
+			}
+		}
+	}
+}
+
+func TestDuplicateRegionDisambiguation(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "login-header", Role: "link", Name: "Log in", Section: "Header"},
+		{Ref: "login-footer", Role: "link", Name: "Log in", Section: "Footer"},
+		{Ref: "login-sidebar", Role: "link", Name: "Log in", Section: "Sticky Sidebar Quick Actions"},
+	}
+
+	res, err := m.Find(context.Background(), "login link, not the footer one", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	for _, match := range res.Matches {
+		if match.Ref == "login-footer" {
+			t.Fatalf("expected footer variant to be excluded")
+		}
+	}
+
+	res2, err := m.Find(context.Background(), "login link except sticky sidebar quick actions", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	for _, match := range res2.Matches {
+		if match.Ref == "login-sidebar" {
+			t.Fatalf("expected sidebar variant to be excluded")
+		}
+	}
+}
+
+func assertTokens(t *testing.T, got, want []string, label string) {
+	t.Helper()
+	if len(got) != len(want) {
+		t.Fatalf("%s length mismatch: got=%v want=%v", label, got, want)
+	}
+	for i := range got {
+		if got[i] != want[i] {
+			t.Fatalf("%s mismatch: got=%v want=%v", label, got, want)
+		}
+	}
+}

From f885acb9d3e5b4a1034bb8d1bbb140064fd73757 Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Wed, 22 Apr 2026 18:19:09 +0100
Subject: [PATCH 04/30] feat: add composable ordinal query support

---
 README.md                             |  12 +-
 docs/reference/cli.md                 |   9 ++
 internal/engine/combined.go           |  21 ++-
 internal/engine/query_context.go      |  27 ++--
 internal/engine/query_ordinal.go      | 184 ++++++++++++++++++++++++++
 internal/engine/query_ordinal_test.go | 177 +++++++++++++++++++++++++
 6 files changed, 415 insertions(+), 15 deletions(-)
 create mode 100644 internal/engine/query_ordinal.go
 create mode 100644 internal/engine/query_ordinal_test.go

diff --git a/README.md b/README.md
index 86a2449..d638ced 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ result, err := matcher.Find(ctx, "log in button", elements, semantic.FindOptions
 // result.BestScore = 0.82
 ```
 
-## Negative Queries
+## Negative and Ordinal Queries
 
 Queries can include exclusion intent using:
 `not`, `without`, `exclude`, `excluding`, `except`, `no`, `ignore`.
@@ -80,12 +80,22 @@ semantic find "link without logout" --snapshot page.json
 semantic find "textbox not email" --snapshot page.json --strategy combined
 ```
 
+Ordinal queries are also supported for position-based selection:
+
+```text
+second button
+third menu item
+last input field
+```
+
 Behavior:
 
 - Positive tokens contribute to base match score.
 - Negative tokens apply penalty when they match an element.
 - Strong negative hits can fully exclude an element from results.
 - Negative matching is synonym-aware (for example, `not login` can penalize `Sign In`).
+- Ordinals select from the final matching candidates in document order.
+- Ordinals compose with context exclusion, for example `second button not in header`.
 
 ## Package Layout
 
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
index 33055c6..bb1ec77 100644
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -33,6 +33,15 @@ semantic find "login" --snapshot page.json --format json
 
 # Just refs (for piping)
 semantic find "submit" --snapshot page.json --format refs
+
+# Exclude contexts for duplicate labels
+semantic find "submit button not in header" --snapshot page.json
+semantic find "login link, not the footer one" --snapshot page.json
+
+# Select by ordinal position
+semantic find "second button" --snapshot page.json
+semantic find "last input field" --snapshot page.json
+semantic find "second button not in header" --snapshot page.json
 ```
 
 ### `semantic match`
diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index be538e1..a95099d 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"github.com/pinchtab/semantic/internal/types"
+	"math"
 	"sort"
 )
 
@@ -45,6 +46,10 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 	}
 
 	parsed := ParseQueryContext(query)
+	mergeOpts := opts
+	if parsed.Ordinal.HasOrdinal {
+		mergeOpts.TopK = len(elements)
+	}
 
 	lexW, embW := c.weights(opts)
 
@@ -53,7 +58,8 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 		return types.FindResult{}, err
 	}
 
-	return c.mergeResults(lexResult, embResult, elements, opts, lexW, embW), nil
+	merged := c.mergeResults(lexResult, embResult, elements, mergeOpts, lexW, embW)
+	return selectOrdinalMatchInOrder(merged, parsed.Ordinal, elements), nil
 }
 
 func (c *CombinedMatcher) weights(opts types.FindOptions) (float64, float64) {
@@ -156,7 +162,18 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 	}
 
 	sort.Slice(candidates, func(i, j int) bool {
-		return candidates[i].score > candidates[j].score
+		scoreDiff := candidates[i].score - candidates[j].score
+		if math.Abs(scoreDiff) > 1e-9 {
+			return scoreDiff > 0
+		}
+
+		idxI := candidates[i].el.Positional.SiblingIndex
+		idxJ := candidates[j].el.Positional.SiblingIndex
+		if idxI != idxJ {
+			return idxI < idxJ
+		}
+
+		return candidates[i].ref < candidates[j].ref
 	})
 	if len(candidates) > opts.TopK {
 		candidates = candidates[:opts.TopK]
diff --git a/internal/engine/query_context.go b/internal/engine/query_context.go
index 677d16b..6d0fc24 100644
--- a/internal/engine/query_context.go
+++ b/internal/engine/query_context.go
@@ -13,47 +13,50 @@ type QueryContext struct {
 	Base     ParsedQuery
 	Exclude  []string
 	HasScope bool
+	Ordinal  OrdinalConstraint
 }
 
 func ParseQueryContext(raw string) QueryContext {
-	parsed := ParseQuery(raw)
-	cleaned := strings.TrimSpace(raw)
+	ordinal, baseRaw := parseOrdinalConstraint(raw)
+	parsed := ParseQuery(baseRaw)
+	cleaned := strings.TrimSpace(baseRaw)
 	if cleaned == "" {
-		return QueryContext{Base: parsed}
+		return QueryContext{Base: parsed, Ordinal: ordinal}
 	}
 
 	loc := negativeContextPattern.FindStringIndex(cleaned)
 	if loc == nil {
-		return QueryContext{Base: parsed}
+		return QueryContext{Base: parsed, Ordinal: ordinal}
 	}
 
-	baseRaw := strings.TrimSpace(cleaned[:loc[0]])
+	contextBaseRaw := strings.TrimSpace(cleaned[:loc[0]])
 	remainder := strings.TrimSpace(cleaned[loc[1]:])
-	if baseRaw == "" || remainder == "" {
-		return QueryContext{Base: parsed}
+	if contextBaseRaw == "" || remainder == "" {
+		return QueryContext{Base: parsed, Ordinal: ordinal}
 	}
 
-	baseParsed := ParseQuery(baseRaw)
+	baseParsed := ParseQuery(contextBaseRaw)
 	if len(baseParsed.Positive) == 0 {
-		return QueryContext{Base: parsed}
+		return QueryContext{Base: parsed, Ordinal: ordinal}
 	}
 	if len(parsed.Negative) == 0 {
-		return QueryContext{Base: parsed}
+		return QueryContext{Base: parsed, Ordinal: ordinal}
 	}
 
 	exclude := normalizeContextPhrase(remainder)
 	if len(exclude) == 0 {
-		return QueryContext{Base: parsed}
+		return QueryContext{Base: parsed, Ordinal: ordinal}
 	}
 
 	if !looksLikeContextPhrase(exclude) {
-		return QueryContext{Base: parsed}
+		return QueryContext{Base: parsed, Ordinal: ordinal}
 	}
 
 	return QueryContext{
 		Base:     baseParsed,
 		Exclude:  exclude,
 		HasScope: true,
+		Ordinal:  ordinal,
 	}
 }
 
diff --git a/internal/engine/query_ordinal.go b/internal/engine/query_ordinal.go
new file mode 100644
index 0000000..e3681ce
--- /dev/null
+++ b/internal/engine/query_ordinal.go
@@ -0,0 +1,184 @@
+package engine
+
+import (
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+type OrdinalConstraint struct {
+	HasOrdinal bool
+	Last       bool
+	Position   int
+}
+
+var numericOrdinalPattern = regexp.MustCompile(`^(\d+)(st|nd|rd|th)$`)
+
+var ordinalWords = map[string]int{
+	"first":   1,
+	"second":  2,
+	"third":   3,
+	"fourth":  4,
+	"fifth":   5,
+	"sixth":   6,
+	"seventh": 7,
+	"eighth":  8,
+	"ninth":   9,
+	"tenth":   10,
+}
+
+var ordinalTargetWords = map[string]bool{
+	"button":    true,
+	"link":      true,
+	"input":     true,
+	"field":     true,
+	"textbox":   true,
+	"searchbox": true,
+	"item":      true,
+	"menu":      true,
+	"option":    true,
+	"tab":       true,
+	"result":    true,
+	"row":       true,
+	"column":    true,
+	"card":      true,
+	"entry":     true,
+	"element":   true,
+}
+
+func parseNumericOrdinal(token string) (int, bool) {
+	m := numericOrdinalPattern.FindStringSubmatch(token)
+	if len(m) != 3 {
+		return 0, false
+	}
+	n, err := strconv.Atoi(m[1])
+	if err != nil || n <= 0 {
+		return 0, false
+	}
+	return n, true
+}
+
+func normalizeQueryToken(token string) string {
+	return strings.Trim(strings.ToLower(token), ",.;:-")
+}
+
+func containsOrdinalTarget(words []string, ordIdx int) bool {
+	for i, w := range words {
+		if i == ordIdx {
+			continue
+		}
+		if ordinalTargetWords[normalizeQueryToken(w)] {
+			return true
+		}
+	}
+	return false
+}
+
+func parseOrdinalConstraint(query string) (OrdinalConstraint, string) {
+	cleaned := strings.TrimSpace(query)
+	if cleaned == "" {
+		return OrdinalConstraint{}, cleaned
+	}
+
+	words := strings.Fields(cleaned)
+	if len(words) == 0 {
+		return OrdinalConstraint{}, cleaned
+	}
+
+	ordIdx := -1
+	ordPos := 0
+	ordLast := false
+
+	for i, w := range words {
+		norm := normalizeQueryToken(w)
+		if norm == "" {
+			continue
+		}
+		if norm == "last" || norm == "final" {
+			ordIdx = i
+			ordLast = true
+			break
+		}
+		if pos, ok := ordinalWords[norm]; ok {
+			ordIdx = i
+			ordPos = pos
+			break
+		}
+		if pos, ok := parseNumericOrdinal(norm); ok {
+			ordIdx = i
+			ordPos = pos
+			break
+		}
+	}
+
+	if ordIdx == -1 || !containsOrdinalTarget(words, ordIdx) {
+		return OrdinalConstraint{}, cleaned
+	}
+
+	filtered := make([]string, 0, len(words)-1)
+	for i, w := range words {
+		if i == ordIdx {
+			continue
+		}
+		filtered = append(filtered, w)
+	}
+
+	base := strings.Trim(strings.TrimSpace(strings.Join(filtered, " ")), ",.;:-")
+	if base == "" {
+		base = cleaned
+	}
+
+	return OrdinalConstraint{
+		HasOrdinal: true,
+		Last:       ordLast,
+		Position:   ordPos,
+	}, base
+}
+
+func selectOrdinalMatchInOrder(result types.FindResult, constraint OrdinalConstraint, elements []types.ElementDescriptor) types.FindResult {
+	if !constraint.HasOrdinal || len(result.Matches) == 0 {
+		return result
+	}
+
+	refOrder := make(map[string]int, len(elements))
+	for idx, el := range elements {
+		refOrder[el.Ref] = idx
+	}
+
+	ordered := make([]types.ElementMatch, len(result.Matches))
+	copy(ordered, result.Matches)
+	sort.SliceStable(ordered, func(i, j int) bool {
+		idxI, okI := refOrder[ordered[i].Ref]
+		idxJ, okJ := refOrder[ordered[j].Ref]
+		if okI && okJ {
+			return idxI < idxJ
+		}
+		if okI != okJ {
+			return okI
+		}
+		return ordered[i].Ref < ordered[j].Ref
+	})
+
+	idx := -1
+	if constraint.Last {
+		idx = len(ordered) - 1
+	} else if constraint.Position > 0 {
+		idx = constraint.Position - 1
+	}
+
+	if idx < 0 || idx >= len(ordered) {
+		result.Matches = nil
+		result.BestRef = ""
+		result.BestScore = 0
+		return result
+	}
+
+	chosen := ordered[idx]
+	result.Matches = []types.ElementMatch{chosen}
+	result.BestRef = chosen.Ref
+	result.BestScore = chosen.Score
+	return result
+}
diff --git a/internal/engine/query_ordinal_test.go b/internal/engine/query_ordinal_test.go
new file mode 100644
index 0000000..0580b7f
--- /dev/null
+++ b/internal/engine/query_ordinal_test.go
@@ -0,0 +1,177 @@
+package engine
+
+import (
+	"context"
+	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+func TestParseOrdinalConstraint_BasicPatterns(t *testing.T) {
+	tests := []struct {
+		name       string
+		query      string
+		wantBase   string
+		wantHasOrd bool
+		wantPos    int
+		wantIsLast bool
+	}{
+		{
+			name:       "second button",
+			query:      "second button",
+			wantBase:   "button",
+			wantHasOrd: true,
+			wantPos:    2,
+		},
+		{
+			name:       "numeric ordinal",
+			query:      "3rd menu item",
+			wantBase:   "menu item",
+			wantHasOrd: true,
+			wantPos:    3,
+		},
+		{
+			name:       "last input field",
+			query:      "last input field",
+			wantBase:   "input field",
+			wantHasOrd: true,
+			wantIsLast: true,
+		},
+		{
+			name:       "non ordinal content query",
+			query:      "first name",
+			wantBase:   "first name",
+			wantHasOrd: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, base := parseOrdinalConstraint(tt.query)
+			if base != tt.wantBase {
+				t.Fatalf("base query mismatch: want=%q got=%q", tt.wantBase, base)
+			}
+			if got.HasOrdinal != tt.wantHasOrd {
+				t.Fatalf("HasOrdinal mismatch: want=%v got=%v", tt.wantHasOrd, got.HasOrdinal)
+			}
+			if got.Position != tt.wantPos {
+				t.Fatalf("position mismatch: want=%d got=%d", tt.wantPos, got.Position)
+			}
+			if got.Last != tt.wantIsLast {
+				t.Fatalf("last mismatch: want=%v got=%v", tt.wantIsLast, got.Last)
+			}
+		})
+	}
+}
+
+func TestParseQueryContext_WithOrdinalAndNegativeScope(t *testing.T) {
+	ctx := ParseQueryContext("second button not in header")
+	if !ctx.Ordinal.HasOrdinal || ctx.Ordinal.Position != 2 {
+		t.Fatalf("expected second ordinal, got %+v", ctx.Ordinal)
+	}
+	assertTokens(t, ctx.Base.Positive, []string{"button"}, "positive")
+	assertTokens(t, ctx.Base.Negative, []string{}, "negative")
+	assertTokens(t, ctx.Exclude, []string{"header"}, "exclude")
+	if !ctx.HasScope {
+		t.Fatalf("expected scope exclusion to be detected")
+	}
+}
+
+func TestCombinedMatcher_OrdinalQuery_SecondButton(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "btn-1", Role: "button", Name: "Action", Positional: types.PositionalHints{SiblingIndex: 0}},
+		{Ref: "btn-2", Role: "button", Name: "Action", Positional: types.PositionalHints{SiblingIndex: 1}},
+		{Ref: "btn-3", Role: "button", Name: "Action", Positional: types.PositionalHints{SiblingIndex: 2}},
+	}
+
+	res, err := m.Find(context.Background(), "second button", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if len(res.Matches) != 1 {
+		t.Fatalf("expected one ordinal-selected match, got %d", len(res.Matches))
+	}
+	if res.BestRef != "btn-2" {
+		t.Fatalf("expected second button btn-2, got %s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_OrdinalQuery_LastInputField(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "input-1", Role: "textbox", Name: "Email", Positional: types.PositionalHints{SiblingIndex: 0}},
+		{Ref: "input-2", Role: "textbox", Name: "Email", Positional: types.PositionalHints{SiblingIndex: 1}},
+		{Ref: "input-3", Role: "textbox", Name: "Email", Positional: types.PositionalHints{SiblingIndex: 2}},
+	}
+
+	res, err := m.Find(context.Background(), "last input field", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if len(res.Matches) != 1 {
+		t.Fatalf("expected one ordinal-selected match, got %d", len(res.Matches))
+	}
+	if res.BestRef != "input-3" {
+		t.Fatalf("expected last input field input-3, got %s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_OrdinalQuery_OutOfRangeReturnsNoMatch(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "b1", Role: "button", Name: "Continue", Positional: types.PositionalHints{SiblingIndex: 0}},
+		{Ref: "b2", Role: "button", Name: "Continue", Positional: types.PositionalHints{SiblingIndex: 1}},
+		{Ref: "b3", Role: "button", Name: "Continue", Positional: types.PositionalHints{SiblingIndex: 2}},
+	}
+
+	res, err := m.Find(context.Background(), "fifth button", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if len(res.Matches) != 0 {
+		t.Fatalf("expected no matches for out-of-range ordinal, got %d", len(res.Matches))
+	}
+	if res.BestRef != "" || res.BestScore != 0 {
+		t.Fatalf("expected empty best match for out-of-range ordinal, got ref=%q score=%f", res.BestRef, res.BestScore)
+	}
+}
+
+func TestCombinedMatcher_OrdinalGuard_DoesNotTreatFirstNameAsOrdinal(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "first-name", Role: "textbox", Name: "First Name", Positional: types.PositionalHints{SiblingIndex: 3}},
+		{Ref: "last-name", Role: "textbox", Name: "Last Name", Positional: types.PositionalHints{SiblingIndex: 0}},
+	}
+
+	res, err := m.Find(context.Background(), "first name", elements, types.FindOptions{Threshold: 0, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if len(res.Matches) == 0 {
+		t.Fatalf("expected at least one match")
+	}
+	if res.BestRef != "first-name" {
+		t.Fatalf("expected semantic match for 'first name', got %s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_OrdinalWithContextExclusion(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "header-btn", Role: "button", Name: "Submit", Section: "Header", Positional: types.PositionalHints{SiblingIndex: 0}},
+		{Ref: "main-btn-1", Role: "button", Name: "Submit", Section: "Main", Positional: types.PositionalHints{SiblingIndex: 1}},
+		{Ref: "main-btn-2", Role: "button", Name: "Submit", Section: "Main", Positional: types.PositionalHints{SiblingIndex: 2}},
+	}
+
+	res, err := m.Find(context.Background(), "second button not in header", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if len(res.Matches) != 1 {
+		t.Fatalf("expected one ordinal-selected match, got %d", len(res.Matches))
+	}
+	if res.BestRef != "main-btn-2" {
+		t.Fatalf("expected second non-header button, got %s", res.BestRef)
+	}
+}

From a6e6a8ed6be9146785057ab1f2a1fae2cda007e7 Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Thu, 23 Apr 2026 10:57:17 +0100
Subject: [PATCH 05/30] test: harden ordinal query selection

---
 internal/engine/query_ordinal.go              | 38 +++++++++++++++-
 internal/engine/query_ordinal_test.go         | 33 ++++++++++++++
 .../corpus/ordinal-context/queries.json       | 44 +++++++++++++++++++
 .../corpus/ordinal-context/snapshot.json      | 13 ++++++
 tests/e2e/cases/14-find-ordinal.sh            | 22 ++++++++++
 5 files changed, 148 insertions(+), 2 deletions(-)
 create mode 100644 tests/benchmark/corpus/ordinal-context/queries.json
 create mode 100644 tests/benchmark/corpus/ordinal-context/snapshot.json
 create mode 100755 tests/e2e/cases/14-find-ordinal.sh

diff --git a/internal/engine/query_ordinal.go b/internal/engine/query_ordinal.go
index e3681ce..dacb349 100644
--- a/internal/engine/query_ordinal.go
+++ b/internal/engine/query_ordinal.go
@@ -143,13 +143,21 @@ func selectOrdinalMatchInOrder(result types.FindResult, constraint OrdinalConstr
 		return result
 	}
 
+	filtered := filterOrdinalCandidates(result.Matches)
+	if len(filtered) == 0 {
+		result.Matches = nil
+		result.BestRef = ""
+		result.BestScore = 0
+		return result
+	}
+
 	refOrder := make(map[string]int, len(elements))
 	for idx, el := range elements {
 		refOrder[el.Ref] = idx
 	}
 
-	ordered := make([]types.ElementMatch, len(result.Matches))
-	copy(ordered, result.Matches)
+	ordered := make([]types.ElementMatch, len(filtered))
+	copy(ordered, filtered)
 	sort.SliceStable(ordered, func(i, j int) bool {
 		idxI, okI := refOrder[ordered[i].Ref]
 		idxJ, okJ := refOrder[ordered[j].Ref]
@@ -182,3 +190,29 @@ func selectOrdinalMatchInOrder(result types.FindResult, constraint OrdinalConstr
 	result.BestScore = chosen.Score
 	return result
 }
+
+func filterOrdinalCandidates(matches []types.ElementMatch) []types.ElementMatch {
+	if len(matches) == 0 {
+		return nil
+	}
+
+	bestScore := matches[0].Score
+	for _, match := range matches[1:] {
+		if match.Score > bestScore {
+			bestScore = match.Score
+		}
+	}
+
+	floor := bestScore * 0.75
+	if floor < 0.2 {
+		floor = 0.2
+	}
+
+	filtered := make([]types.ElementMatch, 0, len(matches))
+	for _, match := range matches {
+		if match.Score >= floor {
+			filtered = append(filtered, match)
+		}
+	}
+	return filtered
+}
diff --git a/internal/engine/query_ordinal_test.go b/internal/engine/query_ordinal_test.go
index 0580b7f..01cef49 100644
--- a/internal/engine/query_ordinal_test.go
+++ b/internal/engine/query_ordinal_test.go
@@ -175,3 +175,36 @@ func TestCombinedMatcher_OrdinalWithContextExclusion(t *testing.T) {
 		t.Fatalf("expected second non-header button, got %s", res.BestRef)
 	}
 }
+
+func TestFilterOrdinalCandidates_DropsWeakSemanticTail(t *testing.T) {
+	matches := []types.ElementMatch{
+		{Ref: "e1", Score: 0.92},
+		{Ref: "e2", Score: 0.81},
+		{Ref: "e3", Score: 0.18},
+	}
+
+	filtered := filterOrdinalCandidates(matches)
+	if len(filtered) != 2 {
+		t.Fatalf("expected 2 strong candidates, got %d", len(filtered))
+	}
+	if filtered[0].Ref != "e1" || filtered[1].Ref != "e2" {
+		t.Fatalf("unexpected filtered refs: %+v", filtered)
+	}
+}
+
+func TestCombinedMatcher_OrdinalQuery_IgnoresWeakNonButtonTail(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "btn-1", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 0}},
+		{Ref: "btn-2", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 1}},
+		{Ref: "note", Role: "note", Name: "Submission tips", Positional: types.PositionalHints{SiblingIndex: 2}},
+	}
+
+	res, err := m.Find(context.Background(), "second submit button", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if res.BestRef != "btn-2" {
+		t.Fatalf("expected second submit button btn-2, got %s", res.BestRef)
+	}
+}
diff --git a/tests/benchmark/corpus/ordinal-context/queries.json b/tests/benchmark/corpus/ordinal-context/queries.json
new file mode 100644
index 0000000..a8b4f3c
--- /dev/null
+++ b/tests/benchmark/corpus/ordinal-context/queries.json
@@ -0,0 +1,44 @@
+[
+  {
+    "id": "ordinal-001",
+    "query": "second submit button",
+    "relevant_refs": ["e2"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["ordinal", "button", "duplicate-labels"]
+  },
+  {
+    "id": "ordinal-002",
+    "query": "last submit button",
+    "relevant_refs": ["e4"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["ordinal", "button", "duplicate-labels"]
+  },
+  {
+    "id": "ordinal-003",
+    "query": "second submit button not in header",
+    "relevant_refs": ["e3"],
+    "partially_relevant_refs": ["e4"],
+    "difficulty": "hard",
+    "tags": ["ordinal", "context-exclusion", "button", "duplicate-labels"],
+    "notes": "Header submit should be excluded before ordinal selection"
+  },
+  {
+    "id": "ordinal-004",
+    "query": "last login link except footer",
+    "relevant_refs": ["e6"],
+    "partially_relevant_refs": ["e5"],
+    "difficulty": "hard",
+    "tags": ["ordinal", "context-exclusion", "link", "duplicate-labels"]
+  },
+  {
+    "id": "ordinal-005",
+    "query": "first name",
+    "relevant_refs": [],
+    "partially_relevant_refs": ["e8", "e9", "e10"],
+    "difficulty": "medium",
+    "tags": ["guard", "literal-text", "textbox"],
+    "notes": "Guard case, should not trigger ordinal parsing just because of the word first"
+  }
+]
diff --git a/tests/benchmark/corpus/ordinal-context/snapshot.json b/tests/benchmark/corpus/ordinal-context/snapshot.json
new file mode 100644
index 0000000..ceb60df
--- /dev/null
+++ b/tests/benchmark/corpus/ordinal-context/snapshot.json
@@ -0,0 +1,13 @@
+[
+  {"ref": "e0", "role": "heading", "name": "Checkout", "interactive": false, "section": "Header"},
+  {"ref": "e1", "role": "button", "name": "Submit", "interactive": true, "section": "Header", "parent": "Header actions", "positional": {"siblingIndex": 0}},
+  {"ref": "e2", "role": "button", "name": "Submit", "interactive": true, "section": "Login", "parent": "Login form", "positional": {"siblingIndex": 1}},
+  {"ref": "e3", "role": "button", "name": "Submit", "interactive": true, "section": "Payment", "parent": "Payment form", "positional": {"siblingIndex": 2}},
+  {"ref": "e4", "role": "button", "name": "Submit", "interactive": true, "section": "Footer", "parent": "Footer actions", "positional": {"siblingIndex": 3}},
+  {"ref": "e5", "role": "link", "name": "Log in", "interactive": true, "section": "Header", "parent": "Header nav", "positional": {"siblingIndex": 4}},
+  {"ref": "e6", "role": "link", "name": "Log in", "interactive": true, "section": "Sidebar", "parent": "Quick actions", "positional": {"siblingIndex": 5}},
+  {"ref": "e7", "role": "link", "name": "Log in", "interactive": true, "section": "Footer", "parent": "Footer nav", "positional": {"siblingIndex": 6}},
+  {"ref": "e8", "role": "textbox", "name": "Email", "interactive": true, "section": "Billing", "parent": "Billing form", "positional": {"siblingIndex": 7}},
+  {"ref": "e9", "role": "textbox", "name": "Email", "interactive": true, "section": "Shipping", "parent": "Shipping form", "positional": {"siblingIndex": 8}},
+  {"ref": "e10", "role": "textbox", "name": "Email", "interactive": true, "section": "Profile", "parent": "Profile form", "positional": {"siblingIndex": 9}}
+]
diff --git a/tests/e2e/cases/14-find-ordinal.sh b/tests/e2e/cases/14-find-ordinal.sh
new file mode 100755
index 0000000..05b3e60
--- /dev/null
+++ b/tests/e2e/cases/14-find-ordinal.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+CASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${CASE_DIR}/../lib.sh"
+
+echo "  -- Find: Ordinal Queries --"
+
+MULTI="${ASSETS_DIR}/snapshots/multi-form.json"
+
+result=$(./semantic find "second submit button" --snapshot "$MULTI" --format json)
+assert_json_field "$result" ".best_ref" "e7" "ordinal: second submit button → e7"
+
+result=$(./semantic find "last submit button" --snapshot "$MULTI" --format json)
+assert_json_field "$result" ".best_ref" "e11" "ordinal: last submit button → e11"
+
+result=$(./semantic find "second submit button not in header" --snapshot "$MULTI" --format json)
+assert_json_field "$result" ".best_ref" "e7" "ordinal+context: second submit button not in header → e7"
+
+LOGIN="${ASSETS_DIR}/snapshots/login-page.json"
+result=$(./semantic find "email address" --snapshot "$LOGIN" --format json)
+assert_json_field "$result" ".best_ref" "e1" "guard: literal query still resolves email address → e1"
+
+summary "find-ordinal"

From ec91f060b05e965c671394d7e7311e81fd3297ca Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Thu, 23 Apr 2026 11:11:30 +0100
Subject: [PATCH 06/30] fix: preserve document order for ordinal queries

---
 cmd/semantic/main.go               |  1 +
 internal/engine/combined.go        | 18 +++++++++++++++---
 internal/engine/query_ordinal.go   |  8 ++++++--
 internal/types/types.go            |  1 +
 tests/e2e/cases/14-find-ordinal.sh | 10 +++++-----
 5 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/cmd/semantic/main.go b/cmd/semantic/main.go
index ae8bc89..7c96773 100644
--- a/cmd/semantic/main.go
+++ b/cmd/semantic/main.go
@@ -135,6 +135,7 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) {
 			Interactive: e.Interactive,
 			Parent:      e.Parent,
 			Section:     e.Section,
+			DocumentIdx: i,
 			Positional: semantic.PositionalHints{
 				Depth:        depth,
 				SiblingIndex: siblingIdx,
diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index a95099d..9bc6e99 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -47,13 +47,15 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 
 	parsed := ParseQueryContext(query)
 	mergeOpts := opts
+	internalOpts := opts
 	if parsed.Ordinal.HasOrdinal {
 		mergeOpts.TopK = len(elements)
+		internalOpts.TopK = len(elements)
 	}
 
 	lexW, embW := c.weights(opts)
 
-	lexResult, embResult, err := c.runBothParsed(ctx, parsed, elements, opts)
+	lexResult, embResult, err := c.runBothParsed(ctx, parsed, elements, internalOpts)
 	if err != nil {
 		return types.FindResult{}, err
 	}
@@ -167,8 +169,8 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 			return scoreDiff > 0
 		}
 
-		idxI := candidates[i].el.Positional.SiblingIndex
-		idxJ := candidates[j].el.Positional.SiblingIndex
+		idxI := documentOrderIndex(candidates[i].el, i)
+		idxJ := documentOrderIndex(candidates[j].el, j)
 		if idxI != idxJ {
 			return idxI < idxJ
 		}
@@ -206,6 +208,16 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 	return result
 }
 
+func documentOrderIndex(el types.ElementDescriptor, fallback int) int {
+	if el.DocumentIdx > 0 || (el.DocumentIdx == 0 && el.Ref != "") {
+		return el.DocumentIdx
+	}
+	if el.Positional.SiblingIndex > 0 || (el.Positional.SiblingIndex == 0 && el.Ref != "") {
+		return el.Positional.SiblingIndex
+	}
+	return fallback
+}
+
 func scoreMap(matches []types.ElementMatch) map[string]float64 {
 	m := make(map[string]float64, len(matches))
 	for _, match := range matches {
diff --git a/internal/engine/query_ordinal.go b/internal/engine/query_ordinal.go
index dacb349..c9645e3 100644
--- a/internal/engine/query_ordinal.go
+++ b/internal/engine/query_ordinal.go
@@ -153,7 +153,11 @@ func selectOrdinalMatchInOrder(result types.FindResult, constraint OrdinalConstr
 
 	refOrder := make(map[string]int, len(elements))
 	for idx, el := range elements {
-		refOrder[el.Ref] = idx
+		order := el.DocumentIdx
+		if order < 0 {
+			order = idx
+		}
+		refOrder[el.Ref] = order
 	}
 
 	ordered := make([]types.ElementMatch, len(filtered))
@@ -203,7 +207,7 @@ func filterOrdinalCandidates(matches []types.ElementMatch) []types.ElementMatch
 		}
 	}
 
-	floor := bestScore * 0.75
+	floor := bestScore - 0.15
 	if floor < 0.2 {
 		floor = 0.2
 	}
diff --git a/internal/types/types.go b/internal/types/types.go
index 6acde4a..1b58eb4 100644
--- a/internal/types/types.go
+++ b/internal/types/types.go
@@ -100,6 +100,7 @@ type ElementDescriptor struct {
 	Interactive bool
 	Parent      string
 	Section     string
+	DocumentIdx int
 	Positional  PositionalHints
 }
 
diff --git a/tests/e2e/cases/14-find-ordinal.sh b/tests/e2e/cases/14-find-ordinal.sh
index 05b3e60..c0f1819 100755
--- a/tests/e2e/cases/14-find-ordinal.sh
+++ b/tests/e2e/cases/14-find-ordinal.sh
@@ -6,17 +6,17 @@ echo "  -- Find: Ordinal Queries --"
 
 MULTI="${ASSETS_DIR}/snapshots/multi-form.json"
 
-result=$(./semantic find "second submit button" --snapshot "$MULTI" --format json)
+result=$(semantic find "second submit button" --snapshot "$MULTI" --format json)
 assert_json_field "$result" ".best_ref" "e7" "ordinal: second submit button → e7"
 
-result=$(./semantic find "last submit button" --snapshot "$MULTI" --format json)
+result=$(semantic find "last submit button" --snapshot "$MULTI" --format json)
 assert_json_field "$result" ".best_ref" "e11" "ordinal: last submit button → e11"
 
-result=$(./semantic find "second submit button not in header" --snapshot "$MULTI" --format json)
-assert_json_field "$result" ".best_ref" "e7" "ordinal+context: second submit button not in header → e7"
+result=$(semantic find "second submit button not in login" --snapshot "$MULTI" --format json)
+assert_json_field "$result" ".best_ref" "e11" "ordinal+context: second submit button not in login → e11"
 
 LOGIN="${ASSETS_DIR}/snapshots/login-page.json"
-result=$(./semantic find "email address" --snapshot "$LOGIN" --format json)
+result=$(semantic find "email address" --snapshot "$LOGIN" --format json)
 assert_json_field "$result" ".best_ref" "e1" "guard: literal query still resolves email address → e1"
 
 summary "find-ordinal"

From 981e9b1ce15d36399bb801caaa556c94d801fcb6 Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Thu, 23 Apr 2026 12:58:32 +0100
Subject: [PATCH 07/30] chore: rename benchmark dev command to bench

---
 dev                          | 10 +++++-----
 docs/guides/contributing.md  |  2 +-
 skills/semantic-dev/SKILL.md |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev b/dev
index ec35d82..dc15e75 100755
--- a/dev
+++ b/dev
@@ -21,7 +21,7 @@ commands=(
   "vet:🔬:Run go vet"
   "check:✅:Run all checks (fmt + vet + lint + test)"
   "build:📦:Build CLI binary"
-  "benchmark:🏋:Run benchmark study"
+  "bench:🏋:Run corpus benchmark suite"
   "e2e:🐳:Run E2E tests (Docker)"
 )
 
@@ -114,9 +114,9 @@ run_build() {
   echo "  ${SUCCESS}✓${NC} Built: ./semantic"
 }
 
-run_benchmark() {
-  echo "  ${ACCENT}${BOLD}⏱️  Running benchmark study${NC}"
-  go test -run TestBenchmarkStudy -v -count=1
+run_bench() {
+  echo "  ${ACCENT}${BOLD}⏱️  Running corpus benchmark suite${NC}"
+  bash tests/benchmark/scripts/run-corpus-benchmark.sh
 }
 
 run_e2e() {
@@ -143,7 +143,7 @@ case "${1:-help}" in
   vet)       run_vet ;;
   check)     run_check ;;
   build)     run_build ;;
-  benchmark) run_benchmark ;;
+  bench|benchmark) run_bench ;;
   e2e)       run_e2e ;;
   help|*)    show_help ;;
 esac
diff --git a/docs/guides/contributing.md b/docs/guides/contributing.md
index 81f9736..695a7d6 100644
--- a/docs/guides/contributing.md
+++ b/docs/guides/contributing.md
@@ -19,7 +19,7 @@ Doctor checks Go version, golangci-lint, dependencies, build, tests, and git hoo
 ./dev lint          # golangci-lint
 ./dev check         # all checks (fmt + vet + lint + test)
 ./dev build         # build CLI binary
-./dev benchmark     # run benchmark study
+./dev bench         # run corpus benchmark suite
 ```
 
 ## Project Structure
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index cee7454..84ade33 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -29,7 +29,7 @@ All development commands run via `./dev`:
 | `./dev vet` | Run go vet |
 | `./dev check` | All checks (fmt + vet + lint + test) |
 | `./dev build` | Build CLI binary |
-| `./dev benchmark` | Run benchmark study |
+| `./dev bench` | Run corpus benchmark suite |
 | `./dev e2e` | Run E2E tests (Docker) |
 
 ## Architecture
@@ -112,7 +112,7 @@ recovery.ClassifyFailure, recovery.DefaultRecoveryConfig
 ## Testing
 
 - **167 tests** across 3 packages (root, engine, recovery)
-- `internal/engine/` has unit tests for all matchers + benchmark study
+- `internal/engine/` has unit tests for all matchers + benchmark suite
 - Root has API-level smoke tests
 - `recovery/` has scenario tests (SPA re-render, checkout, login, etc.)
 

From 852aaca47f9153e006336b4d8edb3c7334eba054 Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Thu, 23 Apr 2026 13:48:35 +0100
Subject: [PATCH 08/30] fix: stabilize ordinal document ordering

---
 internal/engine/combined.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index 9bc6e99..fdd4124 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -209,10 +209,10 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 }
 
 func documentOrderIndex(el types.ElementDescriptor, fallback int) int {
-	if el.DocumentIdx > 0 || (el.DocumentIdx == 0 && el.Ref != "") {
+	if el.DocumentIdx > 0 {
 		return el.DocumentIdx
 	}
-	if el.Positional.SiblingIndex > 0 || (el.Positional.SiblingIndex == 0 && el.Ref != "") {
+	if el.Positional.SiblingIndex > 0 {
 		return el.Positional.SiblingIndex
 	}
 	return fallback

From 18cbd15d0ff1c5ad8ad6e4b90e9995e251280e98 Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Thu, 23 Apr 2026 13:56:31 +0100
Subject: [PATCH 09/30] test: stabilize ordinal input ordering coverage test

---
 internal/engine/query_ordinal_test.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal/engine/query_ordinal_test.go b/internal/engine/query_ordinal_test.go
index 01cef49..8cf3462 100644
--- a/internal/engine/query_ordinal_test.go
+++ b/internal/engine/query_ordinal_test.go
@@ -100,9 +100,9 @@ func TestCombinedMatcher_OrdinalQuery_SecondButton(t *testing.T) {
 func TestCombinedMatcher_OrdinalQuery_LastInputField(t *testing.T) {
 	m := NewCombinedMatcher(NewHashingEmbedder(128))
 	elements := []types.ElementDescriptor{
-		{Ref: "input-1", Role: "textbox", Name: "Email", Positional: types.PositionalHints{SiblingIndex: 0}},
-		{Ref: "input-2", Role: "textbox", Name: "Email", Positional: types.PositionalHints{SiblingIndex: 1}},
-		{Ref: "input-3", Role: "textbox", Name: "Email", Positional: types.PositionalHints{SiblingIndex: 2}},
+		{Ref: "input-1", Role: "textbox", Name: "Email", DocumentIdx: 0, Positional: types.PositionalHints{SiblingIndex: 1}},
+		{Ref: "input-2", Role: "textbox", Name: "Email", DocumentIdx: 1, Positional: types.PositionalHints{SiblingIndex: 2}},
+		{Ref: "input-3", Role: "textbox", Name: "Email", DocumentIdx: 2, Positional: types.PositionalHints{SiblingIndex: 3}},
 	}
 
 	res, err := m.Find(context.Background(), "last input field", elements, types.FindOptions{Threshold: 0, TopK: 3})

From cccada975cef80a525d6ea596b8043d6381dc2a7 Mon Sep 17 00:00:00 2001
From: Bosch <basch@giagolab.com>
Date: Thu, 23 Apr 2026 14:40:47 +0100
Subject: [PATCH 10/30] feat: support visual position hints in queries

---
 README.md                            |  50 +---
 cmd/semantic/main.go                 |  52 +++-
 cmd/semantic/main_test.go            |  16 +-
 docs/reference/cli.md                |  44 +++-
 internal/engine/combined.go          |   4 +-
 internal/engine/query_visual.go      | 374 +++++++++++++++++++++++++++
 internal/engine/query_visual_test.go | 137 ++++++++++
 internal/types/types.go              |   4 +
 8 files changed, 618 insertions(+), 63 deletions(-)
 create mode 100644 internal/engine/query_visual.go
 create mode 100644 internal/engine/query_visual_test.go

diff --git a/README.md b/README.md
index d638ced..57e3053 100644
--- a/README.md
+++ b/README.md
@@ -53,50 +53,6 @@ result, err := matcher.Find(ctx, "log in button", elements, semantic.FindOptions
 // result.BestScore = 0.82
 ```
 
-## Negative and Ordinal Queries
-
-Queries can include exclusion intent using:
-`not`, `without`, `exclude`, `excluding`, `except`, `no`, `ignore`.
-
-There are two supported patterns:
-- token exclusion, for example `button not submit`
-- context exclusion, for example `submit button not in header`
-
-Examples:
-
-```text
-button not submit
-link without logout
-textbox excluding email
-submit button not in header
-login link, not the footer one
-```
-
-CLI examples:
-
-```bash
-semantic find "button not submit" --snapshot page.json
-semantic find "link without logout" --snapshot page.json
-semantic find "textbox not email" --snapshot page.json --strategy combined
-```
-
-Ordinal queries are also supported for position-based selection:
-
-```text
-second button
-third menu item
-last input field
-```
-
-Behavior:
-
-- Positive tokens contribute to base match score.
-- Negative tokens apply penalty when they match an element.
-- Strong negative hits can fully exclude an element from results.
-- Negative matching is synonym-aware (for example, `not login` can penalize `Sign In`).
-- Ordinals select from the final matching candidates in document order.
-- Ordinals compose with context exclusion, for example `second button not in header`.
-
 ## Package Layout
 
 ```
@@ -147,6 +103,7 @@ Implementations are internal — consumers use the `ElementMatcher` interface an
 ## Features
 
 - **Synonym expansion** — 54 UI synonym groups ("sign in" ↔ "log in", "cart" ↔ "basket", "preferences" ↔ "settings", etc.)
+- **Visual position hints** — Understand layout cues like `top`, `bottom`, `left`, `right`, and `above`/`below` anchors
 - **Confidence calibration** — Scores mapped to high (≥ 0.8) / medium (≥ 0.6) / low labels
 - **Error classification** — Classify browser errors (CDP, chromedp) as recoverable or not
 - **Self-healing recovery** — Re-locate stale elements after DOM changes via callback interfaces
@@ -228,6 +185,11 @@ semantic find "login" --snapshot page.json --format json    # machine-readable
 semantic find "login" --snapshot page.json --format table   # human-readable
 semantic find "login" --snapshot page.json --format refs    # just refs
 
+# Visual position hints
+semantic find "button in top right corner" --snapshot page.json
+semantic find "link below the search box" --snapshot page.json
+semantic find "sidebar on the left" --snapshot page.json
+
 # Score a specific element
 semantic match "login" e4 --snapshot page.json
 
diff --git a/cmd/semantic/main.go b/cmd/semantic/main.go
index 7c96773..fe7ebdf 100644
--- a/cmd/semantic/main.go
+++ b/cmd/semantic/main.go
@@ -62,10 +62,16 @@ Flags (find/match):
 
 // snapshotElement is the JSON shape from pinchtab's /snapshot endpoint.
 type snapshotPositional struct {
-	Depth        int    `json:"depth"`
-	SiblingIndex int    `json:"sibling_index"`
-	SiblingCount int    `json:"sibling_count"`
-	LabelledBy   string `json:"labelled_by"`
+	Depth        int     `json:"depth"`
+	SiblingIndex int     `json:"sibling_index"`
+	SiblingCount int     `json:"sibling_count"`
+	LabelledBy   string  `json:"labelled_by"`
+	X            float64 `json:"x"`
+	Y            float64 `json:"y"`
+	Top          float64 `json:"top"`
+	Left         float64 `json:"left"`
+	Width        float64 `json:"width"`
+	Height       float64 `json:"height"`
 }
 
 type snapshotElement struct {
@@ -80,6 +86,12 @@ type snapshotElement struct {
 	SiblingIdx  int                 `json:"sibling_index"`
 	SiblingCnt  int                 `json:"sibling_count"`
 	LabelledBy  string              `json:"labelled_by"`
+	X           float64             `json:"x"`
+	Y           float64             `json:"y"`
+	Top         float64             `json:"top"`
+	Left        float64             `json:"left"`
+	Width       float64             `json:"width"`
+	Height      float64             `json:"height"`
 	Positional  *snapshotPositional `json:"positional"`
 }
 
@@ -112,6 +124,16 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) {
 		depth := e.Depth
 		siblingIdx := e.SiblingIdx
 		siblingCnt := e.SiblingCnt
+		x := e.X
+		y := e.Y
+		if x == 0 && e.Left != 0 {
+			x = e.Left
+		}
+		if y == 0 && e.Top != 0 {
+			y = e.Top
+		}
+		width := e.Width
+		height := e.Height
 		if e.Positional != nil {
 			if e.Positional.Depth != 0 {
 				depth = e.Positional.Depth
@@ -125,6 +147,23 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) {
 			if e.Positional.LabelledBy != "" {
 				labelledBy = e.Positional.LabelledBy
 			}
+
+			hasHorizontal := e.Positional.X != 0 || e.Positional.Left != 0 || e.Positional.Width > 0
+			hasVertical := e.Positional.Y != 0 || e.Positional.Top != 0 || e.Positional.Height > 0
+			if hasHorizontal {
+				x = e.Positional.X
+				if x == 0 && e.Positional.Left != 0 {
+					x = e.Positional.Left
+				}
+				width = e.Positional.Width
+			}
+			if hasVertical {
+				y = e.Positional.Y
+				if y == 0 && e.Positional.Top != 0 {
+					y = e.Positional.Top
+				}
+				height = e.Positional.Height
+			}
 		}
 
 		descs[i] = semantic.ElementDescriptor{
@@ -135,12 +174,15 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) {
 			Interactive: e.Interactive,
 			Parent:      e.Parent,
 			Section:     e.Section,
-			DocumentIdx: i,
 			Positional: semantic.PositionalHints{
 				Depth:        depth,
 				SiblingIndex: siblingIdx,
 				SiblingCount: siblingCnt,
 				LabelledBy:   labelledBy,
+				X:            x,
+				Y:            y,
+				Width:        width,
+				Height:       height,
 			},
 		}
 	}
diff --git a/cmd/semantic/main_test.go b/cmd/semantic/main_test.go
index d423841..0a7d117 100644
--- a/cmd/semantic/main_test.go
+++ b/cmd/semantic/main_test.go
@@ -12,8 +12,8 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) {
 	}
 
 	json := `[
-		{"ref":"e1","role":"button","name":"Submit","interactive":true,"parent":"Login form","section":"Authentication","depth":3,"sibling_index":1,"sibling_count":2,"labelled_by":"Primary Action"},
-		{"ref":"e2","role":"text","name":"Submit","interactive":false,"parent":"Payment form","section":"Checkout","positional":{"depth":2,"sibling_index":0,"sibling_count":1,"labelled_by":"Secondary Action"}}
+		{"ref":"e1","role":"button","name":"Submit","interactive":true,"parent":"Login form","section":"Authentication","depth":3,"sibling_index":1,"sibling_count":2,"labelled_by":"Primary Action","x":20,"y":40,"width":120,"height":30},
+		{"ref":"e2","role":"text","name":"Submit","interactive":false,"parent":"Payment form","section":"Checkout","positional":{"depth":2,"sibling_index":0,"sibling_count":1,"labelled_by":"Secondary Action","left":300,"top":640,"width":200,"height":44}}
 	]`
 	if _, err := f.WriteString(json); err != nil {
 		t.Fatalf("WriteString failed: %v", err)
@@ -50,6 +50,12 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) {
 	if descs[0].Positional.LabelledBy != "Primary Action" {
 		t.Fatalf("expected first descriptor labelled_by=Primary Action, got %q", descs[0].Positional.LabelledBy)
 	}
+	if descs[0].Positional.X != 20 || descs[0].Positional.Y != 40 {
+		t.Fatalf("expected first descriptor x/y=20/40, got %f/%f", descs[0].Positional.X, descs[0].Positional.Y)
+	}
+	if descs[0].Positional.Width != 120 || descs[0].Positional.Height != 30 {
+		t.Fatalf("expected first descriptor width/height=120/30, got %f/%f", descs[0].Positional.Width, descs[0].Positional.Height)
+	}
 	if descs[1].Interactive {
 		t.Fatalf("expected second descriptor interactive=false")
 	}
@@ -71,4 +77,10 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) {
 	if descs[1].Positional.LabelledBy != "Secondary Action" {
 		t.Fatalf("expected second descriptor labelled_by=Secondary Action, got %q", descs[1].Positional.LabelledBy)
 	}
+	if descs[1].Positional.X != 300 || descs[1].Positional.Y != 640 {
+		t.Fatalf("expected second descriptor x/y=300/640, got %f/%f", descs[1].Positional.X, descs[1].Positional.Y)
+	}
+	if descs[1].Positional.Width != 200 || descs[1].Positional.Height != 44 {
+		t.Fatalf("expected second descriptor width/height=200/44, got %f/%f", descs[1].Positional.Width, descs[1].Positional.Height)
+	}
 }
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
index bb1ec77..195496e 100644
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -34,14 +34,10 @@ semantic find "login" --snapshot page.json --format json
 # Just refs (for piping)
 semantic find "submit" --snapshot page.json --format refs
 
-# Exclude contexts for duplicate labels
-semantic find "submit button not in header" --snapshot page.json
-semantic find "login link, not the footer one" --snapshot page.json
-
-# Select by ordinal position
-semantic find "second button" --snapshot page.json
-semantic find "last input field" --snapshot page.json
-semantic find "second button not in header" --snapshot page.json
+# Visual layout hints
+semantic find "button in top right corner" --snapshot page.json
+semantic find "link below the search box" --snapshot page.json
+semantic find "sidebar on the left" --snapshot page.json
 ```
 
 ### `semantic match`
@@ -90,8 +86,34 @@ The CLI expects a JSON array of element descriptors:
 
 ```json
 [
-  {"ref": "e0", "role": "button", "name": "Sign In"},
-  {"ref": "e1", "role": "textbox", "name": "Email"},
-  {"ref": "e2", "role": "link", "name": "Forgot Password"}
+  {
+    "ref": "e0",
+    "role": "button",
+    "name": "Sign In",
+    "interactive": true,
+    "parent": "Auth card",
+    "section": "Header",
+    "x": 920,
+    "y": 16,
+    "width": 96,
+    "height": 32
+  },
+  {
+    "ref": "e1",
+    "role": "textbox",
+    "name": "Email",
+    "positional": {
+      "depth": 3,
+      "sibling_index": 1,
+      "sibling_count": 2,
+      "labelled_by": "Email",
+      "left": 120,
+      "top": 240,
+      "width": 320,
+      "height": 36
+    }
+  }
 ]
 ```
+
+Top-level geometry (`x`, `y`, `top`, `left`, `width`, `height`) and nested `positional` fields are both supported. Supplying coordinates improves results for visual hints such as `top right`, `below`, and `left`.
diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index fdd4124..411af4d 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -46,9 +46,10 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 	}
 
 	parsed := ParseQueryContext(query)
+	visualHints := parseVisualQueryHints(query)
 	mergeOpts := opts
 	internalOpts := opts
-	if parsed.Ordinal.HasOrdinal {
+	if parsed.Ordinal.HasOrdinal || visualHints.hasHints {
 		mergeOpts.TopK = len(elements)
 		internalOpts.TopK = len(elements)
 	}
@@ -61,6 +62,7 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 	}
 
 	merged := c.mergeResults(lexResult, embResult, elements, mergeOpts, lexW, embW)
+	merged = applyVisualHintBoost(merged, visualHints, elements, mergeOpts.TopK)
 	return selectOrdinalMatchInOrder(merged, parsed.Ordinal, elements), nil
 }
 
diff --git a/internal/engine/query_visual.go b/internal/engine/query_visual.go
new file mode 100644
index 0000000..deb11c5
--- /dev/null
+++ b/internal/engine/query_visual.go
@@ -0,0 +1,374 @@
+package engine
+
+import (
+	"sort"
+	"strings"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+const (
+	visualDirectionalBoost = 0.12
+	visualRelativeBoost    = 0.16
+	visualRelativePenalty  = 0.05
+	visualBoostCap         = 0.30
+)
+
+type visualQueryHints struct {
+	hasHints    bool
+	baseQuery   string
+	top         bool
+	bottom      bool
+	left        bool
+	right       bool
+	aboveAnchor string
+	belowAnchor string
+}
+
+var visualKeywordSet = map[string]bool{
+	"top":    true,
+	"bottom": true,
+	"left":   true,
+	"right":  true,
+	"corner": true,
+	"above":  true,
+	"below":  true,
+	"under":  true,
+	"over":   true,
+	"in":     true,
+	"on":     true,
+	"at":     true,
+	"the":    true,
+	"a":      true,
+	"an":     true,
+	"of":     true,
+	"page":   true,
+	"side":   true,
+}
+
+func parseVisualQueryHints(query string) visualQueryHints {
+	cleaned := strings.TrimSpace(query)
+	if cleaned == "" {
+		return visualQueryHints{}
+	}
+
+	words := tokenSet(tokenize(cleaned))
+	hints := visualQueryHints{
+		top:       words["top"],
+		bottom:    words["bottom"],
+		left:      words["left"],
+		right:     words["right"],
+		baseQuery: cleaned,
+	}
+	hasDirectional := hints.top || hints.bottom || hints.left || hints.right || words["corner"]
+	hasRelative := false
+
+	lower := strings.ToLower(cleaned)
+	if idx := strings.Index(lower, " below "); idx >= 0 {
+		hints.belowAnchor = normalizeVisualAnchor(cleaned[idx+len(" below "):])
+		hasRelative = true
+		if base := strings.TrimSpace(cleaned[:idx]); base != "" {
+			hints.baseQuery = stripVisualKeywords(base)
+		}
+	}
+	if idx := strings.Index(lower, " under "); idx >= 0 {
+		hints.belowAnchor = normalizeVisualAnchor(cleaned[idx+len(" under "):])
+		hasRelative = true
+		if base := strings.TrimSpace(cleaned[:idx]); base != "" {
+			hints.baseQuery = stripVisualKeywords(base)
+		}
+	}
+	if idx := strings.Index(lower, " above "); idx >= 0 {
+		hints.aboveAnchor = normalizeVisualAnchor(cleaned[idx+len(" above "):])
+		hasRelative = true
+		if base := strings.TrimSpace(cleaned[:idx]); base != "" {
+			hints.baseQuery = stripVisualKeywords(base)
+		}
+	}
+	if idx := strings.Index(lower, " over "); idx >= 0 {
+		hints.aboveAnchor = normalizeVisualAnchor(cleaned[idx+len(" over "):])
+		hasRelative = true
+		if base := strings.TrimSpace(cleaned[:idx]); base != "" {
+			hints.baseQuery = stripVisualKeywords(base)
+		}
+	}
+
+	if !hasRelative && hasDirectional {
+		hints.baseQuery = stripVisualKeywords(cleaned)
+	}
+	if strings.TrimSpace(hints.baseQuery) == "" {
+		hints.baseQuery = cleaned
+	}
+
+	hints.hasHints = hasDirectional || hints.aboveAnchor != "" || hints.belowAnchor != ""
+	return hints
+}
+
+func stripVisualKeywords(query string) string {
+	parts := tokenize(query)
+	filtered := make([]string, 0, len(parts))
+	for _, p := range parts {
+		if !visualKeywordSet[p] {
+			filtered = append(filtered, p)
+		}
+	}
+	return strings.TrimSpace(strings.Join(filtered, " "))
+}
+
+func normalizeVisualAnchor(s string) string {
+	anchor := stripVisualKeywords(s)
+	if anchor == "" {
+		anchor = strings.TrimSpace(strings.ToLower(s))
+	}
+	return anchor
+}
+
+type spatialStats struct {
+	hasX bool
+	hasY bool
+	minX float64
+	maxX float64
+	minY float64
+	maxY float64
+}
+
+func buildSpatialStats(elements []types.ElementDescriptor) spatialStats {
+	stats := spatialStats{}
+	for _, el := range elements {
+		h := el.Positional
+		if hasHorizontalPosition(h) {
+			x := horizontalPosition(h)
+			if !stats.hasX {
+				stats.hasX = true
+				stats.minX, stats.maxX = x, x
+			} else {
+				if x < stats.minX {
+					stats.minX = x
+				}
+				if x > stats.maxX {
+					stats.maxX = x
+				}
+			}
+		}
+		if hasVerticalPosition(h) {
+			y := verticalPosition(h)
+			if !stats.hasY {
+				stats.hasY = true
+				stats.minY, stats.maxY = y, y
+			} else {
+				if y < stats.minY {
+					stats.minY = y
+				}
+				if y > stats.maxY {
+					stats.maxY = y
+				}
+			}
+		}
+	}
+	return stats
+}
+
+func applyVisualHintBoost(result types.FindResult, hints visualQueryHints, elements []types.ElementDescriptor, topK int) types.FindResult {
+	if !hints.hasHints || len(result.Matches) == 0 {
+		return result
+	}
+
+	refToElem := make(map[string]types.ElementDescriptor, len(elements))
+	refOrder := make(map[string]int, len(elements))
+	for i, el := range elements {
+		refToElem[el.Ref] = el
+		refOrder[el.Ref] = i
+	}
+
+	stats := buildSpatialStats(elements)
+	anchorRef := ""
+	if hints.aboveAnchor != "" {
+		anchorRef = findVisualAnchorRef(hints.aboveAnchor, elements)
+	} else if hints.belowAnchor != "" {
+		anchorRef = findVisualAnchorRef(hints.belowAnchor, elements)
+	}
+
+	type boostedMatch struct {
+		match types.ElementMatch
+		order int
+	}
+
+	boosted := make([]boostedMatch, 0, len(result.Matches))
+	for _, match := range result.Matches {
+		el, ok := refToElem[match.Ref]
+		if !ok {
+			continue
+		}
+		order := refOrder[match.Ref]
+		boost := computeVisualBoost(el, order, len(elements), hints, stats, anchorRef, refToElem, refOrder)
+		if boost > visualBoostCap {
+			boost = visualBoostCap
+		}
+		if boost < -visualBoostCap {
+			boost = -visualBoostCap
+		}
+		match.Score += boost
+		if match.Score > 1.0 {
+			match.Score = 1.0
+		}
+		if match.Score < 0 {
+			match.Score = 0
+		}
+		boosted = append(boosted, boostedMatch{match: match, order: order})
+	}
+
+	sort.SliceStable(boosted, func(i, j int) bool {
+		diff := boosted[i].match.Score - boosted[j].match.Score
+		if diff > 1e-9 || diff < -1e-9 {
+			return diff > 0
+		}
+		if boosted[i].order != boosted[j].order {
+			return boosted[i].order < boosted[j].order
+		}
+		return boosted[i].match.Ref < boosted[j].match.Ref
+	})
+
+	if topK > 0 && len(boosted) > topK {
+		boosted = boosted[:topK]
+	}
+
+	result.Matches = result.Matches[:0]
+	for _, bm := range boosted {
+		result.Matches = append(result.Matches, bm.match)
+	}
+	if len(result.Matches) > 0 {
+		result.BestRef = result.Matches[0].Ref
+		result.BestScore = result.Matches[0].Score
+	} else {
+		result.BestRef = ""
+		result.BestScore = 0
+	}
+	return result
+}
+
+func computeVisualBoost(
+	el types.ElementDescriptor,
+	order int,
+	total int,
+	hints visualQueryHints,
+	stats spatialStats,
+	anchorRef string,
+	refToElem map[string]types.ElementDescriptor,
+	refOrder map[string]int,
+) float64 {
+	xRatio := horizontalRatio(el.Positional, stats, order, total)
+	yRatio := verticalRatio(el.Positional, stats, order, total)
+
+	boost := 0.0
+	if hints.top {
+		boost += visualDirectionalBoost * (1 - yRatio)
+	}
+	if hints.bottom {
+		boost += visualDirectionalBoost * yRatio
+	}
+	if hints.left {
+		boost += visualDirectionalBoost * (1 - xRatio)
+	}
+	if hints.right {
+		boost += visualDirectionalBoost * xRatio
+	}
+
+	if anchorRef != "" && anchorRef != el.Ref {
+		anchorEl, ok := refToElem[anchorRef]
+		if ok {
+			anchorOrder := refOrder[anchorRef]
+			anchorY := verticalRatio(anchorEl.Positional, stats, anchorOrder, total)
+			if hints.aboveAnchor != "" {
+				if yRatio < anchorY {
+					boost += visualRelativeBoost
+				} else {
+					boost -= visualRelativePenalty
+				}
+			}
+			if hints.belowAnchor != "" {
+				if yRatio > anchorY {
+					boost += visualRelativeBoost
+				} else {
+					boost -= visualRelativePenalty
+				}
+			}
+		}
+	}
+
+	return boost
+}
+
+func findVisualAnchorRef(anchorQuery string, elements []types.ElementDescriptor) string {
+	if strings.TrimSpace(anchorQuery) == "" {
+		return ""
+	}
+	bestRef := ""
+	bestScore := 0.0
+	for _, el := range elements {
+		anchorContext := strings.TrimSpace(el.Composite() + " " + el.Parent + " " + el.Section)
+		score := lexicalScore(anchorQuery, anchorContext, false, nil)
+		if score > bestScore {
+			bestScore = score
+			bestRef = el.Ref
+		}
+	}
+	if bestScore < 0.2 {
+		return ""
+	}
+	return bestRef
+}
+
+func horizontalRatio(h types.PositionalHints, stats spatialStats, order, total int) float64 {
+	if stats.hasX && hasHorizontalPosition(h) {
+		x := horizontalPosition(h)
+		if stats.maxX > stats.minX {
+			return (x - stats.minX) / (stats.maxX - stats.minX)
+		}
+		return 0.5
+	}
+	return fallbackOrderRatio(h, order, total)
+}
+
+func verticalRatio(h types.PositionalHints, stats spatialStats, order, total int) float64 {
+	if stats.hasY && hasVerticalPosition(h) {
+		y := verticalPosition(h)
+		if stats.maxY > stats.minY {
+			return (y - stats.minY) / (stats.maxY - stats.minY)
+		}
+		return 0.5
+	}
+	return fallbackOrderRatio(h, order, total)
+}
+
+func fallbackOrderRatio(h types.PositionalHints, order, total int) float64 {
+	if h.SiblingCount > 1 {
+		idx := h.SiblingIndex
+		if idx < 0 {
+			idx = 0
+		}
+		if idx > h.SiblingCount-1 {
+			idx = h.SiblingCount - 1
+		}
+		return float64(idx) / float64(h.SiblingCount-1)
+	}
+	if total > 1 {
+		return float64(order) / float64(total-1)
+	}
+	return 0.5
+}
+
+func hasHorizontalPosition(h types.PositionalHints) bool {
+	return h.Width > 0 || h.X != 0
+}
+
+func hasVerticalPosition(h types.PositionalHints) bool {
+	return h.Height > 0 || h.Y != 0
+}
+
+func horizontalPosition(h types.PositionalHints) float64 {
+	return h.X + (h.Width / 2)
+}
+
+func verticalPosition(h types.PositionalHints) float64 {
+	return h.Y + (h.Height / 2)
+}
diff --git a/internal/engine/query_visual_test.go b/internal/engine/query_visual_test.go
new file mode 100644
index 0000000..64b3be2
--- /dev/null
+++ b/internal/engine/query_visual_test.go
@@ -0,0 +1,137 @@
+package engine
+
+import (
+	"context"
+	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+func TestParseVisualQueryHints_BasicPatterns(t *testing.T) {
+	tests := []struct {
+		name        string
+		query       string
+		wantBase    string
+		wantTop     bool
+		wantBottom  bool
+		wantLeft    bool
+		wantRight   bool
+		wantAbove   string
+		wantBelow   string
+		wantHasHint bool
+	}{
+		{
+			name:        "top right corner",
+			query:       "button in top right corner",
+			wantBase:    "button",
+			wantTop:     true,
+			wantRight:   true,
+			wantHasHint: true,
+		},
+		{
+			name:        "below anchor",
+			query:       "link below the search box",
+			wantBase:    "link",
+			wantBelow:   "search box",
+			wantHasHint: true,
+		},
+		{
+			name:        "left side",
+			query:       "sidebar on the left",
+			wantBase:    "sidebar",
+			wantLeft:    true,
+			wantHasHint: true,
+		},
+		{
+			name:        "plain query",
+			query:       "submit button",
+			wantBase:    "submit button",
+			wantHasHint: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := parseVisualQueryHints(tt.query)
+			if got.baseQuery != tt.wantBase {
+				t.Fatalf("base query mismatch: want=%q got=%q", tt.wantBase, got.baseQuery)
+			}
+			if got.top != tt.wantTop || got.bottom != tt.wantBottom || got.left != tt.wantLeft || got.right != tt.wantRight {
+				t.Fatalf("directional hints mismatch: got top=%v bottom=%v left=%v right=%v", got.top, got.bottom, got.left, got.right)
+			}
+			if got.aboveAnchor != tt.wantAbove || got.belowAnchor != tt.wantBelow {
+				t.Fatalf("anchor mismatch: got above=%q below=%q", got.aboveAnchor, got.belowAnchor)
+			}
+			if got.hasHints != tt.wantHasHint {
+				t.Fatalf("hasHints mismatch: want=%v got=%v", tt.wantHasHint, got.hasHints)
+			}
+		})
+	}
+}
+
+func TestCombinedMatcher_VisualHint_TopRightCorner(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "btn-left-top", Role: "button", Name: "Open", Positional: types.PositionalHints{X: 20, Y: 20, Width: 80, Height: 24}},
+		{Ref: "btn-right-top", Role: "button", Name: "Open", Positional: types.PositionalHints{X: 880, Y: 30, Width: 80, Height: 24}},
+		{Ref: "btn-right-bottom", Role: "button", Name: "Open", Positional: types.PositionalHints{X: 860, Y: 620, Width: 80, Height: 24}},
+	}
+
+	res, err := m.Find(context.Background(), "button in top right corner", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if res.BestRef != "btn-right-top" {
+		t.Fatalf("expected top-right button, got %s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_VisualHint_BelowAnchor(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "search", Role: "searchbox", Name: "Search", Positional: types.PositionalHints{X: 120, Y: 40, Width: 320, Height: 32}},
+		{Ref: "link-top", Role: "link", Name: "Help", Positional: types.PositionalHints{X: 140, Y: 10, Width: 70, Height: 20}},
+		{Ref: "link-bottom", Role: "link", Name: "Help", Positional: types.PositionalHints{X: 140, Y: 160, Width: 70, Height: 20}},
+	}
+
+	res, err := m.Find(context.Background(), "link below the search box", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if res.BestRef != "link-bottom" {
+		t.Fatalf("expected link below anchor, got %s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_VisualHint_LeftSidebar(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "sidebar-left", Role: "navigation", Name: "Sidebar", Positional: types.PositionalHints{X: 10, Y: 120, Width: 200, Height: 600}},
+		{Ref: "sidebar-right", Role: "navigation", Name: "Sidebar", Positional: types.PositionalHints{X: 980, Y: 120, Width: 200, Height: 600}},
+	}
+
+	res, err := m.Find(context.Background(), "sidebar on the left", elements, types.FindOptions{Threshold: 0, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if res.BestRef != "sidebar-left" {
+		t.Fatalf("expected left sidebar, got %s", res.BestRef)
+	}
+}
+
+func TestCombinedMatcher_VisualHint_BottomFallbackWithoutCoordinates(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "button-1", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 0, SiblingCount: 3}},
+		{Ref: "button-2", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 1, SiblingCount: 3}},
+		{Ref: "button-3", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 2, SiblingCount: 3}},
+	}
+
+	res, err := m.Find(context.Background(), "button at bottom of page", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if res.BestRef != "button-3" {
+		t.Fatalf("expected bottom-most fallback button, got %s", res.BestRef)
+	}
+}
diff --git a/internal/types/types.go b/internal/types/types.go
index 1b58eb4..192cce8 100644
--- a/internal/types/types.go
+++ b/internal/types/types.go
@@ -89,6 +89,10 @@ type PositionalHints struct {
 	SiblingIndex int
 	SiblingCount int
 	LabelledBy   string
+	X            float64
+	Y            float64
+	Width        float64
+	Height       float64
 }
 
 // ElementDescriptor describes a single accessibility tree node.

From 16066d470355800b89086f0014132ed73a45ef78 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Thu, 23 Apr 2026 18:10:23 +0100
Subject: [PATCH 11/30] fix: e2e tests

---
 internal/engine/query_ordinal.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal/engine/query_ordinal.go b/internal/engine/query_ordinal.go
index c9645e3..fe65420 100644
--- a/internal/engine/query_ordinal.go
+++ b/internal/engine/query_ordinal.go
@@ -153,9 +153,9 @@ func selectOrdinalMatchInOrder(result types.FindResult, constraint OrdinalConstr
 
 	refOrder := make(map[string]int, len(elements))
 	for idx, el := range elements {
-		order := el.DocumentIdx
-		if order < 0 {
-			order = idx
+		order := idx
+		if el.DocumentIdx > 0 {
+			order = el.DocumentIdx
 		}
 		refOrder[el.Ref] = order
 	}

From 7cacd78f59b36c5d93db49c53c092bc0303b668d Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Thu, 23 Apr 2026 19:33:20 +0100
Subject: [PATCH 12/30] feat: add deterministic ranking and expand benchmark
 coverage

---
 internal/engine/combined.go                   |  37 ++-----
 internal/engine/combined_test.go              |  18 ++++
 internal/engine/embedding.go                  |  12 +--
 internal/engine/embedding_test.go             |  22 ++++
 internal/engine/query_visual_test.go          |  10 ++
 internal/engine/ranking.go                    |  36 +++++++
 tests/benchmark/cases/complex.json            | 102 ++++++++++++++++++
 tests/benchmark/cases/visual.json             |  50 +++++++++
 .../corpus/visual-layout/queries.json         |  50 +++++++++
 .../corpus/visual-layout/snapshot.json        |  10 ++
 tests/e2e/assets/snapshots/visual-layout.json |  10 ++
 tests/e2e/cases/15-find-visual.sh             |  21 ++++
 12 files changed, 346 insertions(+), 32 deletions(-)
 create mode 100644 internal/engine/ranking.go
 create mode 100644 tests/benchmark/cases/complex.json
 create mode 100644 tests/benchmark/cases/visual.json
 create mode 100644 tests/benchmark/corpus/visual-layout/queries.json
 create mode 100644 tests/benchmark/corpus/visual-layout/snapshot.json
 create mode 100644 tests/e2e/assets/snapshots/visual-layout.json
 create mode 100755 tests/e2e/cases/15-find-visual.sh

diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index 411af4d..d7dd814 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -3,9 +3,9 @@ package engine
 import (
 	"context"
 	"fmt"
-	"github.com/pinchtab/semantic/internal/types"
-	"math"
 	"sort"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 // combinedMatcher fuses lexical and embedding scores:
@@ -124,6 +124,7 @@ type scored struct {
 	ref      string
 	score    float64
 	el       types.ElementDescriptor
+	order    int
 	lexScore float64
 	embScore float64
 }
@@ -133,8 +134,10 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 	embScores := scoreMap(embResult.Matches)
 
 	refToElem := make(map[string]types.ElementDescriptor, len(elements))
-	for _, el := range elements {
+	refToOrder := make(map[string]int, len(elements))
+	for i, el := range elements {
 		refToElem[el.Ref] = el
+		refToOrder[el.Ref] = i
 	}
 
 	// Collect all refs from either matcher.
@@ -156,7 +159,7 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 			combined = 1
 		}
 		if combined >= opts.Threshold {
-			s := scored{ref: ref, score: combined, el: refToElem[ref]}
+			s := scored{ref: ref, score: combined, el: refToElem[ref], order: refToOrder[ref]}
 			if opts.Explain {
 				s.lexScore = lexW * lexScores[ref]
 				s.embScore = embW * embScores[ref]
@@ -166,18 +169,10 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 	}
 
 	sort.Slice(candidates, func(i, j int) bool {
-		scoreDiff := candidates[i].score - candidates[j].score
-		if math.Abs(scoreDiff) > 1e-9 {
-			return scoreDiff > 0
-		}
-
-		idxI := documentOrderIndex(candidates[i].el, i)
-		idxJ := documentOrderIndex(candidates[j].el, j)
-		if idxI != idxJ {
-			return idxI < idxJ
-		}
-
-		return candidates[i].ref < candidates[j].ref
+		return rankedMatchLess(
+			candidates[i].score, candidates[i].el, candidates[i].order,
+			candidates[j].score, candidates[j].el, candidates[j].order,
+		)
 	})
 	if len(candidates) > opts.TopK {
 		candidates = candidates[:opts.TopK]
@@ -210,16 +205,6 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el
 	return result
 }
 
-func documentOrderIndex(el types.ElementDescriptor, fallback int) int {
-	if el.DocumentIdx > 0 {
-		return el.DocumentIdx
-	}
-	if el.Positional.SiblingIndex > 0 {
-		return el.Positional.SiblingIndex
-	}
-	return fallback
-}
-
 func scoreMap(matches []types.ElementMatch) map[string]float64 {
 	m := make(map[string]float64, len(matches))
 	for _, match := range matches {
diff --git a/internal/engine/combined_test.go b/internal/engine/combined_test.go
index 346ad1f..c1356cd 100644
--- a/internal/engine/combined_test.go
+++ b/internal/engine/combined_test.go
@@ -577,3 +577,21 @@ func TestCombinedMatcher_ClampsScoreWithCustomWeights(t *testing.T) {
 		}
 	}
 }
+
+func TestCombinedMatcher_DeterministicTieBreak(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	elements := []types.ElementDescriptor{
+		{Ref: "first", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 2, SiblingIndex: 0}},
+		{Ref: "second", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 2, SiblingIndex: 0}},
+	}
+
+	for i := 0; i < 100; i++ {
+		result, err := m.Find(context.Background(), "open button", elements, types.FindOptions{Threshold: 0, TopK: 2})
+		if err != nil {
+			t.Fatalf("Find returned error: %v", err)
+		}
+		if result.BestRef != "first" {
+			t.Fatalf("run %d: expected BestRef=first, got %s", i, result.BestRef)
+		}
+	}
+}
diff --git a/internal/engine/embedding.go b/internal/engine/embedding.go
index e36ebc0..e0bdf24 100644
--- a/internal/engine/embedding.go
+++ b/internal/engine/embedding.go
@@ -78,11 +78,10 @@ func (m *EmbeddingMatcher) findWithParsed(ctx QueryContext, elements []types.Ele
 
 	candidates := m.scoreCandidates(parsed, filtered, vectors, opts.Threshold)
 	sort.Slice(candidates, func(i, j int) bool {
-		scoreDiff := candidates[i].score - candidates[j].score
-		if math.Abs(scoreDiff) > 1e-9 {
-			return scoreDiff > 0
-		}
-		return candidates[i].desc.Ref < candidates[j].desc.Ref
+		return rankedMatchLess(
+			candidates[i].score, candidates[i].desc, candidates[i].order,
+			candidates[j].score, candidates[j].desc, candidates[j].order,
+		)
 	})
 
 	if len(candidates) > opts.TopK {
@@ -126,6 +125,7 @@ func (m *EmbeddingMatcher) embedQueryAndElements(parsed types.ParsedQuery, eleme
 type embeddingScored struct {
 	desc  types.ElementDescriptor
 	score float64
+	order int
 }
 
 func (m *EmbeddingMatcher) scoreCandidates(parsed types.ParsedQuery, elements []types.ElementDescriptor, vectors [][]float32, threshold float64) []embeddingScored {
@@ -156,7 +156,7 @@ func (m *EmbeddingMatcher) scoreCandidates(parsed types.ParsedQuery, elements []
 			continue
 		}
 		if score >= threshold {
-			candidates = append(candidates, embeddingScored{desc: el, score: score})
+			candidates = append(candidates, embeddingScored{desc: el, score: score, order: i})
 		}
 	}
 	return candidates
diff --git a/internal/engine/embedding_test.go b/internal/engine/embedding_test.go
index 08d0e1a..4926855 100644
--- a/internal/engine/embedding_test.go
+++ b/internal/engine/embedding_test.go
@@ -323,6 +323,28 @@ func TestEmbeddingMatcher_SingleElement_WithNeighborWeight(t *testing.T) {
 	}
 }
 
+func TestEmbeddingMatcher_TieBreaksByPositionalHints(t *testing.T) {
+	e := newScriptedEmbedder(map[string][]float32{
+		"open button":  {1, 0, 0},
+		"button: Open": {1, 0, 0},
+	})
+	m := NewEmbeddingMatcherWithNeighborWeight(e, 0)
+
+	elements := []types.ElementDescriptor{
+		{Ref: "shallow", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 1, SiblingIndex: 1}},
+		{Ref: "deep-left", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 3, SiblingIndex: 0}},
+		{Ref: "deep-right", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 3, SiblingIndex: 2}},
+	}
+
+	res, err := m.Find(context.Background(), "open button", elements, types.FindOptions{Threshold: 0, TopK: 3})
+	if err != nil {
+		t.Fatalf("Find failed: %v", err)
+	}
+	if res.BestRef != "deep-left" {
+		t.Fatalf("expected deep-left to win tie-break, got %s", res.BestRef)
+	}
+}
+
 type scriptedEmbedder struct {
 	vectors map[string][]float32
 }
diff --git a/internal/engine/query_visual_test.go b/internal/engine/query_visual_test.go
index 64b3be2..ae31a23 100644
--- a/internal/engine/query_visual_test.go
+++ b/internal/engine/query_visual_test.go
@@ -69,6 +69,16 @@ func TestParseVisualQueryHints_BasicPatterns(t *testing.T) {
 	}
 }
 
+func TestParseVisualQueryHints_DoesNotTreatSignInAsVisualHint(t *testing.T) {
+	got := parseVisualQueryHints("sign in button")
+	if got.hasHints {
+		t.Fatalf("expected hasHints=false for non-visual query, got true")
+	}
+	if got.baseQuery != "sign in button" {
+		t.Fatalf("expected base query to stay unchanged, got %q", got.baseQuery)
+	}
+}
+
 func TestCombinedMatcher_VisualHint_TopRightCorner(t *testing.T) {
 	m := NewCombinedMatcher(NewHashingEmbedder(128))
 	elements := []types.ElementDescriptor{
diff --git a/internal/engine/ranking.go b/internal/engine/ranking.go
new file mode 100644
index 0000000..00c88fa
--- /dev/null
+++ b/internal/engine/ranking.go
@@ -0,0 +1,36 @@
+package engine
+
+import (
+	"math"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+// rankedMatchLess defines deterministic ordering for scored matches.
+func rankedMatchLess(
+	aScore float64,
+	aDesc types.ElementDescriptor,
+	aOrder int,
+	bScore float64,
+	bDesc types.ElementDescriptor,
+	bOrder int,
+) bool {
+	scoreDiff := aScore - bScore
+	if math.Abs(scoreDiff) > 1e-9 {
+		return scoreDiff > 0
+	}
+
+	if aDesc.Positional.Depth != bDesc.Positional.Depth {
+		return aDesc.Positional.Depth > bDesc.Positional.Depth
+	}
+
+	if aDesc.Positional.SiblingIndex != bDesc.Positional.SiblingIndex {
+		return aDesc.Positional.SiblingIndex < bDesc.Positional.SiblingIndex
+	}
+
+	if aOrder != bOrder {
+		return aOrder < bOrder
+	}
+
+	return aDesc.Ref < bDesc.Ref
+}
diff --git a/tests/benchmark/cases/complex.json b/tests/benchmark/cases/complex.json
new file mode 100644
index 0000000..d0d4d0a
--- /dev/null
+++ b/tests/benchmark/cases/complex.json
@@ -0,0 +1,102 @@
+[
+  {
+    "id": "complex-001",
+    "query": "second submit button not in login",
+    "snapshot": "multi-form.json",
+    "expect_ref": "e11",
+    "min_score": 0.5,
+    "tags": ["ordinal", "negative-context", "compound"]
+  },
+  {
+    "id": "complex-002",
+    "query": "last text field in payment",
+    "snapshot": "multi-form.json",
+    "expect_ref": "e6",
+    "min_score": 0.4,
+    "tags": ["ordinal", "section-context", "compound"]
+  },
+  {
+    "id": "complex-003",
+    "query": "click the submit button in the shipping form",
+    "snapshot": "multi-form.json",
+    "expect_ref": "e11",
+    "min_score": 0.5,
+    "tags": ["natural-language", "section-context", "action-verb"]
+  },
+  {
+    "id": "complex-004",
+    "query": "first input field except login section",
+    "snapshot": "multi-form.json",
+    "expect_ref": "e4",
+    "expect_ref_alt": ["e1"],
+    "min_score": 0.4,
+    "tags": ["ordinal", "negative-context", "compound"],
+    "notes": "Known gap: ordinal applied before negative filter"
+  },
+  {
+    "id": "complex-005",
+    "query": "I want to click the sign in button",
+    "snapshot": "login-page.json",
+    "expect_ref": "e4",
+    "min_score": 0.4,
+    "tags": ["natural-language", "conversational"]
+  },
+  {
+    "id": "complex-006",
+    "query": "the button that says submit",
+    "snapshot": "multi-form.json",
+    "expect_ref": "e7",
+    "expect_ref_alt": ["e3", "e11"],
+    "min_score": 0.4,
+    "tags": ["natural-language", "descriptive"],
+    "notes": "Any Submit button is valid without ordinal"
+  },
+  {
+    "id": "complex-007",
+    "query": "where can I type my password",
+    "snapshot": "login-page.json",
+    "expect_ref": "e2",
+    "min_score": 0.3,
+    "tags": ["natural-language", "question-form"]
+  },
+  {
+    "id": "complex-008",
+    "query": "press enter to login",
+    "snapshot": "login-page.json",
+    "expect_no_crash": true,
+    "tags": ["natural-language", "action-synonym"],
+    "notes": "Known gap: 'press enter' not recognized as submit action"
+  },
+  {
+    "id": "complex-009",
+    "query": "add item to my shopping bag",
+    "snapshot": "ecommerce-product.json",
+    "expect_ref": "e10",
+    "min_score": 0.4,
+    "tags": ["synonym-chain", "ecommerce"]
+  },
+  {
+    "id": "complex-010",
+    "query": "go to my account settings",
+    "snapshot": "dashboard.json",
+    "expect_ref": "e3",
+    "min_score": 0.4,
+    "tags": ["natural-language", "navigation"]
+  },
+  {
+    "id": "complex-011",
+    "query": "sign up for a new account",
+    "snapshot": "login-page.json",
+    "expect_ref": "e6",
+    "min_score": 0.4,
+    "tags": ["synonym", "registration"]
+  },
+  {
+    "id": "complex-012",
+    "query": "search for products",
+    "snapshot": "dashboard.json",
+    "expect_ref": "e6",
+    "min_score": 0.4,
+    "tags": ["natural-language", "search"]
+  }
+]
diff --git a/tests/benchmark/cases/visual.json b/tests/benchmark/cases/visual.json
new file mode 100644
index 0000000..81add09
--- /dev/null
+++ b/tests/benchmark/cases/visual.json
@@ -0,0 +1,50 @@
+[
+  {
+    "id": "visual-001",
+    "query": "button in top right",
+    "snapshot": "visual-layout.json",
+    "expect_ref": "e1",
+    "min_score": 0.5,
+    "tags": ["visual", "position", "directional"]
+  },
+  {
+    "id": "visual-002",
+    "query": "button on the left",
+    "snapshot": "visual-layout.json",
+    "expect_ref": "e0",
+    "min_score": 0.4,
+    "tags": ["visual", "position", "directional"]
+  },
+  {
+    "id": "visual-003",
+    "query": "button at bottom",
+    "snapshot": "visual-layout.json",
+    "expect_ref": "e7",
+    "min_score": 0.4,
+    "tags": ["visual", "position", "directional"]
+  },
+  {
+    "id": "visual-004",
+    "query": "link on left side",
+    "snapshot": "visual-layout.json",
+    "expect_ref": "e3",
+    "min_score": 0.4,
+    "tags": ["visual", "position", "directional"]
+  },
+  {
+    "id": "visual-005",
+    "query": "top left menu button",
+    "snapshot": "visual-layout.json",
+    "expect_ref": "e0",
+    "min_score": 0.5,
+    "tags": ["visual", "position", "compound"]
+  },
+  {
+    "id": "visual-006",
+    "query": "settings in upper right corner",
+    "snapshot": "visual-layout.json",
+    "expect_ref": "e1",
+    "min_score": 0.5,
+    "tags": ["visual", "position", "name-match"]
+  }
+]
diff --git a/tests/benchmark/corpus/visual-layout/queries.json b/tests/benchmark/corpus/visual-layout/queries.json
new file mode 100644
index 0000000..6cb0ca2
--- /dev/null
+++ b/tests/benchmark/corpus/visual-layout/queries.json
@@ -0,0 +1,50 @@
+[
+  {
+    "id": "visual-001",
+    "query": "button in top right corner",
+    "relevant_refs": ["e1"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["visual", "position", "directional"]
+  },
+  {
+    "id": "visual-002",
+    "query": "button on the left side",
+    "relevant_refs": ["e0"],
+    "partially_relevant_refs": ["e3", "e4"],
+    "difficulty": "medium",
+    "tags": ["visual", "position", "directional"]
+  },
+  {
+    "id": "visual-003",
+    "query": "button at the bottom of the page",
+    "relevant_refs": ["e6", "e7"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["visual", "position", "directional"]
+  },
+  {
+    "id": "visual-004",
+    "query": "link on the left",
+    "relevant_refs": ["e3", "e4"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["visual", "position", "link"]
+  },
+  {
+    "id": "visual-005",
+    "query": "search box in the header",
+    "relevant_refs": ["e2"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["visual", "section", "search"]
+  },
+  {
+    "id": "visual-006",
+    "query": "settings button top right",
+    "relevant_refs": ["e1"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["visual", "position", "name-match"]
+  }
+]
diff --git a/tests/benchmark/corpus/visual-layout/snapshot.json b/tests/benchmark/corpus/visual-layout/snapshot.json
new file mode 100644
index 0000000..5ee983e
--- /dev/null
+++ b/tests/benchmark/corpus/visual-layout/snapshot.json
@@ -0,0 +1,10 @@
+[
+  {"ref": "e0", "role": "button", "name": "Menu", "interactive": true, "section": "Header", "positional": {"x": 20, "y": 20, "width": 80, "height": 32}},
+  {"ref": "e1", "role": "button", "name": "Settings", "interactive": true, "section": "Header", "positional": {"x": 900, "y": 20, "width": 80, "height": 32}},
+  {"ref": "e2", "role": "searchbox", "name": "Search", "interactive": true, "section": "Header", "positional": {"x": 400, "y": 20, "width": 200, "height": 32}},
+  {"ref": "e3", "role": "link", "name": "Help", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 300, "width": 100, "height": 24}},
+  {"ref": "e4", "role": "link", "name": "Contact", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 340, "width": 100, "height": 24}},
+  {"ref": "e5", "role": "button", "name": "Submit", "interactive": true, "section": "Main", "positional": {"x": 500, "y": 400, "width": 120, "height": 40}},
+  {"ref": "e6", "role": "button", "name": "Cancel", "interactive": true, "section": "Footer", "positional": {"x": 400, "y": 700, "width": 80, "height": 32}},
+  {"ref": "e7", "role": "button", "name": "Save", "interactive": true, "section": "Footer", "positional": {"x": 500, "y": 700, "width": 80, "height": 32}}
+]
diff --git a/tests/e2e/assets/snapshots/visual-layout.json b/tests/e2e/assets/snapshots/visual-layout.json
new file mode 100644
index 0000000..5ee983e
--- /dev/null
+++ b/tests/e2e/assets/snapshots/visual-layout.json
@@ -0,0 +1,10 @@
+[
+  {"ref": "e0", "role": "button", "name": "Menu", "interactive": true, "section": "Header", "positional": {"x": 20, "y": 20, "width": 80, "height": 32}},
+  {"ref": "e1", "role": "button", "name": "Settings", "interactive": true, "section": "Header", "positional": {"x": 900, "y": 20, "width": 80, "height": 32}},
+  {"ref": "e2", "role": "searchbox", "name": "Search", "interactive": true, "section": "Header", "positional": {"x": 400, "y": 20, "width": 200, "height": 32}},
+  {"ref": "e3", "role": "link", "name": "Help", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 300, "width": 100, "height": 24}},
+  {"ref": "e4", "role": "link", "name": "Contact", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 340, "width": 100, "height": 24}},
+  {"ref": "e5", "role": "button", "name": "Submit", "interactive": true, "section": "Main", "positional": {"x": 500, "y": 400, "width": 120, "height": 40}},
+  {"ref": "e6", "role": "button", "name": "Cancel", "interactive": true, "section": "Footer", "positional": {"x": 400, "y": 700, "width": 80, "height": 32}},
+  {"ref": "e7", "role": "button", "name": "Save", "interactive": true, "section": "Footer", "positional": {"x": 500, "y": 700, "width": 80, "height": 32}}
+]
diff --git a/tests/e2e/cases/15-find-visual.sh b/tests/e2e/cases/15-find-visual.sh
new file mode 100755
index 0000000..42ff027
--- /dev/null
+++ b/tests/e2e/cases/15-find-visual.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+CASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${CASE_DIR}/../lib.sh"
+
+echo "  -- Find: Visual Position Hints --"
+
+VISUAL="${ASSETS_DIR}/snapshots/visual-layout.json"
+
+result=$(semantic find "button in top right" --snapshot "$VISUAL" --format json)
+assert_json_field "$result" ".best_ref" "e1" "visual: button in top right → e1 (Settings)"
+
+result=$(semantic find "button on the left" --snapshot "$VISUAL" --format json)
+assert_json_field "$result" ".best_ref" "e0" "visual: button on the left → e0 (Menu)"
+
+result=$(semantic find "button at the bottom" --snapshot "$VISUAL" --format json)
+assert_json_field "$result" ".best_ref" "e7" "visual: button at the bottom → e7 (Save)"
+
+result=$(semantic find "link on left side" --snapshot "$VISUAL" --format json)
+assert_json_field "$result" ".best_ref" "e3" "visual: link on left side → e3 (Help)"
+
+summary "find-visual"

From e3e963f810b5f4636c1ecb68761b9b60473ebc54 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Thu, 23 Apr 2026 19:40:48 +0100
Subject: [PATCH 13/30] chore: small cleanup of bench scripts

---
 tests/benchmark/scripts/finalize-report.sh    |  2 +-
 tests/benchmark/scripts/run-benchmark.sh      | 37 +++++++++++++++----
 .../benchmark/scripts/run-corpus-benchmark.sh | 22 ++++++++++-
 tests/benchmark/scripts/run-full-benchmark.sh | 35 ++++++++++++++----
 4 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/tests/benchmark/scripts/finalize-report.sh b/tests/benchmark/scripts/finalize-report.sh
index 632a923..38d314f 100755
--- a/tests/benchmark/scripts/finalize-report.sh
+++ b/tests/benchmark/scripts/finalize-report.sh
@@ -18,7 +18,7 @@ SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
 # Calculate final metrics
 TMP_FILE=$(mktemp)
 jq '
-    .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 100 | floor / 100) else 0 end) |
+    .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 10000 | floor / 100) else 0 end) |
     .summary.avg_score = (if (.results | length) > 0 then ([.results[].score] | add / length | . * 1000 | floor / 1000) else 0 end) |
     .summary.avg_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | add / length | floor) else 0 end) |
     .summary.min_score = (if (.results | length) > 0 then ([.results[].score] | min) else 0 end) |
diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh
index a8b4492..4ce67d6 100755
--- a/tests/benchmark/scripts/run-benchmark.sh
+++ b/tests/benchmark/scripts/run-benchmark.sh
@@ -6,7 +6,7 @@
 #   ./run-benchmark.sh [--strategy <name>] [--cases <file>]
 #
 # Options:
-#   --strategy <name>   Strategy to benchmark (lexical, embedding, combined, all)
+#   --strategy <name>   Strategy to benchmark (lexical, embedding, combined)
 #   --cases <file>      Specific case file to run (default: all)
 #   --output <dir>      Output directory (default: ../results)
 #
@@ -31,6 +31,11 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+case "${STRATEGY}" in
+    lexical|embedding|combined) ;;
+    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
+esac
+
 mkdir -p "${RESULTS_DIR}"
 
 # Build semantic binary
@@ -65,6 +70,12 @@ jq -n \
     }' > "${REPORT_FILE}"
 
 # Run cases
+score_at_least() {
+    local score="$1"
+    local min_score="$2"
+    awk -v score="${score}" -v min_score="${min_score}" 'BEGIN { exit (score + 0 >= min_score + 0) ? 0 : 1 }'
+}
+
 run_case() {
     local case_file="$1"
     local case_name
@@ -142,18 +153,30 @@ run_case() {
                 fi
             elif [[ "${expect_has_matches}" == "true" ]]; then
                 if [[ ${match_count} -gt 0 ]]; then
-                    status="pass"
-                    notes="${match_count} matches"
+                    if score_at_least "${got_score}" "${min_score}"; then
+                        status="pass"
+                        notes="${match_count} matches, score=${got_score}"
+                    else
+                        notes="${match_count} matches, score=${got_score} below min_score=${min_score}"
+                    fi
                 else
                     notes="expected matches, got 0"
                 fi
             elif [[ -n "${expect_ref}" ]]; then
                 if [[ "${got_ref}" == "${expect_ref}" ]]; then
-                    status="pass"
-                    notes="ref=${got_ref}, score=${got_score}"
+                    if score_at_least "${got_score}" "${min_score}"; then
+                        status="pass"
+                        notes="ref=${got_ref}, score=${got_score}"
+                    else
+                        notes="ref=${got_ref}, score=${got_score} below min_score=${min_score}"
+                    fi
                 elif [[ -n "${expect_ref_alt}" ]] && echo ",${expect_ref_alt}," | grep -q ",${got_ref},"; then
-                    status="pass"
-                    notes="ref=${got_ref} (alt), score=${got_score}"
+                    if score_at_least "${got_score}" "${min_score}"; then
+                        status="pass"
+                        notes="ref=${got_ref} (alt), score=${got_score}"
+                    else
+                        notes="ref=${got_ref} (alt), score=${got_score} below min_score=${min_score}"
+                    fi
                 else
                     notes="got ${got_ref}, want ${expect_ref}"
                 fi
diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh
index 67aca0c..44b97d6 100755
--- a/tests/benchmark/scripts/run-corpus-benchmark.sh
+++ b/tests/benchmark/scripts/run-corpus-benchmark.sh
@@ -31,6 +31,11 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+case "${STRATEGY}" in
+    lexical|embedding|combined) ;;
+    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
+esac
+
 mkdir -p "${RESULTS_DIR}"
 
 # Build semantic binary
@@ -80,6 +85,9 @@ run_corpus() {
     local queries="${corpus_path}/queries.json"
 
     if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
+        if [[ -f "${corpus_path}/cases.json" ]] || [[ -f "${corpus_path}/scenarios.json" ]]; then
+            return
+        fi
         echo "  Skipping ${corpus_name}: missing files"
         return
     fi
@@ -104,12 +112,22 @@ run_corpus() {
         local start_ns end_ns duration_ms result
         start_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
 
-        result=$("${SEMANTIC}" find "${query}" \
+        if ! result=$("${SEMANTIC}" find "${query}" \
             --snapshot "${snapshot}" \
             --strategy "${STRATEGY}" \
             --threshold 0.01 \
             --top-k "${TOP_K}" \
-            --format json 2>/dev/null || echo '{"matches":[]}')
+            --format json 2>&1); then
+            echo "  [${id}] ERROR: semantic find failed for query: ${query}" >&2
+            echo "${result}" >&2
+            exit 1
+        fi
+
+        if ! echo "$result" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
+            echo "  [${id}] ERROR: semantic find returned invalid JSON" >&2
+            echo "${result}" >&2
+            exit 1
+        fi
 
         end_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
         duration_ms=$(( (end_ns - start_ns) / 1000 ))
diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh
index 89db077..eadaad7 100755
--- a/tests/benchmark/scripts/run-full-benchmark.sh
+++ b/tests/benchmark/scripts/run-full-benchmark.sh
@@ -61,11 +61,16 @@ echo "=============================================="
 FIND_OUTPUT=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" 2>&1)
 echo "$FIND_OUTPUT"
 
-# Extract metrics from output
-FIND_MRR=$(echo "$FIND_OUTPUT" | grep "MRR:" | tail -1 | awk '{print $2}')
-FIND_P1=$(echo "$FIND_OUTPUT" | grep "P@1:" | tail -1 | awk '{print $2}')
-FIND_TOTAL=$(echo "$FIND_OUTPUT" | grep "Queries:" | tail -1 | awk '{print $2}')
-FIND_LAT=$(echo "$FIND_OUTPUT" | grep "Latency P50:" | tail -1 | awk '{print $3}')
+# Extract metrics from the corpus report rather than the human-readable output.
+FIND_REPORT=$(echo "$FIND_OUTPUT" | awk '/^Report:/ {print $2}' | tail -1)
+if [[ -z "${FIND_REPORT}" ]] || [[ ! -f "${FIND_REPORT}" ]]; then
+    echo "error: could not locate corpus benchmark report" >&2
+    exit 1
+fi
+FIND_MRR=$(jq -r '.metrics.mrr' "$FIND_REPORT")
+FIND_P1=$(jq -r '.metrics.p_at_1' "$FIND_REPORT")
+FIND_TOTAL=$(jq -r '.metrics.total' "$FIND_REPORT")
+FIND_LAT=$(jq -r '.metrics.latency_p50_ms' "$FIND_REPORT")
 
 # Rebuild semantic binary (corpus benchmark deletes it)
 (cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
@@ -99,7 +104,18 @@ if [[ -f "$SCENARIOS_FILE" ]]; then
 
         # Run semantic find on after snapshot with the same minimum score
         # enforced by DefaultRecoveryConfig in the recovery engine.
-        RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>/dev/null || echo '{"matches":[]}')
+        if ! RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>&1); then
+            echo "  [$ID] ERROR: semantic find failed during recovery benchmark" >&2
+            echo "$RESULT" >&2
+            rm -f "$AFTER_FILE"
+            exit 1
+        fi
+        if ! echo "$RESULT" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
+            echo "  [$ID] ERROR: semantic find returned invalid JSON during recovery benchmark" >&2
+            echo "$RESULT" >&2
+            rm -f "$AFTER_FILE"
+            exit 1
+        fi
         BEST_REF=$(echo "$RESULT" | jq -r '.best_ref // ""')
 
         rm -f "$AFTER_FILE"
@@ -150,7 +166,11 @@ if [[ -f "$CLASS_FILE" ]]; then
         EXPECTED=$(jq -r ".[$i].expected_type" "$CLASS_FILE")
 
         # Run semantic classify (extract just the type, first word)
-        RESULT=$("${SEMANTIC}" classify "$ERROR" 2>/dev/null || echo "unknown")
+        if ! RESULT=$("${SEMANTIC}" classify "$ERROR" 2>&1); then
+            echo "  [$ID] ERROR: semantic classify failed" >&2
+            echo "$RESULT" >&2
+            exit 1
+        fi
         GOT=$(echo "$RESULT" | awk '{print $1}')
 
         CLASS_TOTAL=$((CLASS_TOTAL + 1))
@@ -189,6 +209,7 @@ COMPOSITE=$(echo "scale=4; \
     ($FIND_MRR * 0.20) + \
     ($RECOVERY_RATE * 0.25) + \
     ($CLASS_ACCURACY * 0.15)" | bc)
+COMPOSITE=$(awk -v value="$COMPOSITE" 'BEGIN { printf "%.4f", value }')
 
 # Assign grade
 GRADE="F"

From e8f07db4f6541756d118144ab5a7054dbe4f8830 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Thu, 23 Apr 2026 21:16:42 +0100
Subject: [PATCH 14/30] chore: expand benchmark corpus and add tuning tools

---
 cmd/semantic/main.go                          |  10 +-
 docs/reference/cli.md                         |   5 +
 tests/benchmark/README.md                     |  33 ++-
 tests/benchmark/cases/negative-threshold.json | 117 +++++++++
 tests/benchmark/cases/visual.json             |  12 +-
 tests/benchmark/corpus/README.md              |  24 ++
 .../ambiguous-layout-context/queries.json     | 138 +++++++++++
 .../ambiguous-layout-context/snapshot.json    |  35 +++
 .../corpus/form-state-controls/queries.json   | 146 +++++++++++
 .../corpus/form-state-controls/snapshot.json  |  54 +++++
 .../corpus/icon-aria-labels/queries.json      | 227 ++++++++++++++++++
 .../corpus/icon-aria-labels/snapshot.json     |  59 +++++
 .../implicit-domain-intent/queries.json       | 146 +++++++++++
 .../implicit-domain-intent/snapshot.json      |  58 +++++
 .../overlays-menus-dialogs/queries.json       | 218 +++++++++++++++++
 .../overlays-menus-dialogs/snapshot.json      |  60 +++++
 .../benchmark/corpus/table-grid/queries.json  | 219 +++++++++++++++++
 .../benchmark/corpus/table-grid/snapshot.json |  78 ++++++
 tests/benchmark/scripts/lint-corpus.sh        | 197 +++++++++++++++
 .../benchmark/scripts/run-corpus-benchmark.sh | 154 +++++++++++-
 tests/benchmark/scripts/tune-weights.sh       | 157 ++++++++++++
 21 files changed, 2121 insertions(+), 26 deletions(-)
 create mode 100644 tests/benchmark/cases/negative-threshold.json
 create mode 100644 tests/benchmark/corpus/ambiguous-layout-context/queries.json
 create mode 100644 tests/benchmark/corpus/ambiguous-layout-context/snapshot.json
 create mode 100644 tests/benchmark/corpus/form-state-controls/queries.json
 create mode 100644 tests/benchmark/corpus/form-state-controls/snapshot.json
 create mode 100644 tests/benchmark/corpus/icon-aria-labels/queries.json
 create mode 100644 tests/benchmark/corpus/icon-aria-labels/snapshot.json
 create mode 100644 tests/benchmark/corpus/implicit-domain-intent/queries.json
 create mode 100644 tests/benchmark/corpus/implicit-domain-intent/snapshot.json
 create mode 100644 tests/benchmark/corpus/overlays-menus-dialogs/queries.json
 create mode 100644 tests/benchmark/corpus/overlays-menus-dialogs/snapshot.json
 create mode 100644 tests/benchmark/corpus/table-grid/queries.json
 create mode 100644 tests/benchmark/corpus/table-grid/snapshot.json
 create mode 100755 tests/benchmark/scripts/lint-corpus.sh
 create mode 100755 tests/benchmark/scripts/tune-weights.sh

diff --git a/cmd/semantic/main.go b/cmd/semantic/main.go
index fe7ebdf..b99815c 100644
--- a/cmd/semantic/main.go
+++ b/cmd/semantic/main.go
@@ -56,6 +56,8 @@ Flags (find/match):
   --threshold <n>     Minimum score (default: 0.3)
   --top-k <n>         Max results (default: 3)
   --strategy <name>   lexical, embedding, or combined (default: combined)
+  --lexical-weight <n>   Combined strategy lexical weight override
+  --embedding-weight <n> Combined strategy embedding weight override
   --format <fmt>      json, table, or refs (default: table)
 `)
 }
@@ -209,6 +211,8 @@ func runFind(args []string) {
 	threshold := fs.Float64("threshold", 0.3, "minimum score")
 	topK := fs.Int("top-k", 3, "max results")
 	strategy := fs.String("strategy", "combined", "matching strategy")
+	lexicalWeight := fs.Float64("lexical-weight", 0, "combined strategy lexical weight override")
+	embeddingWeight := fs.Float64("embedding-weight", 0, "combined strategy embedding weight override")
 	format := fs.String("format", "table", "output format: json, table, refs")
 	_ = fs.Parse(args)
 
@@ -226,8 +230,10 @@ func runFind(args []string) {
 
 	matcher := newMatcher(*strategy)
 	result, err := matcher.Find(context.Background(), query, elements, semantic.FindOptions{
-		Threshold: *threshold,
-		TopK:      *topK,
+		Threshold:       *threshold,
+		TopK:            *topK,
+		LexicalWeight:   *lexicalWeight,
+		EmbeddingWeight: *embeddingWeight,
 	})
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "error: %v\n", err)
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
index 195496e..1e7155d 100644
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -16,6 +16,8 @@ semantic find <query> [flags]
 | `--threshold` | 0.3 | Minimum score |
 | `--top-k` | 3 | Maximum results |
 | `--strategy` | combined | `combined`, `lexical`, or `embedding` |
+| `--lexical-weight` | 0 | Combined strategy lexical weight override |
+| `--embedding-weight` | 0 | Combined strategy embedding weight override |
 | `--format` | table | `table`, `json`, or `refs` |
 
 **Examples:**
@@ -31,6 +33,9 @@ curl -s localhost:9999/snapshot | semantic find "search box"
 # Machine-readable
 semantic find "login" --snapshot page.json --format json
 
+# Tune combined scoring
+semantic find "login" --snapshot page.json --lexical-weight 0.7 --embedding-weight 0.3
+
 # Just refs (for piping)
 semantic find "submit" --snapshot page.json --format refs
 
diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
index c2526c1..531f679 100644
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@@ -14,6 +14,9 @@ cd tests/benchmark
 ./scripts/run-corpus-benchmark.sh --strategy lexical
 ./scripts/run-corpus-benchmark.sh --strategy embedding
 ./scripts/run-corpus-benchmark.sh --strategy combined
+
+# Sweep combined lexical/embedding weights
+./scripts/tune-weights.sh
 ```
 
 ## Metrics
@@ -55,25 +58,27 @@ corpus/
 ## Current Results (combined strategy)
 
 ```
-Queries:     50
-MRR:         0.88
-P@1:         0.87
-P@3:         0.34
-Latency P50: 31 ms
-Latency P95: 52 ms
+Queries:     105
+MRR:         0.8897
+P@1:         0.8762
+P@3:         0.3412
+Latency P50: 23 ms
+Latency P95: 28 ms
 
 By Difficulty:
-  easy:   34 queries, P@1 = 0.95
-  medium: 14 queries, P@1 = 0.78
-  hard:    2 queries, P@1 = 0.00
+  easy:   76 queries, P@1 = 0.94
+  medium: 25 queries, P@1 = 0.74
+  hard:    4 queries, P@1 = 0.50
 ```
 
 ## Optimization Targets
 
-The 6 current misses are "hard" cases requiring:
+The current misses cluster around:
 - Synonym expansion (save for later → wishlist)
 - Implicit actions (clone → Code button)
 - Domain knowledge (CI status → Actions tab)
+- Form/input intent (type new query → search box)
+- Accessibility/navigation shortcuts (skip to content, homepage)
 
 ## Scripts
 
@@ -81,6 +86,7 @@ The 6 current misses are "hard" cases requiring:
 |--------|---------|
 | `run-corpus-benchmark.sh` | Main benchmark with MRR/P@K metrics |
 | `run-benchmark.sh` | Simple pass/fail test runner |
+| `tune-weights.sh` | Grid search combined matcher lexical/embedding weights |
 
 ## Adding to Corpus
 
@@ -95,6 +101,13 @@ The 6 current misses are "hard" cases requiring:
 
 3. Run benchmark to establish baseline
 
+4. Add several related queries for the same behavior, not one isolated case.
+   Include easy, medium, hard, and at least one near-miss or partial match where
+   ambiguity matters.
+
+5. Re-run `./scripts/tune-weights.sh` after larger corpus changes to see whether
+   the best combined weights moved.
+
 ## CI Integration
 
 ```yaml
diff --git a/tests/benchmark/cases/negative-threshold.json b/tests/benchmark/cases/negative-threshold.json
new file mode 100644
index 0000000..dece15f
--- /dev/null
+++ b/tests/benchmark/cases/negative-threshold.json
@@ -0,0 +1,117 @@
+[
+  {
+    "id": "neg-001",
+    "query": "xyzzy plugh qwerty",
+    "snapshot": "login-page.json",
+    "expect_no_match": true,
+    "threshold": 0.3,
+    "tags": ["no-match", "nonsense"]
+  },
+  {
+    "id": "neg-002",
+    "query": "upload spreadsheet to cloud",
+    "snapshot": "login-page.json",
+    "expect_no_match": true,
+    "threshold": 0.4,
+    "tags": ["no-match", "absent-control"]
+  },
+  {
+    "id": "neg-003",
+    "query": "open video player",
+    "snapshot": "dashboard.json",
+    "expect_no_match": true,
+    "threshold": 0.4,
+    "tags": ["no-match", "absent-control"]
+  },
+  {
+    "id": "neg-004",
+    "query": "print receipt",
+    "snapshot": "login-page.json",
+    "expect_no_match": true,
+    "threshold": 0.4,
+    "tags": ["no-match", "absent-control"]
+  },
+  {
+    "id": "neg-005",
+    "query": "submit button",
+    "snapshot": "multi-form.json",
+    "expect_ref": "e11",
+    "expect_ref_alt": ["e3", "e7"],
+    "threshold": 0.3,
+    "min_score": 0.5,
+    "tags": ["threshold", "duplicate-labels"]
+  },
+  {
+    "id": "neg-006",
+    "query": "enter",
+    "snapshot": "login-page.json",
+    "expect_has_matches": true,
+    "threshold": 0.1,
+    "min_score": 0.15,
+    "tags": ["threshold", "weak-match"]
+  },
+  {
+    "id": "neg-007",
+    "query": "click",
+    "snapshot": "ecommerce-product.json",
+    "expect_has_matches": true,
+    "threshold": 0.1,
+    "tags": ["threshold", "generic-verb"]
+  },
+  {
+    "id": "neg-008",
+    "query": "the thing",
+    "snapshot": "dashboard.json",
+    "expect_has_matches": true,
+    "threshold": 0.05,
+    "tags": ["threshold", "vague-query"]
+  },
+  {
+    "id": "neg-009",
+    "query": "asdfghjkl",
+    "snapshot": "multi-form.json",
+    "expect_no_match": true,
+    "threshold": 0.3,
+    "tags": ["no-match", "keyboard-mash"]
+  },
+  {
+    "id": "neg-010",
+    "query": "stale element e999",
+    "snapshot": "login-page.json",
+    "expect_no_match": true,
+    "threshold": 0.3,
+    "tags": ["no-match", "stale-ref"]
+  },
+  {
+    "id": "neg-011",
+    "query": "a b c d e f",
+    "snapshot": "dashboard.json",
+    "expect_no_crash": true,
+    "threshold": 0.3,
+    "tags": ["threshold", "noise-tokens"]
+  },
+  {
+    "id": "neg-012",
+    "query": "configure webhook endpoint",
+    "snapshot": "login-page.json",
+    "expect_no_match": true,
+    "threshold": 0.4,
+    "tags": ["no-match", "absent-control", "domain-intent"]
+  },
+  {
+    "id": "neg-013",
+    "query": "invoice download",
+    "snapshot": "ecommerce-product.json",
+    "expect_no_match": true,
+    "threshold": 0.4,
+    "tags": ["no-match", "absent-control"]
+  },
+  {
+    "id": "neg-014",
+    "query": "share on twitter",
+    "snapshot": "login-page.json",
+    "expect_no_match": true,
+    "threshold": 0.4,
+    "tags": ["no-match", "absent-control"]
+  }
+]
diff --git a/tests/benchmark/cases/visual.json b/tests/benchmark/cases/visual.json
index 81add09..3a12219 100644
--- a/tests/benchmark/cases/visual.json
+++ b/tests/benchmark/cases/visual.json
@@ -1,6 +1,6 @@
 [
   {
-    "id": "visual-001",
+    "id": "vcase-001",
     "query": "button in top right",
     "snapshot": "visual-layout.json",
     "expect_ref": "e1",
@@ -8,7 +8,7 @@
     "tags": ["visual", "position", "directional"]
   },
   {
-    "id": "visual-002",
+    "id": "vcase-002",
     "query": "button on the left",
     "snapshot": "visual-layout.json",
     "expect_ref": "e0",
@@ -16,7 +16,7 @@
     "tags": ["visual", "position", "directional"]
   },
   {
-    "id": "visual-003",
+    "id": "vcase-003",
     "query": "button at bottom",
     "snapshot": "visual-layout.json",
     "expect_ref": "e7",
@@ -24,7 +24,7 @@
     "tags": ["visual", "position", "directional"]
   },
   {
-    "id": "visual-004",
+    "id": "vcase-004",
     "query": "link on left side",
     "snapshot": "visual-layout.json",
     "expect_ref": "e3",
@@ -32,7 +32,7 @@
     "tags": ["visual", "position", "directional"]
   },
   {
-    "id": "visual-005",
+    "id": "vcase-005",
     "query": "top left menu button",
     "snapshot": "visual-layout.json",
     "expect_ref": "e0",
@@ -40,7 +40,7 @@
     "tags": ["visual", "position", "compound"]
   },
   {
-    "id": "visual-006",
+    "id": "vcase-006",
     "query": "settings in upper right corner",
     "snapshot": "visual-layout.json",
     "expect_ref": "e1",
diff --git a/tests/benchmark/corpus/README.md b/tests/benchmark/corpus/README.md
index 37c353e..c1defda 100644
--- a/tests/benchmark/corpus/README.md
+++ b/tests/benchmark/corpus/README.md
@@ -38,6 +38,30 @@ Each corpus entry is a directory containing:
 - **P@3**: How many of top-3 are relevant?
 - **Margin**: Score gap between relevant and irrelevant
 
+## Expansion Groups
+
+### Expansion 1: Complex Query Patterns (2026-04)
+
+Added corpora for underrepresented query types:
+
+- **implicit-domain-intent/**: GitHub-like repo page with 56 elements. Tests implicit intents like "clone this repo", "check CI status", "switch branch", "save for later". 18 queries, 8 hard.
+
+- **form-state-controls/**: Settings page with checkboxes, radios, toggles, comboboxes. Tests stateful controls like "keep me logged in", "enable 2FA", "subscribe to newsletter". 18 queries, 8 hard.
+
+- **ambiguous-layout-context/**: Multi-section page with duplicate labels (3x Search, 2x Save, 2x Cancel, 2x Login, 2x Home, 2x Help). Tests positional and section disambiguation. 17 queries, 7 hard.
+
+Also added `tests/benchmark/cases/negative-threshold.json` with 14 no-match and threshold calibration cases.
+
+### Expansion 2: Enterprise UI Patterns (2026-04)
+
+Added corpora for complex enterprise UI scenarios:
+
+- **table-grid/**: Invoice table with 50+ elements. Tests row-level context, repeated buttons (Edit, Delete, More), ordinal references ("second invoice", "last row"), and bulk operations. 24 queries, 8 hard.
+
+- **overlays-menus-dialogs/**: Multi-layer UI with modal dialogs, dropdown menus, context menus, notifications. Tests duplicate controls across scopes ("cancel in modal", "save on page not dialog"), menu item selection, and overlay disambiguation. 24 queries, 8 hard.
+
+- **icon-aria-labels/**: Icon-only controls across toolbar, media player, navigation. Tests sparse accessible names, icon descriptions ("kebab menu", "hamburger", "pencil edit"), and section context for repeated icons. 25 queries, 6 hard.
+
 ## Sources
 
 Snapshots should be captured from real websites using pinchtab:
diff --git a/tests/benchmark/corpus/ambiguous-layout-context/queries.json b/tests/benchmark/corpus/ambiguous-layout-context/queries.json
new file mode 100644
index 0000000..458fe8a
--- /dev/null
+++ b/tests/benchmark/corpus/ambiguous-layout-context/queries.json
@@ -0,0 +1,138 @@
+[
+  {
+    "id": "alc-001",
+    "query": "header search box",
+    "relevant_refs": ["e1"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "section", "position"]
+  },
+  {
+    "id": "alc-002",
+    "query": "sidebar search",
+    "relevant_refs": ["e6"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "section", "position"]
+  },
+  {
+    "id": "alc-003",
+    "query": "search on the left",
+    "relevant_refs": ["e6"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["duplicate-labels", "visual", "position"]
+  },
+  {
+    "id": "alc-004",
+    "query": "search in top right area",
+    "relevant_refs": ["e1"],
+    "partially_relevant_refs": ["e2"],
+    "difficulty": "hard",
+    "tags": ["duplicate-labels", "visual", "position"]
+  },
+  {
+    "id": "alc-005",
+    "query": "save button in profile",
+    "relevant_refs": ["e18"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "section", "context-exclusion"]
+  },
+  {
+    "id": "alc-006",
+    "query": "save in billing section",
+    "relevant_refs": ["e22"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "section", "context-exclusion"]
+  },
+  {
+    "id": "alc-007",
+    "query": "second save button",
+    "relevant_refs": ["e22"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "ordinal"]
+  },
+  {
+    "id": "alc-008",
+    "query": "cancel button below password",
+    "relevant_refs": ["e19"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["duplicate-labels", "visual", "position"]
+  },
+  {
+    "id": "alc-009",
+    "query": "login link not in footer",
+    "relevant_refs": ["e3"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["duplicate-labels", "context-exclusion"]
+  },
+  {
+    "id": "alc-010",
+    "query": "footer login link",
+    "relevant_refs": ["e29"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "section"]
+  },
+  {
+    "id": "alc-011",
+    "query": "home link in sidebar",
+    "relevant_refs": ["e7"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "section"]
+  },
+  {
+    "id": "alc-012",
+    "query": "home link at bottom",
+    "relevant_refs": ["e25"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "visual", "position"]
+  },
+  {
+    "id": "alc-013",
+    "query": "help in sidebar not footer",
+    "relevant_refs": ["e10"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["duplicate-labels", "context-exclusion"]
+  },
+  {
+    "id": "alc-014",
+    "query": "main content save button",
+    "relevant_refs": ["e18"],
+    "partially_relevant_refs": ["e22"],
+    "difficulty": "hard",
+    "tags": ["duplicate-labels", "section"]
+  },
+  {
+    "id": "alc-015",
+    "query": "first cancel button",
+    "relevant_refs": ["e19"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["duplicate-labels", "ordinal"]
+  },
+  {
+    "id": "alc-016",
+    "query": "submit at the bottom of the page",
+    "relevant_refs": ["e32"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["visual", "position"]
+  },
+  {
+    "id": "alc-017",
+    "query": "input field below username",
+    "relevant_refs": ["e16"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["visual", "position"]
+  }
+]
diff --git a/tests/benchmark/corpus/ambiguous-layout-context/snapshot.json b/tests/benchmark/corpus/ambiguous-layout-context/snapshot.json
new file mode 100644
index 0000000..6d5a5aa
--- /dev/null
+++ b/tests/benchmark/corpus/ambiguous-layout-context/snapshot.json
@@ -0,0 +1,35 @@
+[
+  {"ref": "e0", "role": "banner", "name": "Header", "interactive": false, "section": "Header"},
+  {"ref": "e1", "role": "searchbox", "name": "Search", "interactive": true, "section": "Header", "parent": "Header search", "positional": {"x": 200, "y": 20, "width": 300, "height": 36}},
+  {"ref": "e2", "role": "button", "name": "Search", "interactive": true, "section": "Header", "parent": "Header search", "positional": {"x": 510, "y": 20, "width": 80, "height": 36}},
+  {"ref": "e3", "role": "link", "name": "Login", "interactive": true, "section": "Header", "parent": "Header actions", "positional": {"x": 850, "y": 20, "width": 60, "height": 36}},
+  {"ref": "e4", "role": "link", "name": "Sign up", "interactive": true, "section": "Header", "parent": "Header actions", "positional": {"x": 920, "y": 20, "width": 70, "height": 36}},
+  {"ref": "e5", "role": "navigation", "name": "Sidebar", "interactive": false, "section": "Sidebar", "positional": {"x": 0, "y": 80, "width": 200, "height": 600}},
+  {"ref": "e6", "role": "searchbox", "name": "Search", "interactive": true, "section": "Sidebar", "parent": "Sidebar search", "positional": {"x": 10, "y": 100, "width": 180, "height": 32}},
+  {"ref": "e7", "role": "link", "name": "Home", "interactive": true, "section": "Sidebar", "parent": "Main nav", "positional": {"x": 10, "y": 150, "width": 180, "height": 32}},
+  {"ref": "e8", "role": "link", "name": "Products", "interactive": true, "section": "Sidebar", "parent": "Main nav", "positional": {"x": 10, "y": 190, "width": 180, "height": 32}},
+  {"ref": "e9", "role": "link", "name": "Settings", "interactive": true, "section": "Sidebar", "parent": "Main nav", "positional": {"x": 10, "y": 230, "width": 180, "height": 32}},
+  {"ref": "e10", "role": "link", "name": "Help", "interactive": true, "section": "Sidebar", "parent": "Secondary nav", "positional": {"x": 10, "y": 600, "width": 180, "height": 32}},
+  {"ref": "e11", "role": "main", "name": "Main content", "interactive": false, "section": "Main", "positional": {"x": 220, "y": 80, "width": 780, "height": 600}},
+  {"ref": "e12", "role": "heading", "name": "Welcome", "interactive": false, "section": "Main", "parent": "Hero"},
+  {"ref": "e13", "role": "button", "name": "Get Started", "interactive": true, "section": "Main", "parent": "Hero", "positional": {"x": 400, "y": 200, "width": 120, "height": 40}},
+  {"ref": "e14", "role": "heading", "name": "Profile Settings", "interactive": false, "section": "Main", "parent": "Profile form"},
+  {"ref": "e15", "role": "textbox", "name": "Username", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 300, "width": 300, "height": 36}},
+  {"ref": "e16", "role": "textbox", "name": "Email", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 350, "width": 300, "height": 36}},
+  {"ref": "e17", "role": "textbox", "name": "Password", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 400, "width": 300, "height": 36}},
+  {"ref": "e18", "role": "button", "name": "Save", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 460, "width": 100, "height": 40}},
+  {"ref": "e19", "role": "button", "name": "Cancel", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 420, "y": 460, "width": 100, "height": 40}},
+  {"ref": "e20", "role": "heading", "name": "Billing Details", "interactive": false, "section": "Main", "parent": "Billing form"},
+  {"ref": "e21", "role": "textbox", "name": "Card number", "interactive": true, "section": "Main", "parent": "Billing form", "positional": {"x": 300, "y": 540, "width": 300, "height": 36}},
+  {"ref": "e22", "role": "button", "name": "Save", "interactive": true, "section": "Main", "parent": "Billing form", "positional": {"x": 300, "y": 600, "width": 100, "height": 40}},
+  {"ref": "e23", "role": "button", "name": "Cancel", "interactive": true, "section": "Main", "parent": "Billing form", "positional": {"x": 420, "y": 600, "width": 100, "height": 40}},
+  {"ref": "e24", "role": "contentinfo", "name": "Footer", "interactive": false, "section": "Footer", "positional": {"x": 0, "y": 700, "width": 1000, "height": 100}},
+  {"ref": "e25", "role": "link", "name": "Home", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 50, "y": 720, "width": 60, "height": 24}},
+  {"ref": "e26", "role": "link", "name": "Privacy", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 130, "y": 720, "width": 60, "height": 24}},
+  {"ref": "e27", "role": "link", "name": "Terms", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 210, "y": 720, "width": 60, "height": 24}},
+  {"ref": "e28", "role": "link", "name": "Contact", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 290, "y": 720, "width": 70, "height": 24}},
+  {"ref": "e29", "role": "link", "name": "Login", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 380, "y": 720, "width": 50, "height": 24}},
+  {"ref": "e30", "role": "link", "name": "Help", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 450, "y": 720, "width": 50, "height": 24}},
+  {"ref": "e31", "role": "searchbox", "name": "Search", "interactive": true, "section": "Footer", "parent": "Footer search", "positional": {"x": 700, "y": 720, "width": 200, "height": 32}},
+  {"ref": "e32", "role": "button", "name": "Submit", "interactive": true, "section": "Footer", "parent": "Footer newsletter", "positional": {"x": 700, "y": 760, "width": 100, "height": 32}}
+]
diff --git a/tests/benchmark/corpus/form-state-controls/queries.json b/tests/benchmark/corpus/form-state-controls/queries.json
new file mode 100644
index 0000000..7fec46c
--- /dev/null
+++ b/tests/benchmark/corpus/form-state-controls/queries.json
@@ -0,0 +1,146 @@
+[
+  {
+    "id": "fsc-001",
+    "query": "keep me logged in",
+    "relevant_refs": ["e10"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "checkbox", "state"]
+  },
+  {
+    "id": "fsc-002",
+    "query": "remember me checkbox",
+    "relevant_refs": ["e10"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["checkbox", "state"]
+  },
+  {
+    "id": "fsc-003",
+    "query": "subscribe to newsletter",
+    "relevant_refs": ["e15"],
+    "partially_relevant_refs": ["e14", "e16"],
+    "difficulty": "hard",
+    "tags": ["implicit", "checkbox", "state"]
+  },
+  {
+    "id": "fsc-004",
+    "query": "opt out of marketing",
+    "relevant_refs": ["e15"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "checkbox", "state"]
+  },
+  {
+    "id": "fsc-005",
+    "query": "enable 2FA",
+    "relevant_refs": ["e11"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "checkbox", "state", "domain-intent"]
+  },
+  {
+    "id": "fsc-006",
+    "query": "use same address for shipping",
+    "relevant_refs": ["e28"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["implicit", "checkbox", "state"]
+  },
+  {
+    "id": "fsc-007",
+    "query": "select payment method",
+    "relevant_refs": ["e24", "e25", "e26"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["radio", "state"]
+  },
+  {
+    "id": "fsc-008",
+    "query": "pay with PayPal",
+    "relevant_refs": ["e25"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["radio", "state"]
+  },
+  {
+    "id": "fsc-009",
+    "query": "change my country",
+    "relevant_refs": ["e33"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["combobox", "state"]
+  },
+  {
+    "id": "fsc-010",
+    "query": "select language preference",
+    "relevant_refs": ["e35"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["combobox", "state"]
+  },
+  {
+    "id": "fsc-011",
+    "query": "turn on push notifications",
+    "relevant_refs": ["e17"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["toggle", "switch", "state"]
+  },
+  {
+    "id": "fsc-012",
+    "query": "enable text alerts",
+    "relevant_refs": ["e18"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "toggle", "switch", "state"]
+  },
+  {
+    "id": "fsc-013",
+    "query": "export my data",
+    "relevant_refs": ["e42"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["domain-intent", "link"]
+  },
+  {
+    "id": "fsc-014",
+    "query": "delete my account",
+    "relevant_refs": ["e43"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["domain-intent", "link"]
+  },
+  {
+    "id": "fsc-015",
+    "query": "show advanced settings",
+    "relevant_refs": ["e44"],
+    "partially_relevant_refs": ["e45"],
+    "difficulty": "medium",
+    "tags": ["implicit", "button", "state"]
+  },
+  {
+    "id": "fsc-016",
+    "query": "enable beta features",
+    "relevant_refs": ["e47"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["checkbox", "state"]
+  },
+  {
+    "id": "fsc-017",
+    "query": "hide my profile from search",
+    "relevant_refs": ["e40"],
+    "partially_relevant_refs": ["e39"],
+    "difficulty": "hard",
+    "tags": ["implicit", "checkbox", "state"]
+  },
+  {
+    "id": "fsc-018",
+    "query": "stop sharing data with partners",
+    "relevant_refs": ["e41"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "checkbox", "state"]
+  }
+]
diff --git a/tests/benchmark/corpus/form-state-controls/snapshot.json b/tests/benchmark/corpus/form-state-controls/snapshot.json
new file mode 100644
index 0000000..6819953
--- /dev/null
+++ b/tests/benchmark/corpus/form-state-controls/snapshot.json
@@ -0,0 +1,54 @@
+[
+  {"ref": "e0", "role": "heading", "name": "Account Settings", "interactive": false, "section": "Header"},
+  {"ref": "e1", "role": "link", "name": "Profile", "interactive": true, "section": "Settings nav", "parent": "Navigation"},
+  {"ref": "e2", "role": "link", "name": "Security", "interactive": true, "section": "Settings nav", "parent": "Navigation"},
+  {"ref": "e3", "role": "link", "name": "Notifications", "interactive": true, "section": "Settings nav", "parent": "Navigation"},
+  {"ref": "e4", "role": "link", "name": "Billing", "interactive": true, "section": "Settings nav", "parent": "Navigation"},
+  {"ref": "e5", "role": "link", "name": "Privacy", "interactive": true, "section": "Settings nav", "parent": "Navigation"},
+  {"ref": "e6", "role": "heading", "name": "Login & Security", "interactive": false, "section": "Security settings"},
+  {"ref": "e7", "role": "textbox", "name": "Current password", "interactive": true, "section": "Security settings", "parent": "Change password"},
+  {"ref": "e8", "role": "textbox", "name": "New password", "interactive": true, "section": "Security settings", "parent": "Change password"},
+  {"ref": "e9", "role": "textbox", "name": "Confirm password", "interactive": true, "section": "Security settings", "parent": "Change password"},
+  {"ref": "e10", "role": "checkbox", "name": "Remember me on this device", "interactive": true, "section": "Security settings", "parent": "Session options"},
+  {"ref": "e11", "role": "checkbox", "name": "Enable two-factor authentication", "interactive": true, "section": "Security settings", "parent": "Security options"},
+  {"ref": "e12", "role": "button", "name": "Update password", "interactive": true, "section": "Security settings", "parent": "Change password"},
+  {"ref": "e13", "role": "heading", "name": "Notification Preferences", "interactive": false, "section": "Notification settings"},
+  {"ref": "e14", "role": "checkbox", "name": "Email notifications", "interactive": true, "section": "Notification settings", "parent": "Email"},
+  {"ref": "e15", "role": "checkbox", "name": "Marketing emails", "interactive": true, "section": "Notification settings", "parent": "Email"},
+  {"ref": "e16", "role": "checkbox", "name": "Product updates", "interactive": true, "section": "Notification settings", "parent": "Email"},
+  {"ref": "e17", "role": "switch", "name": "Push notifications", "interactive": true, "section": "Notification settings", "parent": "Mobile"},
+  {"ref": "e18", "role": "switch", "name": "SMS alerts", "interactive": true, "section": "Notification settings", "parent": "Mobile"},
+  {"ref": "e19", "role": "heading", "name": "Billing Information", "interactive": false, "section": "Billing settings"},
+  {"ref": "e20", "role": "textbox", "name": "Cardholder name", "interactive": true, "section": "Billing settings", "parent": "Payment method"},
+  {"ref": "e21", "role": "textbox", "name": "Card number", "interactive": true, "section": "Billing settings", "parent": "Payment method"},
+  {"ref": "e22", "role": "textbox", "name": "Expiry date", "interactive": true, "section": "Billing settings", "parent": "Payment method"},
+  {"ref": "e23", "role": "textbox", "name": "CVV", "interactive": true, "section": "Billing settings", "parent": "Payment method"},
+  {"ref": "e24", "role": "radio", "name": "Credit Card", "interactive": true, "section": "Billing settings", "parent": "Payment type"},
+  {"ref": "e25", "role": "radio", "name": "PayPal", "interactive": true, "section": "Billing settings", "parent": "Payment type"},
+  {"ref": "e26", "role": "radio", "name": "Bank Transfer", "interactive": true, "section": "Billing settings", "parent": "Payment type"},
+  {"ref": "e27", "role": "heading", "name": "Shipping Address", "interactive": false, "section": "Shipping settings"},
+  {"ref": "e28", "role": "checkbox", "name": "Same as billing address", "interactive": true, "section": "Shipping settings"},
+  {"ref": "e29", "role": "textbox", "name": "Street address", "interactive": true, "section": "Shipping settings", "parent": "Address form"},
+  {"ref": "e30", "role": "textbox", "name": "City", "interactive": true, "section": "Shipping settings", "parent": "Address form"},
+  {"ref": "e31", "role": "textbox", "name": "State/Province", "interactive": true, "section": "Shipping settings", "parent": "Address form"},
+  {"ref": "e32", "role": "textbox", "name": "Postal code", "interactive": true, "section": "Shipping settings", "parent": "Address form"},
+  {"ref": "e33", "role": "combobox", "name": "Country", "interactive": true, "section": "Shipping settings", "parent": "Address form"},
+  {"ref": "e34", "role": "heading", "name": "Language & Region", "interactive": false, "section": "Preferences"},
+  {"ref": "e35", "role": "combobox", "name": "Language", "interactive": true, "section": "Preferences", "parent": "Language settings"},
+  {"ref": "e36", "role": "combobox", "name": "Timezone", "interactive": true, "section": "Preferences", "parent": "Regional settings"},
+  {"ref": "e37", "role": "combobox", "name": "Currency", "interactive": true, "section": "Preferences", "parent": "Regional settings"},
+  {"ref": "e38", "role": "heading", "name": "Privacy", "interactive": false, "section": "Privacy settings"},
+  {"ref": "e39", "role": "checkbox", "name": "Make profile public", "interactive": true, "section": "Privacy settings", "parent": "Visibility"},
+  {"ref": "e40", "role": "checkbox", "name": "Allow search engines to index my profile", "interactive": true, "section": "Privacy settings", "parent": "Visibility"},
+  {"ref": "e41", "role": "checkbox", "name": "Share activity with partners", "interactive": true, "section": "Privacy settings", "parent": "Data sharing"},
+  {"ref": "e42", "role": "link", "name": "Download my data", "interactive": true, "section": "Privacy settings", "parent": "Data export"},
+  {"ref": "e43", "role": "link", "name": "Delete my account", "interactive": true, "section": "Privacy settings", "parent": "Account"},
+  {"ref": "e44", "role": "button", "name": "Advanced options", "interactive": true, "section": "Footer", "parent": "Expandable section"},
+  {"ref": "e45", "role": "region", "name": "Advanced options", "interactive": false, "section": "Footer", "parent": "Expandable content"},
+  {"ref": "e46", "role": "checkbox", "name": "Developer mode", "interactive": true, "section": "Advanced", "parent": "Advanced options"},
+  {"ref": "e47", "role": "checkbox", "name": "Beta features", "interactive": true, "section": "Advanced", "parent": "Advanced options"},
+  {"ref": "e48", "role": "button", "name": "Save changes", "interactive": true, "section": "Footer"},
+  {"ref": "e49", "role": "button", "name": "Cancel", "interactive": true, "section": "Footer"},
+  {"ref": "e50", "role": "button", "name": "Reset to defaults", "interactive": false, "section": "Footer"},
+  {"ref": "e51", "role": "alert", "name": "Please fill in all required fields", "interactive": false, "section": "Form validation"}
+]
diff --git a/tests/benchmark/corpus/icon-aria-labels/queries.json b/tests/benchmark/corpus/icon-aria-labels/queries.json
new file mode 100644
index 0000000..a6daad0
--- /dev/null
+++ b/tests/benchmark/corpus/icon-aria-labels/queries.json
@@ -0,0 +1,227 @@
+[
+  {
+    "id": "icon-001",
+    "query": "settings gear",
+    "relevant_refs": ["e1"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "exact-match", "description"],
+    "notes": "Settings button with gear icon description"
+  },
+  {
+    "id": "icon-002",
+    "query": "delete trash icon",
+    "relevant_refs": ["e2", "e13"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["icon", "duplicate-labels", "description"],
+    "notes": "Multiple delete buttons with trash icon"
+  },
+  {
+    "id": "icon-003",
+    "query": "more options",
+    "relevant_refs": ["e3"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "exact-match"],
+    "notes": "Kebab/three dots menu"
+  },
+  {
+    "id": "icon-004",
+    "query": "kebab menu",
+    "relevant_refs": ["e3"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["icon", "synonym"],
+    "notes": "Three dots menu using slang term"
+  },
+  {
+    "id": "icon-005",
+    "query": "notifications bell",
+    "relevant_refs": ["e4"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "description"],
+    "notes": "Bell icon for notifications"
+  },
+  {
+    "id": "icon-006",
+    "query": "search magnifier",
+    "relevant_refs": ["e5"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "synonym", "description"],
+    "notes": "Search with magnifying glass"
+  },
+  {
+    "id": "icon-007",
+    "query": "copy link",
+    "relevant_refs": ["e6"],
+    "partially_relevant_refs": ["e14"],
+    "difficulty": "easy",
+    "tags": ["icon", "exact-match"],
+    "notes": "Copy link button"
+  },
+  {
+    "id": "icon-008",
+    "query": "share",
+    "relevant_refs": ["e7"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "exact-match"],
+    "notes": "Share button"
+  },
+  {
+    "id": "icon-009",
+    "query": "refresh",
+    "relevant_refs": ["e8"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "exact-match"],
+    "notes": "Refresh button with circular arrows"
+  },
+  {
+    "id": "icon-010",
+    "query": "download",
+    "relevant_refs": ["e9"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "exact-match"],
+    "notes": "Download button"
+  },
+  {
+    "id": "icon-011",
+    "query": "upload file",
+    "relevant_refs": ["e10"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "action"],
+    "notes": "Upload button"
+  },
+  {
+    "id": "icon-012",
+    "query": "calendar picker",
+    "relevant_refs": ["e20"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["icon", "section"],
+    "notes": "Calendar button in date picker"
+  },
+  {
+    "id": "icon-013",
+    "query": "hamburger menu",
+    "relevant_refs": ["e30"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["icon", "synonym"],
+    "notes": "Menu button using slang term"
+  },
+  {
+    "id": "icon-014",
+    "query": "play video",
+    "relevant_refs": ["e40"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "action", "media"],
+    "notes": "Play button in media controls"
+  },
+  {
+    "id": "icon-015",
+    "query": "mute audio",
+    "relevant_refs": ["e46"],
+    "partially_relevant_refs": ["e45"],
+    "difficulty": "medium",
+    "tags": ["icon", "action", "media"],
+    "notes": "Mute button or volume control"
+  },
+  {
+    "id": "icon-016",
+    "query": "fullscreen expand",
+    "relevant_refs": ["e47"],
+    "partially_relevant_refs": ["e52"],
+    "difficulty": "medium",
+    "tags": ["icon", "synonym"],
+    "notes": "Fullscreen or expand button"
+  },
+  {
+    "id": "icon-017",
+    "query": "plus add button",
+    "relevant_refs": ["e50"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "description"],
+    "notes": "Add button with plus icon"
+  },
+  {
+    "id": "icon-018",
+    "query": "star favorite",
+    "relevant_refs": ["e60"],
+    "partially_relevant_refs": ["e63"],
+    "difficulty": "medium",
+    "tags": ["icon", "synonym", "social"],
+    "notes": "Star or favorite button"
+  },
+  {
+    "id": "icon-019",
+    "query": "like heart",
+    "relevant_refs": ["e61"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "description", "social"],
+    "notes": "Like button with heart icon"
+  },
+  {
+    "id": "icon-020",
+    "query": "undo last action",
+    "relevant_refs": ["e70"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["icon", "action"],
+    "notes": "Undo button"
+  },
+  {
+    "id": "icon-021",
+    "query": "info circle",
+    "relevant_refs": ["e80"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["icon", "description"],
+    "notes": "Info button (i in circle)"
+  },
+  {
+    "id": "icon-022",
+    "query": "question help",
+    "relevant_refs": ["e81"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["icon", "synonym"],
+    "notes": "Help button"
+  },
+  {
+    "id": "icon-023",
+    "query": "trash in toolbar",
+    "relevant_refs": ["e2"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["icon", "section-context", "description"],
+    "notes": "Delete in toolbar section, not row actions"
+  },
+  {
+    "id": "icon-024",
+    "query": "pencil edit icon",
+    "relevant_refs": ["e11"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["icon", "description"],
+    "notes": "Edit button identified by pencil"
+  },
+  {
+    "id": "icon-025",
+    "query": "eye view button",
+    "relevant_refs": ["e12"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["icon", "description"],
+    "notes": "View button with eye icon"
+  }
+]
diff --git a/tests/benchmark/corpus/icon-aria-labels/snapshot.json b/tests/benchmark/corpus/icon-aria-labels/snapshot.json
new file mode 100644
index 0000000..f1c9c15
--- /dev/null
+++ b/tests/benchmark/corpus/icon-aria-labels/snapshot.json
@@ -0,0 +1,59 @@
+[
+  {"ref": "e1", "role": "button", "name": "Settings", "parent": null, "section": "Toolbar", "description": "gear icon"},
+  {"ref": "e2", "role": "button", "name": "Delete", "parent": null, "section": "Toolbar", "description": "trash icon"},
+  {"ref": "e3", "role": "button", "name": "More options", "parent": null, "section": "Toolbar", "description": "three dots vertical"},
+  {"ref": "e4", "role": "button", "name": "Notifications", "parent": null, "section": "Toolbar", "description": "bell icon"},
+  {"ref": "e5", "role": "button", "name": "Search", "parent": null, "section": "Toolbar", "description": "magnifying glass icon"},
+  {"ref": "e6", "role": "button", "name": "Copy link", "parent": null, "section": "Toolbar"},
+  {"ref": "e7", "role": "button", "name": "Share", "parent": null, "section": "Toolbar"},
+  {"ref": "e8", "role": "button", "name": "Refresh", "parent": null, "section": "Toolbar", "description": "circular arrows"},
+  {"ref": "e9", "role": "button", "name": "Download", "parent": null, "section": "Toolbar", "description": "down arrow"},
+  {"ref": "e10", "role": "button", "name": "Upload", "parent": null, "section": "Toolbar", "description": "up arrow"},
+
+  {"ref": "e11", "role": "button", "name": "Edit", "parent": null, "section": "Row Actions", "description": "pencil icon"},
+  {"ref": "e12", "role": "button", "name": "View", "parent": null, "section": "Row Actions", "description": "eye icon"},
+  {"ref": "e13", "role": "button", "name": "Delete", "parent": null, "section": "Row Actions", "description": "trash icon"},
+  {"ref": "e14", "role": "button", "name": "Copy", "parent": null, "section": "Row Actions", "description": "two rectangles"},
+  {"ref": "e15", "role": "button", "name": "Move", "parent": null, "section": "Row Actions", "description": "folder with arrow"},
+
+  {"ref": "e20", "role": "button", "name": "Calendar", "parent": null, "section": "Date Picker", "description": "calendar icon"},
+  {"ref": "e21", "role": "button", "name": "Previous month", "parent": "e20", "description": "left chevron"},
+  {"ref": "e22", "role": "button", "name": "Next month", "parent": "e20", "description": "right chevron"},
+  {"ref": "e23", "role": "button", "name": "Today", "parent": "e20"},
+
+  {"ref": "e30", "role": "button", "name": "Menu", "parent": null, "section": "Navigation", "description": "hamburger icon"},
+  {"ref": "e31", "role": "button", "name": "Close", "parent": null, "section": "Navigation", "description": "X icon"},
+  {"ref": "e32", "role": "button", "name": "Back", "parent": null, "section": "Navigation", "description": "left arrow"},
+  {"ref": "e33", "role": "button", "name": "Forward", "parent": null, "section": "Navigation", "description": "right arrow"},
+  {"ref": "e34", "role": "button", "name": "Home", "parent": null, "section": "Navigation", "description": "house icon"},
+
+  {"ref": "e40", "role": "button", "name": "Play", "parent": null, "section": "Media Controls", "description": "triangle pointing right"},
+  {"ref": "e41", "role": "button", "name": "Pause", "parent": null, "section": "Media Controls", "description": "two vertical bars"},
+  {"ref": "e42", "role": "button", "name": "Stop", "parent": null, "section": "Media Controls", "description": "square"},
+  {"ref": "e43", "role": "button", "name": "Skip forward", "parent": null, "section": "Media Controls", "description": "double right arrows"},
+  {"ref": "e44", "role": "button", "name": "Skip back", "parent": null, "section": "Media Controls", "description": "double left arrows"},
+  {"ref": "e45", "role": "button", "name": "Volume", "parent": null, "section": "Media Controls", "description": "speaker icon"},
+  {"ref": "e46", "role": "button", "name": "Mute", "parent": null, "section": "Media Controls", "description": "speaker with X"},
+  {"ref": "e47", "role": "button", "name": "Fullscreen", "parent": null, "section": "Media Controls", "description": "expand arrows"},
+
+  {"ref": "e50", "role": "button", "name": "Add", "parent": null, "section": "Quick Actions", "description": "plus icon"},
+  {"ref": "e51", "role": "button", "name": "Remove", "parent": null, "section": "Quick Actions", "description": "minus icon"},
+  {"ref": "e52", "role": "button", "name": "Expand", "parent": null, "section": "Quick Actions", "description": "chevron down"},
+  {"ref": "e53", "role": "button", "name": "Collapse", "parent": null, "section": "Quick Actions", "description": "chevron up"},
+  {"ref": "e54", "role": "button", "name": "Pin", "parent": null, "section": "Quick Actions", "description": "pin icon"},
+  {"ref": "e55", "role": "button", "name": "Unpin", "parent": null, "section": "Quick Actions", "description": "pin with slash"},
+
+  {"ref": "e60", "role": "button", "name": "Star", "parent": null, "section": "Social", "description": "star outline"},
+  {"ref": "e61", "role": "button", "name": "Like", "parent": null, "section": "Social", "description": "heart icon"},
+  {"ref": "e62", "role": "button", "name": "Comment", "parent": null, "section": "Social", "description": "speech bubble"},
+  {"ref": "e63", "role": "button", "name": "Bookmark", "parent": null, "section": "Social", "description": "bookmark icon"},
+  {"ref": "e64", "role": "button", "name": "Flag", "parent": null, "section": "Social", "description": "flag icon"},
+
+  {"ref": "e70", "role": "button", "name": "Undo", "parent": null, "section": "History", "description": "curved left arrow"},
+  {"ref": "e71", "role": "button", "name": "Redo", "parent": null, "section": "History", "description": "curved right arrow"},
+  {"ref": "e72", "role": "button", "name": "History", "parent": null, "section": "History", "description": "clock icon"},
+
+  {"ref": "e80", "role": "button", "name": "Info", "parent": null, "section": "Help", "description": "i in circle"},
+  {"ref": "e81", "role": "button", "name": "Help", "parent": null, "section": "Help", "description": "question mark in circle"},
+  {"ref": "e82", "role": "button", "name": "Warning", "parent": null, "section": "Help", "description": "triangle with exclamation"}
+]
diff --git a/tests/benchmark/corpus/implicit-domain-intent/queries.json b/tests/benchmark/corpus/implicit-domain-intent/queries.json
new file mode 100644
index 0000000..19652e9
--- /dev/null
+++ b/tests/benchmark/corpus/implicit-domain-intent/queries.json
@@ -0,0 +1,146 @@
+[
+  {
+    "id": "idi-001",
+    "query": "clone this repository",
+    "relevant_refs": ["e24"],
+    "partially_relevant_refs": ["e28", "e29"],
+    "difficulty": "medium",
+    "tags": ["implicit", "domain-intent", "action"]
+  },
+  {
+    "id": "idi-002",
+    "query": "download the source code",
+    "relevant_refs": ["e30"],
+    "partially_relevant_refs": ["e24"],
+    "difficulty": "hard",
+    "tags": ["implicit", "domain-intent", "action"]
+  },
+  {
+    "id": "idi-003",
+    "query": "switch to a different branch",
+    "relevant_refs": ["e21"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "domain-intent", "action"]
+  },
+  {
+    "id": "idi-004",
+    "query": "check CI status",
+    "relevant_refs": ["e15", "e33"],
+    "partially_relevant_refs": ["e32"],
+    "difficulty": "hard",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-005",
+    "query": "view build results",
+    "relevant_refs": ["e15", "e33"],
+    "partially_relevant_refs": ["e32"],
+    "difficulty": "hard",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-006",
+    "query": "save this project for later",
+    "relevant_refs": ["e11", "e39"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "domain-intent", "action"]
+  },
+  {
+    "id": "idi-007",
+    "query": "bookmark this repo",
+    "relevant_refs": ["e11"],
+    "partially_relevant_refs": ["e39"],
+    "difficulty": "hard",
+    "tags": ["implicit", "domain-intent", "action"]
+  },
+  {
+    "id": "idi-008",
+    "query": "compare branches",
+    "relevant_refs": ["e37"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["implicit", "domain-intent", "action"]
+  },
+  {
+    "id": "idi-009",
+    "query": "go to my profile",
+    "relevant_refs": ["e7"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-010",
+    "query": "check my notifications",
+    "relevant_refs": ["e6"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-011",
+    "query": "contact support",
+    "relevant_refs": ["e52"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-012",
+    "query": "privacy policy",
+    "relevant_refs": ["e48"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-013",
+    "query": "type a new search query",
+    "relevant_refs": ["e5"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["implicit", "domain-intent", "input"]
+  },
+  {
+    "id": "idi-014",
+    "query": "fork this project",
+    "relevant_refs": ["e10"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["domain-intent", "action", "button"]
+  },
+  {
+    "id": "idi-015",
+    "query": "open pull requests",
+    "relevant_refs": ["e14"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-016",
+    "query": "see who contributed",
+    "relevant_refs": ["e42"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-017",
+    "query": "check latest releases",
+    "relevant_refs": ["e40"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["implicit", "domain-intent", "navigation"]
+  },
+  {
+    "id": "idi-018",
+    "query": "create a new file",
+    "relevant_refs": ["e23"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["implicit", "domain-intent", "action"]
+  }
+]
diff --git a/tests/benchmark/corpus/implicit-domain-intent/snapshot.json b/tests/benchmark/corpus/implicit-domain-intent/snapshot.json
new file mode 100644
index 0000000..1ff13a7
--- /dev/null
+++ b/tests/benchmark/corpus/implicit-domain-intent/snapshot.json
@@ -0,0 +1,58 @@
+[
+  {"ref": "e0", "role": "banner", "name": "", "interactive": false, "section": "Header"},
+  {"ref": "e1", "role": "link", "name": "Dashboard", "interactive": true, "section": "Header", "parent": "Navigation"},
+  {"ref": "e2", "role": "link", "name": "Projects", "interactive": true, "section": "Header", "parent": "Navigation"},
+  {"ref": "e3", "role": "link", "name": "Settings", "interactive": true, "section": "Header", "parent": "Navigation"},
+  {"ref": "e4", "role": "button", "name": "New", "interactive": true, "section": "Header", "parent": "Actions"},
+  {"ref": "e5", "role": "searchbox", "name": "Search or jump to...", "interactive": true, "section": "Header"},
+  {"ref": "e6", "role": "button", "name": "Notifications", "interactive": true, "section": "Header", "parent": "User menu"},
+  {"ref": "e7", "role": "button", "name": "Profile", "interactive": true, "section": "Header", "parent": "User menu"},
+  {"ref": "e8", "role": "heading", "name": "acme/webapp", "interactive": false, "section": "Repository header"},
+  {"ref": "e9", "role": "button", "name": "Watch", "interactive": true, "section": "Repository header", "parent": "Repository actions"},
+  {"ref": "e10", "role": "button", "name": "Fork", "interactive": true, "section": "Repository header", "parent": "Repository actions"},
+  {"ref": "e11", "role": "button", "name": "Star", "interactive": true, "section": "Repository header", "parent": "Repository actions"},
+  {"ref": "e12", "role": "link", "name": "Code", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e13", "role": "link", "name": "Issues", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e14", "role": "link", "name": "Pull requests", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e15", "role": "link", "name": "Actions", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e16", "role": "link", "name": "Projects", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e17", "role": "link", "name": "Wiki", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e18", "role": "link", "name": "Security", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e19", "role": "link", "name": "Insights", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e20", "role": "link", "name": "Settings", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"},
+  {"ref": "e21", "role": "button", "name": "main", "interactive": true, "section": "Code view", "parent": "Branch selector"},
+  {"ref": "e22", "role": "button", "name": "Go to file", "interactive": true, "section": "Code view", "parent": "File actions"},
+  {"ref": "e23", "role": "button", "name": "Add file", "interactive": true, "section": "Code view", "parent": "File actions"},
+  {"ref": "e24", "role": "button", "name": "Code", "interactive": true, "section": "Code view", "parent": "Clone dropdown"},
+  {"ref": "e25", "role": "link", "name": "HTTPS", "interactive": true, "section": "Clone panel", "parent": "Clone options"},
+  {"ref": "e26", "role": "link", "name": "SSH", "interactive": true, "section": "Clone panel", "parent": "Clone options"},
+  {"ref": "e27", "role": "link", "name": "GitHub CLI", "interactive": true, "section": "Clone panel", "parent": "Clone options"},
+  {"ref": "e28", "role": "textbox", "name": "Clone URL", "value": "https://github.com/acme/webapp.git", "interactive": true, "section": "Clone panel"},
+  {"ref": "e29", "role": "button", "name": "Copy URL", "interactive": true, "section": "Clone panel"},
+  {"ref": "e30", "role": "link", "name": "Download ZIP", "interactive": true, "section": "Clone panel"},
+  {"ref": "e31", "role": "link", "name": "Open with GitHub Desktop", "interactive": true, "section": "Clone panel"},
+  {"ref": "e32", "role": "status", "name": "All checks have passed", "interactive": false, "section": "Commit status"},
+  {"ref": "e33", "role": "link", "name": "View workflow runs", "interactive": true, "section": "Commit status"},
+  {"ref": "e34", "role": "link", "name": "README.md", "interactive": true, "section": "File tree"},
+  {"ref": "e35", "role": "link", "name": "package.json", "interactive": true, "section": "File tree"},
+  {"ref": "e36", "role": "link", "name": "src", "interactive": true, "section": "File tree"},
+  {"ref": "e37", "role": "link", "name": "Compare", "interactive": true, "section": "Branch actions"},
+  {"ref": "e38", "role": "link", "name": "Contribute", "interactive": true, "section": "Branch actions"},
+  {"ref": "e39", "role": "button", "name": "Add to list", "interactive": true, "section": "Repository sidebar", "parent": "Lists"},
+  {"ref": "e40", "role": "link", "name": "Releases", "interactive": true, "section": "Repository sidebar"},
+  {"ref": "e41", "role": "link", "name": "Packages", "interactive": true, "section": "Repository sidebar"},
+  {"ref": "e42", "role": "link", "name": "Contributors", "interactive": true, "section": "Repository sidebar"},
+  {"ref": "e43", "role": "link", "name": "Activity", "interactive": true, "section": "Repository sidebar"},
+  {"ref": "e44", "role": "link", "name": "Report repository", "interactive": true, "section": "Footer"},
+  {"ref": "e45", "role": "link", "name": "About", "interactive": true, "section": "Footer"},
+  {"ref": "e46", "role": "link", "name": "Blog", "interactive": true, "section": "Footer"},
+  {"ref": "e47", "role": "link", "name": "Terms", "interactive": true, "section": "Footer"},
+  {"ref": "e48", "role": "link", "name": "Privacy", "interactive": true, "section": "Footer"},
+  {"ref": "e49", "role": "link", "name": "Security", "interactive": true, "section": "Footer"},
+  {"ref": "e50", "role": "link", "name": "Status", "interactive": true, "section": "Footer"},
+  {"ref": "e51", "role": "link", "name": "Docs", "interactive": true, "section": "Footer"},
+  {"ref": "e52", "role": "link", "name": "Contact", "interactive": true, "section": "Footer"},
+  {"ref": "e53", "role": "link", "name": "Pricing", "interactive": true, "section": "Footer"},
+  {"ref": "e54", "role": "link", "name": "API", "interactive": true, "section": "Footer"},
+  {"ref": "e55", "role": "link", "name": "Training", "interactive": true, "section": "Footer"}
+]
diff --git a/tests/benchmark/corpus/overlays-menus-dialogs/queries.json b/tests/benchmark/corpus/overlays-menus-dialogs/queries.json
new file mode 100644
index 0000000..95990d7
--- /dev/null
+++ b/tests/benchmark/corpus/overlays-menus-dialogs/queries.json
@@ -0,0 +1,218 @@
+[
+  {
+    "id": "overlay-001",
+    "query": "close dialog",
+    "relevant_refs": ["e15", "e56"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["dialog", "action", "duplicate-labels"],
+    "notes": "Multiple dialogs have close buttons"
+  },
+  {
+    "id": "overlay-002",
+    "query": "confirm delete",
+    "relevant_refs": ["e14"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["dialog", "action", "context-exclusion"],
+    "notes": "Delete button in the Confirm Delete dialog"
+  },
+  {
+    "id": "overlay-003",
+    "query": "cancel inside modal",
+    "relevant_refs": ["e13", "e55"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["dialog", "action", "context-exclusion"],
+    "notes": "Cancel buttons inside modals vs page"
+  },
+  {
+    "id": "overlay-004",
+    "query": "cancel on page not modal",
+    "relevant_refs": ["e1", "e71"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["dialog", "negative-context", "context-exclusion"],
+    "notes": "Cancel buttons on the page, excluding modals"
+  },
+  {
+    "id": "overlay-005",
+    "query": "open account menu",
+    "relevant_refs": ["e4"],
+    "partially_relevant_refs": ["e20"],
+    "difficulty": "easy",
+    "tags": ["menu", "action"],
+    "notes": "Account Menu button or the open menu"
+  },
+  {
+    "id": "overlay-006",
+    "query": "select export CSV from menu",
+    "relevant_refs": ["e31"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["menu", "menuitem", "action"],
+    "notes": "Export CSV menuitem in Repository Options"
+  },
+  {
+    "id": "overlay-007",
+    "query": "choose billing from dropdown",
+    "relevant_refs": ["e23"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["menu", "menuitem", "synonym"],
+    "notes": "Billing menuitem in Account Menu"
+  },
+  {
+    "id": "overlay-008",
+    "query": "more options for repository",
+    "relevant_refs": ["e6"],
+    "partially_relevant_refs": ["e30"],
+    "difficulty": "medium",
+    "tags": ["menu", "action"],
+    "notes": "More Options button in Repository Actions"
+  },
+  {
+    "id": "overlay-009",
+    "query": "dismiss notification",
+    "relevant_refs": ["e42"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["alertdialog", "action", "exact-match"],
+    "notes": "Dismiss button in notification"
+  },
+  {
+    "id": "overlay-010",
+    "query": "save changes in dialog",
+    "relevant_refs": ["e54"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["dialog", "action", "context-exclusion"],
+    "notes": "Save button inside Settings dialog, not page"
+  },
+  {
+    "id": "overlay-011",
+    "query": "sign out from menu",
+    "relevant_refs": ["e24"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["menu", "menuitem", "action"],
+    "notes": "Sign Out menuitem"
+  },
+  {
+    "id": "overlay-012",
+    "query": "archive repository",
+    "relevant_refs": ["e34"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["menu", "menuitem", "action"],
+    "notes": "Archive menuitem in Repository Options"
+  },
+  {
+    "id": "overlay-013",
+    "query": "delete in context menu",
+    "relevant_refs": ["e94"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["menu", "context-exclusion", "duplicate-labels"],
+    "notes": "Delete in Context Menu vs other menus"
+  },
+  {
+    "id": "overlay-014",
+    "query": "copy from right click menu",
+    "relevant_refs": ["e92"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["menu", "menuitem", "synonym"],
+    "notes": "Copy in Context Menu (right-click equivalent)"
+  },
+  {
+    "id": "overlay-015",
+    "query": "theme dropdown options",
+    "relevant_refs": ["e60"],
+    "partially_relevant_refs": ["e61", "e62", "e63"],
+    "difficulty": "medium",
+    "tags": ["combobox", "dialog"],
+    "notes": "Select Theme combobox in Settings"
+  },
+  {
+    "id": "overlay-016",
+    "query": "choose dark theme",
+    "relevant_refs": ["e62"],
+    "partially_relevant_refs": ["e52"],
+    "difficulty": "medium",
+    "tags": ["option", "combobox"],
+    "notes": "Dark option in theme dropdown"
+  },
+  {
+    "id": "overlay-017",
+    "query": "help in footer not menu",
+    "relevant_refs": ["e72"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["link", "negative-context", "context-exclusion"],
+    "notes": "Help link in Footer, excluding menu"
+  },
+  {
+    "id": "overlay-018",
+    "query": "settings menuitem",
+    "relevant_refs": ["e22"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["menu", "menuitem", "duplicate-labels"],
+    "notes": "Settings in Account Menu, not the nav button"
+  },
+  {
+    "id": "overlay-019",
+    "query": "undo notification action",
+    "relevant_refs": ["e43"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["alertdialog", "action"],
+    "notes": "Undo button in notification"
+  },
+  {
+    "id": "overlay-020",
+    "query": "profile in account dropdown",
+    "relevant_refs": ["e21"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["menu", "menuitem"],
+    "notes": "Profile menuitem"
+  },
+  {
+    "id": "overlay-021",
+    "query": "delete from repository options menu",
+    "relevant_refs": ["e35"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["menu", "context-exclusion", "duplicate-labels"],
+    "notes": "Delete in Repository Options, not Context Menu"
+  },
+  {
+    "id": "overlay-022",
+    "query": "enable dark mode checkbox",
+    "relevant_refs": ["e52"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["dialog", "checkbox"],
+    "notes": "Dark Mode checkbox in Settings"
+  },
+  {
+    "id": "overlay-023",
+    "query": "page save button not dialog",
+    "relevant_refs": ["e2", "e70"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["button", "negative-context", "context-exclusion"],
+    "notes": "Save buttons on page, excluding dialogs"
+  },
+  {
+    "id": "overlay-024",
+    "query": "paste from clipboard",
+    "relevant_refs": ["e93"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["menu", "menuitem", "action"],
+    "notes": "Paste menuitem in Context Menu"
+  }
+]
diff --git a/tests/benchmark/corpus/overlays-menus-dialogs/snapshot.json b/tests/benchmark/corpus/overlays-menus-dialogs/snapshot.json
new file mode 100644
index 0000000..7622665
--- /dev/null
+++ b/tests/benchmark/corpus/overlays-menus-dialogs/snapshot.json
@@ -0,0 +1,60 @@
+[
+  {"ref": "e1", "role": "button", "name": "Cancel", "parent": null, "section": "Page Header"},
+  {"ref": "e2", "role": "button", "name": "Save", "parent": null, "section": "Page Header"},
+  {"ref": "e3", "role": "button", "name": "Settings", "parent": null, "section": "Navigation"},
+  {"ref": "e4", "role": "button", "name": "Account Menu", "parent": null, "section": "Navigation", "expanded": false},
+  {"ref": "e5", "role": "link", "name": "Home", "parent": null, "section": "Navigation"},
+  {"ref": "e6", "role": "button", "name": "More Options", "parent": null, "section": "Repository Actions"},
+  {"ref": "e7", "role": "button", "name": "Delete Repository", "parent": null, "section": "Repository Actions"},
+
+  {"ref": "e10", "role": "dialog", "name": "Confirm Delete", "parent": null, "modal": true},
+  {"ref": "e11", "role": "heading", "name": "Delete Repository?", "parent": "e10", "section": "Confirm Delete"},
+  {"ref": "e12", "role": "StaticText", "name": "This action cannot be undone.", "parent": "e10", "section": "Confirm Delete"},
+  {"ref": "e13", "role": "button", "name": "Cancel", "parent": "e10", "section": "Confirm Delete"},
+  {"ref": "e14", "role": "button", "name": "Delete", "parent": "e10", "section": "Confirm Delete"},
+  {"ref": "e15", "role": "button", "name": "Close", "parent": "e10", "section": "Confirm Delete", "description": "Close dialog"},
+
+  {"ref": "e20", "role": "menu", "name": "Account Menu", "parent": null, "expanded": true},
+  {"ref": "e21", "role": "menuitem", "name": "Profile", "parent": "e20", "section": "Account Menu"},
+  {"ref": "e22", "role": "menuitem", "name": "Settings", "parent": "e20", "section": "Account Menu"},
+  {"ref": "e23", "role": "menuitem", "name": "Billing", "parent": "e20", "section": "Account Menu"},
+  {"ref": "e24", "role": "menuitem", "name": "Sign Out", "parent": "e20", "section": "Account Menu"},
+  {"ref": "e25", "role": "menuitem", "name": "Help", "parent": "e20", "section": "Account Menu"},
+
+  {"ref": "e30", "role": "menu", "name": "Repository Options", "parent": null, "expanded": true},
+  {"ref": "e31", "role": "menuitem", "name": "Export CSV", "parent": "e30", "section": "Repository Options"},
+  {"ref": "e32", "role": "menuitem", "name": "Export JSON", "parent": "e30", "section": "Repository Options"},
+  {"ref": "e33", "role": "menuitem", "name": "Duplicate", "parent": "e30", "section": "Repository Options"},
+  {"ref": "e34", "role": "menuitem", "name": "Archive", "parent": "e30", "section": "Repository Options"},
+  {"ref": "e35", "role": "menuitem", "name": "Delete", "parent": "e30", "section": "Repository Options"},
+
+  {"ref": "e40", "role": "alertdialog", "name": "Notification", "parent": null},
+  {"ref": "e41", "role": "StaticText", "name": "Changes saved successfully", "parent": "e40", "section": "Notification"},
+  {"ref": "e42", "role": "button", "name": "Dismiss", "parent": "e40", "section": "Notification"},
+  {"ref": "e43", "role": "button", "name": "Undo", "parent": "e40", "section": "Notification"},
+
+  {"ref": "e50", "role": "dialog", "name": "Settings", "parent": null, "modal": true},
+  {"ref": "e51", "role": "heading", "name": "Settings", "parent": "e50", "section": "Settings Dialog"},
+  {"ref": "e52", "role": "checkbox", "name": "Dark Mode", "parent": "e50", "section": "Settings Dialog"},
+  {"ref": "e53", "role": "checkbox", "name": "Notifications", "parent": "e50", "section": "Settings Dialog"},
+  {"ref": "e54", "role": "button", "name": "Save", "parent": "e50", "section": "Settings Dialog"},
+  {"ref": "e55", "role": "button", "name": "Cancel", "parent": "e50", "section": "Settings Dialog"},
+  {"ref": "e56", "role": "button", "name": "Close", "parent": "e50", "section": "Settings Dialog"},
+
+  {"ref": "e60", "role": "combobox", "name": "Select Theme", "parent": "e50", "section": "Settings Dialog", "expanded": true},
+  {"ref": "e61", "role": "option", "name": "Light", "parent": "e60"},
+  {"ref": "e62", "role": "option", "name": "Dark", "parent": "e60"},
+  {"ref": "e63", "role": "option", "name": "System", "parent": "e60"},
+
+  {"ref": "e70", "role": "button", "name": "Save", "parent": null, "section": "Footer"},
+  {"ref": "e71", "role": "button", "name": "Cancel", "parent": null, "section": "Footer"},
+  {"ref": "e72", "role": "link", "name": "Help", "parent": null, "section": "Footer"},
+
+  {"ref": "e80", "role": "tooltip", "name": "Save your changes", "parent": null},
+
+  {"ref": "e90", "role": "menu", "name": "Context Menu", "parent": null, "expanded": true},
+  {"ref": "e91", "role": "menuitem", "name": "Cut", "parent": "e90"},
+  {"ref": "e92", "role": "menuitem", "name": "Copy", "parent": "e90"},
+  {"ref": "e93", "role": "menuitem", "name": "Paste", "parent": "e90"},
+  {"ref": "e94", "role": "menuitem", "name": "Delete", "parent": "e90"}
+]
diff --git a/tests/benchmark/corpus/table-grid/queries.json b/tests/benchmark/corpus/table-grid/queries.json
new file mode 100644
index 0000000..f7aeb28
--- /dev/null
+++ b/tests/benchmark/corpus/table-grid/queries.json
@@ -0,0 +1,219 @@
+[
+  {
+    "id": "table-001",
+    "query": "edit Alice",
+    "relevant_refs": ["e15"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "row-context", "action"],
+    "notes": "Edit button for Alice Johnson's row"
+  },
+  {
+    "id": "table-002",
+    "query": "delete second invoice",
+    "relevant_refs": ["e26"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["table", "ordinal", "action"],
+    "notes": "Delete button for second row (Acme Corp)"
+  },
+  {
+    "id": "table-003",
+    "query": "status for Acme Corp",
+    "relevant_refs": ["e24"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "row-context", "cell"],
+    "notes": "Status cell showing Pending for Acme Corp"
+  },
+  {
+    "id": "table-004",
+    "query": "open failed payment row",
+    "relevant_refs": ["e40"],
+    "partially_relevant_refs": ["e48"],
+    "difficulty": "hard",
+    "tags": ["table", "state", "row-context"],
+    "notes": "Row with Failed status (Maya Chen)"
+  },
+  {
+    "id": "table-005",
+    "query": "download invoice INV-1024",
+    "relevant_refs": [],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["table", "no-match", "absent-control"],
+    "expect_no_match": true,
+    "notes": "Invoice doesn't exist"
+  },
+  {
+    "id": "table-006",
+    "query": "approve pending request from Maya",
+    "relevant_refs": [],
+    "partially_relevant_refs": ["e48"],
+    "difficulty": "hard",
+    "tags": ["table", "domain-intent", "row-context"],
+    "notes": "Maya has Failed status, not Pending. Retry Payment is closest."
+  },
+  {
+    "id": "table-007",
+    "query": "sort by due date",
+    "relevant_refs": ["e60"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["table", "action", "exact-match"],
+    "notes": "Exact match for sort control"
+  },
+  {
+    "id": "table-008",
+    "query": "filter unpaid invoices",
+    "relevant_refs": ["e62"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "filter", "domain-intent"],
+    "notes": "Filter by Status combobox for non-paid statuses"
+  },
+  {
+    "id": "table-009",
+    "query": "open actions menu for third row",
+    "relevant_refs": ["e82"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["table", "ordinal", "action"],
+    "notes": "More button for Bob Smith (third row)"
+  },
+  {
+    "id": "table-010",
+    "query": "select all overdue invoices",
+    "relevant_refs": ["e72"],
+    "partially_relevant_refs": ["e64"],
+    "difficulty": "hard",
+    "tags": ["table", "state", "checkbox"],
+    "notes": "Only INV-1003 is overdue; select all is partial"
+  },
+  {
+    "id": "table-011",
+    "query": "download Bob Smith invoice",
+    "relevant_refs": ["e37"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "row-context", "action"],
+    "notes": "Download button for Bob Smith's row"
+  },
+  {
+    "id": "table-012",
+    "query": "send reminder to Acme",
+    "relevant_refs": ["e28"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "row-context", "action"],
+    "notes": "Send Reminder button for Acme Corp"
+  },
+  {
+    "id": "table-013",
+    "query": "retry payment Maya Chen",
+    "relevant_refs": ["e48"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["table", "row-context", "action"],
+    "notes": "Retry Payment button in Maya's row"
+  },
+  {
+    "id": "table-014",
+    "query": "mark overdue as bad debt",
+    "relevant_refs": ["e39"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "state", "action"],
+    "notes": "Only the overdue invoice has Mark as Bad Debt"
+  },
+  {
+    "id": "table-015",
+    "query": "export table to CSV",
+    "relevant_refs": ["e63"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["table", "action", "synonym"],
+    "notes": "Export CSV button"
+  },
+  {
+    "id": "table-016",
+    "query": "delete all selected",
+    "relevant_refs": ["e65"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "bulk-action", "synonym"],
+    "notes": "Bulk Delete button"
+  },
+  {
+    "id": "table-017",
+    "query": "edit first invoice",
+    "relevant_refs": ["e15"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "ordinal", "action"],
+    "notes": "Edit button for first row (Alice)"
+  },
+  {
+    "id": "table-018",
+    "query": "select invoice 1003",
+    "relevant_refs": ["e72"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "row-context", "checkbox"],
+    "notes": "Checkbox for INV-1003"
+  },
+  {
+    "id": "table-019",
+    "query": "last row more actions",
+    "relevant_refs": ["e84"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["table", "ordinal", "action"],
+    "notes": "More button in last row (Delta Inc)"
+  },
+  {
+    "id": "table-020",
+    "query": "search invoices",
+    "relevant_refs": ["e66"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["table", "searchbox", "exact-match"],
+    "notes": "Search invoices searchbox"
+  },
+  {
+    "id": "table-021",
+    "query": "edit Delta Inc invoice",
+    "relevant_refs": ["e55"],
+    "partially_relevant_refs": [],
+    "difficulty": "medium",
+    "tags": ["table", "row-context", "action"],
+    "notes": "Edit button for Delta Inc row"
+  },
+  {
+    "id": "table-022",
+    "query": "invoice with highest amount",
+    "relevant_refs": ["e20"],
+    "partially_relevant_refs": ["e23"],
+    "difficulty": "hard",
+    "tags": ["table", "domain-intent", "implicit"],
+    "notes": "Acme Corp has highest amount at $1500"
+  },
+  {
+    "id": "table-023",
+    "query": "select checkbox for pending invoice",
+    "relevant_refs": ["e71"],
+    "partially_relevant_refs": [],
+    "difficulty": "hard",
+    "tags": ["table", "state", "checkbox"],
+    "notes": "Only INV-1002 (Acme) is pending"
+  },
+  {
+    "id": "table-024",
+    "query": "sort invoices by amount",
+    "relevant_refs": ["e61"],
+    "partially_relevant_refs": [],
+    "difficulty": "easy",
+    "tags": ["table", "action", "exact-match"],
+    "notes": "Sort by Amount button"
+  }
+]
diff --git a/tests/benchmark/corpus/table-grid/snapshot.json b/tests/benchmark/corpus/table-grid/snapshot.json
new file mode 100644
index 0000000..ade656e
--- /dev/null
+++ b/tests/benchmark/corpus/table-grid/snapshot.json
@@ -0,0 +1,78 @@
+[
+  {"ref": "e1", "role": "table", "name": "Invoices", "parent": null},
+  {"ref": "e2", "role": "row", "name": "", "parent": "e1", "section": "Invoices"},
+  {"ref": "e3", "role": "columnheader", "name": "Invoice", "parent": "e2"},
+  {"ref": "e4", "role": "columnheader", "name": "Customer", "parent": "e2"},
+  {"ref": "e5", "role": "columnheader", "name": "Amount", "parent": "e2"},
+  {"ref": "e6", "role": "columnheader", "name": "Status", "parent": "e2"},
+  {"ref": "e7", "role": "columnheader", "name": "Actions", "parent": "e2"},
+
+  {"ref": "e10", "role": "row", "name": "INV-1001 Alice Johnson $250.00 Paid", "parent": "e1"},
+  {"ref": "e11", "role": "cell", "name": "INV-1001", "parent": "e10"},
+  {"ref": "e12", "role": "cell", "name": "Alice Johnson", "parent": "e10"},
+  {"ref": "e13", "role": "cell", "name": "$250.00", "parent": "e10"},
+  {"ref": "e14", "role": "cell", "name": "Paid", "parent": "e10"},
+  {"ref": "e15", "role": "button", "name": "Edit", "parent": "e10", "description": "Edit invoice INV-1001"},
+  {"ref": "e16", "role": "button", "name": "Delete", "parent": "e10", "description": "Delete invoice INV-1001"},
+  {"ref": "e17", "role": "button", "name": "Download", "parent": "e10", "description": "Download PDF"},
+
+  {"ref": "e20", "role": "row", "name": "INV-1002 Acme Corp $1,500.00 Pending", "parent": "e1"},
+  {"ref": "e21", "role": "cell", "name": "INV-1002", "parent": "e20"},
+  {"ref": "e22", "role": "cell", "name": "Acme Corp", "parent": "e20"},
+  {"ref": "e23", "role": "cell", "name": "$1,500.00", "parent": "e20"},
+  {"ref": "e24", "role": "cell", "name": "Pending", "parent": "e20"},
+  {"ref": "e25", "role": "button", "name": "Edit", "parent": "e20", "description": "Edit invoice INV-1002"},
+  {"ref": "e26", "role": "button", "name": "Delete", "parent": "e20", "description": "Delete invoice INV-1002"},
+  {"ref": "e27", "role": "button", "name": "Download", "parent": "e20", "description": "Download PDF"},
+  {"ref": "e28", "role": "button", "name": "Send Reminder", "parent": "e20"},
+
+  {"ref": "e30", "role": "row", "name": "INV-1003 Bob Smith $75.00 Overdue", "parent": "e1"},
+  {"ref": "e31", "role": "cell", "name": "INV-1003", "parent": "e30"},
+  {"ref": "e32", "role": "cell", "name": "Bob Smith", "parent": "e30"},
+  {"ref": "e33", "role": "cell", "name": "$75.00", "parent": "e30"},
+  {"ref": "e34", "role": "cell", "name": "Overdue", "parent": "e30"},
+  {"ref": "e35", "role": "button", "name": "Edit", "parent": "e30", "description": "Edit invoice INV-1003"},
+  {"ref": "e36", "role": "button", "name": "Delete", "parent": "e30", "description": "Delete invoice INV-1003"},
+  {"ref": "e37", "role": "button", "name": "Download", "parent": "e30", "description": "Download PDF"},
+  {"ref": "e38", "role": "button", "name": "Send Reminder", "parent": "e30"},
+  {"ref": "e39", "role": "button", "name": "Mark as Bad Debt", "parent": "e30"},
+
+  {"ref": "e40", "role": "row", "name": "INV-1004 Maya Chen $320.00 Failed", "parent": "e1"},
+  {"ref": "e41", "role": "cell", "name": "INV-1004", "parent": "e40"},
+  {"ref": "e42", "role": "cell", "name": "Maya Chen", "parent": "e40"},
+  {"ref": "e43", "role": "cell", "name": "$320.00", "parent": "e40"},
+  {"ref": "e44", "role": "cell", "name": "Failed", "parent": "e40"},
+  {"ref": "e45", "role": "button", "name": "Edit", "parent": "e40", "description": "Edit invoice INV-1004"},
+  {"ref": "e46", "role": "button", "name": "Delete", "parent": "e40", "description": "Delete invoice INV-1004"},
+  {"ref": "e47", "role": "button", "name": "Download", "parent": "e40", "description": "Download PDF"},
+  {"ref": "e48", "role": "button", "name": "Retry Payment", "parent": "e40"},
+
+  {"ref": "e50", "role": "row", "name": "INV-1005 Delta Inc $890.00 Paid", "parent": "e1"},
+  {"ref": "e51", "role": "cell", "name": "INV-1005", "parent": "e50"},
+  {"ref": "e52", "role": "cell", "name": "Delta Inc", "parent": "e50"},
+  {"ref": "e53", "role": "cell", "name": "$890.00", "parent": "e50"},
+  {"ref": "e54", "role": "cell", "name": "Paid", "parent": "e50"},
+  {"ref": "e55", "role": "button", "name": "Edit", "parent": "e50", "description": "Edit invoice INV-1005"},
+  {"ref": "e56", "role": "button", "name": "Delete", "parent": "e50", "description": "Delete invoice INV-1005"},
+  {"ref": "e57", "role": "button", "name": "Download", "parent": "e50", "description": "Download PDF"},
+
+  {"ref": "e60", "role": "button", "name": "Sort by Due Date", "parent": null, "section": "Table Controls"},
+  {"ref": "e61", "role": "button", "name": "Sort by Amount", "parent": null, "section": "Table Controls"},
+  {"ref": "e62", "role": "combobox", "name": "Filter by Status", "parent": null, "section": "Table Controls"},
+  {"ref": "e63", "role": "button", "name": "Export CSV", "parent": null, "section": "Table Controls"},
+  {"ref": "e64", "role": "checkbox", "name": "Select All", "parent": null, "section": "Table Controls"},
+  {"ref": "e65", "role": "button", "name": "Bulk Delete", "parent": null, "section": "Table Controls"},
+  {"ref": "e66", "role": "searchbox", "name": "Search invoices", "parent": null, "section": "Table Controls"},
+
+  {"ref": "e70", "role": "checkbox", "name": "Select INV-1001", "parent": "e10"},
+  {"ref": "e71", "role": "checkbox", "name": "Select INV-1002", "parent": "e20"},
+  {"ref": "e72", "role": "checkbox", "name": "Select INV-1003", "parent": "e30"},
+  {"ref": "e73", "role": "checkbox", "name": "Select INV-1004", "parent": "e40"},
+  {"ref": "e74", "role": "checkbox", "name": "Select INV-1005", "parent": "e50"},
+
+  {"ref": "e80", "role": "button", "name": "More", "parent": "e10", "description": "More actions"},
+  {"ref": "e81", "role": "button", "name": "More", "parent": "e20", "description": "More actions"},
+  {"ref": "e82", "role": "button", "name": "More", "parent": "e30", "description": "More actions"},
+  {"ref": "e83", "role": "button", "name": "More", "parent": "e40", "description": "More actions"},
+  {"ref": "e84", "role": "button", "name": "More", "parent": "e50", "description": "More actions"}
+]
diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh
new file mode 100755
index 0000000..29f81b2
--- /dev/null
+++ b/tests/benchmark/scripts/lint-corpus.sh
@@ -0,0 +1,197 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+CORPUS_DIR="${BENCHMARK_DIR}/corpus"
+CASES_DIR="${BENCHMARK_DIR}/cases"
+SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m'
+
+ERRORS=0
+WARNINGS=0
+
+error() {
+    echo -e "${RED}ERROR:${NC} $1"
+    ((ERRORS++))
+}
+
+warn() {
+    echo -e "${YELLOW}WARN:${NC} $1"
+    ((WARNINGS++))
+}
+
+ok() {
+    echo -e "${GREEN}✓${NC} $1"
+}
+
+echo "=== Corpus Lint ==="
+echo ""
+
+# 1. Check for invalid JSON in all benchmark files
+echo "Checking JSON validity..."
+for f in "${CORPUS_DIR}"/*/*.json "${CASES_DIR}"/*.json; do
+    if [[ -f "$f" ]]; then
+        if ! jq . "$f" >/dev/null 2>&1; then
+            error "Invalid JSON: $f"
+        fi
+    fi
+done
+
+# 2. Check for duplicate query IDs across corpus files
+echo "Checking for duplicate query IDs..."
+declare -A QUERY_IDS
+for f in "${CORPUS_DIR}"/*/queries.json; do
+    if [[ -f "$f" ]]; then
+        while IFS= read -r id; do
+            if [[ -n "$id" && "$id" != "null" ]]; then
+                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
+                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
+                else
+                    QUERY_IDS[$id]="$f"
+                fi
+            fi
+        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
+    fi
+done
+
+# Also check cases files
+for f in "${CASES_DIR}"/*.json; do
+    if [[ -f "$f" ]]; then
+        while IFS= read -r id; do
+            if [[ -n "$id" && "$id" != "null" ]]; then
+                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
+                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
+                else
+                    QUERY_IDS[$id]="$f"
+                fi
+            fi
+        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
+    fi
+done
+
+# 3. Check for duplicate refs within snapshots
+echo "Checking for duplicate refs in snapshots..."
+for f in "${CORPUS_DIR}"/*/snapshot.json; do
+    if [[ -f "$f" ]]; then
+        dupes=$(jq -r '.[].ref' "$f" 2>/dev/null | sort | uniq -d)
+        if [[ -n "$dupes" ]]; then
+            error "Duplicate refs in $f: $dupes"
+        fi
+    fi
+done
+
+# 4. Check that relevant_refs exist in snapshot
+echo "Checking relevant_refs exist in snapshots..."
+for corpus_dir in "${CORPUS_DIR}"/*/; do
+    corpus_name=$(basename "$corpus_dir")
+    snapshot="${corpus_dir}snapshot.json"
+    queries="${corpus_dir}queries.json"
+
+    if [[ -f "$snapshot" && -f "$queries" ]]; then
+        # Get all refs from snapshot
+        refs=$(jq -r '.[].ref' "$snapshot" 2>/dev/null | sort | uniq)
+
+        # Check relevant_refs
+        while IFS= read -r ref; do
+            if [[ -n "$ref" && "$ref" != "null" ]]; then
+                if ! echo "$refs" | grep -qx "$ref"; then
+                    error "[$corpus_name] relevant_ref '$ref' not found in snapshot"
+                fi
+            fi
+        done < <(jq -r '.[].relevant_refs[]? // empty' "$queries" 2>/dev/null)
+
+        # Check partially_relevant_refs
+        while IFS= read -r ref; do
+            if [[ -n "$ref" && "$ref" != "null" ]]; then
+                if ! echo "$refs" | grep -qx "$ref"; then
+                    error "[$corpus_name] partially_relevant_ref '$ref' not found in snapshot"
+                fi
+            fi
+        done < <(jq -r '.[].partially_relevant_refs[]? // empty' "$queries" 2>/dev/null)
+    fi
+done
+
+# 5. Check for empty relevant_refs (except no-match cases)
+echo "Checking for empty relevant_refs..."
+for f in "${CORPUS_DIR}"/*/queries.json; do
+    if [[ -f "$f" ]]; then
+        empty_relevant=$(jq -r '.[] | select(.relevant_refs | length == 0) | select(.partially_relevant_refs | length == 0) | select(.expect_no_match != true) | .id' "$f" 2>/dev/null)
+        for id in $empty_relevant; do
+            if [[ -n "$id" ]]; then
+                warn "Query '$id' in $f has empty relevant_refs"
+            fi
+        done
+    fi
+done
+
+# 6. Check difficulty values
+echo "Checking difficulty values..."
+VALID_DIFFICULTIES="easy medium hard"
+for f in "${CORPUS_DIR}"/*/queries.json; do
+    if [[ -f "$f" ]]; then
+        while IFS= read -r line; do
+            id=$(echo "$line" | cut -d'|' -f1)
+            diff=$(echo "$line" | cut -d'|' -f2)
+            if [[ -n "$diff" && "$diff" != "null" ]]; then
+                if ! echo "$VALID_DIFFICULTIES" | grep -qw "$diff"; then
+                    error "Invalid difficulty '$diff' for query '$id' in $f"
+                fi
+            fi
+        done < <(jq -r '.[] | "\(.id)|\(.difficulty // "null")"' "$f" 2>/dev/null)
+    fi
+done
+
+# 7. Check for known tags (warn on unknown)
+echo "Checking tags..."
+KNOWN_TAGS="absent-control accessibility action action-synonym action-verb adversarial alertdialog all-stopwords auth basket-cart bulk-action button cell checkbox combobox compound context-exclusion conversational dashboard description descriptive dialog directional disambiguation domain-intent download-export duplicate-labels ecommerce empty-query empty-snapshot exact exact-match filter find-search generic-verb github guard icon implicit input interactive-boost keyboard-mash legal link literal-text login login-signin long-query lookup-search media menu menuitem missing-letter name-match natural-language navigation negative-context no-match noise-tokens nonsense option ordinal pagination parent-context partial position preferences-settings purchase-buy question-form radio register-create registration repeated-word row-context search searchbox section section-context signout-logout single-char social special-chars spinbutton stale-ref state switch synonym synonym-chain tab table textbox threshold toggle transposition typo vague-query visual weak-match wikipedia"
+for f in "${CORPUS_DIR}"/*/queries.json "${CASES_DIR}"/*.json; do
+    if [[ -f "$f" ]]; then
+        while IFS= read -r tag; do
+            if [[ -n "$tag" && "$tag" != "null" ]]; then
+                if ! echo "$KNOWN_TAGS" | grep -qw "$tag"; then
+                    warn "Unknown tag '$tag' in $f"
+                fi
+            fi
+        done < <(jq -r '.[].tags[]? // empty' "$f" 2>/dev/null)
+    fi
+done
+
+# 8. Check case files reference existing snapshots
+echo "Checking case file snapshot references..."
+for f in "${CASES_DIR}"/*.json; do
+    if [[ -f "$f" ]]; then
+        while IFS= read -r snapshot; do
+            if [[ -n "$snapshot" && "$snapshot" != "null" ]]; then
+                if [[ ! -f "${SNAPSHOTS_DIR}/${snapshot}" ]]; then
+                    error "Case file $f references missing snapshot: $snapshot"
+                fi
+            fi
+        done < <(jq -r '.[].snapshot // empty' "$f" 2>/dev/null)
+    fi
+done
+
+# 9. Check for generated result files in source tree
+echo "Checking for generated result files..."
+if ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | grep -v '.gitkeep' | head -1 >/dev/null 2>&1; then
+    result_count=$(ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | wc -l | tr -d ' ')
+    warn "Found $result_count generated result files in tests/benchmark/results/ (should be gitignored)"
+fi
+
+echo ""
+echo "=== Summary ==="
+if [[ $ERRORS -eq 0 && $WARNINGS -eq 0 ]]; then
+    ok "All checks passed"
+    exit 0
+elif [[ $ERRORS -eq 0 ]]; then
+    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
+    exit 0
+else
+    echo -e "${RED}Errors: $ERRORS${NC}"
+    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
+    exit 1
+fi
diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh
index 44b97d6..b5579bf 100755
--- a/tests/benchmark/scripts/run-corpus-benchmark.sh
+++ b/tests/benchmark/scripts/run-corpus-benchmark.sh
@@ -3,7 +3,7 @@
 # Run semantic matching benchmark with ranking metrics
 #
 # Usage:
-#   ./run-corpus-benchmark.sh [--strategy <name>] [--corpus <dir>]
+#   ./run-corpus-benchmark.sh [--strategy <name>] [--corpus <dir>] [--lexical-weight <n>] [--embedding-weight <n>]
 #
 # Metrics:
 #   - MRR (Mean Reciprocal Rank)
@@ -22,11 +22,15 @@ RESULTS_DIR="${BENCHMARK_DIR}/results"
 STRATEGY="combined"
 SPECIFIC_CORPUS=""
 TOP_K=5
+LEXICAL_WEIGHT=0.6
+EMBEDDING_WEIGHT=0.4
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --strategy) STRATEGY="$2"; shift 2 ;;
         --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
         --top-k) TOP_K="$2"; shift 2 ;;
+        --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;;
+        --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;;
         *) echo "Unknown option: $1"; exit 1 ;;
     esac
 done
@@ -51,12 +55,18 @@ jq -n \
     --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
     --arg strategy "${STRATEGY}" \
     --argjson top_k "${TOP_K}" \
+    --argjson lexical_weight "${LEXICAL_WEIGHT}" \
+    --argjson embedding_weight "${EMBEDDING_WEIGHT}" \
     '{
         benchmark: {
             timestamp: $ts,
             strategy: $strategy,
             top_k: $top_k,
-            type: "corpus"
+            type: "corpus",
+            weights: {
+                lexical: $lexical_weight,
+                embedding: $embedding_weight
+            }
         },
         results: [],
         metrics: {
@@ -74,6 +84,9 @@ jq -n \
 declare -a ALL_RRS=()
 declare -a ALL_P1=()
 declare -a ALL_P3=()
+declare -a ALL_HIT3=()
+declare -a ALL_HIT5=()
+declare -a ALL_MARGINS=()
 declare -a ALL_LATENCIES=()
 
 run_corpus() {
@@ -117,6 +130,8 @@ run_corpus() {
             --strategy "${STRATEGY}" \
             --threshold 0.01 \
             --top-k "${TOP_K}" \
+            --lexical-weight "${LEXICAL_WEIGHT}" \
+            --embedding-weight "${EMBEDDING_WEIGHT}" \
             --format json 2>&1); then
             echo "  [${id}] ERROR: semantic find failed for query: ${query}" >&2
             echo "${result}" >&2
@@ -160,22 +175,61 @@ run_corpus() {
         # Calculate P@3 (count relevant in top 3, partials count as 0.5)
         local relevant_in_top3=0
         local partial_in_top3=0
-        for rank in 1 2 3; do
+        local hit_at_3=0
+        local hit_at_5=0
+        local best_relevant_rank="null"
+        for rank in 1 2 3 4 5; do
             local ref_at_rank
             ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"")
             if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                relevant_in_top3=$((relevant_in_top3 + 1))
-            elif echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                partial_in_top3=$((partial_in_top3 + 1))
+                if [[ "$best_relevant_rank" == "null" ]]; then
+                    best_relevant_rank=$rank
+                fi
+                if [[ $rank -le 3 ]]; then
+                    relevant_in_top3=$((relevant_in_top3 + 1))
+                    hit_at_3=1
+                fi
+                hit_at_5=1
+            elif [[ $rank -le 3 ]]; then
+                if echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
+                    partial_in_top3=$((partial_in_top3 + 1))
+                fi
             fi
         done
         local p3
         p3=$(echo "scale=4; (${relevant_in_top3} + ${partial_in_top3} * 0.5) / 3" | bc)
 
+        # Calculate best_relevant_score, best_wrong_score, and margin
+        local best_relevant_score=0
+        local best_wrong_score=0
+        local num_matches
+        num_matches=$(echo "$result" | jq '.matches | length')
+        for idx in $(seq 0 $((num_matches - 1))); do
+            local ref_at_idx score_at_idx
+            ref_at_idx=$(echo "$result" | jq -r ".matches[$idx].ref // \"\"")
+            score_at_idx=$(echo "$result" | jq -r ".matches[$idx].score // 0")
+            if echo "$relevant_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
+                if (( $(echo "$score_at_idx > $best_relevant_score" | bc -l) )); then
+                    best_relevant_score=$score_at_idx
+                fi
+            elif echo "$partial_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
+                : # partials don't count as wrong
+            else
+                if (( $(echo "$score_at_idx > $best_wrong_score" | bc -l) )); then
+                    best_wrong_score=$score_at_idx
+                fi
+            fi
+        done
+        local margin
+        margin=$(echo "scale=4; $best_relevant_score - $best_wrong_score" | bc)
+
         # Collect metrics
         ALL_RRS+=("$rr")
         ALL_P1+=("$p1")
         ALL_P3+=("$p3")
+        ALL_HIT3+=("$hit_at_3")
+        ALL_HIT5+=("$hit_at_5")
+        ALL_MARGINS+=("$margin")
         ALL_LATENCIES+=("$duration_ms")
 
         # Status indicator
@@ -204,6 +258,12 @@ run_corpus() {
             --argjson rr "$rr" \
             --argjson p1 "$p1" \
             --argjson p3 "$p3" \
+            --argjson hit_at_3 "$hit_at_3" \
+            --argjson hit_at_5 "$hit_at_5" \
+            --argjson best_relevant_rank "$best_relevant_rank" \
+            --argjson best_relevant_score "$best_relevant_score" \
+            --argjson best_wrong_score "$best_wrong_score" \
+            --argjson margin "$margin" \
             --argjson latency "$duration_ms" \
             '{
                 id: $id, query: $query, corpus: $corpus,
@@ -211,6 +271,11 @@ run_corpus() {
                 best_ref: $best_ref, best_score: $best_score,
                 matches: $matches, relevant_refs: $relevant,
                 rr: $rr, p_at_1: $p1, p_at_3: $p3,
+                hit_at_3: $hit_at_3, hit_at_5: $hit_at_5,
+                best_relevant_rank: $best_relevant_rank,
+                best_relevant_score: $best_relevant_score,
+                best_wrong_score: $best_wrong_score,
+                margin: $margin,
                 latency_ms: $latency
             }')
 
@@ -251,6 +316,15 @@ P1=$(printf '%s\n' "${ALL_P1[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
 # P@3
 P3=$(printf '%s\n' "${ALL_P3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
 
+# Hit@3
+HIT3=$(printf '%s\n' "${ALL_HIT3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
+
+# Hit@5
+HIT5=$(printf '%s\n' "${ALL_HIT5[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
+
+# Average margin
+AVG_MARGIN=$(printf '%s\n' "${ALL_MARGINS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
+
 # Latency percentiles
 SORTED_LAT=($(printf '%s\n' "${ALL_LATENCIES[@]}" | sort -n))
 P50_IDX=$(( TOTAL * 50 / 100 ))
@@ -268,6 +342,9 @@ jq \
     --argjson mrr "$MRR" \
     --argjson p1 "$P1" \
     --argjson p3 "$P3" \
+    --argjson hit3 "$HIT3" \
+    --argjson hit5 "$HIT5" \
+    --argjson avg_margin "$AVG_MARGIN" \
     --argjson lat_avg "$LAT_AVG" \
     --argjson lat_p50 "$LAT_P50" \
     --argjson lat_p95 "$LAT_P95" \
@@ -277,6 +354,9 @@ jq \
         mrr: $mrr,
         p_at_1: $p1,
         p_at_3: $p3,
+        hit_at_3: $hit3,
+        hit_at_5: $hit5,
+        avg_margin: $avg_margin,
         latency_avg_ms: $lat_avg,
         latency_p50_ms: $lat_p50,
         latency_p95_ms: $lat_p95,
@@ -292,12 +372,53 @@ jq '.metrics.by_difficulty = (
         value: {
             count: length,
             mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length)
+            p_at_1: ([.[].p_at_1] | add / length),
+            hit_at_3: ([.[].hit_at_3] | add / length),
+            hit_at_5: ([.[].hit_at_5] | add / length),
+            avg_margin: ([.[].margin] | add / length)
         }
     }) | from_entries
 )' "$REPORT_FILE" > "$tmp"
 mv "$tmp" "$REPORT_FILE"
 
+# Add by-corpus breakdown
+tmp=$(mktemp)
+jq '.metrics.by_corpus = (
+    .results | group_by(.corpus) | map({
+        key: .[0].corpus,
+        value: {
+            count: length,
+            mrr: ([.[].rr] | add / length),
+            p_at_1: ([.[].p_at_1] | add / length),
+            hit_at_3: ([.[].hit_at_3] | add / length),
+            hit_at_5: ([.[].hit_at_5] | add / length),
+            avg_margin: ([.[].margin] | add / length)
+        }
+    }) | from_entries
+)' "$REPORT_FILE" > "$tmp"
+mv "$tmp" "$REPORT_FILE"
+
+# Add by-tag breakdown
+tmp=$(mktemp)
+jq '.metrics.by_tag = (
+    [.results[] | {tags: .tags, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
+    | [.[] | .tags[] as $tag | {tag: $tag, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
+    | group_by(.tag)
+    | map({
+        key: .[0].tag,
+        value: {
+            count: length,
+            mrr: ([.[].rr] | add / length),
+            p_at_1: ([.[].p_at_1] | add / length),
+            hit_at_3: ([.[].hit_at_3] | add / length),
+            hit_at_5: ([.[].hit_at_5] | add / length),
+            avg_margin: ([.[].margin] | add / length)
+        }
+    })
+    | from_entries
+)' "$REPORT_FILE" > "$tmp"
+mv "$tmp" "$REPORT_FILE"
+
 # Generate summary
 SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
 
@@ -310,6 +431,8 @@ cat > "${SUMMARY_FILE}" << EOF
 |-------|-------|
 | Timestamp | $(jq -r '.benchmark.timestamp' "$REPORT_FILE") |
 | Strategy | ${STRATEGY} |
+| Lexical Weight | ${LEXICAL_WEIGHT} |
+| Embedding Weight | ${EMBEDDING_WEIGHT} |
 | Top-K | ${TOP_K} |
 | Total Queries | ${TOTAL} |
 
@@ -320,6 +443,9 @@ cat > "${SUMMARY_FILE}" << EOF
 | **MRR** | **${MRR}** | Mean Reciprocal Rank |
 | **P@1** | **${P1}** | Precision at rank 1 |
 | **P@3** | **${P3}** | Precision at rank 3 |
+| **Hit@3** | **${HIT3}** | Any relevant in top 3 |
+| **Hit@5** | **${HIT5}** | Any relevant in top 5 |
+| **Avg Margin** | **${AVG_MARGIN}** | best_relevant - best_wrong |
 
 ## Latency
 
@@ -332,7 +458,15 @@ cat > "${SUMMARY_FILE}" << EOF
 
 ## By Difficulty
 
-$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) queries | MRR: \(.value.mrr | . * 100 | floor / 100) | P@1: \(.value.p_at_1 | . * 100 | floor / 100) |"' "$REPORT_FILE")
+| Difficulty | Count | MRR | P@1 | Hit@3 | Margin |
+|------------|-------|-----|-----|-------|--------|
+$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
+
+## By Corpus
+
+| Corpus | Count | MRR | P@1 | Hit@3 | Margin |
+|--------|-------|-----|-----|-------|--------|
+$(jq -r '.metrics.by_corpus | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
 
 ## Misses (P@1 = 0)
 
@@ -350,10 +484,14 @@ echo "================================================"
 echo "  CORPUS BENCHMARK RESULTS"
 echo "================================================"
 echo "  Strategy:    ${STRATEGY}"
+echo "  Weights:     lexical=${LEXICAL_WEIGHT} embedding=${EMBEDDING_WEIGHT}"
 echo "  Queries:     ${TOTAL}"
 echo "  MRR:         ${MRR}"
 echo "  P@1:         ${P1}"
 echo "  P@3:         ${P3}"
+echo "  Hit@3:       ${HIT3}"
+echo "  Hit@5:       ${HIT5}"
+echo "  Avg Margin:  ${AVG_MARGIN}"
 echo "  Latency P50: ${LAT_P50} ms"
 echo "  Latency P95: ${LAT_P95} ms"
 echo "================================================"
diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh
new file mode 100755
index 0000000..ef61d88
--- /dev/null
+++ b/tests/benchmark/scripts/tune-weights.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+#
+# Grid-search combined matcher lexical/embedding weights against the corpus.
+#
+# Usage:
+#   ./tune-weights.sh [--corpus <dir>] [--step <n>] [--output <dir>]
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+RESULTS_DIR="${BENCHMARK_DIR}/results"
+
+SPECIFIC_CORPUS=""
+STEP="0.1"
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
+        --step) STEP="$2"; shift 2 ;;
+        --output) RESULTS_DIR="$2"; shift 2 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${RESULTS_DIR}"
+
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+REPORT_FILE="${RESULTS_DIR}/tuning_weights_${TIMESTAMP}.json"
+SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
+
+jq -n \
+    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+    --arg step "${STEP}" \
+    '{
+        benchmark: {
+            timestamp: $ts,
+            type: "weight-tuning",
+            strategy: "combined",
+            step: ($step | tonumber)
+        },
+        results: [],
+        best: null
+    }' > "${REPORT_FILE}"
+
+weights=$(awk -v step="${STEP}" 'BEGIN {
+    if (step <= 0 || step > 1) {
+        exit 1
+    }
+    for (w = 0; w <= 1.000001; w += step) {
+        printf "%.4f\n", w
+    }
+}')
+
+if [[ -z "${weights}" ]]; then
+    echo "Invalid step: ${STEP}" >&2
+    exit 1
+fi
+
+echo "Weight tuning: step=${STEP}"
+echo ""
+printf "%-10s %-10s %-8s %-8s %-8s %-8s %-8s\n" "lexical" "embedding" "MRR" "P@1" "P@3" "P50" "report"
+
+while IFS= read -r lexical_weight; do
+    embedding_weight=$(awk -v w="${lexical_weight}" 'BEGIN { printf "%.4f", 1 - w }')
+
+    args=(
+        --strategy combined
+        --lexical-weight "${lexical_weight}"
+        --embedding-weight "${embedding_weight}"
+    )
+    if [[ -n "${SPECIFIC_CORPUS}" ]]; then
+        args+=(--corpus "${SPECIFIC_CORPUS}")
+    fi
+
+    if ! output=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" "${args[@]}" 2>&1); then
+        echo "$output" >&2
+        exit 1
+    fi
+
+    corpus_report=$(echo "$output" | awk '/^Report:/ {print $2}' | tail -1)
+    if [[ -z "${corpus_report}" || ! -f "${corpus_report}" ]]; then
+        echo "Could not find corpus report for lexical=${lexical_weight}" >&2
+        echo "$output" >&2
+        exit 1
+    fi
+
+    mrr=$(jq -r '.metrics.mrr' "$corpus_report")
+    p1=$(jq -r '.metrics.p_at_1' "$corpus_report")
+    p3=$(jq -r '.metrics.p_at_3' "$corpus_report")
+    p50=$(jq -r '.metrics.latency_p50_ms' "$corpus_report")
+    total=$(jq -r '.metrics.total' "$corpus_report")
+
+    printf "%-10s %-10s %-8s %-8s %-8s %-8s %s\n" \
+        "${lexical_weight}" "${embedding_weight}" "${mrr}" "${p1}" "${p3}" "${p50}" "$(basename "$corpus_report")"
+
+    result_json=$(jq -n \
+        --argjson lexical_weight "${lexical_weight}" \
+        --argjson embedding_weight "${embedding_weight}" \
+        --argjson total "${total}" \
+        --argjson mrr "${mrr}" \
+        --argjson p1 "${p1}" \
+        --argjson p3 "${p3}" \
+        --argjson p50 "${p50}" \
+        --arg report "${corpus_report}" \
+        '{
+            lexical_weight: $lexical_weight,
+            embedding_weight: $embedding_weight,
+            total: $total,
+            mrr: $mrr,
+            p_at_1: $p1,
+            p_at_3: $p3,
+            latency_p50_ms: $p50,
+            report: $report
+        }')
+
+    tmp=$(mktemp)
+    jq --argjson result "${result_json}" '.results += [$result]' "${REPORT_FILE}" > "$tmp"
+    mv "$tmp" "${REPORT_FILE}"
+done <<< "${weights}"
+
+tmp=$(mktemp)
+jq '
+    .best = (
+        .results
+        | sort_by(.p_at_1, .mrr, .p_at_3, -(.latency_p50_ms))
+        | last
+    )
+' "${REPORT_FILE}" > "$tmp"
+mv "$tmp" "${REPORT_FILE}"
+
+cat > "${SUMMARY_FILE}" << EOF
+# Combined Weight Tuning
+
+## Best
+
+| Field | Value |
+|-------|-------|
+| Lexical Weight | $(jq -r '.best.lexical_weight' "$REPORT_FILE") |
+| Embedding Weight | $(jq -r '.best.embedding_weight' "$REPORT_FILE") |
+| MRR | $(jq -r '.best.mrr' "$REPORT_FILE") |
+| P@1 | $(jq -r '.best.p_at_1' "$REPORT_FILE") |
+| P@3 | $(jq -r '.best.p_at_3' "$REPORT_FILE") |
+| Latency P50 | $(jq -r '.best.latency_p50_ms' "$REPORT_FILE") ms |
+
+## All Runs
+
+| Lexical | Embedding | MRR | P@1 | P@3 | P50 |
+|---------|-----------|-----|-----|-----|-----|
+$(jq -r '.results | sort_by(-.p_at_1, -.mrr, -.p_at_3, .latency_p50_ms)[] | "| \(.lexical_weight) | \(.embedding_weight) | \(.mrr) | \(.p_at_1) | \(.p_at_3) | \(.latency_p50_ms) ms |"' "$REPORT_FILE")
+EOF
+
+echo ""
+echo "Best weights:"
+jq '.best' "${REPORT_FILE}"
+echo ""
+echo "Report:  ${REPORT_FILE}"
+echo "Summary: ${SUMMARY_FILE}"

From 71c093270035f77b8cd56839cc21b6a5116c5714 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Thu, 23 Apr 2026 22:38:29 +0100
Subject: [PATCH 15/30] feat: harden matchers with input sanitization and
 context cancellation

---
 .gitignore                              |   4 +-
 internal/engine/combined.go             |  67 +++++++++--
 internal/engine/combined_test.go        | 149 +++++++++++++++++++++++-
 internal/engine/embedding.go            |  95 ++++++++++++---
 internal/engine/embedding_test.go       | 109 ++++++++++++++++-
 internal/engine/hashing.go              |  17 +++
 internal/engine/hashing_test.go         |  27 +++++
 internal/engine/lexical.go              |  40 ++++---
 internal/engine/lexical_test.go         |  20 +++-
 internal/engine/options.go              |  36 ++++++
 internal/engine/testing_helpers_test.go |  13 +++
 tests/e2e/cases/16-input-hardening.sh   |  60 ++++++++++
 12 files changed, 597 insertions(+), 40 deletions(-)
 create mode 100644 internal/engine/options.go
 create mode 100755 tests/e2e/cases/16-input-hardening.sh

diff --git a/.gitignore b/.gitignore
index 09584bd..2f3b5cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
-# Binary (root only, not cmd/semantic/)
+# Binary
 /semantic
+tests/benchmark/semantic
+tests/e2e/semantic
 *.exe
 
 # Test
diff --git a/internal/engine/combined.go b/internal/engine/combined.go
index d7dd814..c42597f 100644
--- a/internal/engine/combined.go
+++ b/internal/engine/combined.go
@@ -3,6 +3,7 @@ package engine
 import (
 	"context"
 	"fmt"
+	"math"
 	"sort"
 
 	"github.com/pinchtab/semantic/internal/types"
@@ -41,10 +42,12 @@ func (c *CombinedMatcher) Strategy() string {
 }
 
 func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
-	if opts.TopK <= 0 {
-		opts.TopK = 3
+	if ctx == nil {
+		ctx = context.Background()
 	}
 
+	opts = sanitizeFindOptions(opts, len(elements), 3)
+
 	parsed := ParseQueryContext(query)
 	visualHints := parseVisualQueryHints(query)
 	mergeOpts := opts
@@ -67,10 +70,49 @@ func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []typ
 }
 
 func (c *CombinedMatcher) weights(opts types.FindOptions) (float64, float64) {
-	if opts.LexicalWeight > 0 || opts.EmbeddingWeight > 0 {
-		return opts.LexicalWeight, opts.EmbeddingWeight
+	baseLex, baseEmb := normalizeWeights(c.LexicalWeight, c.EmbeddingWeight)
+	if baseLex == 0 && baseEmb == 0 {
+		baseLex, baseEmb = 0.6, 0.4
+	}
+
+	reqLex := sanitizeWeight(opts.LexicalWeight)
+	reqEmb := sanitizeWeight(opts.EmbeddingWeight)
+	if reqLex == 0 && reqEmb == 0 {
+		return baseLex, baseEmb
+	}
+
+	if reqLex > 0 && reqEmb == 0 && reqLex <= 1 {
+		reqEmb = 1 - reqLex
+	}
+	if reqEmb > 0 && reqLex == 0 && reqEmb <= 1 {
+		reqLex = 1 - reqEmb
+	}
+
+	lex, emb := normalizeWeights(reqLex, reqEmb)
+	if lex == 0 && emb == 0 {
+		return baseLex, baseEmb
+	}
+
+	return lex, emb
+}
+
+func sanitizeWeight(weight float64) float64 {
+	if math.IsNaN(weight) || math.IsInf(weight, 0) || weight < 0 {
+		return 0
+	}
+	return weight
+}
+
+func normalizeWeights(lexicalWeight, embeddingWeight float64) (float64, float64) {
+	lexicalWeight = sanitizeWeight(lexicalWeight)
+	embeddingWeight = sanitizeWeight(embeddingWeight)
+
+	total := lexicalWeight + embeddingWeight
+	if total <= 0 {
+		return 0, 0
 	}
-	return c.LexicalWeight, c.EmbeddingWeight
+
+	return lexicalWeight / total, embeddingWeight / total
 }
 
 type matcherResult struct {
@@ -108,8 +150,19 @@ func (c *CombinedMatcher) runBothParsed(ctx context.Context, parsed QueryContext
 		embCh <- matcherResult{r, err}
 	}()
 
-	lexRes := <-lexCh
-	embRes := <-embCh
+	var lexRes, embRes matcherResult
+	gotLex, gotEmb := false, false
+
+	for !gotLex || !gotEmb {
+		select {
+		case <-ctx.Done():
+			return types.FindResult{}, types.FindResult{}, ctx.Err()
+		case lexRes = <-lexCh:
+			gotLex = true
+		case embRes = <-embCh:
+			gotEmb = true
+		}
+	}
 
 	if lexRes.err != nil {
 		return types.FindResult{}, types.FindResult{}, lexRes.err
diff --git a/internal/engine/combined_test.go b/internal/engine/combined_test.go
index c1356cd..424b609 100644
--- a/internal/engine/combined_test.go
+++ b/internal/engine/combined_test.go
@@ -2,9 +2,13 @@ package engine
 
 import (
 	"context"
+	"errors"
 	"fmt"
-	"github.com/pinchtab/semantic/internal/types"
+	"math"
 	"testing"
+	"time"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 // CATEGORY 6: Role Boost Accumulation Test (Bug Fix)
@@ -595,3 +599,146 @@ func TestCombinedMatcher_DeterministicTieBreak(t *testing.T) {
 		}
 	}
 }
+
+// Hardening tests
+
+func TestCombinedMatcher_Weights_AdversarialNormalizationGrid(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+
+	inputs := []float64{
+		-10, -1, -0.25, 0, 0.001, 0.2, 0.5, 0.9, 1, 2, 10,
+		math.NaN(), math.Inf(1),
+	}
+
+	for _, baseLex := range inputs {
+		for _, baseEmb := range inputs {
+			m.LexicalWeight = baseLex
+			m.EmbeddingWeight = baseEmb
+
+			for _, reqLex := range inputs {
+				for _, reqEmb := range inputs {
+					lex, emb := m.weights(types.FindOptions{LexicalWeight: reqLex, EmbeddingWeight: reqEmb})
+
+					if math.IsNaN(lex) || math.IsNaN(emb) || math.IsInf(lex, 0) || math.IsInf(emb, 0) {
+						t.Fatalf("non-finite weights from base=(%v,%v) req=(%v,%v): lexical=%v embedding=%v",
+							baseLex, baseEmb, reqLex, reqEmb, lex, emb)
+					}
+					if lex < 0 || emb < 0 {
+						t.Fatalf("negative weights from base=(%v,%v) req=(%v,%v): lexical=%v embedding=%v",
+							baseLex, baseEmb, reqLex, reqEmb, lex, emb)
+					}
+
+					sum := lex + emb
+					if math.Abs(sum-1) > 1e-9 {
+						t.Fatalf("weights do not normalize to 1 from base=(%v,%v) req=(%v,%v): sum=%v",
+							baseLex, baseEmb, reqLex, reqEmb, sum)
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestCombinedMatcher_ScoreBoundedUnderInvalidModelWeights(t *testing.T) {
+	m := NewCombinedMatcher(NewHashingEmbedder(128))
+	m.LexicalWeight = 9
+	m.EmbeddingWeight = 9
+
+	elements := []types.ElementDescriptor{
+		{Ref: "e0", Role: "button", Name: "Sign in"},
+		{Ref: "e1", Role: "link", Name: "Register"},
+	}
+
+	result, err := m.Find(context.Background(), "sign in button", elements, types.FindOptions{Threshold: 0, TopK: 2})
+	if err != nil {
+		t.Fatalf("Find returned error: %v", err)
+	}
+	if result.BestScore < 0 || result.BestScore > 1 {
+		t.Fatalf("best score out of [0,1]: %f", result.BestScore)
+	}
+	for _, match := range result.Matches {
+		if match.Score < 0 || match.Score > 1 {
+			t.Fatalf("match %s score out of [0,1]: %f", match.Ref, match.Score)
+		}
+	}
+}
+
+func TestCombinedMatcher_Find_ContextCanceledWhileEmbeddingBlocked(t *testing.T) {
+	e := &blockingEmbedder{started: make(chan struct{}), release: make(chan struct{})}
+	defer close(e.release)
+
+	m := NewCombinedMatcher(e)
+	elements := []types.ElementDescriptor{
+		{Ref: "e1", Role: "button", Name: "Save"},
+		{Ref: "e2", Role: "link", Name: "Cancel"},
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan error, 1)
+	go func() {
+		_, err := m.Find(ctx, "save button", elements, types.FindOptions{Threshold: 0, TopK: 2})
+		done <- err
+	}()
+
+	select {
+	case <-e.started:
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("expected embedding to start")
+	}
+
+	cancel()
+
+	select {
+	case err := <-done:
+		if !errors.Is(err, context.Canceled) {
+			t.Fatalf("expected context canceled error, got %v", err)
+		}
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("Find did not return promptly after context cancellation")
+	}
+}
+
+type blockingEmbedder struct {
+	started chan struct{}
+	release chan struct{}
+}
+
+func (e *blockingEmbedder) Strategy() string { return "blocking" }
+
+func (e *blockingEmbedder) Embed(texts []string) ([][]float32, error) {
+	close(e.started)
+	<-e.release
+	return nil, errors.New("released")
+}
+
+func TestSanitizeFindOptions(t *testing.T) {
+	tests := []struct {
+		name        string
+		opts        types.FindOptions
+		elemCount   int
+		defaultTopK int
+		wantTopK    int
+		wantThresh  float64
+	}{
+		{"zero topk uses default", types.FindOptions{TopK: 0}, 10, 3, 3, 0},
+		{"negative topk uses default", types.FindOptions{TopK: -5}, 10, 3, 3, 0},
+		{"topk exceeds elements clamped", types.FindOptions{TopK: 20}, 5, 3, 5, 0},
+		{"NaN threshold becomes 0", types.FindOptions{Threshold: math.NaN()}, 10, 3, 3, 0},
+		{"Inf threshold becomes 0", types.FindOptions{Threshold: math.Inf(1)}, 10, 3, 3, 0},
+		{"negative threshold becomes 0", types.FindOptions{Threshold: -0.5}, 10, 3, 3, 0},
+		{"threshold > 1 becomes 1", types.FindOptions{Threshold: 1.5}, 10, 3, 3, 1},
+		{"valid threshold preserved", types.FindOptions{Threshold: 0.5, TopK: 5}, 10, 3, 5, 0.5},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := sanitizeFindOptions(tt.opts, tt.elemCount, tt.defaultTopK)
+			if got.TopK != tt.wantTopK {
+				t.Errorf("TopK = %d, want %d", got.TopK, tt.wantTopK)
+			}
+			if got.Threshold != tt.wantThresh {
+				t.Errorf("Threshold = %f, want %f", got.Threshold, tt.wantThresh)
+			}
+		})
+	}
+}
diff --git a/internal/engine/embedding.go b/internal/engine/embedding.go
index e0bdf24..48922dc 100644
--- a/internal/engine/embedding.go
+++ b/internal/engine/embedding.go
@@ -2,10 +2,12 @@ package engine
 
 import (
 	"context"
-	"github.com/pinchtab/semantic/internal/types"
+	"fmt"
 	"math"
 	"sort"
 	"strings"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 // Embedder converts text into dense vectors. See NewHashingEmbedder.
@@ -48,17 +50,28 @@ func (m *EmbeddingMatcher) Strategy() string {
 	return "embedding:" + m.embedder.Strategy()
 }
 
-func (m *EmbeddingMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
-	ctx := ParseQueryContext(query)
-	return m.findWithParsed(ctx, elements, opts)
+// contextAwareEmbedder is an optional interface for embedders that support
+// context cancellation during embedding.
+type contextAwareEmbedder interface {
+	EmbedContext(ctx context.Context, texts []string) ([][]float32, error)
 }
 
-func (m *EmbeddingMatcher) findWithParsed(ctx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
-	parsed := ctx.Base
-	if opts.TopK <= 0 {
-		opts.TopK = 3
+func (m *EmbeddingMatcher) Find(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if err := ctx.Err(); err != nil {
+		return types.FindResult{}, err
 	}
 
+	queryCtx := ParseQueryContext(query)
+	return m.findWithParsedContext(ctx, queryCtx, elements, opts)
+}
+
+func (m *EmbeddingMatcher) findWithParsedContext(ctx context.Context, queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+	parsed := queryCtx.Base
+	opts = sanitizeFindOptions(opts, len(elements), 3)
+
 	if len(parsed.Positive) == 0 && len(parsed.Negative) == 0 {
 		return types.FindResult{
 			Strategy:     m.Strategy(),
@@ -66,17 +79,25 @@ func (m *EmbeddingMatcher) findWithParsed(ctx QueryContext, elements []types.Ele
 		}, nil
 	}
 
-	filtered := filterContextExcludedElements(elements, ctx)
+	filtered := filterContextExcludedElements(elements, queryCtx)
 	if len(filtered) == 0 {
 		return types.FindResult{Strategy: m.Strategy(), ElementCount: len(elements)}, nil
 	}
 
-	vectors, err := m.embedQueryAndElements(parsed, filtered)
+	vectors, err := m.embedQueryAndElementsWithContext(ctx, parsed, filtered)
 	if err != nil {
 		return types.FindResult{}, err
 	}
 
-	candidates := m.scoreCandidates(parsed, filtered, vectors, opts.Threshold)
+	if err := validateEmbeddedVectors(vectors, len(filtered)+countQueryVectors(parsed)); err != nil {
+		return types.FindResult{}, err
+	}
+
+	if err := ctx.Err(); err != nil {
+		return types.FindResult{}, err
+	}
+
+	candidates := m.scoreCandidatesWithContext(ctx, parsed, filtered, vectors, opts.Threshold)
 	sort.Slice(candidates, func(i, j int) bool {
 		return rankedMatchLess(
 			candidates[i].score, candidates[i].desc, candidates[i].order,
@@ -91,6 +112,10 @@ func (m *EmbeddingMatcher) findWithParsed(ctx QueryContext, elements []types.Ele
 	return buildEmbeddingResult(m.Strategy(), len(elements), candidates), nil
 }
 
+func (m *EmbeddingMatcher) findWithParsed(queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+	return m.findWithParsedContext(context.Background(), queryCtx, elements, opts)
+}
+
 func filterContextExcludedElements(elements []types.ElementDescriptor, ctx QueryContext) []types.ElementDescriptor {
 	filtered := make([]types.ElementDescriptor, 0, len(elements))
 	for _, el := range elements {
@@ -102,7 +127,7 @@ func filterContextExcludedElements(elements []types.ElementDescriptor, ctx Query
 	return filtered
 }
 
-func (m *EmbeddingMatcher) embedQueryAndElements(parsed types.ParsedQuery, elements []types.ElementDescriptor) ([][]float32, error) {
+func (m *EmbeddingMatcher) embedQueryAndElementsWithContext(ctx context.Context, parsed types.ParsedQuery, elements []types.ElementDescriptor) ([][]float32, error) {
 	positiveQuery := strings.Join(parsed.Positive, " ")
 	negativeQuery := strings.Join(parsed.Negative, " ")
 
@@ -119,7 +144,44 @@ func (m *EmbeddingMatcher) embedQueryAndElements(parsed types.ParsedQuery, eleme
 		texts = append(texts, negativeQuery)
 	}
 	texts = append(texts, descs...)
-	return m.embedder.Embed(texts)
+	return embedWithContext(ctx, m.embedder, texts)
+}
+
+func embedWithContext(ctx context.Context, embedder Embedder, texts []string) ([][]float32, error) {
+	if ce, ok := embedder.(contextAwareEmbedder); ok {
+		return ce.EmbedContext(ctx, texts)
+	}
+	if err := ctx.Err(); err != nil {
+		return nil, err
+	}
+	return embedder.Embed(texts)
+}
+
+func countQueryVectors(parsed types.ParsedQuery) int {
+	count := 0
+	if len(parsed.Positive) > 0 {
+		count++
+	}
+	if len(parsed.Negative) > 0 {
+		count++
+	}
+	return count
+}
+
+func validateEmbeddedVectors(vectors [][]float32, expected int) error {
+	if len(vectors) != expected {
+		return fmt.Errorf("embedder returned %d vectors, expected %d", len(vectors), expected)
+	}
+	if len(vectors) == 0 {
+		return nil
+	}
+	dim := len(vectors[0])
+	for i := 1; i < len(vectors); i++ {
+		if len(vectors[i]) != dim {
+			return fmt.Errorf("embedder returned inconsistent vector dimensions at index %d: %d vs %d", i, len(vectors[i]), dim)
+		}
+	}
+	return nil
 }
 
 type embeddingScored struct {
@@ -128,7 +190,7 @@ type embeddingScored struct {
 	order int
 }
 
-func (m *EmbeddingMatcher) scoreCandidates(parsed types.ParsedQuery, elements []types.ElementDescriptor, vectors [][]float32, threshold float64) []embeddingScored {
+func (m *EmbeddingMatcher) scoreCandidatesWithContext(ctx context.Context, parsed types.ParsedQuery, elements []types.ElementDescriptor, vectors [][]float32, threshold float64) []embeddingScored {
 	negativeOnly := len(parsed.Positive) == 0 && len(parsed.Negative) > 0
 	idx := 0
 	var posVec []float32
@@ -151,6 +213,11 @@ func (m *EmbeddingMatcher) scoreCandidates(parsed types.ParsedQuery, elements []
 
 	var candidates []embeddingScored
 	for i, el := range elements {
+		if i%64 == 0 {
+			if ctx.Err() != nil {
+				return candidates
+			}
+		}
 		score := scoreEmbeddingCandidate(parsed, posVec, negVec, contextVecs[i], elemVecs[i])
 		if negativeOnly && score == 0 {
 			continue
diff --git a/internal/engine/embedding_test.go b/internal/engine/embedding_test.go
index 4926855..6c70074 100644
--- a/internal/engine/embedding_test.go
+++ b/internal/engine/embedding_test.go
@@ -2,10 +2,12 @@ package engine
 
 import (
 	"context"
+	"errors"
 	"fmt"
-	"github.com/pinchtab/semantic/internal/types"
 	"math"
 	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 // dummyEmbedder tests
@@ -382,3 +384,108 @@ func findMatchScore(matches []types.ElementMatch, ref string) (float64, bool) {
 }
 
 // FindResult.ConfidenceLabel tests
+
+// Hardening tests
+
+func TestEmbeddingMatcher_Find_ContextCanceledBeforeEmbed(t *testing.T) {
+	m := NewEmbeddingMatcher(newDummyEmbedder(64))
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	_, err := m.Find(ctx, "submit button", []types.ElementDescriptor{
+		{Ref: "e1", Role: "button", Name: "Submit"},
+	}, types.FindOptions{Threshold: 0, TopK: 1})
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("expected context canceled error, got %v", err)
+	}
+}
+
+func TestEmbeddingMatcher_Find_EmbedderVectorCountMismatchReturnsError(t *testing.T) {
+	e := &malformedEmbedder{vectors: fixedVectors(1, 64)}
+	m := NewEmbeddingMatcher(e)
+
+	_, err := m.Find(context.Background(), "submit", []types.ElementDescriptor{
+		{Ref: "e1", Role: "button", Name: "Submit"},
+		{Ref: "e2", Role: "button", Name: "Cancel"},
+	}, types.FindOptions{Threshold: 0, TopK: 2})
+
+	if err == nil {
+		t.Fatal("expected error for vector count mismatch")
+	}
+}
+
+func TestEmbeddingMatcher_Find_InconsistentVectorDimensionsReturnsError(t *testing.T) {
+	e := &malformedEmbedder{vectors: [][]float32{
+		{1, 0, 0},
+		{0, 1},
+		{0, 0, 1},
+	}}
+	m := NewEmbeddingMatcher(e)
+
+	_, err := m.Find(context.Background(), "submit", []types.ElementDescriptor{
+		{Ref: "e1", Role: "button", Name: "Submit"},
+		{Ref: "e2", Role: "button", Name: "Cancel"},
+	}, types.FindOptions{Threshold: 0, TopK: 2})
+
+	if err == nil {
+		t.Fatal("expected error for inconsistent vector dimensions")
+	}
+}
+
+type malformedEmbedder struct {
+	vectors [][]float32
+}
+
+func (e *malformedEmbedder) Strategy() string { return "malformed" }
+
+func (e *malformedEmbedder) Embed(texts []string) ([][]float32, error) {
+	return e.vectors, nil
+}
+
+func TestValidateEmbeddedVectors(t *testing.T) {
+	tests := []struct {
+		name     string
+		vectors  [][]float32
+		expected int
+		wantErr  bool
+	}{
+		{"empty valid", [][]float32{}, 0, false},
+		{"correct count", fixedVectors(3, 64), 3, false},
+		{"wrong count", fixedVectors(2, 64), 3, true},
+		{"inconsistent dims", [][]float32{{1, 0}, {0, 1, 0}}, 2, true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateEmbeddedVectors(tt.vectors, tt.expected)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("validateEmbeddedVectors() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestEmbedWithContext_UsesContextAwareEmbedder(t *testing.T) {
+	e := NewHashingEmbedder(64)
+	ctx := context.Background()
+
+	vecs, err := embedWithContext(ctx, e, []string{"test"})
+	if err != nil {
+		t.Fatalf("embedWithContext error: %v", err)
+	}
+	if len(vecs) != 1 {
+		t.Fatalf("expected 1 vector, got %d", len(vecs))
+	}
+}
+
+func TestEmbedWithContext_CanceledBeforeNonContextAware(t *testing.T) {
+	e := newDummyEmbedder(64) // doesn't implement contextAwareEmbedder
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	_, err := embedWithContext(ctx, e, []string{"test"})
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("expected context canceled, got %v", err)
+	}
+}
diff --git a/internal/engine/hashing.go b/internal/engine/hashing.go
index 9963d02..6c02276 100644
--- a/internal/engine/hashing.go
+++ b/internal/engine/hashing.go
@@ -1,6 +1,7 @@
 package engine
 
 import (
+	"context"
 	"hash/fnv"
 	"math"
 	"strings"
@@ -39,8 +40,24 @@ func NewHashingEmbedder(dim int) *HashingEmbedder {
 func (h *HashingEmbedder) Strategy() string { return "hashing" }
 
 func (h *HashingEmbedder) Embed(texts []string) ([][]float32, error) {
+	return h.EmbedContext(context.Background(), texts)
+}
+
+func (h *HashingEmbedder) EmbedContext(ctx context.Context, texts []string) ([][]float32, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if err := ctx.Err(); err != nil {
+		return nil, err
+	}
+
 	result := make([][]float32, len(texts))
 	for i, text := range texts {
+		if i%64 == 0 {
+			if err := ctx.Err(); err != nil {
+				return nil, err
+			}
+		}
 		result[i] = h.vectorize(text)
 	}
 	return result, nil
diff --git a/internal/engine/hashing_test.go b/internal/engine/hashing_test.go
index d8459fa..ed6dcb6 100644
--- a/internal/engine/hashing_test.go
+++ b/internal/engine/hashing_test.go
@@ -1,6 +1,8 @@
 package engine
 
 import (
+	"context"
+	"errors"
 	"math"
 	"testing"
 )
@@ -257,4 +259,29 @@ func TestHashingEmbedder_BatchConsistency(t *testing.T) {
 	}
 }
 
+// Hardening tests
+
+func TestHashingEmbedder_EmbedContext_Canceled(t *testing.T) {
+	e := NewHashingEmbedder(128)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	_, err := e.EmbedContext(ctx, []string{"test"})
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("expected context canceled error, got %v", err)
+	}
+}
+
+func TestHashingEmbedder_EmbedContext_NilContext(t *testing.T) {
+	e := NewHashingEmbedder(128)
+	//nolint:staticcheck // intentionally testing nil context handling
+	vecs, err := e.EmbedContext(nil, []string{"test"})
+	if err != nil {
+		t.Fatalf("expected no error with nil context, got %v", err)
+	}
+	if len(vecs) != 1 || len(vecs[0]) != 128 {
+		t.Fatalf("expected 1 vector of dim 128, got %d vectors", len(vecs))
+	}
+}
+
 // Phase 3: CombinedMatcher tests
diff --git a/internal/engine/lexical.go b/internal/engine/lexical.go
index 95dd3a2..ffde616 100644
--- a/internal/engine/lexical.go
+++ b/internal/engine/lexical.go
@@ -47,16 +47,25 @@ func NewLexicalMatcher() *LexicalMatcher {
 
 func (m *LexicalMatcher) Strategy() string { return "lexical" }
 
-func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
-	ctx := ParseQueryContext(query)
-	return m.findWithParsed(ctx, elements, opts), nil
+func (m *LexicalMatcher) Find(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if err := ctx.Err(); err != nil {
+		return types.FindResult{}, err
+	}
+
+	queryCtx := ParseQueryContext(query)
+	return m.findWithParsedContext(ctx, queryCtx, elements, opts), nil
 }
 
-func (m *LexicalMatcher) findWithParsed(ctx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult {
-	parsed := ctx.Base
-	if opts.TopK <= 0 {
-		opts.TopK = 3
-	}
+func (m *LexicalMatcher) findWithParsed(queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult {
+	return m.findWithParsedContext(context.Background(), queryCtx, elements, opts)
+}
+
+func (m *LexicalMatcher) findWithParsedContext(ctx context.Context, queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult {
+	parsed := queryCtx.Base
+	opts = sanitizeFindOptions(opts, len(elements), 3)
 
 	if len(parsed.Positive) == 0 && len(parsed.Negative) == 0 {
 		return types.FindResult{
@@ -76,8 +85,14 @@ func (m *LexicalMatcher) findWithParsed(ctx QueryContext, elements []types.Eleme
 	}
 
 	var candidates []scored
-	for _, el := range elements {
-		if ctx.HasScope && matchesExcludedContext(el, ctx.Exclude) {
+	for i, el := range elements {
+		if i%64 == 0 {
+			if ctx.Err() != nil {
+				break
+			}
+		}
+
+		if queryCtx.HasScope && matchesExcludedContext(el, queryCtx.Exclude) {
 			continue
 		}
 
@@ -85,7 +100,6 @@ func (m *LexicalMatcher) findWithParsed(ctx QueryContext, elements []types.Eleme
 		descTokens := tokenize(composite)
 		score := 0.0
 		if len(parsed.Positive) == 0 {
-			// Negative-only query means "everything except negatives".
 			score = 1.0
 		} else {
 			score = lexicalScoreTokens(parsed.Positive, descTokens, el.Interactive, ef)
@@ -93,15 +107,11 @@ func (m *LexicalMatcher) findWithParsed(ctx QueryContext, elements []types.Eleme
 		}
 
 		if len(parsed.Negative) > 0 {
-			// Debug note: negativeScore reflects how strongly negative tokens match
-			// this element; hasStrongNegativeHit indicates exact/synonym token hit.
 			negativeScore := lexicalScoreTokens(parsed.Negative, descTokens, el.Interactive, ef)
 			switch {
 			case hasStrongNegativeHit(parsed.Negative, descTokens) || negativeScore > 0.7:
-				// Applied penalty: full exclusion.
 				score = 0
 			case negativeScore > 0.4:
-				// Applied penalty: multiplicative down-weight.
 				score *= 1 - negativeScore
 			}
 		}
diff --git a/internal/engine/lexical_test.go b/internal/engine/lexical_test.go
index 109e05b..cbe82fa 100644
--- a/internal/engine/lexical_test.go
+++ b/internal/engine/lexical_test.go
@@ -2,10 +2,12 @@ package engine
 
 import (
 	"context"
-	"github.com/pinchtab/semantic/internal/types"
+	"errors"
 	"math"
 	"strconv"
 	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 func TestTokenPrefixScore_BtnButton(t *testing.T) {
@@ -643,4 +645,20 @@ func TestLexicalMatcher_EmptyQueryReturnsNoResults(t *testing.T) {
 	}
 }
 
+// Hardening tests
+
+func TestLexicalMatcher_Find_ContextCanceled(t *testing.T) {
+	m := NewLexicalMatcher()
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	_, err := m.Find(ctx, "submit button", []types.ElementDescriptor{
+		{Ref: "e1", Role: "button", Name: "Submit"},
+	}, types.FindOptions{Threshold: 0, TopK: 1})
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("expected context canceled error, got %v", err)
+	}
+}
+
 // dummyEmbedder tests
diff --git a/internal/engine/options.go b/internal/engine/options.go
new file mode 100644
index 0000000..15a2ec6
--- /dev/null
+++ b/internal/engine/options.go
@@ -0,0 +1,36 @@
+package engine
+
+import (
+	"math"
+
+	"github.com/pinchtab/semantic/internal/types"
+)
+
+func sanitizeFindOptions(opts types.FindOptions, elementCount int, defaultTopK int) types.FindOptions {
+	if defaultTopK <= 0 {
+		defaultTopK = 3
+	}
+
+	if opts.TopK <= 0 {
+		opts.TopK = defaultTopK
+	}
+	if elementCount >= 0 && opts.TopK > elementCount {
+		opts.TopK = elementCount
+	}
+
+	opts.Threshold = sanitizeThreshold(opts.Threshold)
+	return opts
+}
+
+func sanitizeThreshold(threshold float64) float64 {
+	if math.IsNaN(threshold) || math.IsInf(threshold, 0) {
+		return 0
+	}
+	if threshold < 0 {
+		return 0
+	}
+	if threshold > 1 {
+		return 1
+	}
+	return threshold
+}
diff --git a/internal/engine/testing_helpers_test.go b/internal/engine/testing_helpers_test.go
index 7c5197c..244578c 100644
--- a/internal/engine/testing_helpers_test.go
+++ b/internal/engine/testing_helpers_test.go
@@ -47,3 +47,16 @@ func (d *dummyEmbedder) hashVec(s string) []float32 {
 	}
 	return vec
 }
+
+func fixedVectors(n, dim int) [][]float32 {
+	if dim <= 0 {
+		dim = 1
+	}
+	vectors := make([][]float32, n)
+	for i := 0; i < n; i++ {
+		vec := make([]float32, dim)
+		vec[i%dim] = 1
+		vectors[i] = vec
+	}
+	return vectors
+}
diff --git a/tests/e2e/cases/16-input-hardening.sh b/tests/e2e/cases/16-input-hardening.sh
new file mode 100755
index 0000000..506c398
--- /dev/null
+++ b/tests/e2e/cases/16-input-hardening.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+CASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${CASE_DIR}/../lib.sh"
+
+echo "  ── Find: Input Hardening / Edge Cases ──"
+
+SNAPSHOT="${ASSETS_DIR}/snapshots/login-page.json"
+
+# Negative threshold (should be clamped to 0)
+set +e
+result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --threshold -0.5 2>&1)
+exit_code=$?
+set -e
+assert_eq "$exit_code" "0" "hardening: negative threshold doesn't crash"
+
+# Threshold > 1 (should be clamped to 1, returning no matches)
+result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --threshold 1.5)
+count=$(echo "$result" | jq '.matches | length')
+assert_eq "$count" "0" "hardening: threshold > 1 returns no matches"
+
+# Zero topk (should use default)
+set +e
+result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --top-k 0 2>&1)
+exit_code=$?
+set -e
+assert_eq "$exit_code" "0" "hardening: zero topk doesn't crash"
+
+# Negative topk (should use default)
+set +e
+result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --top-k -5 2>&1)
+exit_code=$?
+set -e
+assert_eq "$exit_code" "0" "hardening: negative topk doesn't crash"
+
+# Very large topk (should be clamped to element count)
+result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --top-k 10000 --threshold 0)
+count=$(echo "$result" | jq '.matches | length')
+elem_count=$(jq 'length' "$SNAPSHOT")
+if [ "$count" -le "$elem_count" ]; then
+  pass "hardening: large topk clamped to element count"
+else
+  fail "hardening: large topk clamped to element count" "got $count matches, expected <= $elem_count"
+fi
+
+# Custom weights that sum to > 1
+set +e
+result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --lexical-weight 2 --embedding-weight 2 2>&1)
+exit_code=$?
+set -e
+assert_eq "$exit_code" "0" "hardening: weights > 1 don't crash"
+
+# Verify scores are still bounded [0,1] with extreme weights
+best_score=$(echo "$result" | jq '.best_score')
+if awk "BEGIN {exit !($best_score >= 0 && $best_score <= 1)}"; then
+  pass "hardening: scores bounded with extreme weights"
+else
+  fail "hardening: scores bounded with extreme weights" "got score $best_score"
+fi
+
+summary "input-hardening"

From 913191489b1e7fc951dd58c37acad91a931338a6 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Thu, 23 Apr 2026 22:58:12 +0100
Subject: [PATCH 16/30] test: build fresh CLI for local e2e hardening checks

---
 tests/e2e/cases/16-input-hardening.sh | 13 +++++++++----
 tests/e2e/lib.sh                      | 14 ++++++++++++++
 tests/e2e/run.sh                      |  4 +++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/tests/e2e/cases/16-input-hardening.sh b/tests/e2e/cases/16-input-hardening.sh
index 506c398..3ebc4c2 100755
--- a/tests/e2e/cases/16-input-hardening.sh
+++ b/tests/e2e/cases/16-input-hardening.sh
@@ -50,11 +50,16 @@ set -e
 assert_eq "$exit_code" "0" "hardening: weights > 1 don't crash"
 
 # Verify scores are still bounded [0,1] with extreme weights
-best_score=$(echo "$result" | jq '.best_score')
-if awk "BEGIN {exit !($best_score >= 0 && $best_score <= 1)}"; then
-  pass "hardening: scores bounded with extreme weights"
+if [ "$exit_code" != "0" ]; then
+  fail "hardening: scores bounded with extreme weights" "semantic find failed: $result"
+elif best_score=$(echo "$result" | jq -er '.best_score' 2>/dev/null); then
+  if awk "BEGIN {exit !($best_score >= 0 && $best_score <= 1)}"; then
+    pass "hardening: scores bounded with extreme weights"
+  else
+    fail "hardening: scores bounded with extreme weights" "got score $best_score"
+  fi
 else
-  fail "hardening: scores bounded with extreme weights" "got score $best_score"
+  fail "hardening: scores bounded with extreme weights" "semantic find returned invalid JSON: $result"
 fi
 
 summary "input-hardening"
diff --git a/tests/e2e/lib.sh b/tests/e2e/lib.sh
index c6d1ad0..9c903a0 100755
--- a/tests/e2e/lib.sh
+++ b/tests/e2e/lib.sh
@@ -11,6 +11,20 @@ else
   ASSETS_DIR="${E2E_DIR}/assets"
 fi
 
+if [ "$E2E_DIR" != "/e2e" ] && [ -z "${SEMANTIC_E2E_BOOTSTRAPPED:-}" ]; then
+  REPO_ROOT="$(cd "${E2E_DIR}/../.." && pwd)"
+  if ! command -v go >/dev/null 2>&1; then
+    echo "ERROR: go is required to run local E2E tests" >&2
+    exit 1
+  fi
+  if ! (cd "$REPO_ROOT" && go build -o "${E2E_DIR}/semantic" ./cmd/semantic); then
+    echo "ERROR: failed to build semantic binary for local E2E tests" >&2
+    exit 1
+  fi
+  export PATH="${E2E_DIR}:$PATH"
+  export SEMANTIC_E2E_BOOTSTRAPPED=1
+fi
+
 PASSED=0
 FAILED=0
 ERRORS=""
diff --git a/tests/e2e/run.sh b/tests/e2e/run.sh
index 1af644c..29de4ea 100755
--- a/tests/e2e/run.sh
+++ b/tests/e2e/run.sh
@@ -7,6 +7,9 @@ echo "  semantic E2E tests"
 echo "═══════════════════════════════════════════════════"
 echo ""
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/lib.sh"
+
 # Verify binary is available
 if ! command -v semantic &>/dev/null; then
   echo "ERROR: semantic binary not found"
@@ -24,7 +27,6 @@ run_suite() {
 }
 
 # Run all test suites
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 for suite in "${SCRIPT_DIR}"/cases/*.sh; do
   [ -f "$suite" ] || continue
   echo ""

From 1a481aa08a5694211764773ad1044e5db80b46cc Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 11:03:34 +0100
Subject: [PATCH 17/30] chore: expand benchmark corpus and add tuning tools

Add baseline management scripts (create/check/update), threshold
calibration, and runtime baseline tracking. Centralize benchmark
config. Update dev tool with ./dev pr command for pre-PR validation.
---
 .gitignore                                    |   3 +-
 dev                                           | 110 ++++++++-
 skills/semantic-dev/SKILL.md                  |  36 +--
 tests/benchmark/baselines/.gitkeep            |   0
 tests/benchmark/config/benchmark.json         |  40 +++-
 .../benchmark/scripts/calibrate-thresholds.sh | 208 ++++++++++++++++++
 tests/benchmark/scripts/check-baseline.sh     | 140 ++++++++++++
 .../scripts/check-runtime-baseline.sh         | 137 ++++++++++++
 tests/benchmark/scripts/create-baseline.sh    |  86 ++++++++
 tests/benchmark/scripts/lint-corpus.sh        |   4 +-
 tests/benchmark/scripts/run-benchmark.sh      |  13 +-
 .../benchmark/scripts/run-corpus-benchmark.sh |  26 ++-
 tests/benchmark/scripts/run-full-benchmark.sh |  13 ++
 tests/benchmark/scripts/tune-weights.sh       |  10 +
 tests/benchmark/scripts/update-baseline.sh    |  70 ++++++
 15 files changed, 855 insertions(+), 41 deletions(-)
 create mode 100644 tests/benchmark/baselines/.gitkeep
 create mode 100755 tests/benchmark/scripts/calibrate-thresholds.sh
 create mode 100755 tests/benchmark/scripts/check-baseline.sh
 create mode 100755 tests/benchmark/scripts/check-runtime-baseline.sh
 create mode 100755 tests/benchmark/scripts/create-baseline.sh
 create mode 100755 tests/benchmark/scripts/update-baseline.sh

diff --git a/.gitignore b/.gitignore
index 2f3b5cc..8a46978 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,5 @@ cover.out
 .claude
 tests/e2e/results/*.txt
 tests/benchmark/results/*.json
-tests/benchmark/results/*.md
\ No newline at end of file
+tests/benchmark/results/*.md
+tests/benchmark/baselines/*.backup.json
\ No newline at end of file
diff --git a/dev b/dev
index dc15e75..215b566 100755
--- a/dev
+++ b/dev
@@ -11,17 +11,26 @@ ERROR=$'\033[38;2;230;57;70m'
 NC=$'\033[0m'
 
 commands=(
+  "pr:🚀:Pre-PR checks (check + e2e + bench)"
   "doctor:🩺:Setup dev environment"
   "test:🧪:Run unit tests"
   "test verbose:🧪:Run unit tests (verbose)"
   "test race:🧪:Run unit tests with race detector"
   "coverage:📊:Run tests with coverage report"
   "lint:🔍:Run golangci-lint"
+  "lint corpus:🔍:Lint benchmark corpus"
   "fmt:✨:Format code"
   "vet:🔬:Run go vet"
   "check:✅:Run all checks (fmt + vet + lint + test)"
   "build:📦:Build CLI binary"
-  "bench:🏋:Run corpus benchmark suite"
+  "bench:🏋:Run corpus benchmark"
+  "bench full:🏋:Run full benchmark suite"
+  "baseline:📏:Create quality baseline"
+  "baseline check:📏:Check against baseline"
+  "baseline update:📏:Update baseline (--accept)"
+  "calibrate:🎯:Calibrate threshold recommendations"
+  "runtime:⏱️:Check runtime baseline"
+  "tune:🎛️:Tune combined weights"
   "e2e:🐳:Run E2E tests (Docker)"
 )
 
@@ -36,6 +45,36 @@ show_help() {
   echo ""
 }
 
+run_pr() {
+  echo "  ${ACCENT}${BOLD}🚀 Pre-PR checks${NC}"
+  echo ""
+
+  echo "  ${MUTED}1/4 All checks (fmt + vet + lint + test)${NC}"
+  run_check
+
+  echo ""
+  echo "  ${MUTED}2/4 E2E tests${NC}"
+  if [[ -f tests/e2e/run.sh ]]; then
+    go build -o /tmp/semantic ./cmd/semantic
+    PATH="/tmp:$PATH" bash tests/e2e/run.sh
+    echo "  ${SUCCESS}✓${NC} E2E passed"
+  else
+    echo "  ${MUTED}Skipped (no e2e/run.sh)${NC}"
+  fi
+
+  echo ""
+  echo "  ${MUTED}3/4 Lint corpus${NC}"
+  run_lint_corpus
+
+  echo ""
+  echo "  ${MUTED}4/4 Corpus benchmark${NC}"
+  run_bench > /dev/null 2>&1
+  echo "  ${SUCCESS}✓${NC} Benchmark complete"
+
+  echo ""
+  echo "  ${SUCCESS}${BOLD}🚀 Ready for PR${NC}"
+}
+
 run_test() {
   echo "  ${ACCENT}${BOLD}🧪 Running tests${NC}"
   go test ./... -count=1
@@ -115,8 +154,48 @@ run_build() {
 }
 
 run_bench() {
-  echo "  ${ACCENT}${BOLD}⏱️  Running corpus benchmark suite${NC}"
-  bash tests/benchmark/scripts/run-corpus-benchmark.sh
+  echo "  ${ACCENT}${BOLD}🏋 Running corpus benchmark${NC}"
+  bash tests/benchmark/scripts/run-corpus-benchmark.sh "$@"
+}
+
+run_bench_full() {
+  echo "  ${ACCENT}${BOLD}🏋 Running full benchmark suite${NC}"
+  bash tests/benchmark/scripts/run-full-benchmark.sh
+}
+
+run_lint_corpus() {
+  echo "  ${ACCENT}${BOLD}🔍 Linting benchmark corpus${NC}"
+  bash tests/benchmark/scripts/lint-corpus.sh
+}
+
+run_baseline() {
+  echo "  ${ACCENT}${BOLD}📏 Creating quality baseline${NC}"
+  bash tests/benchmark/scripts/create-baseline.sh "$@"
+}
+
+run_baseline_check() {
+  echo "  ${ACCENT}${BOLD}📏 Checking against baseline${NC}"
+  bash tests/benchmark/scripts/check-baseline.sh "$@"
+}
+
+run_baseline_update() {
+  echo "  ${ACCENT}${BOLD}📏 Updating baseline${NC}"
+  bash tests/benchmark/scripts/update-baseline.sh --accept "$@"
+}
+
+run_calibrate() {
+  echo "  ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}"
+  bash tests/benchmark/scripts/calibrate-thresholds.sh "$@"
+}
+
+run_runtime() {
+  echo "  ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}"
+  bash tests/benchmark/scripts/check-runtime-baseline.sh "$@"
+}
+
+run_tune() {
+  echo "  ${ACCENT}${BOLD}🎛️ Tuning combined weights${NC}"
+  bash tests/benchmark/scripts/tune-weights.sh "$@"
 }
 
 run_e2e() {
@@ -129,6 +208,7 @@ run_e2e() {
 }
 
 case "${1:-help}" in
+  pr)        run_pr ;;
   doctor)    exec bash scripts/doctor.sh ;;
   test)
     case "${2:-}" in
@@ -138,12 +218,32 @@ case "${1:-help}" in
     esac
     ;;
   coverage)  run_coverage ;;
-  lint)      run_lint ;;
+  lint)
+    case "${2:-}" in
+      corpus) run_lint_corpus ;;
+      *) run_lint ;;
+    esac
+    ;;
   fmt)       run_fmt ;;
   vet)       run_vet ;;
   check)     run_check ;;
   build)     run_build ;;
-  bench|benchmark) run_bench ;;
+  bench|benchmark)
+    case "${2:-}" in
+      full) run_bench_full ;;
+      *) shift; run_bench "$@" ;;
+    esac
+    ;;
+  baseline)
+    case "${2:-}" in
+      check) shift 2; run_baseline_check "$@" ;;
+      update) shift 2; run_baseline_update "$@" ;;
+      *) shift; run_baseline "$@" ;;
+    esac
+    ;;
+  calibrate) shift; run_calibrate "$@" ;;
+  runtime)   shift; run_runtime "$@" ;;
+  tune)      shift; run_tune "$@" ;;
   e2e)       run_e2e ;;
   help|*)    show_help ;;
 esac
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index 84ade33..b813297 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -15,22 +15,26 @@ cd ~/dev/semantic
 
 ## Dev Commands
 
-All development commands run via `./dev`:
-
-| Command | Description |
-|---------|-------------|
-| `./dev doctor` | Setup dev environment |
-| `./dev test` | Run unit tests |
-| `./dev test verbose` | Run unit tests (verbose) |
-| `./dev test race` | Run unit tests with race detector |
-| `./dev coverage` | Run tests with coverage report |
-| `./dev lint` | Run golangci-lint |
-| `./dev fmt` | Format code |
-| `./dev vet` | Run go vet |
-| `./dev check` | All checks (fmt + vet + lint + test) |
-| `./dev build` | Build CLI binary |
-| `./dev bench` | Run corpus benchmark suite |
-| `./dev e2e` | Run E2E tests (Docker) |
+```bash
+# Before opening a PR (runs all checks + e2e + benchmark)
+./dev pr
+
+# Quick iteration
+./dev test              # unit tests
+./dev check             # fmt + vet + lint + test race
+
+# Benchmarking
+./dev bench             # corpus benchmark
+./dev baseline          # create baseline (first time)
+./dev baseline check    # check for regressions
+
+# Other
+./dev build             # build ./semantic binary
+./dev e2e               # e2e tests (Docker)
+./dev lint corpus       # validate benchmark data
+./dev calibrate         # find optimal thresholds
+./dev tune              # grid-search weights
+```
 
 ## Architecture
 
diff --git a/tests/benchmark/baselines/.gitkeep b/tests/benchmark/baselines/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/benchmark/config/benchmark.json b/tests/benchmark/config/benchmark.json
index 23b5661..7b06060 100644
--- a/tests/benchmark/config/benchmark.json
+++ b/tests/benchmark/config/benchmark.json
@@ -1,13 +1,35 @@
 {
-  "version": "1.0.0",
-  "strategies": ["lexical", "embedding", "combined"],
-  "default_strategy": "combined",
-  "default_threshold": 0.3,
-  "default_top_k": 3,
-  "metrics": {
-    "min_accuracy": 0.85,
-    "min_avg_score": 0.5,
-    "max_latency_ms": 100
+  "version": "1.1.0",
+  "defaults": {
+    "strategy": "combined",
+    "threshold": 0.01,
+    "top_k": 5,
+    "weights": {
+      "lexical": 0.6,
+      "embedding": 0.4
+    }
+  },
+  "baseline": {
+    "quality": {
+      "max_overall_p_at_1_drop": 0.02,
+      "max_overall_mrr_drop": 0.02,
+      "max_overall_hit_at_3_drop": 0.02,
+      "max_corpus_p_at_1_drop": 0.08,
+      "max_difficulty_p_at_1_drop": 0.08,
+      "max_margin_drop_report": 0.15
+    },
+    "runtime": {
+      "max_ns_op_regression_ratio": 1.25,
+      "max_alloc_regression_ratio": 1.25,
+      "max_corpus_latency_p50_ms": 75,
+      "max_corpus_latency_p95_ms": 200
+    }
   },
+  "results": {
+    "dir": "tests/benchmark/results",
+    "baselines_dir": "tests/benchmark/baselines",
+    "generated_files_policy": "warn"
+  },
+  "strategies": ["lexical", "embedding", "combined"],
   "snapshots_dir": "../e2e/assets/snapshots"
 }
diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh
new file mode 100755
index 0000000..ef5603d
--- /dev/null
+++ b/tests/benchmark/scripts/calibrate-thresholds.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+#
+# Calibrate threshold recommendations for find and recovery.
+#
+# Usage:
+#   ./calibrate-thresholds.sh [--corpus <dir>]
+#
+# Reports recall/precision/false-positive-rate by threshold.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+CORPUS_DIR="${BENCHMARK_DIR}/corpus"
+RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read config
+if [[ -f "$CONFIG_FILE" ]]; then
+    STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+    LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
+    EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
+else
+    STRATEGY="combined"
+    LEXICAL_WEIGHT=0.6
+    EMBEDDING_WEIGHT=0.4
+fi
+
+SPECIFIC_CORPUS=""
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${RESULTS_DIR}"
+
+# Build semantic binary
+echo "Building semantic..."
+(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
+
+SEMANTIC="${BENCHMARK_DIR}/semantic"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json"
+
+# Thresholds to test
+THRESHOLDS=(0.01 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.60 0.70 0.80 0.90)
+
+echo "Testing ${#THRESHOLDS[@]} thresholds: ${THRESHOLDS[*]}"
+echo ""
+
+# Initialize report
+jq -n \
+    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+    --arg strategy "${STRATEGY}" \
+    '{
+        timestamp: $ts,
+        strategy: $strategy,
+        thresholds: [],
+        recommendations: {}
+    }' > "${REPORT_FILE}"
+
+# Collect results for each threshold
+for thresh in "${THRESHOLDS[@]}"; do
+    echo "Testing threshold: ${thresh}"
+
+    total=0
+    true_positives=0
+    false_positives=0
+    false_negatives=0
+
+    for corpus in "${CORPUS_DIR}"/*/; do
+        [[ -d "$corpus" ]] || continue
+
+        if [[ -n "$SPECIFIC_CORPUS" ]] && [[ "$(basename "$corpus")" != "$SPECIFIC_CORPUS" ]]; then
+            continue
+        fi
+
+        snapshot="${corpus}/snapshot.json"
+        queries="${corpus}/queries.json"
+
+        [[ -f "$snapshot" ]] && [[ -f "$queries" ]] || continue
+
+        count=$(jq length "$queries")
+
+        for i in $(seq 0 $((count - 1))); do
+            query=$(jq -r ".[$i].query" "$queries")
+            relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries")
+
+            result=$("${SEMANTIC}" find "${query}" \
+                --snapshot "${snapshot}" \
+                --strategy "${STRATEGY}" \
+                --threshold "${thresh}" \
+                --top-k 5 \
+                --lexical-weight "${LEXICAL_WEIGHT}" \
+                --embedding-weight "${EMBEDDING_WEIGHT}" \
+                --format json 2>/dev/null) || continue
+
+            best_ref=$(echo "$result" | jq -r '.best_ref // ""')
+            num_matches=$(echo "$result" | jq '.matches | length')
+
+            total=$((total + 1))
+
+            # Check if best match is relevant
+            if [[ -n "$best_ref" ]] && echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
+                true_positives=$((true_positives + 1))
+            elif [[ -n "$best_ref" ]] && [[ "$num_matches" -gt 0 ]]; then
+                false_positives=$((false_positives + 1))
+            fi
+
+            # If no match but there should be one
+            if [[ -z "$best_ref" ]] || [[ "$num_matches" -eq 0 ]]; then
+                rel_count=$(echo "$relevant_refs" | jq 'length')
+                if [[ "$rel_count" -gt 0 ]]; then
+                    false_negatives=$((false_negatives + 1))
+                fi
+            fi
+        done
+    done
+
+    # Calculate metrics
+    if [[ $total -eq 0 ]]; then
+        echo "  No queries processed"
+        continue
+    fi
+
+    precision=0
+    recall=0
+    fpr=0
+
+    if [[ $((true_positives + false_positives)) -gt 0 ]]; then
+        precision=$(echo "scale=4; $true_positives / ($true_positives + $false_positives)" | bc)
+    fi
+
+    if [[ $((true_positives + false_negatives)) -gt 0 ]]; then
+        recall=$(echo "scale=4; $true_positives / ($true_positives + $false_negatives)" | bc)
+    fi
+
+    if [[ $((false_positives + true_positives)) -gt 0 ]]; then
+        fpr=$(echo "scale=4; $false_positives / $total" | bc)
+    fi
+
+    f1=0
+    if (( $(echo "$precision + $recall > 0" | bc -l) )); then
+        f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc)
+    fi
+
+    printf "  Precision: %.3f | Recall: %.3f | FPR: %.3f | F1: %.3f\n" "$precision" "$recall" "$fpr" "$f1"
+
+    # Append to report
+    tmp=$(mktemp)
+    jq --argjson thresh "$thresh" \
+       --argjson total "$total" \
+       --argjson tp "$true_positives" \
+       --argjson fp "$false_positives" \
+       --argjson fn "$false_negatives" \
+       --argjson precision "$precision" \
+       --argjson recall "$recall" \
+       --argjson fpr "$fpr" \
+       --argjson f1 "$f1" \
+       '.thresholds += [{
+           threshold: $thresh,
+           total: $total,
+           true_positives: $tp,
+           false_positives: $fp,
+           false_negatives: $fn,
+           precision: $precision,
+           recall: $recall,
+           false_positive_rate: $fpr,
+           f1: $f1
+       }]' "$REPORT_FILE" > "$tmp"
+    mv "$tmp" "$REPORT_FILE"
+done
+
+# Calculate recommendations
+echo ""
+echo "Calculating recommendations..."
+
+# Best F1 for general find
+BEST_FIND=$(jq -r '[.thresholds[] | select(.f1 > 0)] | max_by(.f1) | .threshold // 0.3' "$REPORT_FILE")
+
+# Best recall with precision > 0.8 for recovery (prioritize not missing)
+BEST_RECOVERY=$(jq -r '[.thresholds[] | select(.precision >= 0.7)] | max_by(.recall) | .threshold // 0.2' "$REPORT_FILE")
+
+# Update recommendations
+tmp=$(mktemp)
+jq --argjson find "$BEST_FIND" \
+   --argjson recovery "$BEST_RECOVERY" \
+   '.recommendations = {
+       find: $find,
+       recovery: $recovery,
+       note: "find optimizes F1; recovery optimizes recall with precision >= 0.7"
+   }' "$REPORT_FILE" > "$tmp"
+mv "$tmp" "$REPORT_FILE"
+
+# Cleanup
+rm -f "${BENCHMARK_DIR}/semantic"
+
+echo ""
+echo "================================================"
+echo "  THRESHOLD CALIBRATION RESULTS"
+echo "================================================"
+echo "  Recommended for Find:     ${BEST_FIND}"
+echo "  Recommended for Recovery: ${BEST_RECOVERY}"
+echo "================================================"
+echo ""
+echo "Report: ${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/check-baseline.sh b/tests/benchmark/scripts/check-baseline.sh
new file mode 100755
index 0000000..f6e95ae
--- /dev/null
+++ b/tests/benchmark/scripts/check-baseline.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+#
+# Check current benchmark results against a baseline.
+#
+# Usage:
+#   ./check-baseline.sh [--baseline <file>] [--fail-on-regression]
+#
+# Exit codes:
+#   0 - No regressions detected
+#   1 - Regressions detected (if --fail-on-regression)
+#   2 - Error (missing files, invalid config)
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m'
+
+# Read config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 2
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+MAX_P1_DROP=$(jq -r '.baseline.quality.max_overall_p_at_1_drop // 0.02' "$CONFIG_FILE")
+MAX_MRR_DROP=$(jq -r '.baseline.quality.max_overall_mrr_drop // 0.02' "$CONFIG_FILE")
+MAX_HIT3_DROP=$(jq -r '.baseline.quality.max_overall_hit_at_3_drop // 0.02' "$CONFIG_FILE")
+MAX_CORPUS_P1_DROP=$(jq -r '.baseline.quality.max_corpus_p_at_1_drop // 0.08' "$CONFIG_FILE")
+MAX_MARGIN_DROP=$(jq -r '.baseline.quality.max_margin_drop_report // 0.15' "$CONFIG_FILE")
+
+# Parse args
+BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
+FAIL_ON_REGRESSION=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --baseline) BASELINE_FILE="$2"; shift 2 ;;
+        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
+        *) echo "Unknown option: $1"; exit 2 ;;
+    esac
+done
+
+if [[ ! -f "$BASELINE_FILE" ]]; then
+    echo "ERROR: Baseline not found: $BASELINE_FILE" >&2
+    echo "Run ./create-baseline.sh first" >&2
+    exit 2
+fi
+
+echo "Checking against baseline: ${BASELINE_FILE}"
+echo "Tolerances: P@1=${MAX_P1_DROP}, MRR=${MAX_MRR_DROP}, Hit@3=${MAX_HIT3_DROP}"
+echo ""
+
+# Run current benchmark
+TEMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TEMP_DIR"' EXIT
+
+"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" > "${TEMP_DIR}/output.log" 2>&1
+
+# Find the latest report
+LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
+
+if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
+    echo "ERROR: Could not find benchmark report" >&2
+    exit 2
+fi
+
+# Compare metrics
+REGRESSIONS=0
+WARNINGS=0
+
+compare_metric() {
+    local name="$1"
+    local baseline_val="$2"
+    local current_val="$3"
+    local max_drop="$4"
+
+    local diff
+    diff=$(echo "scale=4; $current_val - $baseline_val" | bc)
+    local drop
+    drop=$(echo "scale=4; $baseline_val - $current_val" | bc)
+
+    if (( $(echo "$drop > $max_drop" | bc -l) )); then
+        echo -e "${RED}REGRESSION${NC} $name: $baseline_val -> $current_val (drop: $drop, max: $max_drop)"
+        REGRESSIONS=$((REGRESSIONS + 1))
+    elif (( $(echo "$drop > 0" | bc -l) )); then
+        echo -e "${YELLOW}WARNING${NC} $name: $baseline_val -> $current_val (drop: $drop)"
+        WARNINGS=$((WARNINGS + 1))
+    else
+        echo -e "${GREEN}OK${NC} $name: $baseline_val -> $current_val (${diff:0:6})"
+    fi
+}
+
+echo "=== Overall Metrics ==="
+echo ""
+
+BASELINE_MRR=$(jq -r '.metrics.mrr' "$BASELINE_FILE")
+CURRENT_MRR=$(jq -r '.metrics.mrr' "$LATEST_REPORT")
+compare_metric "MRR" "$BASELINE_MRR" "$CURRENT_MRR" "$MAX_MRR_DROP"
+
+BASELINE_P1=$(jq -r '.metrics.p_at_1' "$BASELINE_FILE")
+CURRENT_P1=$(jq -r '.metrics.p_at_1' "$LATEST_REPORT")
+compare_metric "P@1" "$BASELINE_P1" "$CURRENT_P1" "$MAX_P1_DROP"
+
+BASELINE_HIT3=$(jq -r '.metrics.hit_at_3' "$BASELINE_FILE")
+CURRENT_HIT3=$(jq -r '.metrics.hit_at_3' "$LATEST_REPORT")
+compare_metric "Hit@3" "$BASELINE_HIT3" "$CURRENT_HIT3" "$MAX_HIT3_DROP"
+
+BASELINE_MARGIN=$(jq -r '.metrics.avg_margin' "$BASELINE_FILE")
+CURRENT_MARGIN=$(jq -r '.metrics.avg_margin' "$LATEST_REPORT")
+compare_metric "Margin" "$BASELINE_MARGIN" "$CURRENT_MARGIN" "$MAX_MARGIN_DROP"
+
+echo ""
+echo "=== Per-Corpus ==="
+echo ""
+
+for corpus in $(jq -r '.by_corpus | keys[]' "$BASELINE_FILE"); do
+    BASELINE_CORPUS_P1=$(jq -r ".by_corpus[\"$corpus\"].p_at_1 // 0" "$BASELINE_FILE")
+    CURRENT_CORPUS_P1=$(jq -r ".metrics.by_corpus[\"$corpus\"].p_at_1 // 0" "$LATEST_REPORT")
+    compare_metric "$corpus P@1" "$BASELINE_CORPUS_P1" "$CURRENT_CORPUS_P1" "$MAX_CORPUS_P1_DROP"
+done
+
+echo ""
+echo "================================================"
+if [[ $REGRESSIONS -gt 0 ]]; then
+    echo -e "${RED}REGRESSIONS: $REGRESSIONS${NC}"
+    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
+        exit 1
+    fi
+elif [[ $WARNINGS -gt 0 ]]; then
+    echo -e "${YELLOW}WARNINGS: $WARNINGS (no regressions)${NC}"
+else
+    echo -e "${GREEN}ALL CHECKS PASSED${NC}"
+fi
+echo "================================================"
diff --git a/tests/benchmark/scripts/check-runtime-baseline.sh b/tests/benchmark/scripts/check-runtime-baseline.sh
new file mode 100755
index 0000000..75bc4fc
--- /dev/null
+++ b/tests/benchmark/scripts/check-runtime-baseline.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+#
+# Check Go benchmark results against runtime baseline.
+#
+# Usage:
+#   ./check-runtime-baseline.sh [--fail-on-regression]
+#
+# Runs Go benchmarks and compares against saved baseline.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+PROJECT_ROOT="${BENCHMARK_DIR}/../.."
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m'
+
+# Read tolerances from config
+if [[ -f "$CONFIG_FILE" ]]; then
+    MAX_NS_RATIO=$(jq -r '.baseline.runtime.max_ns_op_regression_ratio // 1.25' "$CONFIG_FILE")
+    MAX_ALLOC_RATIO=$(jq -r '.baseline.runtime.max_alloc_regression_ratio // 1.25' "$CONFIG_FILE")
+else
+    MAX_NS_RATIO=1.25
+    MAX_ALLOC_RATIO=1.25
+fi
+
+# Parse args
+FAIL_ON_REGRESSION=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${RESULTS_DIR}"
+mkdir -p "${BASELINES_DIR}"
+
+BASELINE_FILE="${BASELINES_DIR}/runtime.json"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+REPORT_FILE="${RESULTS_DIR}/runtime_${TIMESTAMP}.json"
+
+echo "Running Go benchmarks..."
+echo ""
+
+# Run benchmarks
+BENCH_OUTPUT=$(mktemp)
+(cd "$PROJECT_ROOT" && go test -bench=. -benchmem ./internal/engine/... 2>&1) | tee "$BENCH_OUTPUT"
+
+# Parse benchmark output into JSON
+echo ""
+echo "Parsing results..."
+
+jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" '{timestamp: $ts, benchmarks: []}' > "$REPORT_FILE"
+
+while IFS= read -r line; do
+    if [[ "$line" =~ ^Benchmark ]]; then
+        # Parse: BenchmarkName-N  iterations  ns/op  bytes/op  allocs/op
+        name=$(echo "$line" | awk '{print $1}' | sed 's/-[0-9]*$//')
+        ns_op=$(echo "$line" | grep -oE '[0-9.]+ ns/op' | awk '{print $1}' || echo "0")
+        bytes_op=$(echo "$line" | grep -oE '[0-9]+ B/op' | awk '{print $1}' || echo "0")
+        allocs_op=$(echo "$line" | grep -oE '[0-9]+ allocs/op' | awk '{print $1}' || echo "0")
+
+        if [[ -n "$ns_op" ]] && [[ "$ns_op" != "0" ]]; then
+            tmp=$(mktemp)
+            jq --arg name "$name" \
+               --argjson ns "$ns_op" \
+               --argjson bytes "${bytes_op:-0}" \
+               --argjson allocs "${allocs_op:-0}" \
+               '.benchmarks += [{name: $name, ns_op: $ns, bytes_op: $bytes, allocs_op: $allocs}]' \
+               "$REPORT_FILE" > "$tmp"
+            mv "$tmp" "$REPORT_FILE"
+        fi
+    fi
+done < "$BENCH_OUTPUT"
+
+rm -f "$BENCH_OUTPUT"
+
+# If no baseline exists, create one
+if [[ ! -f "$BASELINE_FILE" ]]; then
+    echo ""
+    echo "No runtime baseline found. Creating initial baseline..."
+    cp "$REPORT_FILE" "$BASELINE_FILE"
+    echo "Baseline saved to: $BASELINE_FILE"
+    exit 0
+fi
+
+# Compare against baseline
+echo ""
+echo "=== Comparing against baseline ==="
+echo ""
+
+REGRESSIONS=0
+
+for name in $(jq -r '.benchmarks[].name' "$REPORT_FILE"); do
+    baseline_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$BASELINE_FILE")
+    current_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$REPORT_FILE")
+
+    baseline_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$BASELINE_FILE")
+    current_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$REPORT_FILE")
+
+    if [[ "$baseline_ns" == "0" ]] || [[ "$baseline_ns" == "null" ]]; then
+        echo -e "${YELLOW}NEW${NC} $name: ${current_ns} ns/op"
+        continue
+    fi
+
+    ratio=$(echo "scale=4; $current_ns / $baseline_ns" | bc)
+
+    if (( $(echo "$ratio > $MAX_NS_RATIO" | bc -l) )); then
+        echo -e "${RED}REGRESSION${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x, max: ${MAX_NS_RATIO}x)"
+        REGRESSIONS=$((REGRESSIONS + 1))
+    elif (( $(echo "$ratio > 1.1" | bc -l) )); then
+        echo -e "${YELLOW}WARNING${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
+    else
+        echo -e "${GREEN}OK${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
+    fi
+done
+
+echo ""
+echo "================================================"
+if [[ $REGRESSIONS -gt 0 ]]; then
+    echo -e "${RED}RUNTIME REGRESSIONS: $REGRESSIONS${NC}"
+    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
+        exit 1
+    fi
+else
+    echo -e "${GREEN}NO RUNTIME REGRESSIONS${NC}"
+fi
+echo "================================================"
+echo ""
+echo "Report: ${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/create-baseline.sh b/tests/benchmark/scripts/create-baseline.sh
new file mode 100755
index 0000000..cd4696a
--- /dev/null
+++ b/tests/benchmark/scripts/create-baseline.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#
+# Create a quality baseline from current corpus benchmark results.
+#
+# Usage:
+#   ./create-baseline.sh [--name <name>]
+#
+# This runs run-corpus-benchmark.sh and saves the results as a baseline.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+
+# Parse args
+BASELINE_NAME="${STRATEGY}"
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --name) BASELINE_NAME="$2"; shift 2 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${BASELINES_DIR}"
+
+BASELINE_FILE="${BASELINES_DIR}/${BASELINE_NAME}.json"
+
+echo "Creating baseline: ${BASELINE_NAME}"
+echo "Strategy: ${STRATEGY}"
+echo ""
+
+# Run corpus benchmark
+TEMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TEMP_DIR"' EXIT
+
+"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" 2>&1 | tee "${TEMP_DIR}/output.log"
+
+# Find the latest report
+LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
+
+if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
+    echo "ERROR: Could not find benchmark report" >&2
+    exit 1
+fi
+
+# Extract baseline data
+jq '{
+    created_at: .benchmark.timestamp,
+    strategy: .benchmark.strategy,
+    threshold: .benchmark.threshold,
+    top_k: .benchmark.top_k,
+    weights: .benchmark.weights,
+    metrics: {
+        total: .metrics.total,
+        mrr: .metrics.mrr,
+        p_at_1: .metrics.p_at_1,
+        p_at_3: .metrics.p_at_3,
+        hit_at_3: .metrics.hit_at_3,
+        hit_at_5: .metrics.hit_at_5,
+        avg_margin: .metrics.avg_margin,
+        latency_p50_ms: .metrics.latency_p50_ms,
+        latency_p95_ms: .metrics.latency_p95_ms
+    },
+    by_difficulty: .metrics.by_difficulty,
+    by_corpus: .metrics.by_corpus,
+    per_query: [.results[] | {id, corpus, difficulty, p_at_1, rr, margin}]
+}' "$LATEST_REPORT" > "$BASELINE_FILE"
+
+echo ""
+echo "================================================"
+echo "  BASELINE CREATED"
+echo "================================================"
+echo "  File: ${BASELINE_FILE}"
+echo ""
+jq -r '"  MRR:     \(.metrics.mrr)\n  P@1:     \(.metrics.p_at_1)\n  Hit@3:   \(.metrics.hit_at_3)\n  Margin:  \(.metrics.avg_margin)"' "$BASELINE_FILE"
+echo "================================================"
diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh
index 29f81b2..783e546 100755
--- a/tests/benchmark/scripts/lint-corpus.sh
+++ b/tests/benchmark/scripts/lint-corpus.sh
@@ -17,12 +17,12 @@ WARNINGS=0
 
 error() {
     echo -e "${RED}ERROR:${NC} $1"
-    ((ERRORS++))
+    ERRORS=$((ERRORS + 1))
 }
 
 warn() {
     echo -e "${YELLOW}WARN:${NC} $1"
-    ((WARNINGS++))
+    WARNINGS=$((WARNINGS + 1))
 }
 
 ok() {
diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh
index 4ce67d6..29c8a22 100755
--- a/tests/benchmark/scripts/run-benchmark.sh
+++ b/tests/benchmark/scripts/run-benchmark.sh
@@ -19,9 +19,18 @@ CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
 SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
 
-# Parse args
-STRATEGY="combined"
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
 CASE_FILE=""
+
+# Parse args (override config)
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --strategy) STRATEGY="$2"; shift 2 ;;
diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh
index b5579bf..53216af 100755
--- a/tests/benchmark/scripts/run-corpus-benchmark.sh
+++ b/tests/benchmark/scripts/run-corpus-benchmark.sh
@@ -17,17 +17,27 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 CORPUS_DIR="${BENCHMARK_DIR}/corpus"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
 
-# Parse args
-STRATEGY="combined"
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
+LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
+EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
 SPECIFIC_CORPUS=""
-TOP_K=5
-LEXICAL_WEIGHT=0.6
-EMBEDDING_WEIGHT=0.4
+
+# Parse args (override config)
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --strategy) STRATEGY="$2"; shift 2 ;;
         --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
+        --threshold) THRESHOLD="$2"; shift 2 ;;
         --top-k) TOP_K="$2"; shift 2 ;;
         --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;;
         --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;;
@@ -54,15 +64,19 @@ REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json"
 jq -n \
     --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
     --arg strategy "${STRATEGY}" \
+    --argjson threshold "${THRESHOLD}" \
     --argjson top_k "${TOP_K}" \
     --argjson lexical_weight "${LEXICAL_WEIGHT}" \
     --argjson embedding_weight "${EMBEDDING_WEIGHT}" \
+    --arg config_file "${CONFIG_FILE}" \
     '{
         benchmark: {
             timestamp: $ts,
             strategy: $strategy,
+            threshold: $threshold,
             top_k: $top_k,
             type: "corpus",
+            config_source: $config_file,
             weights: {
                 lexical: $lexical_weight,
                 embedding: $embedding_weight
@@ -128,7 +142,7 @@ run_corpus() {
         if ! result=$("${SEMANTIC}" find "${query}" \
             --snapshot "${snapshot}" \
             --strategy "${STRATEGY}" \
-            --threshold 0.01 \
+            --threshold "${THRESHOLD}" \
             --top-k "${TOP_K}" \
             --lexical-weight "${LEXICAL_WEIGHT}" \
             --embedding-weight "${EMBEDDING_WEIGHT}" \
diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh
index eadaad7..5c759dc 100755
--- a/tests/benchmark/scripts/run-full-benchmark.sh
+++ b/tests/benchmark/scripts/run-full-benchmark.sh
@@ -10,6 +10,19 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 CORPUS_DIR="${BENCHMARK_DIR}/corpus"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
+LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
+EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
 
 mkdir -p "${RESULTS_DIR}"
 
diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh
index ef61d88..011b1b2 100755
--- a/tests/benchmark/scripts/tune-weights.sh
+++ b/tests/benchmark/scripts/tune-weights.sh
@@ -10,6 +10,16 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read defaults from config (used for threshold/top_k in grid runs)
+if [[ -f "$CONFIG_FILE" ]]; then
+    THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+    TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
+else
+    THRESHOLD=0.01
+    TOP_K=5
+fi
 
 SPECIFIC_CORPUS=""
 STEP="0.1"
diff --git a/tests/benchmark/scripts/update-baseline.sh b/tests/benchmark/scripts/update-baseline.sh
new file mode 100755
index 0000000..ba93089
--- /dev/null
+++ b/tests/benchmark/scripts/update-baseline.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Update baseline after reviewing regressions.
+#
+# Usage:
+#   ./update-baseline.sh --accept [--baseline <file>]
+#
+# This re-runs the benchmark and overwrites the baseline file.
+# Use after reviewing check-baseline.sh output and confirming
+# the changes are intentional.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+
+# Parse args
+BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
+ACCEPT=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --accept) ACCEPT=true; shift ;;
+        --baseline) BASELINE_FILE="$2"; shift 2 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+if [[ "$ACCEPT" != "true" ]]; then
+    echo "Usage: $0 --accept [--baseline <file>]"
+    echo ""
+    echo "This will overwrite the baseline. Run check-baseline.sh first"
+    echo "to review changes before accepting."
+    exit 1
+fi
+
+if [[ ! -f "$BASELINE_FILE" ]]; then
+    echo "Baseline not found: $BASELINE_FILE"
+    echo "Creating new baseline instead..."
+    exec "${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
+fi
+
+# Show what will change
+echo "Current baseline: ${BASELINE_FILE}"
+echo ""
+jq -r '"  MRR:   \(.metrics.mrr)\n  P@1:   \(.metrics.p_at_1)\n  Hit@3: \(.metrics.hit_at_3)"' "$BASELINE_FILE"
+echo ""
+echo "Running benchmark to generate new baseline..."
+echo ""
+
+# Backup old baseline
+BACKUP_FILE="${BASELINE_FILE%.json}_$(date +%Y%m%d_%H%M%S).backup.json"
+cp "$BASELINE_FILE" "$BACKUP_FILE"
+echo "Backed up old baseline to: $BACKUP_FILE"
+
+# Create new baseline (overwrites)
+"${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
+
+echo ""
+echo "Baseline updated. Old baseline backed up to:"
+echo "  $BACKUP_FILE"

From 6cab0f7f8588a3eb3a38a4947c5b2b8bb8cb9772 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 11:04:00 +0100
Subject: [PATCH 18/30] docs: improve SKILL.md for LLM usage

Add scenario-based command table to help LLM assistants pick
the right dev command for each situation.
---
 skills/semantic-dev/SKILL.md | 51 ++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index b813297..16e70b4 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -5,37 +5,44 @@ description: Develop and contribute to the Semantic project. Use when working on
 
 # Semantic Development
 
-Semantic is a zero-dependency Go library for matching natural language queries against accessibility tree elements.
+Zero-dependency Go library for matching natural language queries against accessibility tree elements.
 
-## Project Location
+## Essential Commands
 
+**Before any PR:**
 ```bash
-cd ~/dev/semantic
+./dev pr                # runs: check + e2e + lint corpus + bench
 ```
 
-## Dev Commands
+**During development:**
+```bash
+./dev test              # unit tests (fast)
+./dev check             # fmt + vet + lint + test race (full validation)
+./dev build             # build ./semantic CLI binary
+```
 
+**Quality regression checks:**
 ```bash
-# Before opening a PR (runs all checks + e2e + benchmark)
-./dev pr
-
-# Quick iteration
-./dev test              # unit tests
-./dev check             # fmt + vet + lint + test race
-
-# Benchmarking
-./dev bench             # corpus benchmark
-./dev baseline          # create baseline (first time)
-./dev baseline check    # check for regressions
-
-# Other
-./dev build             # build ./semantic binary
-./dev e2e               # e2e tests (Docker)
-./dev lint corpus       # validate benchmark data
-./dev calibrate         # find optimal thresholds
-./dev tune              # grid-search weights
+./dev baseline check    # compare quality against baseline
+./dev runtime           # compare performance against baseline
 ```
 
+**When quality changes intentionally:**
+```bash
+./dev baseline update   # accept new quality baseline (after review)
+```
+
+## When to Use Each
+
+| Scenario | Command |
+|----------|---------|
+| Made code changes, quick sanity | `./dev test` |
+| Ready to commit | `./dev check` |
+| Before opening PR | `./dev pr` |
+| Changed scoring/matching logic | `./dev baseline check` |
+| Performance-sensitive changes | `./dev runtime` |
+| Tuning weights | `./dev tune` then `./dev bench` |
+
 ## Architecture
 
 ```

From c69f054a4f588b194ed233b4b9517e97ca3e4702 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:37:42 +0100
Subject: [PATCH 19/30] refactor: use Go CLI instead of bash scripts in dev
 tool

Replace bash implementations of bench, lint corpus, and loop commands
with calls to go run ./cmd/semantic-bench. Removes ~100 lines of
duplicate bash logic.
---
 dev                                           |  20 +-
 recovery/benchmark_test.go                    | 250 ++++++++++++
 skills/semantic-dev/SKILL.md                  |  54 +++
 .../benchmark/scripts/calibrate-thresholds.sh | 368 ++++++++++++------
 .../scripts/run-recovery-benchmark.sh         |  42 ++
 5 files changed, 613 insertions(+), 121 deletions(-)
 create mode 100644 recovery/benchmark_test.go
 create mode 100755 tests/benchmark/scripts/run-recovery-benchmark.sh

diff --git a/dev b/dev
index 215b566..a7f6247 100755
--- a/dev
+++ b/dev
@@ -19,6 +19,7 @@ commands=(
   "coverage:📊:Run tests with coverage report"
   "lint:🔍:Run golangci-lint"
   "lint corpus:🔍:Lint benchmark corpus"
+  "lint docs:🔍:Check documentation links"
   "fmt:✨:Format code"
   "vet:🔬:Run go vet"
   "check:✅:Run all checks (fmt + vet + lint + test)"
@@ -32,6 +33,7 @@ commands=(
   "runtime:⏱️:Check runtime baseline"
   "tune:🎛️:Tune combined weights"
   "e2e:🐳:Run E2E tests (Docker)"
+  "loop:🔄:Benchmark loop (bench → compare → report)"
 )
 
 show_help() {
@@ -155,17 +157,22 @@ run_build() {
 
 run_bench() {
   echo "  ${ACCENT}${BOLD}🏋 Running corpus benchmark${NC}"
-  bash tests/benchmark/scripts/run-corpus-benchmark.sh "$@"
+  go run ./cmd/semantic-bench check "$@"
 }
 
 run_bench_full() {
   echo "  ${ACCENT}${BOLD}🏋 Running full benchmark suite${NC}"
-  bash tests/benchmark/scripts/run-full-benchmark.sh
+  go run ./cmd/semantic-bench run -suite=all "$@"
 }
 
 run_lint_corpus() {
   echo "  ${ACCENT}${BOLD}🔍 Linting benchmark corpus${NC}"
-  bash tests/benchmark/scripts/lint-corpus.sh
+  go run ./cmd/semantic-bench lint "$@"
+}
+
+run_lint_docs() {
+  echo "  ${ACCENT}${BOLD}🔍 Checking documentation links${NC}"
+  bash scripts/check-docs-links.sh
 }
 
 run_baseline() {
@@ -207,6 +214,11 @@ run_e2e() {
   bash scripts/e2e.sh
 }
 
+run_loop() {
+  echo "  ${ACCENT}${BOLD}🔄 Benchmark Loop${NC}"
+  go run ./cmd/semantic-bench check -verbose "$@"
+}
+
 case "${1:-help}" in
   pr)        run_pr ;;
   doctor)    exec bash scripts/doctor.sh ;;
@@ -221,6 +233,7 @@ case "${1:-help}" in
   lint)
     case "${2:-}" in
       corpus) run_lint_corpus ;;
+      docs) run_lint_docs ;;
       *) run_lint ;;
     esac
     ;;
@@ -245,5 +258,6 @@ case "${1:-help}" in
   runtime)   shift; run_runtime "$@" ;;
   tune)      shift; run_tune "$@" ;;
   e2e)       run_e2e ;;
+  loop)      run_loop ;;
   help|*)    show_help ;;
 esac
diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go
new file mode 100644
index 0000000..9670a68
--- /dev/null
+++ b/recovery/benchmark_test.go
@@ -0,0 +1,250 @@
+package recovery
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/pinchtab/semantic"
+)
+
+type BenchmarkScenario struct {
+	ID            string                       `json:"id"`
+	Name          string                       `json:"name"`
+	Description   string                       `json:"description"`
+	OriginalQuery string                       `json:"original_query"`
+	OriginalRef   string                       `json:"original_ref"`
+	Before        []semantic.ElementDescriptor `json:"before"`
+	After         []semantic.ElementDescriptor `json:"after"`
+	ExpectedRef   *string                      `json:"expected_ref"`
+	ExpectedAlt   []string                     `json:"expected_alt"`
+	ExpectNoMatch bool                         `json:"expect_no_match"`
+	Difficulty    string                       `json:"difficulty"`
+}
+
+func loadScenarios(t *testing.T) []BenchmarkScenario {
+	_, thisFile, _, _ := runtime.Caller(0)
+	repoRoot := filepath.Join(filepath.Dir(thisFile), "..")
+	scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json")
+
+	data, err := os.ReadFile(scenariosPath)
+	if err != nil {
+		t.Fatalf("failed to read scenarios: %v", err)
+	}
+
+	var scenarios []BenchmarkScenario
+	if err := json.Unmarshal(data, &scenarios); err != nil {
+		t.Fatalf("failed to parse scenarios: %v", err)
+	}
+
+	return scenarios
+}
+
+func TestRecoveryBenchmark_Scenarios(t *testing.T) {
+	scenarios := loadScenarios(t)
+	matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128))
+
+	passed, failed := 0, 0
+
+	for _, sc := range scenarios {
+		t.Run(sc.ID, func(t *testing.T) {
+			result := runBenchmarkScenario(t, matcher, sc)
+
+			if result.pass {
+				passed++
+				t.Logf("PASS: recovered=%v got=%s expected=%s score=%.3f",
+					result.recovered, result.gotRef, result.expectedRef, result.score)
+			} else {
+				failed++
+				t.Errorf("FAIL: recovered=%v got=%s expected=%s score=%.3f error=%s",
+					result.recovered, result.gotRef, result.expectedRef, result.score, result.err)
+			}
+		})
+	}
+
+	t.Logf("Summary: %d passed, %d failed out of %d scenarios", passed, failed, len(scenarios))
+}
+
+type scenarioResult struct {
+	pass        bool
+	recovered   bool
+	gotRef      string
+	expectedRef string
+	score       float64
+	confidence  string
+	latencyMs   int64
+	err         string
+}
+
+func runBenchmarkScenario(t *testing.T, matcher semantic.ElementMatcher, sc BenchmarkScenario) scenarioResult {
+	result := scenarioResult{}
+
+	if sc.ExpectedRef != nil {
+		result.expectedRef = *sc.ExpectedRef
+	}
+
+	var origDesc semantic.ElementDescriptor
+	for _, d := range sc.Before {
+		if d.Ref == sc.OriginalRef {
+			origDesc = d
+			break
+		}
+	}
+
+	cache := NewIntentCache(100, 5*time.Minute)
+	cache.Store("test-tab", sc.OriginalRef, IntentEntry{
+		Query:      sc.OriginalQuery,
+		Descriptor: origDesc,
+		Score:      0.95,
+		Confidence: "high",
+		Strategy:   "combined",
+	})
+
+	re := NewRecoveryEngine(
+		DefaultRecoveryConfig(),
+		matcher,
+		cache,
+		func(_ context.Context, _ string) error { return nil },
+		func(_, ref string) (int64, bool) {
+			for i, d := range sc.After {
+				if d.Ref == ref {
+					return int64(1000 + i), true
+				}
+			}
+			return 0, false
+		},
+		func(_ string) []semantic.ElementDescriptor { return sc.After },
+	)
+
+	start := time.Now()
+
+	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
+
+	if !re.ShouldAttempt(err, sc.OriginalRef) {
+		result.err = "ShouldAttempt returned false"
+		result.pass = sc.ExpectNoMatch
+		result.latencyMs = time.Since(start).Milliseconds()
+		return result
+	}
+
+	rr, _, recErr := re.AttemptWithClassification(
+		context.Background(),
+		"test-tab",
+		sc.OriginalRef,
+		"click",
+		ClassifyFailure(err),
+		func(_ context.Context, kind string, nodeID int64) (map[string]any, error) {
+			return map[string]any{"clicked": true}, nil
+		},
+	)
+
+	result.latencyMs = time.Since(start).Milliseconds()
+	result.recovered = rr.Recovered
+	result.gotRef = rr.NewRef
+	result.score = rr.Score
+	result.confidence = rr.Confidence
+
+	if recErr != nil {
+		result.err = recErr.Error()
+	}
+
+	if sc.ExpectNoMatch {
+		result.pass = !rr.Recovered
+	} else if sc.ExpectedRef != nil {
+		if rr.NewRef == *sc.ExpectedRef {
+			result.pass = true
+		} else {
+			for _, alt := range sc.ExpectedAlt {
+				if rr.NewRef == alt {
+					result.pass = true
+					break
+				}
+			}
+		}
+	}
+
+	return result
+}
+
+func BenchmarkRecoveryEngine_Scenarios(b *testing.B) {
+	scenarios := loadScenariosB(b)
+	matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, sc := range scenarios {
+			runBenchmarkScenarioB(b, matcher, sc)
+		}
+	}
+}
+
+func loadScenariosB(b *testing.B) []BenchmarkScenario {
+	_, thisFile, _, _ := runtime.Caller(0)
+	repoRoot := filepath.Join(filepath.Dir(thisFile), "..")
+	scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json")
+
+	data, err := os.ReadFile(scenariosPath)
+	if err != nil {
+		b.Fatalf("failed to read scenarios: %v", err)
+	}
+
+	var scenarios []BenchmarkScenario
+	if err := json.Unmarshal(data, &scenarios); err != nil {
+		b.Fatalf("failed to parse scenarios: %v", err)
+	}
+
+	return scenarios
+}
+
+func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc BenchmarkScenario) {
+	var origDesc semantic.ElementDescriptor
+	for _, d := range sc.Before {
+		if d.Ref == sc.OriginalRef {
+			origDesc = d
+			break
+		}
+	}
+
+	cache := NewIntentCache(100, 5*time.Minute)
+	cache.Store("test-tab", sc.OriginalRef, IntentEntry{
+		Query:      sc.OriginalQuery,
+		Descriptor: origDesc,
+		Score:      0.95,
+		Confidence: "high",
+		Strategy:   "combined",
+	})
+
+	re := NewRecoveryEngine(
+		DefaultRecoveryConfig(),
+		matcher,
+		cache,
+		func(_ context.Context, _ string) error { return nil },
+		func(_, ref string) (int64, bool) {
+			for i, d := range sc.After {
+				if d.Ref == ref {
+					return int64(1000 + i), true
+				}
+			}
+			return 0, false
+		},
+		func(_ string) []semantic.ElementDescriptor { return sc.After },
+	)
+
+	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
+
+	re.AttemptWithClassification(
+		context.Background(),
+		"test-tab",
+		sc.OriginalRef,
+		"click",
+		ClassifyFailure(err),
+		func(_ context.Context, kind string, nodeID int64) (map[string]any, error) {
+			return map[string]any{"clicked": true}, nil
+		},
+	)
+}
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index 16e70b4..7cbb684 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -90,6 +90,60 @@ cmd/semantic/main.go       CLI tool (find, match, classify)
 
 4. **Pre-commit hook** runs gofmt + golangci-lint automatically on staged files.
 
+## Benchmark Improvement Loop
+
+When implementing changes that affect matching quality, follow this loop:
+
+### Step 1: Ensure baseline exists
+
+```bash
+./dev baseline
+```
+
+Creates `tests/benchmark/baselines/combined.json` if missing.
+
+### Step 2: Implement change
+
+Make one focused improvement at a time.
+
+### Step 3: Run benchmark loop
+
+```bash
+./dev loop
+```
+
+Shows comparison table with deltas:
+- **Green (+)** = improved
+- **Red (-)** = regressed  
+- **Gray** = unchanged
+
+### Step 4: Evaluate and decide
+
+| Result | Action |
+|--------|--------|
+| All metrics improved/unchanged | `./dev baseline update` |
+| Mixed (some up, some down) | Investigate tradeoff |
+| Key metrics regressed | Fix before merging |
+
+### Step 5: Iterate
+
+Repeat steps 2-4. Each `baseline update` sets new goalpost.
+
+### Key metrics
+
+- **MRR** — Mean Reciprocal Rank (higher = finds correct element faster)
+- **P@1** — Precision at 1 (is top result correct?)
+- **Hit@3** — Any correct result in top 3?
+- **Margin** — Score gap between best correct and best wrong
+
+### Adding test cases
+
+When a query should work better:
+
+1. Add to `tests/benchmark/corpus/*/queries.json` or `cases/*.json`
+2. Run `./dev lint corpus`
+3. Run `./dev loop` — benchmark will show regression until fixed
+
 ## Public API Surface
 
 Only these symbols are visible to consumers:
diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh
index ef5603d..84d68d1 100755
--- a/tests/benchmark/scripts/calibrate-thresholds.sh
+++ b/tests/benchmark/scripts/calibrate-thresholds.sh
@@ -1,30 +1,20 @@
 #!/bin/bash
 #
-# Calibrate threshold recommendations for find and recovery.
+# Threshold Calibration Benchmark
+#
+# Calculates optimal thresholds for semantic matching by evaluating
+# recall, precision, and false-positive rates across threshold levels.
 #
 # Usage:
 #   ./calibrate-thresholds.sh [--corpus <dir>]
 #
-# Reports recall/precision/false-positive-rate by threshold.
-#
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 CORPUS_DIR="${BENCHMARK_DIR}/corpus"
+CASES_DIR="${BENCHMARK_DIR}/cases"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read config
-if [[ -f "$CONFIG_FILE" ]]; then
-    STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-    LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
-    EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
-else
-    STRATEGY="combined"
-    LEXICAL_WEIGHT=0.6
-    EMBEDDING_WEIGHT=0.4
-fi
 
 SPECIFIC_CORPUS=""
 while [[ $# -gt 0 ]]; do
@@ -45,164 +35,306 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json"
 
 # Thresholds to test
-THRESHOLDS=(0.01 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.60 0.70 0.80 0.90)
-
-echo "Testing ${#THRESHOLDS[@]} thresholds: ${THRESHOLDS[*]}"
-echo ""
+THRESHOLDS=(0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60)
 
 # Initialize report
 jq -n \
     --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
+    --argjson thresholds "$(printf '%s\n' "${THRESHOLDS[@]}" | jq -s '.')" \
     '{
-        timestamp: $ts,
-        strategy: $strategy,
-        thresholds: [],
+        calibration: {
+            timestamp: $ts,
+            thresholds_tested: $thresholds
+        },
+        by_threshold: {},
+        by_tag: {},
         recommendations: {}
     }' > "${REPORT_FILE}"
 
-# Collect results for each threshold
-for thresh in "${THRESHOLDS[@]}"; do
-    echo "Testing threshold: ${thresh}"
+echo ""
+echo "=== Threshold Calibration ==="
+echo "Testing thresholds: ${THRESHOLDS[*]}"
+echo ""
 
-    total=0
-    true_positives=0
-    false_positives=0
-    false_negatives=0
+# Collect all test cases
+declare -a ALL_QUERIES=()
+declare -a ALL_SNAPSHOTS=()
+declare -a ALL_RELEVANT=()
+declare -a ALL_EXPECT_NO_MATCH=()
+declare -a ALL_IDS=()
 
-    for corpus in "${CORPUS_DIR}"/*/; do
-        [[ -d "$corpus" ]] || continue
+load_corpus() {
+    local corpus_path="$1"
+    local snapshot="${corpus_path}/snapshot.json"
+    local queries="${corpus_path}/queries.json"
 
-        if [[ -n "$SPECIFIC_CORPUS" ]] && [[ "$(basename "$corpus")" != "$SPECIFIC_CORPUS" ]]; then
+    if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
+        return
+    fi
+
+    local count
+    count=$(jq length "$queries")
+
+    for i in $(seq 0 $((count - 1))); do
+        local query relevant id expect_no_match
+        id=$(jq -r ".[$i].id" "$queries")
+        query=$(jq -r ".[$i].query" "$queries")
+        relevant=$(jq -c ".[$i].relevant_refs // []" "$queries")
+        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$queries")
+
+        ALL_IDS+=("$id")
+        ALL_QUERIES+=("$query")
+        ALL_SNAPSHOTS+=("$snapshot")
+        ALL_RELEVANT+=("$relevant")
+        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
+    done
+}
+
+load_cases() {
+    local cases_file="$1"
+    local snapshots_dir="${BENCHMARK_DIR}/../e2e/assets/snapshots"
+
+    if [[ ! -f "$cases_file" ]]; then
+        return
+    fi
+
+    local count
+    count=$(jq length "$cases_file")
+
+    for i in $(seq 0 $((count - 1))); do
+        local id query snapshot_name expect_no_match expect_ref expect_ref_alt relevant
+        id=$(jq -r ".[$i].id" "$cases_file")
+        query=$(jq -r ".[$i].query" "$cases_file")
+        snapshot_name=$(jq -r ".[$i].snapshot" "$cases_file")
+        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$cases_file")
+        expect_ref=$(jq -r ".[$i].expect_ref // \"\"" "$cases_file")
+        expect_ref_alt=$(jq -c ".[$i].expect_ref_alt // []" "$cases_file")
+
+        if [[ -n "$expect_ref" && "$expect_ref" != "null" ]]; then
+            relevant=$(echo "$expect_ref_alt" | jq --arg r "$expect_ref" '. + [$r]')
+        else
+            relevant="[]"
+        fi
+
+        local snapshot="${snapshots_dir}/${snapshot_name}"
+        if [[ ! -f "$snapshot" ]]; then
             continue
         fi
 
-        snapshot="${corpus}/snapshot.json"
-        queries="${corpus}/queries.json"
+        ALL_IDS+=("$id")
+        ALL_QUERIES+=("$query")
+        ALL_SNAPSHOTS+=("$snapshot")
+        ALL_RELEVANT+=("$relevant")
+        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
+    done
+}
 
-        [[ -f "$snapshot" ]] && [[ -f "$queries" ]] || continue
+echo "Loading test cases..."
+if [[ -n "${SPECIFIC_CORPUS}" ]]; then
+    load_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}"
+else
+    for corpus in "${CORPUS_DIR}"/*/; do
+        [[ -d "$corpus" ]] || continue
+        load_corpus "$corpus"
+    done
+fi
 
-        count=$(jq length "$queries")
+load_cases "${CASES_DIR}/negative-threshold.json"
 
-        for i in $(seq 0 $((count - 1))); do
-            query=$(jq -r ".[$i].query" "$queries")
-            relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries")
+TOTAL_CASES=${#ALL_QUERIES[@]}
+echo "Loaded ${TOTAL_CASES} test cases"
+echo ""
 
-            result=$("${SEMANTIC}" find "${query}" \
-                --snapshot "${snapshot}" \
-                --strategy "${STRATEGY}" \
-                --threshold "${thresh}" \
-                --top-k 5 \
-                --lexical-weight "${LEXICAL_WEIGHT}" \
-                --embedding-weight "${EMBEDDING_WEIGHT}" \
-                --format json 2>/dev/null) || continue
+for threshold in "${THRESHOLDS[@]}"; do
+    echo "Testing threshold ${threshold}..."
 
-            best_ref=$(echo "$result" | jq -r '.best_ref // ""')
-            num_matches=$(echo "$result" | jq '.matches | length')
+    tp=0 fp=0 fn=0 tn=0
 
-            total=$((total + 1))
+    for i in $(seq 0 $((TOTAL_CASES - 1))); do
+        query="${ALL_QUERIES[$i]}"
+        snapshot="${ALL_SNAPSHOTS[$i]}"
+        relevant="${ALL_RELEVANT[$i]}"
+        expect_no_match="${ALL_EXPECT_NO_MATCH[$i]}"
 
-            # Check if best match is relevant
-            if [[ -n "$best_ref" ]] && echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-                true_positives=$((true_positives + 1))
-            elif [[ -n "$best_ref" ]] && [[ "$num_matches" -gt 0 ]]; then
-                false_positives=$((false_positives + 1))
+        result=$("${SEMANTIC}" find "${query}" \
+            --snapshot "${snapshot}" \
+            --strategy combined \
+            --threshold "${threshold}" \
+            --top-k 5 \
+            --format json 2>/dev/null) || result='{"matches":[]}'
+
+        match_count=$(echo "$result" | jq '.matches | length')
+        best_ref=$(echo "$result" | jq -r '.best_ref // ""')
+
+        if [[ "$expect_no_match" == "true" ]]; then
+            if [[ $match_count -eq 0 ]]; then
+                tn=$((tn + 1))
+            else
+                fp=$((fp + 1))
+            fi
+        else
+            relevant_count=$(echo "$relevant" | jq 'length')
+            if [[ $relevant_count -eq 0 ]]; then
+                continue
             fi
 
-            # If no match but there should be one
-            if [[ -z "$best_ref" ]] || [[ "$num_matches" -eq 0 ]]; then
-                rel_count=$(echo "$relevant_refs" | jq 'length')
-                if [[ "$rel_count" -gt 0 ]]; then
-                    false_negatives=$((false_negatives + 1))
-                fi
+            if [[ $match_count -eq 0 ]]; then
+                fn=$((fn + 1))
+            elif echo "$relevant" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
+                tp=$((tp + 1))
+            else
+                fp=$((fp + 1))
             fi
-        done
+        fi
     done
 
-    # Calculate metrics
-    if [[ $total -eq 0 ]]; then
-        echo "  No queries processed"
-        continue
-    fi
-
-    precision=0
-    recall=0
-    fpr=0
+    total_positive=$((tp + fn))
+    total_negative=$((tn + fp))
 
-    if [[ $((true_positives + false_positives)) -gt 0 ]]; then
-        precision=$(echo "scale=4; $true_positives / ($true_positives + $false_positives)" | bc)
+    if [[ $total_positive -gt 0 ]]; then
+        recall=$(echo "scale=4; $tp / $total_positive" | bc)
+    else
+        recall="0"
     fi
 
-    if [[ $((true_positives + false_negatives)) -gt 0 ]]; then
-        recall=$(echo "scale=4; $true_positives / ($true_positives + $false_negatives)" | bc)
+    if [[ $((tp + fp)) -gt 0 ]]; then
+        precision=$(echo "scale=4; $tp / ($tp + $fp)" | bc)
+    else
+        precision="1"
     fi
 
-    if [[ $((false_positives + true_positives)) -gt 0 ]]; then
-        fpr=$(echo "scale=4; $false_positives / $total" | bc)
+    if [[ $total_negative -gt 0 ]]; then
+        fpr=$(echo "scale=4; $fp / $total_negative" | bc)
+    else
+        fpr="0"
     fi
 
-    f1=0
-    if (( $(echo "$precision + $recall > 0" | bc -l) )); then
+    if [[ $(echo "$precision + $recall > 0" | bc) -eq 1 ]]; then
         f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc)
+    else
+        f1="0"
     fi
 
-    printf "  Precision: %.3f | Recall: %.3f | FPR: %.3f | F1: %.3f\n" "$precision" "$recall" "$fpr" "$f1"
+    printf "  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f FPR=%.3f F1=%.3f\n" \
+        "$threshold" "$tp" "$fp" "$fn" "$tn" "$recall" "$precision" "$fpr" "$f1"
 
-    # Append to report
     tmp=$(mktemp)
-    jq --argjson thresh "$thresh" \
-       --argjson total "$total" \
-       --argjson tp "$true_positives" \
-       --argjson fp "$false_positives" \
-       --argjson fn "$false_negatives" \
-       --argjson precision "$precision" \
-       --argjson recall "$recall" \
-       --argjson fpr "$fpr" \
-       --argjson f1 "$f1" \
-       '.thresholds += [{
-           threshold: $thresh,
-           total: $total,
-           true_positives: $tp,
-           false_positives: $fp,
-           false_negatives: $fn,
-           precision: $precision,
-           recall: $recall,
-           false_positive_rate: $fpr,
-           f1: $f1
-       }]' "$REPORT_FILE" > "$tmp"
+    jq --arg t "$threshold" \
+       --argjson tp "$tp" --argjson fp "$fp" --argjson fn "$fn" --argjson tn "$tn" \
+       --argjson recall "$recall" --argjson precision "$precision" \
+       --argjson fpr "$fpr" --argjson f1 "$f1" \
+       '.by_threshold[$t] = {
+           tp: $tp, fp: $fp, fn: $fn, tn: $tn,
+           recall: $recall, precision: $precision,
+           false_positive_rate: $fpr, f1: $f1
+       }' "$REPORT_FILE" > "$tmp"
     mv "$tmp" "$REPORT_FILE"
 done
 
-# Calculate recommendations
 echo ""
 echo "Calculating recommendations..."
 
-# Best F1 for general find
-BEST_FIND=$(jq -r '[.thresholds[] | select(.f1 > 0)] | max_by(.f1) | .threshold // 0.3' "$REPORT_FILE")
+best_f1_threshold="" best_f1=0
+best_recall_threshold="" best_recall=0
+
+for threshold in "${THRESHOLDS[@]}"; do
+    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
+    f1=$(echo "$metrics" | jq -r '.f1')
+    recall=$(echo "$metrics" | jq -r '.recall')
+
+    if (( $(echo "$f1 > $best_f1" | bc -l) )); then
+        best_f1=$f1
+        best_f1_threshold=$threshold
+    fi
+    if (( $(echo "$recall > $best_recall" | bc -l) )); then
+        best_recall=$recall
+        best_recall_threshold=$threshold
+    fi
+done
+
+recovery_threshold=""
+recovery_precision=0
+for threshold in "${THRESHOLDS[@]}"; do
+    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
+    recall=$(echo "$metrics" | jq -r '.recall')
+    precision=$(echo "$metrics" | jq -r '.precision')
+
+    if (( $(echo "$recall >= 0.85" | bc -l) )); then
+        if (( $(echo "$precision > $recovery_precision" | bc -l) )); then
+            recovery_precision=$precision
+            recovery_threshold=$threshold
+        fi
+    fi
+done
+
+if [[ -z "$recovery_threshold" ]]; then
+    recovery_threshold="${THRESHOLDS[0]}"
+fi
 
-# Best recall with precision > 0.8 for recovery (prioritize not missing)
-BEST_RECOVERY=$(jq -r '[.thresholds[] | select(.precision >= 0.7)] | max_by(.recall) | .threshold // 0.2' "$REPORT_FILE")
+default_threshold="$best_f1_threshold"
 
-# Update recommendations
 tmp=$(mktemp)
-jq --argjson find "$BEST_FIND" \
-   --argjson recovery "$BEST_RECOVERY" \
+jq --arg default "$default_threshold" \
+   --arg recovery "$recovery_threshold" \
+   --arg best_f1 "$best_f1_threshold" \
+   --argjson best_f1_val "$best_f1" \
    '.recommendations = {
-       find: $find,
-       recovery: $recovery,
-       note: "find optimizes F1; recovery optimizes recall with precision >= 0.7"
+       default_threshold: $default,
+       recovery_threshold: $recovery,
+       best_f1: { threshold: $best_f1, value: $best_f1_val },
+       notes: "default_threshold optimizes F1. recovery_threshold prioritizes recall (>=85%)."
    }' "$REPORT_FILE" > "$tmp"
 mv "$tmp" "$REPORT_FILE"
 
-# Cleanup
+SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
+
+cat > "${SUMMARY_FILE}" << EOF
+# Threshold Calibration Report
+
+Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)
+
+## Recommendations
+
+| Use Case | Threshold | Rationale |
+|----------|-----------|-----------|
+| **Default (find)** | **${default_threshold}** | Best F1 score (${best_f1}) |
+| **Recovery** | **${recovery_threshold}** | High recall for element recovery |
+
+## Metrics by Threshold
+
+| Threshold | TP | FP | FN | TN | Recall | Precision | FPR | F1 |
+|-----------|----|----|----|----|--------|-----------|-----|-----|
+$(for t in "${THRESHOLDS[@]}"; do
+    m=$(jq -r ".by_threshold[\"$t\"]" "$REPORT_FILE")
+    printf "| %.2f | %d | %d | %d | %d | %.3f | %.3f | %.3f | %.3f |\n" \
+        "$t" \
+        "$(echo "$m" | jq -r '.tp')" \
+        "$(echo "$m" | jq -r '.fp')" \
+        "$(echo "$m" | jq -r '.fn')" \
+        "$(echo "$m" | jq -r '.tn')" \
+        "$(echo "$m" | jq -r '.recall')" \
+        "$(echo "$m" | jq -r '.precision')" \
+        "$(echo "$m" | jq -r '.false_positive_rate')" \
+        "$(echo "$m" | jq -r '.f1')"
+done)
+
+## Trade-offs
+
+- **Lower threshold** (0.10-0.20): High recall, more false positives. Good for recovery.
+- **Medium threshold** (0.25-0.35): Balanced. Good default for find operations.
+- **Higher threshold** (0.40+): High precision, misses weaker matches.
+EOF
+
 rm -f "${BENCHMARK_DIR}/semantic"
 
 echo ""
 echo "================================================"
-echo "  THRESHOLD CALIBRATION RESULTS"
+echo "  THRESHOLD CALIBRATION COMPLETE"
 echo "================================================"
-echo "  Recommended for Find:     ${BEST_FIND}"
-echo "  Recommended for Recovery: ${BEST_RECOVERY}"
+echo "  Test cases:         ${TOTAL_CASES}"
+echo "  Default threshold:  ${default_threshold} (F1=${best_f1})"
+echo "  Recovery threshold: ${recovery_threshold}"
 echo "================================================"
 echo ""
-echo "Report: ${REPORT_FILE}"
+echo "Report:  ${REPORT_FILE}"
+echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/run-recovery-benchmark.sh b/tests/benchmark/scripts/run-recovery-benchmark.sh
new file mode 100755
index 0000000..93fc88a
--- /dev/null
+++ b/tests/benchmark/scripts/run-recovery-benchmark.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Recovery Engine Benchmark
+#
+# Exercises RecoveryEngine directly using before/after snapshots
+# and intent cache entries from recovery scenarios.
+#
+# Usage:
+#   ./run-recovery-benchmark.sh
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+RESULTS_DIR="${BENCHMARK_DIR}/results"
+
+mkdir -p "${RESULTS_DIR}"
+
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+REPORT_FILE="${RESULTS_DIR}/recovery_benchmark_${TIMESTAMP}.txt"
+
+echo "=== Recovery Engine Benchmark ==="
+echo ""
+
+cd "${BENCHMARK_DIR}/../.."
+
+# Run the Go test that exercises RecoveryEngine with scenarios
+echo "Running recovery scenarios..."
+echo ""
+
+go test -v -run TestRecoveryBenchmark_Scenarios ./recovery/ 2>&1 | tee "$REPORT_FILE"
+
+# Also run the Go benchmark for performance
+echo ""
+echo "Running performance benchmark..."
+go test -bench=BenchmarkRecoveryEngine_Scenarios -benchmem ./recovery/ 2>&1 | tee -a "$REPORT_FILE"
+
+echo ""
+echo "================================================"
+echo "  RECOVERY BENCHMARK COMPLETE"
+echo "================================================"
+echo "Report: $REPORT_FILE"

From 201b9c40a3942b431488fe68e88e2e4c7e41346b Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:38:06 +0100
Subject: [PATCH 20/30] feat: add semantic-bench CLI for benchmark management

Go CLI with commands: check, run, compare, lint, catalog.
Replaces bash scripts with structured benchmark framework.
---
 cmd/semantic-bench/main.go     | 113 ++++++++
 internal/benchmark/commands.go | 510 +++++++++++++++++++++++++++++++++
 internal/benchmark/config.go   | 247 ++++++++++++++++
 internal/benchmark/dataset.go  | 117 ++++++++
 internal/benchmark/runner.go   | 384 +++++++++++++++++++++++++
 5 files changed, 1371 insertions(+)
 create mode 100644 cmd/semantic-bench/main.go
 create mode 100644 internal/benchmark/commands.go
 create mode 100644 internal/benchmark/config.go
 create mode 100644 internal/benchmark/dataset.go
 create mode 100644 internal/benchmark/runner.go

diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go
new file mode 100644
index 0000000..35bf051
--- /dev/null
+++ b/cmd/semantic-bench/main.go
@@ -0,0 +1,113 @@
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/pinchtab/semantic/internal/benchmark"
+)
+
+const usage = `semantic-bench - Benchmark runner for semantic matching
+
+Usage:
+  semantic-bench <command> [flags]
+
+Commands:
+  check     Run benchmark and compare against baseline (default)
+  run       Run benchmark suites
+  compare   Compare two reports
+  lint      Validate dataset
+  catalog   Print dataset inventory
+
+Flags:
+  -h, --help    Show help
+
+Run 'semantic-bench <command> --help' for command-specific help.
+`
+
+func main() {
+	if len(os.Args) < 2 {
+		runCheck(os.Args[1:])
+		return
+	}
+
+	cmd := os.Args[1]
+	args := os.Args[2:]
+
+	switch cmd {
+	case "check":
+		runCheck(args)
+	case "run":
+		runRun(args)
+	case "compare":
+		runCompare(args)
+	case "lint":
+		runLint(args)
+	case "catalog":
+		runCatalog(args)
+	case "-h", "--help", "help":
+		fmt.Print(usage)
+	default:
+		fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage)
+		os.Exit(2)
+	}
+}
+
+func runCheck(args []string) {
+	cfg := benchmark.ParseCheckFlags(args)
+	result, err := benchmark.RunCheck(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCheckResult(result, cfg)
+	if result.Status == "fail" {
+		os.Exit(1)
+	}
+}
+
+func runRun(args []string) {
+	cfg := benchmark.ParseRunFlags(args)
+	result, err := benchmark.RunBenchmark(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintRunResult(result, cfg)
+}
+
+func runCompare(args []string) {
+	cfg := benchmark.ParseCompareFlags(args)
+	result, err := benchmark.RunCompare(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCompareResult(result, cfg)
+	if result.Status == "fail" {
+		os.Exit(1)
+	}
+}
+
+func runLint(args []string) {
+	cfg := benchmark.ParseLintFlags(args)
+	result, err := benchmark.RunLint(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintLintResult(result, cfg)
+	if result.Errors > 0 {
+		os.Exit(1)
+	}
+}
+
+func runCatalog(args []string) {
+	cfg := benchmark.ParseCatalogFlags(args)
+	result, err := benchmark.RunCatalog(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCatalogResult(result, cfg)
+}
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
new file mode 100644
index 0000000..ad22ea3
--- /dev/null
+++ b/internal/benchmark/commands.go
@@ -0,0 +1,510 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+)
+
+type CheckResult struct {
+	Status    string        `json:"status"`
+	Summary   CheckSummary  `json:"summary"`
+	Delta     *MetricsDelta `json:"delta,omitempty"`
+	TopRegs   []Regression  `json:"top_regressions,omitempty"`
+	Artifacts Artifacts     `json:"artifacts"`
+	Report    *Report       `json:"-"`
+}
+
+type CheckSummary struct {
+	PAt1        float64 `json:"p_at_1"`
+	MRR         float64 `json:"mrr"`
+	HitAt3      float64 `json:"hit_at_3"`
+	Total       int     `json:"total"`
+	Regressions int     `json:"regressions"`
+	Warnings    int     `json:"warnings"`
+}
+
+type MetricsDelta struct {
+	PAt1   float64 `json:"p_at_1"`
+	MRR    float64 `json:"mrr"`
+	HitAt3 float64 `json:"hit_at_3"`
+}
+
+type Regression struct {
+	ID           string   `json:"id"`
+	Corpus       string   `json:"corpus"`
+	Query        string   `json:"query"`
+	Expected     []string `json:"expected"`
+	BaselineRef  string   `json:"baseline_ref,omitempty"`
+	CurrentRef   string   `json:"current_ref"`
+	Reason       string   `json:"reason"`
+	DebugCommand string   `json:"debug_command"`
+}
+
+type Artifacts struct {
+	ReportJSON string `json:"report_json"`
+	SummaryMD  string `json:"summary_md"`
+}
+
+type CompareResult struct {
+	Status       string       `json:"status"`
+	Delta        MetricsDelta `json:"delta"`
+	Regressions  []Regression `json:"regressions"`
+	Improvements []string     `json:"improvements"`
+}
+
+type LintResult struct {
+	Errors   int      `json:"errors"`
+	Warnings int      `json:"warnings"`
+	Messages []string `json:"messages"`
+}
+
+type CatalogResult struct {
+	Corpora      []CorpusSummary `json:"corpora"`
+	TotalQueries int             `json:"total_queries"`
+	ByTag        map[string]int  `json:"by_tag,omitempty"`
+	ByDifficulty map[string]int  `json:"by_difficulty,omitempty"`
+}
+
+type CorpusSummary struct {
+	ID      string   `json:"id"`
+	Queries int      `json:"queries"`
+	Tags    []string `json:"tags"`
+}
+
+func RunCheck(cfg CheckConfig) (*CheckResult, error) {
+	root := FindBenchmarkRoot()
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	benchCfg, _ := LoadConfig(root)
+	profile := Profile{
+		Strategy:  "combined",
+		Threshold: 0.01,
+		TopK:      5,
+		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+	}
+	if benchCfg != nil {
+		profile = ResolveProfile(benchCfg, cfg.Profile)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        profile.Strategy,
+		Threshold:       profile.Threshold,
+		TopK:            profile.TopK,
+		LexicalWeight:   profile.Weights.Lexical,
+		EmbeddingWeight: profile.Weights.Embedding,
+		Profile:         cfg.Profile,
+		Mode:            "library",
+		Verbose:         cfg.Verbose,
+		Explain:         cfg.Explain,
+		OutputDir:       cfg.OutputDir,
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	result := &CheckResult{
+		Status: "pass",
+		Report: report,
+	}
+	result.Summary.PAt1 = report.Metrics.Overall.PAt1
+	result.Summary.MRR = report.Metrics.Overall.MRR
+	result.Summary.HitAt3 = report.Metrics.Overall.HitAt3
+	result.Summary.Total = report.Metrics.Overall.Total
+
+	// Count misses
+	for _, r := range report.Results {
+		if r.Status == "miss" {
+			result.TopRegs = append(result.TopRegs, Regression{
+				ID:           r.ID,
+				Corpus:       r.Corpus,
+				Query:        r.Query,
+				Expected:     r.Expected.RelevantRefs,
+				CurrentRef:   r.Actual.BestRef,
+				Reason:       "miss",
+				DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID),
+			})
+		}
+	}
+	result.Summary.Regressions = len(result.TopRegs)
+
+	// Compare to baseline if exists
+	baselinePath := cfg.BaselinePath
+	if baselinePath == "" {
+		baselinePath = filepath.Join(root, "baselines", "combined.json")
+	}
+	if _, err := os.Stat(baselinePath); err == nil {
+		baseline, err := loadReport(baselinePath)
+		if err == nil {
+			result.Delta = &MetricsDelta{
+				PAt1:   report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+			}
+			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
+				result.Status = "fail"
+			}
+		}
+	}
+
+	// Write artifacts
+	os.MkdirAll(cfg.OutputDir, 0755)
+	ts := time.Now().Format("20060102_150405")
+	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
+	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
+
+	reportJSON, _ := json.MarshalIndent(report, "", "  ")
+	os.WriteFile(reportPath, reportJSON, 0644)
+
+	summaryMD := generateSummaryMD(report, result)
+	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
+
+	result.Artifacts.ReportJSON = reportPath
+	result.Artifacts.SummaryMD = summaryPath
+
+	return result, nil
+}
+
+func RunBenchmark(cfg RunConfig) (*Report, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+	return RunCorpusBenchmark(ds, cfg)
+}
+
+func RunCompare(cfg CompareConfig) (*CompareResult, error) {
+	baseline, err := loadReport(cfg.BaselinePath)
+	if err != nil {
+		return nil, fmt.Errorf("load baseline: %w", err)
+	}
+	current, err := loadReport(cfg.CurrentPath)
+	if err != nil {
+		return nil, fmt.Errorf("load current: %w", err)
+	}
+
+	result := &CompareResult{
+		Status: "pass",
+		Delta: MetricsDelta{
+			PAt1:   current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+			MRR:    current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+			HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+		},
+	}
+
+	if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 {
+		result.Status = "fail"
+	}
+
+	// Find regressions
+	baselineResults := make(map[string]QueryResult)
+	for _, r := range baseline.Results {
+		baselineResults[r.ID] = r
+	}
+	for _, r := range current.Results {
+		if base, ok := baselineResults[r.ID]; ok {
+			if base.Status == "hit" && r.Status != "hit" {
+				result.Regressions = append(result.Regressions, Regression{
+					ID:          r.ID,
+					Corpus:      r.Corpus,
+					Query:       r.Query,
+					BaselineRef: base.Actual.BestRef,
+					CurrentRef:  r.Actual.BestRef,
+					Reason:      fmt.Sprintf("%s -> %s", base.Status, r.Status),
+				})
+			}
+		}
+	}
+
+	return result, nil
+}
+
+func RunLint(cfg LintConfig) (*LintResult, error) {
+	root := FindBenchmarkRoot()
+	result := &LintResult{}
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		result.Errors++
+		result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err))
+		return result, nil
+	}
+
+	// Check for duplicate IDs
+	ids := make(map[string]string)
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if existing, ok := ids[q.ID]; ok {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing))
+			} else {
+				ids[q.ID] = c.ID
+			}
+		}
+	}
+
+	// Check refs exist
+	for _, c := range ds.Corpora {
+		refs := make(map[string]bool)
+		for _, d := range c.Snapshot {
+			refs[d.Ref] = true
+		}
+		for _, q := range c.Queries {
+			for _, r := range q.RelevantRefs {
+				if !refs[r] {
+					result.Errors++
+					result.Messages = append(result.Messages,
+						fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r))
+				}
+			}
+		}
+	}
+
+	// Check difficulty values
+	validDiff := map[string]bool{"easy": true, "medium": true, "hard": true}
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if q.Difficulty != "" && !validDiff[q.Difficulty] {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID))
+			}
+		}
+	}
+
+	if result.Errors == 0 && result.Warnings == 0 {
+		result.Messages = append(result.Messages, "All checks passed")
+	}
+
+	return result, nil
+}
+
+func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+
+	result := &CatalogResult{
+		ByTag:        make(map[string]int),
+		ByDifficulty: make(map[string]int),
+	}
+
+	for _, c := range ds.Corpora {
+		tags := make(map[string]bool)
+		for _, q := range c.Queries {
+			result.TotalQueries++
+			result.ByDifficulty[q.Difficulty]++
+			for _, t := range q.Tags {
+				tags[t] = true
+				result.ByTag[t]++
+			}
+		}
+		var tagList []string
+		for t := range tags {
+			tagList = append(tagList, t)
+		}
+		sort.Strings(tagList)
+		result.Corpora = append(result.Corpora, CorpusSummary{
+			ID:      c.ID,
+			Queries: len(c.Queries),
+			Tags:    tagList,
+		})
+	}
+
+	return result, nil
+}
+
+func loadReport(path string) (*Report, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var r Report
+	if err := json.Unmarshal(data, &r); err != nil {
+		return nil, err
+	}
+	return &r, nil
+}
+
+func generateSummaryMD(report *Report, result *CheckResult) string {
+	var sb strings.Builder
+
+	sb.WriteString("# Benchmark Summary\n\n")
+	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
+
+	sb.WriteString("## Overall Metrics\n\n")
+	sb.WriteString("| Metric | Value |\n")
+	sb.WriteString("|--------|-------|\n")
+	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
+	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
+	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
+	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
+	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
+
+	if result.Delta != nil {
+		sb.WriteString("\n## Delta from Baseline\n\n")
+		sb.WriteString("| Metric | Delta |\n")
+		sb.WriteString("|--------|-------|\n")
+		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
+		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
+		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
+	}
+
+	if len(result.TopRegs) > 0 {
+		sb.WriteString("\n## Misses\n\n")
+		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
+		sb.WriteString("|----|--------|-------|-----|----------|\n")
+		for _, r := range result.TopRegs {
+			if len(result.TopRegs) > 10 {
+				break
+			}
+			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
+				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
+		}
+	}
+
+	return sb.String()
+}
+
+func PrintCheckResult(result *CheckResult, cfg CheckConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m Benchmark passed\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Benchmark failed\n")
+	}
+	fmt.Printf("\n")
+
+	fmt.Printf("  %-12s %8.4f\n", "MRR", result.Summary.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", result.Summary.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", result.Summary.Total)
+	fmt.Printf("  %-12s %8d\n", "Misses", result.Summary.Regressions)
+
+	if result.Delta != nil {
+		fmt.Printf("\n  Delta from baseline:\n")
+		printDelta("P@1", result.Delta.PAt1)
+		printDelta("MRR", result.Delta.MRR)
+		printDelta("Hit@3", result.Delta.HitAt3)
+	}
+
+	fmt.Printf("\n  Artifacts:\n")
+	fmt.Printf("    Report:  %s\n", result.Artifacts.ReportJSON)
+	fmt.Printf("    Summary: %s\n", result.Artifacts.SummaryMD)
+	fmt.Printf("\n")
+}
+
+func printDelta(name string, delta float64) {
+	color := "\033[0m"
+	sign := ""
+	if delta > 0.001 {
+		color = "\033[32m"
+		sign = "+"
+	} else if delta < -0.001 {
+		color = "\033[31m"
+	}
+	fmt.Printf("    %s%-8s %s%.4f\033[0m\n", color, name, sign, delta)
+}
+
+func PrintRunResult(report *Report, cfg RunConfig) {
+	fmt.Printf("\n")
+	fmt.Printf("  %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", report.Metrics.Overall.Total)
+	fmt.Printf("\n")
+
+	if cfg.Verbose {
+		for _, r := range report.Results {
+			status := "\033[32mHIT \033[0m"
+			switch r.Status {
+			case "miss":
+				status = "\033[31mMISS\033[0m"
+			case "partial":
+				status = "\033[33mPART\033[0m"
+			}
+			fmt.Printf("  [%s] %s | %s | got=%s score=%.3f\n",
+				r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore)
+		}
+	}
+}
+
+func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m No regression\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Regression detected\n")
+	}
+	fmt.Printf("\n")
+	printDelta("P@1", result.Delta.PAt1)
+	printDelta("MRR", result.Delta.MRR)
+	printDelta("Hit@3", result.Delta.HitAt3)
+
+	if len(result.Regressions) > 0 {
+		fmt.Printf("\n  Regressions:\n")
+		for _, r := range result.Regressions {
+			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
+		}
+	}
+	fmt.Printf("\n")
+}
+
+func PrintLintResult(result *LintResult, cfg LintConfig) {
+	for _, msg := range result.Messages {
+		fmt.Println(msg)
+	}
+	fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings)
+}
+
+func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n  Corpora: %d\n", len(result.Corpora))
+	fmt.Printf("  Total Queries: %d\n\n", result.TotalQueries)
+
+	fmt.Printf("  %-30s %8s\n", "Corpus", "Queries")
+	fmt.Printf("  %-30s %8s\n", "------", "-------")
+	for _, c := range result.Corpora {
+		fmt.Printf("  %-30s %8d\n", c.ID, c.Queries)
+	}
+
+	switch cfg.By {
+	case "difficulty":
+		fmt.Printf("\n  By Difficulty:\n")
+		for d, n := range result.ByDifficulty {
+			fmt.Printf("    %-10s %4d\n", d, n)
+		}
+	case "tag":
+		fmt.Printf("\n  By Tag:\n")
+		for t, n := range result.ByTag {
+			fmt.Printf("    %-20s %4d\n", t, n)
+		}
+	}
+	fmt.Printf("\n")
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
new file mode 100644
index 0000000..c8ac10d
--- /dev/null
+++ b/internal/benchmark/config.go
@@ -0,0 +1,247 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"flag"
+	"os"
+	"path/filepath"
+)
+
+type Config struct {
+	Version  string             `json:"version"`
+	Defaults DefaultsConfig     `json:"defaults"`
+	Profiles map[string]Profile `json:"profiles"`
+	Baseline BaselineConfig     `json:"baseline"`
+}
+
+type DefaultsConfig struct {
+	Profile string `json:"profile"`
+}
+
+type Profile struct {
+	Strategy   string   `json:"strategy"`
+	Threshold  float64  `json:"threshold"`
+	TopK       int      `json:"top_k"`
+	Weights    Weights  `json:"weights"`
+	Suites     []string `json:"suites"`
+	Mode       string   `json:"mode"`
+	Inherits   string   `json:"inherits"`
+	Verbose    bool     `json:"verbose"`
+	Explain    bool     `json:"explain"`
+	FailOnReg  bool     `json:"fail_on_regression"`
+}
+
+type Weights struct {
+	Lexical   float64 `json:"lexical"`
+	Embedding float64 `json:"embedding"`
+}
+
+type BaselineConfig struct {
+	Quality BaselineQuality `json:"quality"`
+	Runtime BaselineRuntime `json:"runtime"`
+}
+
+type BaselineQuality struct {
+	MaxOverallPAt1Drop  float64 `json:"max_overall_p_at_1_drop"`
+	MaxOverallMRRDrop   float64 `json:"max_overall_mrr_drop"`
+	MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"`
+	MaxCorpusPAt1Drop   float64 `json:"max_corpus_p_at_1_drop"`
+	MaxTagPAt1Drop      float64 `json:"max_tag_p_at_1_drop"`
+}
+
+type BaselineRuntime struct {
+	MaxNsOpRegressionRatio   float64 `json:"max_ns_op_regression_ratio"`
+	MaxAllocRegressionRatio  float64 `json:"max_alloc_regression_ratio"`
+}
+
+type CheckConfig struct {
+	Profile      string
+	BaselinePath string
+	OutputDir    string
+	Format       string
+	FailOnReg    bool
+	Quick        bool
+	Verbose      bool
+	Explain      bool
+}
+
+type RunConfig struct {
+	Suite           string
+	Corpus          string
+	QueryID         string
+	Strategy        string
+	Threshold       float64
+	TopK            int
+	LexicalWeight   float64
+	EmbeddingWeight float64
+	Profile         string
+	Mode            string
+	Verbose         bool
+	Explain         bool
+	OutputDir       string
+	ReportName      string
+}
+
+type CompareConfig struct {
+	BaselinePath string
+	CurrentPath  string
+	Format       string
+	Verbose      bool
+}
+
+type LintConfig struct {
+	Format  string
+	Verbose bool
+}
+
+type CatalogConfig struct {
+	Format string
+	By     string
+}
+
+func FindBenchmarkRoot() string {
+	cwd, _ := os.Getwd()
+	for d := cwd; d != "/"; d = filepath.Dir(d) {
+		if _, err := os.Stat(filepath.Join(d, "tests/benchmark/config/benchmark.json")); err == nil {
+			return filepath.Join(d, "tests/benchmark")
+		}
+		if _, err := os.Stat(filepath.Join(d, "go.mod")); err == nil {
+			return filepath.Join(d, "tests/benchmark")
+		}
+	}
+	return filepath.Join(cwd, "tests/benchmark")
+}
+
+func LoadConfig(benchmarkRoot string) (*Config, error) {
+	path := filepath.Join(benchmarkRoot, "config/benchmark.json")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var cfg Config
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return nil, err
+	}
+	return &cfg, nil
+}
+
+func ResolveProfile(cfg *Config, name string) Profile {
+	p, ok := cfg.Profiles[name]
+	if !ok {
+		return Profile{
+			Strategy:  "combined",
+			Threshold: 0.01,
+			TopK:      5,
+			Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+			Suites:    []string{"corpus"},
+			Mode:      "library",
+		}
+	}
+	if p.Inherits != "" {
+		base := ResolveProfile(cfg, p.Inherits)
+		if p.Strategy == "" {
+			p.Strategy = base.Strategy
+		}
+		if p.Threshold == 0 {
+			p.Threshold = base.Threshold
+		}
+		if p.TopK == 0 {
+			p.TopK = base.TopK
+		}
+		if p.Weights.Lexical == 0 && p.Weights.Embedding == 0 {
+			p.Weights = base.Weights
+		}
+		if len(p.Suites) == 0 {
+			p.Suites = base.Suites
+		}
+		if p.Mode == "" {
+			p.Mode = base.Mode
+		}
+	}
+	return p
+}
+
+func ParseCheckFlags(args []string) CheckConfig {
+	fs := flag.NewFlagSet("check", flag.ExitOnError)
+	cfg := CheckConfig{
+		Profile:   "default",
+		OutputDir: filepath.Join(FindBenchmarkRoot(), "results"),
+		Format:    "text",
+	}
+	fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile")
+	fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline file path")
+	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)")
+	fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression")
+	fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details")
+	fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseRunFlags(args []string) RunConfig {
+	fs := flag.NewFlagSet("run", flag.ExitOnError)
+	cfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Profile:         "default",
+		Mode:            "library",
+		OutputDir:       filepath.Join(FindBenchmarkRoot(), "results"),
+	}
+	fs.StringVar(&cfg.Suite, "suite", cfg.Suite, "suite to run (corpus|recovery|classification|runtime|all)")
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to run")
+	fs.StringVar(&cfg.QueryID, "query", "", "specific query ID to run")
+	fs.StringVar(&cfg.Strategy, "strategy", cfg.Strategy, "matching strategy")
+	fs.Float64Var(&cfg.Threshold, "threshold", cfg.Threshold, "score threshold")
+	fs.IntVar(&cfg.TopK, "top-k", cfg.TopK, "number of results")
+	fs.Float64Var(&cfg.LexicalWeight, "lexical-weight", cfg.LexicalWeight, "lexical weight")
+	fs.Float64Var(&cfg.EmbeddingWeight, "embedding-weight", cfg.EmbeddingWeight, "embedding weight")
+	fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile")
+	fs.StringVar(&cfg.Mode, "mode", cfg.Mode, "execution mode (cli|library|both)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.BoolVar(&cfg.Explain, "explain", false, "include explanations")
+	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
+	fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseCompareFlags(args []string) CompareConfig {
+	fs := flag.NewFlagSet("compare", flag.ExitOnError)
+	cfg := CompareConfig{
+		Format: "text",
+	}
+	fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline report path (required)")
+	fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)")
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseLintFlags(args []string) LintConfig {
+	fs := flag.NewFlagSet("lint", flag.ExitOnError)
+	cfg := LintConfig{
+		Format: "text",
+	}
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseCatalogFlags(args []string) CatalogConfig {
+	fs := flag.NewFlagSet("catalog", flag.ExitOnError)
+	cfg := CatalogConfig{
+		Format: "table",
+	}
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)")
+	fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)")
+	fs.Parse(args)
+	return cfg
+}
diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go
new file mode 100644
index 0000000..555b503
--- /dev/null
+++ b/internal/benchmark/dataset.go
@@ -0,0 +1,117 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+
+	"github.com/pinchtab/semantic"
+)
+
+type Query struct {
+	ID                    string   `json:"id"`
+	QueryText             string   `json:"query"`
+	RelevantRefs          []string `json:"relevant_refs"`
+	PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
+	Difficulty            string   `json:"difficulty"`
+	Tags                  []string `json:"tags"`
+	Intent                string   `json:"intent,omitempty"`
+	PageType              string   `json:"page_type,omitempty"`
+	Threshold             *float64 `json:"threshold,omitempty"`
+	TopK                  *int     `json:"top_k,omitempty"`
+	ExpectNoMatch         bool     `json:"expect_no_match,omitempty"`
+	MinScore              *float64 `json:"min_score,omitempty"`
+	Notes                 string   `json:"notes,omitempty"`
+}
+
+type Corpus struct {
+	ID        string
+	Path      string
+	Snapshot  []semantic.ElementDescriptor
+	Queries   []Query
+}
+
+type Dataset struct {
+	Root    string
+	Corpora []Corpus
+}
+
+func LoadDataset(benchmarkRoot string) (*Dataset, error) {
+	corpusDir := filepath.Join(benchmarkRoot, "corpus")
+	entries, err := os.ReadDir(corpusDir)
+	if err != nil {
+		return nil, err
+	}
+
+	ds := &Dataset{Root: benchmarkRoot}
+
+	for _, entry := range entries {
+		if !entry.IsDir() {
+			continue
+		}
+
+		corpusPath := filepath.Join(corpusDir, entry.Name())
+		snapshotPath := filepath.Join(corpusPath, "snapshot.json")
+		queriesPath := filepath.Join(corpusPath, "queries.json")
+
+		if _, err := os.Stat(snapshotPath); os.IsNotExist(err) {
+			continue
+		}
+		if _, err := os.Stat(queriesPath); os.IsNotExist(err) {
+			continue
+		}
+
+		corpus, err := loadCorpus(entry.Name(), corpusPath)
+		if err != nil {
+			return nil, err
+		}
+
+		ds.Corpora = append(ds.Corpora, *corpus)
+	}
+
+	return ds, nil
+}
+
+func loadCorpus(id, path string) (*Corpus, error) {
+	snapshotPath := filepath.Join(path, "snapshot.json")
+	queriesPath := filepath.Join(path, "queries.json")
+
+	snapshotData, err := os.ReadFile(snapshotPath)
+	if err != nil {
+		return nil, err
+	}
+
+	var snapshot []semantic.ElementDescriptor
+	if err := json.Unmarshal(snapshotData, &snapshot); err != nil {
+		return nil, err
+	}
+
+	queriesData, err := os.ReadFile(queriesPath)
+	if err != nil {
+		return nil, err
+	}
+
+	var queries []Query
+	if err := json.Unmarshal(queriesData, &queries); err != nil {
+		return nil, err
+	}
+
+	return &Corpus{
+		ID:       id,
+		Path:     path,
+		Snapshot: snapshot,
+		Queries:  queries,
+	}, nil
+}
+
+func (ds *Dataset) QueryCount() int {
+	count := 0
+	for _, c := range ds.Corpora {
+		count += len(c.Queries)
+	}
+	return count
+}
+
+func (ds *Dataset) CorpusCount() int {
+	return len(ds.Corpora)
+}
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
new file mode 100644
index 0000000..391cc0a
--- /dev/null
+++ b/internal/benchmark/runner.go
@@ -0,0 +1,384 @@
+package benchmark
+
+import (
+	"context"
+	"time"
+
+	"github.com/pinchtab/semantic"
+)
+
+type QueryResult struct {
+	ID       string   `json:"id"`
+	Corpus   string   `json:"corpus"`
+	Query    string   `json:"query"`
+	Difficulty string `json:"difficulty"`
+	Tags     []string `json:"tags"`
+	Intent   string   `json:"intent,omitempty"`
+	PageType string   `json:"page_type,omitempty"`
+	Expected struct {
+		RelevantRefs          []string `json:"relevant_refs"`
+		PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
+	} `json:"expected"`
+	Actual struct {
+		BestRef   string  `json:"best_ref"`
+		BestScore float64 `json:"best_score"`
+		Matches   []Match `json:"matches"`
+	} `json:"actual"`
+	Metrics struct {
+		RR                float64 `json:"rr"`
+		PAt1              float64 `json:"p_at_1"`
+		PAt3              float64 `json:"p_at_3"`
+		HitAt3            int     `json:"hit_at_3"`
+		HitAt5            int     `json:"hit_at_5"`
+		BestRelevantRank  *int    `json:"best_relevant_rank"`
+		BestRelevantScore float64 `json:"best_relevant_score"`
+		BestWrongScore    float64 `json:"best_wrong_score"`
+		Margin            float64 `json:"margin"`
+	} `json:"metrics"`
+	Latency struct {
+		LibraryMs int64 `json:"library_ms"`
+		CLIMs     *int64 `json:"cli_ms,omitempty"`
+	} `json:"latency"`
+	Status string `json:"status"`
+}
+
+type Match struct {
+	Ref   string  `json:"ref"`
+	Score float64 `json:"score"`
+	Role  string  `json:"role"`
+	Name  string  `json:"name"`
+}
+
+type Report struct {
+	SchemaVersion string `json:"schema_version"`
+	Run           struct {
+		ID        string `json:"id"`
+		Timestamp string `json:"timestamp"`
+		Tool      string `json:"tool"`
+		GitSHA    string `json:"git_sha,omitempty"`
+		GitDirty  bool   `json:"git_dirty,omitempty"`
+		Command   string `json:"command"`
+	} `json:"run"`
+	Dataset struct {
+		Name         string `json:"name"`
+		Version      string `json:"version,omitempty"`
+		QueryCount   int    `json:"query_count"`
+		CorpusCount  int    `json:"corpus_count"`
+	} `json:"dataset"`
+	Config struct {
+		Profile   string  `json:"profile"`
+		Strategy  string  `json:"strategy"`
+		Threshold float64 `json:"threshold"`
+		TopK      int     `json:"top_k"`
+		Weights   Weights `json:"weights"`
+	} `json:"config"`
+	Status  string `json:"status"`
+	Metrics struct {
+		Overall    OverallMetrics           `json:"overall"`
+		Latency    LatencyMetrics           `json:"latency"`
+		ByCorpus   map[string]CorpusMetrics `json:"by_corpus"`
+		ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"`
+		ByTag      map[string]CorpusMetrics `json:"by_tag"`
+	} `json:"metrics"`
+	Results []QueryResult `json:"results"`
+}
+
+type OverallMetrics struct {
+	Total     int     `json:"total"`
+	MRR       float64 `json:"mrr"`
+	PAt1      float64 `json:"p_at_1"`
+	PAt3      float64 `json:"p_at_3"`
+	HitAt3    float64 `json:"hit_at_3"`
+	HitAt5    float64 `json:"hit_at_5"`
+	AvgMargin float64 `json:"avg_margin"`
+}
+
+type LatencyMetrics struct {
+	LibraryP50Ms int64  `json:"library_p50_ms"`
+	LibraryP95Ms int64  `json:"library_p95_ms"`
+	CLIP50Ms     *int64 `json:"cli_p50_ms,omitempty"`
+	CLIP95Ms     *int64 `json:"cli_p95_ms,omitempty"`
+}
+
+type CorpusMetrics struct {
+	Count     int     `json:"count"`
+	MRR       float64 `json:"mrr"`
+	PAt1      float64 `json:"p_at_1"`
+	HitAt3    float64 `json:"hit_at_3"`
+	AvgMargin float64 `json:"avg_margin"`
+}
+
+func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
+	matcher := createMatcher(cfg)
+
+	report := &Report{
+		SchemaVersion: "1.0.0",
+		Status:        "pass",
+	}
+	report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile
+	report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339)
+	report.Run.Tool = "semantic-bench"
+	report.Dataset.Name = "semantic-ui-matching-corpus"
+	report.Dataset.QueryCount = ds.QueryCount()
+	report.Dataset.CorpusCount = ds.CorpusCount()
+	report.Config.Profile = cfg.Profile
+	report.Config.Strategy = cfg.Strategy
+	report.Config.Threshold = cfg.Threshold
+	report.Config.TopK = cfg.TopK
+	report.Config.Weights = Weights{Lexical: cfg.LexicalWeight, Embedding: cfg.EmbeddingWeight}
+
+	report.Metrics.ByCorpus = make(map[string]CorpusMetrics)
+	report.Metrics.ByDifficulty = make(map[string]CorpusMetrics)
+	report.Metrics.ByTag = make(map[string]CorpusMetrics)
+
+	var allLatencies []int64
+
+	for _, corpus := range ds.Corpora {
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+
+		for _, query := range corpus.Queries {
+			if cfg.QueryID != "" && query.ID != cfg.QueryID {
+				continue
+			}
+
+			result := runQuery(matcher, corpus, query, cfg)
+			report.Results = append(report.Results, result)
+			allLatencies = append(allLatencies, result.Latency.LibraryMs)
+		}
+	}
+
+	aggregateMetrics(report, allLatencies)
+	return report, nil
+}
+
+func createMatcher(cfg RunConfig) semantic.ElementMatcher {
+	embedder := semantic.NewHashingEmbedder(128)
+	switch cfg.Strategy {
+	case "lexical":
+		return semantic.NewLexicalMatcher()
+	case "embedding":
+		return semantic.NewEmbeddingMatcher(embedder)
+	default:
+		return semantic.NewCombinedMatcher(embedder)
+	}
+}
+
+func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg RunConfig) QueryResult {
+	result := QueryResult{
+		ID:         query.ID,
+		Corpus:     corpus.ID,
+		Query:      query.QueryText,
+		Difficulty: query.Difficulty,
+		Tags:       query.Tags,
+		Intent:     query.Intent,
+		PageType:   query.PageType,
+	}
+	result.Expected.RelevantRefs = query.RelevantRefs
+	result.Expected.PartiallyRelevantRefs = query.PartiallyRelevantRefs
+
+	threshold := cfg.Threshold
+	if query.Threshold != nil {
+		threshold = *query.Threshold
+	}
+	topK := cfg.TopK
+	if query.TopK != nil {
+		topK = *query.TopK
+	}
+
+	start := time.Now()
+	findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{
+		Threshold: threshold,
+		TopK:      topK,
+	})
+	result.Latency.LibraryMs = time.Since(start).Milliseconds()
+
+	result.Actual.BestRef = findResult.BestRef
+	result.Actual.BestScore = findResult.BestScore
+	for _, m := range findResult.Matches {
+		result.Actual.Matches = append(result.Actual.Matches, Match{
+			Ref:   m.Ref,
+			Score: m.Score,
+			Role:  m.Role,
+			Name:  m.Name,
+		})
+	}
+
+	computeQueryMetrics(&result, query)
+	return result
+}
+
+func computeQueryMetrics(result *QueryResult, query Query) {
+	relevantSet := make(map[string]bool)
+	for _, r := range query.RelevantRefs {
+		relevantSet[r] = true
+	}
+	partialSet := make(map[string]bool)
+	for _, r := range query.PartiallyRelevantRefs {
+		partialSet[r] = true
+	}
+
+	// Reciprocal Rank
+	for i, m := range result.Actual.Matches {
+		if relevantSet[m.Ref] {
+			result.Metrics.RR = 1.0 / float64(i+1)
+			break
+		}
+	}
+
+	// P@1
+	if len(result.Actual.Matches) > 0 {
+		if relevantSet[result.Actual.Matches[0].Ref] {
+			result.Metrics.PAt1 = 1.0
+		} else if partialSet[result.Actual.Matches[0].Ref] {
+			result.Metrics.PAt1 = 0.5
+		}
+	}
+
+	// P@3, Hit@3, Hit@5
+	relevantInTop3 := 0
+	partialInTop3 := 0
+	for i, m := range result.Actual.Matches {
+		if i >= 5 {
+			break
+		}
+		if relevantSet[m.Ref] {
+			if result.Metrics.BestRelevantRank == nil {
+				rank := i + 1
+				result.Metrics.BestRelevantRank = &rank
+			}
+			if result.Metrics.BestRelevantScore == 0 || m.Score > result.Metrics.BestRelevantScore {
+				result.Metrics.BestRelevantScore = m.Score
+			}
+			if i < 3 {
+				relevantInTop3++
+				result.Metrics.HitAt3 = 1
+			}
+			result.Metrics.HitAt5 = 1
+		} else if partialSet[m.Ref] {
+			if i < 3 {
+				partialInTop3++
+			}
+		} else {
+			if m.Score > result.Metrics.BestWrongScore {
+				result.Metrics.BestWrongScore = m.Score
+			}
+		}
+	}
+	result.Metrics.PAt3 = (float64(relevantInTop3) + float64(partialInTop3)*0.5) / 3.0
+	result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore
+
+	// Status
+	if query.ExpectNoMatch {
+		if len(result.Actual.Matches) == 0 {
+			result.Status = "no_match_expected"
+		} else {
+			result.Status = "unexpected_match"
+		}
+	} else if result.Metrics.PAt1 >= 1.0 {
+		result.Status = "hit"
+	} else if result.Metrics.PAt1 >= 0.5 {
+		result.Status = "partial"
+	} else {
+		result.Status = "miss"
+	}
+}
+
+func aggregateMetrics(report *Report, latencies []int64) {
+	n := len(report.Results)
+	if n == 0 {
+		return
+	}
+
+	report.Metrics.Overall.Total = n
+
+	var sumRR, sumP1, sumP3, sumHit3, sumHit5, sumMargin float64
+	corpusAgg := make(map[string]*aggregator)
+	diffAgg := make(map[string]*aggregator)
+	tagAgg := make(map[string]*aggregator)
+
+	for _, r := range report.Results {
+		sumRR += r.Metrics.RR
+		sumP1 += r.Metrics.PAt1
+		sumP3 += r.Metrics.PAt3
+		sumHit3 += float64(r.Metrics.HitAt3)
+		sumHit5 += float64(r.Metrics.HitAt5)
+		sumMargin += r.Metrics.Margin
+
+		addToAgg(corpusAgg, r.Corpus, r)
+		addToAgg(diffAgg, r.Difficulty, r)
+		for _, t := range r.Tags {
+			addToAgg(tagAgg, t, r)
+		}
+	}
+
+	report.Metrics.Overall.MRR = sumRR / float64(n)
+	report.Metrics.Overall.PAt1 = sumP1 / float64(n)
+	report.Metrics.Overall.PAt3 = sumP3 / float64(n)
+	report.Metrics.Overall.HitAt3 = sumHit3 / float64(n)
+	report.Metrics.Overall.HitAt5 = sumHit5 / float64(n)
+	report.Metrics.Overall.AvgMargin = sumMargin / float64(n)
+
+	for k, a := range corpusAgg {
+		report.Metrics.ByCorpus[k] = a.toMetrics()
+	}
+	for k, a := range diffAgg {
+		report.Metrics.ByDifficulty[k] = a.toMetrics()
+	}
+	for k, a := range tagAgg {
+		report.Metrics.ByTag[k] = a.toMetrics()
+	}
+
+	// Latency percentiles
+	if len(latencies) > 0 {
+		sorted := make([]int64, len(latencies))
+		copy(sorted, latencies)
+		sortInt64(sorted)
+		report.Metrics.Latency.LibraryP50Ms = sorted[len(sorted)*50/100]
+		report.Metrics.Latency.LibraryP95Ms = sorted[len(sorted)*95/100]
+	}
+}
+
+type aggregator struct {
+	count     int
+	sumRR     float64
+	sumP1     float64
+	sumHit3   float64
+	sumMargin float64
+}
+
+func addToAgg(m map[string]*aggregator, key string, r QueryResult) {
+	if _, ok := m[key]; !ok {
+		m[key] = &aggregator{}
+	}
+	a := m[key]
+	a.count++
+	a.sumRR += r.Metrics.RR
+	a.sumP1 += r.Metrics.PAt1
+	a.sumHit3 += float64(r.Metrics.HitAt3)
+	a.sumMargin += r.Metrics.Margin
+}
+
+func (a *aggregator) toMetrics() CorpusMetrics {
+	if a.count == 0 {
+		return CorpusMetrics{}
+	}
+	return CorpusMetrics{
+		Count:     a.count,
+		MRR:       a.sumRR / float64(a.count),
+		PAt1:      a.sumP1 / float64(a.count),
+		HitAt3:    a.sumHit3 / float64(a.count),
+		AvgMargin: a.sumMargin / float64(a.count),
+	}
+}
+
+func sortInt64(s []int64) {
+	for i := range s {
+		for j := i + 1; j < len(s); j++ {
+			if s[j] < s[i] {
+				s[i], s[j] = s[j], s[i]
+			}
+		}
+	}
+}

From 39729ab3f2b6eb5ef6f6e9aa86bcc6857b78bd37 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:38:26 +0100
Subject: [PATCH 21/30] chore: ignore semantic-bench binary

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8a46978..9a58d8e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Binary
 /semantic
+/semantic-bench
 tests/benchmark/semantic
 tests/e2e/semantic
 *.exe

From 510be9585c3bdaa0c94f127aaf07b91ae186ace0 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:48:00 +0100
Subject: [PATCH 22/30] feat: add baseline, calibrate, tune commands to Go CLI

Move benchmark management from bash scripts to Go:
- `semantic-bench baseline create/update` - manage quality baselines
- `semantic-bench calibrate` - threshold optimization via precision/recall
- `semantic-bench tune` - grid-search lexical/embedding weights

Update dev tool to use Go CLI for all benchmark commands.
---
 cmd/semantic-bench/main.go     |  49 ++++-
 dev                            |  10 +-
 internal/benchmark/commands.go | 363 +++++++++++++++++++++++++++++++++
 internal/benchmark/config.go   |  59 ++++++
 4 files changed, 471 insertions(+), 10 deletions(-)

diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go
index 35bf051..4866601 100644
--- a/cmd/semantic-bench/main.go
+++ b/cmd/semantic-bench/main.go
@@ -13,11 +13,14 @@ Usage:
   semantic-bench <command> [flags]
 
 Commands:
-  check     Run benchmark and compare against baseline (default)
-  run       Run benchmark suites
-  compare   Compare two reports
-  lint      Validate dataset
-  catalog   Print dataset inventory
+  check       Run benchmark and compare against baseline (default)
+  run         Run benchmark suites
+  compare     Compare two reports
+  lint        Validate dataset
+  catalog     Print dataset inventory
+  baseline    Manage quality baselines (create, update)
+  calibrate   Find optimal thresholds via precision/recall analysis
+  tune        Grid-search lexical/embedding weights
 
 Flags:
   -h, --help    Show help
@@ -45,6 +48,12 @@ func main() {
 		runLint(args)
 	case "catalog":
 		runCatalog(args)
+	case "baseline":
+		runBaseline(args)
+	case "calibrate":
+		runCalibrate(args)
+	case "tune":
+		runTune(args)
 	case "-h", "--help", "help":
 		fmt.Print(usage)
 	default:
@@ -111,3 +120,33 @@ func runCatalog(args []string) {
 	}
 	benchmark.PrintCatalogResult(result, cfg)
 }
+
+func runBaseline(args []string) {
+	cfg := benchmark.ParseBaselineFlags(args)
+	result, err := benchmark.RunBaseline(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintBaselineResult(result, cfg)
+}
+
+func runCalibrate(args []string) {
+	cfg := benchmark.ParseCalibrateFlags(args)
+	result, err := benchmark.RunCalibrate(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCalibrateResult(result, cfg)
+}
+
+func runTune(args []string) {
+	cfg := benchmark.ParseTuneFlags(args)
+	result, err := benchmark.RunTune(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintTuneResult(result, cfg)
+}
diff --git a/dev b/dev
index a7f6247..da0f70c 100755
--- a/dev
+++ b/dev
@@ -177,22 +177,22 @@ run_lint_docs() {
 
 run_baseline() {
   echo "  ${ACCENT}${BOLD}📏 Creating quality baseline${NC}"
-  bash tests/benchmark/scripts/create-baseline.sh "$@"
+  go run ./cmd/semantic-bench baseline create "$@"
 }
 
 run_baseline_check() {
   echo "  ${ACCENT}${BOLD}📏 Checking against baseline${NC}"
-  bash tests/benchmark/scripts/check-baseline.sh "$@"
+  go run ./cmd/semantic-bench check "$@"
 }
 
 run_baseline_update() {
   echo "  ${ACCENT}${BOLD}📏 Updating baseline${NC}"
-  bash tests/benchmark/scripts/update-baseline.sh --accept "$@"
+  go run ./cmd/semantic-bench baseline update --accept "$@"
 }
 
 run_calibrate() {
   echo "  ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}"
-  bash tests/benchmark/scripts/calibrate-thresholds.sh "$@"
+  go run ./cmd/semantic-bench calibrate -verbose "$@"
 }
 
 run_runtime() {
@@ -202,7 +202,7 @@ run_runtime() {
 
 run_tune() {
   echo "  ${ACCENT}${BOLD}🎛️ Tuning combined weights${NC}"
-  bash tests/benchmark/scripts/tune-weights.sh "$@"
+  go run ./cmd/semantic-bench tune -verbose "$@"
 }
 
 run_e2e() {
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
index ad22ea3..7f37ed5 100644
--- a/internal/benchmark/commands.go
+++ b/internal/benchmark/commands.go
@@ -8,6 +8,8 @@ import (
 	"sort"
 	"strings"
 	"time"
+
+	"github.com/pinchtab/semantic"
 )
 
 type CheckResult struct {
@@ -508,3 +510,364 @@ func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
 	}
 	fmt.Printf("\n")
 }
+
+// Baseline management
+
+type BaselineResult struct {
+	Action   string         `json:"action"`
+	Path     string         `json:"path"`
+	Metrics  OverallMetrics `json:"metrics"`
+	Previous *OverallMetrics `json:"previous,omitempty"`
+}
+
+func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) {
+	root := FindBenchmarkRoot()
+	baselinesDir := filepath.Join(root, "baselines")
+	if err := os.MkdirAll(baselinesDir, 0755); err != nil {
+		return nil, err
+	}
+
+	baselinePath := filepath.Join(baselinesDir, cfg.Name+".json")
+
+	switch cfg.Action {
+	case "create":
+		return createBaseline(root, baselinePath, cfg)
+	case "update":
+		if !cfg.Accept {
+			return nil, fmt.Errorf("use --accept to confirm baseline update")
+		}
+		return updateBaseline(root, baselinePath, cfg)
+	default:
+		return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action)
+	}
+}
+
+func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Mode:            "library",
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	data, err := json.MarshalIndent(report, "", "  ")
+	if err != nil {
+		return nil, err
+	}
+	if err := os.WriteFile(baselinePath, data, 0644); err != nil {
+		return nil, err
+	}
+
+	return &BaselineResult{
+		Action:  "create",
+		Path:    baselinePath,
+		Metrics: report.Metrics.Overall,
+	}, nil
+}
+
+func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	var previous *OverallMetrics
+	if data, err := os.ReadFile(baselinePath); err == nil {
+		var old Report
+		if json.Unmarshal(data, &old) == nil {
+			previous = &old.Metrics.Overall
+		}
+		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
+		os.WriteFile(backupPath, data, 0644)
+	}
+
+	result, err := createBaseline(root, baselinePath, cfg)
+	if err != nil {
+		return nil, err
+	}
+	result.Action = "update"
+	result.Previous = previous
+	return result, nil
+}
+
+func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) {
+	fmt.Printf("\n  Baseline %sd: %s\n\n", result.Action, result.Path)
+	fmt.Printf("  MRR:    %.4f\n", result.Metrics.MRR)
+	fmt.Printf("  P@1:    %.4f\n", result.Metrics.PAt1)
+	fmt.Printf("  Hit@3:  %.4f\n", result.Metrics.HitAt3)
+
+	if result.Previous != nil {
+		fmt.Printf("\n  Previous:\n")
+		fmt.Printf("    MRR:    %.4f\n", result.Previous.MRR)
+		fmt.Printf("    P@1:    %.4f\n", result.Previous.PAt1)
+		fmt.Printf("    Hit@3:  %.4f\n", result.Previous.HitAt3)
+	}
+	fmt.Println()
+}
+
+// Threshold calibration
+
+type CalibrateResult struct {
+	ByThreshold     map[string]ThresholdMetrics `json:"by_threshold"`
+	Recommendations CalibrateRecommendations    `json:"recommendations"`
+	TotalCases      int                         `json:"total_cases"`
+}
+
+type ThresholdMetrics struct {
+	TP        int     `json:"tp"`
+	FP        int     `json:"fp"`
+	FN        int     `json:"fn"`
+	TN        int     `json:"tn"`
+	Recall    float64 `json:"recall"`
+	Precision float64 `json:"precision"`
+	FPR       float64 `json:"false_positive_rate"`
+	F1        float64 `json:"f1"`
+}
+
+type CalibrateRecommendations struct {
+	DefaultThreshold  float64 `json:"default_threshold"`
+	RecoveryThreshold float64 `json:"recovery_threshold"`
+	BestF1            float64 `json:"best_f1"`
+}
+
+func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &CalibrateResult{
+		ByThreshold: make(map[string]ThresholdMetrics),
+	}
+
+	type testCase struct {
+		query         Query
+		corpus        *Corpus
+	}
+
+	var cases []testCase
+	for i := range ds.Corpora {
+		corpus := &ds.Corpora[i]
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+		for _, q := range corpus.Queries {
+			cases = append(cases, testCase{query: q, corpus: corpus})
+		}
+	}
+	result.TotalCases = len(cases)
+
+	if cfg.Verbose {
+		fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases))
+	}
+
+	runCfg := RunConfig{
+		Strategy:        "combined",
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+	}
+	matcher := createMatcher(runCfg)
+
+	var bestF1, bestF1Threshold float64
+	var bestRecallThreshold float64
+	var bestRecallWithPrecision float64
+
+	for _, threshold := range cfg.Thresholds {
+		tp, fp, fn, tn := 0, 0, 0, 0
+
+		for _, tc := range cases {
+			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
+				Threshold: threshold,
+				TopK:      5,
+			})
+
+			hasMatch := len(findResult.Matches) > 0
+			topRef := ""
+			if hasMatch {
+				topRef = findResult.Matches[0].Ref
+			}
+
+			if tc.query.ExpectNoMatch {
+				if hasMatch {
+					fp++
+				} else {
+					tn++
+				}
+			} else if len(tc.query.RelevantRefs) > 0 {
+				if !hasMatch {
+					fn++
+				} else if contains(tc.query.RelevantRefs, topRef) {
+					tp++
+				} else {
+					fp++
+				}
+			}
+		}
+
+		totalPos := tp + fn
+		totalNeg := tn + fp
+
+		var recall, precision, fpr, f1 float64
+		if totalPos > 0 {
+			recall = float64(tp) / float64(totalPos)
+		}
+		if tp+fp > 0 {
+			precision = float64(tp) / float64(tp+fp)
+		}
+		if totalNeg > 0 {
+			fpr = float64(fp) / float64(totalNeg)
+		}
+		if precision+recall > 0 {
+			f1 = 2 * precision * recall / (precision + recall)
+		}
+
+		key := fmt.Sprintf("%.2f", threshold)
+		result.ByThreshold[key] = ThresholdMetrics{
+			TP: tp, FP: fp, FN: fn, TN: tn,
+			Recall: recall, Precision: precision, FPR: fpr, F1: f1,
+		}
+
+		if f1 > bestF1 {
+			bestF1 = f1
+			bestF1Threshold = threshold
+		}
+		if recall >= 0.85 && precision > bestRecallWithPrecision {
+			bestRecallWithPrecision = precision
+			bestRecallThreshold = threshold
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n",
+				threshold, tp, fp, fn, tn, recall, precision, f1)
+		}
+	}
+
+	if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 {
+		bestRecallThreshold = cfg.Thresholds[0]
+	}
+
+	result.Recommendations = CalibrateRecommendations{
+		DefaultThreshold:  bestF1Threshold,
+		RecoveryThreshold: bestRecallThreshold,
+		BestF1:            bestF1,
+	}
+
+	return result, nil
+}
+
+func contains(refs []string, ref string) bool {
+	for _, r := range refs {
+		if r == ref {
+			return true
+		}
+	}
+	return false
+}
+
+func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) {
+	fmt.Printf("\n  Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold))
+
+	fmt.Printf("  Recommendations:\n")
+	fmt.Printf("    Default (best F1):   %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1)
+	fmt.Printf("    Recovery (recall):   %.2f\n", result.Recommendations.RecoveryThreshold)
+	fmt.Println()
+}
+
+// Weight tuning
+
+type TuneResult struct {
+	Results []TuneRun `json:"results"`
+	Best    *TuneRun  `json:"best"`
+}
+
+type TuneRun struct {
+	LexicalWeight   float64 `json:"lexical_weight"`
+	EmbeddingWeight float64 `json:"embedding_weight"`
+	MRR             float64 `json:"mrr"`
+	PAt1            float64 `json:"p_at_1"`
+	HitAt3          float64 `json:"hit_at_3"`
+}
+
+func RunTune(cfg TuneConfig) (*TuneResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &TuneResult{}
+
+	if cfg.Verbose {
+		fmt.Printf("  %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3")
+	}
+
+	for w := 0.0; w <= 1.0001; w += cfg.Step {
+		lexW := w
+		embW := 1.0 - w
+
+		runCfg := RunConfig{
+			Suite:           "corpus",
+			Strategy:        "combined",
+			Threshold:       0.01,
+			TopK:            5,
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			Mode:            "library",
+		}
+
+		if cfg.Corpus != "" {
+			runCfg.Corpus = cfg.Corpus
+		}
+
+		report, err := RunCorpusBenchmark(ds, runCfg)
+		if err != nil {
+			return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err)
+		}
+
+		run := TuneRun{
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			MRR:             report.Metrics.Overall.MRR,
+			PAt1:            report.Metrics.Overall.PAt1,
+			HitAt3:          report.Metrics.Overall.HitAt3,
+		}
+		result.Results = append(result.Results, run)
+
+		if result.Best == nil || run.PAt1 > result.Best.PAt1 ||
+			(run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) {
+			best := run
+			result.Best = &best
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n",
+				lexW, embW, run.MRR, run.PAt1, run.HitAt3)
+		}
+	}
+
+	return result, nil
+}
+
+func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
+	fmt.Printf("\n  Tested %d weight combinations\n\n", len(result.Results))
+
+	if result.Best != nil {
+		fmt.Printf("  Best weights:\n")
+		fmt.Printf("    Lexical:   %.2f\n", result.Best.LexicalWeight)
+		fmt.Printf("    Embedding: %.2f\n", result.Best.EmbeddingWeight)
+		fmt.Printf("    MRR:       %.4f\n", result.Best.MRR)
+		fmt.Printf("    P@1:       %.4f\n", result.Best.PAt1)
+		fmt.Printf("    Hit@3:     %.4f\n", result.Best.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index c8ac10d..eb2fe57 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -99,6 +99,25 @@ type CatalogConfig struct {
 	By     string
 }
 
+type BaselineCmdConfig struct {
+	Action  string // "create" or "update"
+	Name    string
+	Accept  bool
+	Verbose bool
+}
+
+type CalibrateConfig struct {
+	Corpus     string
+	Thresholds []float64
+	Verbose    bool
+}
+
+type TuneConfig struct {
+	Corpus  string
+	Step    float64
+	Verbose bool
+}
+
 func FindBenchmarkRoot() string {
 	cwd, _ := os.Getwd()
 	for d := cwd; d != "/"; d = filepath.Dir(d) {
@@ -245,3 +264,43 @@ func ParseCatalogFlags(args []string) CatalogConfig {
 	fs.Parse(args)
 	return cfg
 }
+
+func ParseBaselineFlags(args []string) BaselineCmdConfig {
+	fs := flag.NewFlagSet("baseline", flag.ExitOnError)
+	cfg := BaselineCmdConfig{
+		Action: "create",
+		Name:   "combined",
+	}
+	fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name")
+	fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+
+	if len(fs.Args()) > 0 {
+		cfg.Action = fs.Args()[0]
+	}
+	return cfg
+}
+
+func ParseCalibrateFlags(args []string) CalibrateConfig {
+	fs := flag.NewFlagSet("calibrate", flag.ExitOnError)
+	cfg := CalibrateConfig{
+		Thresholds: []float64{0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60},
+	}
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseTuneFlags(args []string) TuneConfig {
+	fs := flag.NewFlagSet("tune", flag.ExitOnError)
+	cfg := TuneConfig{
+		Step: 0.1,
+	}
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against")
+	fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}

From 33bc06f56ee9de99c81c201c7ff7dedf094edf9a Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:31:21 +0100
Subject: [PATCH 23/30] chore: remove bash scripts replaced by Go CLI

Keep only check-runtime-baseline.sh (wraps go test -bench).
---
 .../benchmark/scripts/calibrate-thresholds.sh | 340 ------------
 tests/benchmark/scripts/check-baseline.sh     | 140 -----
 tests/benchmark/scripts/create-baseline.sh    |  86 ---
 tests/benchmark/scripts/finalize-report.sh    | 115 ----
 tests/benchmark/scripts/lint-corpus.sh        | 197 -------
 tests/benchmark/scripts/record-result.sh      |  44 --
 tests/benchmark/scripts/run-benchmark.sh      | 226 --------
 .../benchmark/scripts/run-corpus-benchmark.sh | 514 ------------------
 tests/benchmark/scripts/run-full-benchmark.sh | 317 -----------
 .../scripts/run-recovery-benchmark.sh         |  42 --
 tests/benchmark/scripts/tune-weights.sh       | 167 ------
 tests/benchmark/scripts/update-baseline.sh    |  70 ---
 12 files changed, 2258 deletions(-)
 delete mode 100755 tests/benchmark/scripts/calibrate-thresholds.sh
 delete mode 100755 tests/benchmark/scripts/check-baseline.sh
 delete mode 100755 tests/benchmark/scripts/create-baseline.sh
 delete mode 100755 tests/benchmark/scripts/finalize-report.sh
 delete mode 100755 tests/benchmark/scripts/lint-corpus.sh
 delete mode 100755 tests/benchmark/scripts/record-result.sh
 delete mode 100755 tests/benchmark/scripts/run-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/run-corpus-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/run-full-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/run-recovery-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/tune-weights.sh
 delete mode 100755 tests/benchmark/scripts/update-baseline.sh

diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh
deleted file mode 100755
index 84d68d1..0000000
--- a/tests/benchmark/scripts/calibrate-thresholds.sh
+++ /dev/null
@@ -1,340 +0,0 @@
-#!/bin/bash
-#
-# Threshold Calibration Benchmark
-#
-# Calculates optimal thresholds for semantic matching by evaluating
-# recall, precision, and false-positive rates across threshold levels.
-#
-# Usage:
-#   ./calibrate-thresholds.sh [--corpus <dir>]
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-CASES_DIR="${BENCHMARK_DIR}/cases"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-SPECIFIC_CORPUS=""
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json"
-
-# Thresholds to test
-THRESHOLDS=(0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60)
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --argjson thresholds "$(printf '%s\n' "${THRESHOLDS[@]}" | jq -s '.')" \
-    '{
-        calibration: {
-            timestamp: $ts,
-            thresholds_tested: $thresholds
-        },
-        by_threshold: {},
-        by_tag: {},
-        recommendations: {}
-    }' > "${REPORT_FILE}"
-
-echo ""
-echo "=== Threshold Calibration ==="
-echo "Testing thresholds: ${THRESHOLDS[*]}"
-echo ""
-
-# Collect all test cases
-declare -a ALL_QUERIES=()
-declare -a ALL_SNAPSHOTS=()
-declare -a ALL_RELEVANT=()
-declare -a ALL_EXPECT_NO_MATCH=()
-declare -a ALL_IDS=()
-
-load_corpus() {
-    local corpus_path="$1"
-    local snapshot="${corpus_path}/snapshot.json"
-    local queries="${corpus_path}/queries.json"
-
-    if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
-        return
-    fi
-
-    local count
-    count=$(jq length "$queries")
-
-    for i in $(seq 0 $((count - 1))); do
-        local query relevant id expect_no_match
-        id=$(jq -r ".[$i].id" "$queries")
-        query=$(jq -r ".[$i].query" "$queries")
-        relevant=$(jq -c ".[$i].relevant_refs // []" "$queries")
-        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$queries")
-
-        ALL_IDS+=("$id")
-        ALL_QUERIES+=("$query")
-        ALL_SNAPSHOTS+=("$snapshot")
-        ALL_RELEVANT+=("$relevant")
-        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
-    done
-}
-
-load_cases() {
-    local cases_file="$1"
-    local snapshots_dir="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-
-    if [[ ! -f "$cases_file" ]]; then
-        return
-    fi
-
-    local count
-    count=$(jq length "$cases_file")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query snapshot_name expect_no_match expect_ref expect_ref_alt relevant
-        id=$(jq -r ".[$i].id" "$cases_file")
-        query=$(jq -r ".[$i].query" "$cases_file")
-        snapshot_name=$(jq -r ".[$i].snapshot" "$cases_file")
-        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$cases_file")
-        expect_ref=$(jq -r ".[$i].expect_ref // \"\"" "$cases_file")
-        expect_ref_alt=$(jq -c ".[$i].expect_ref_alt // []" "$cases_file")
-
-        if [[ -n "$expect_ref" && "$expect_ref" != "null" ]]; then
-            relevant=$(echo "$expect_ref_alt" | jq --arg r "$expect_ref" '. + [$r]')
-        else
-            relevant="[]"
-        fi
-
-        local snapshot="${snapshots_dir}/${snapshot_name}"
-        if [[ ! -f "$snapshot" ]]; then
-            continue
-        fi
-
-        ALL_IDS+=("$id")
-        ALL_QUERIES+=("$query")
-        ALL_SNAPSHOTS+=("$snapshot")
-        ALL_RELEVANT+=("$relevant")
-        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
-    done
-}
-
-echo "Loading test cases..."
-if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-    load_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}"
-else
-    for corpus in "${CORPUS_DIR}"/*/; do
-        [[ -d "$corpus" ]] || continue
-        load_corpus "$corpus"
-    done
-fi
-
-load_cases "${CASES_DIR}/negative-threshold.json"
-
-TOTAL_CASES=${#ALL_QUERIES[@]}
-echo "Loaded ${TOTAL_CASES} test cases"
-echo ""
-
-for threshold in "${THRESHOLDS[@]}"; do
-    echo "Testing threshold ${threshold}..."
-
-    tp=0 fp=0 fn=0 tn=0
-
-    for i in $(seq 0 $((TOTAL_CASES - 1))); do
-        query="${ALL_QUERIES[$i]}"
-        snapshot="${ALL_SNAPSHOTS[$i]}"
-        relevant="${ALL_RELEVANT[$i]}"
-        expect_no_match="${ALL_EXPECT_NO_MATCH[$i]}"
-
-        result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot}" \
-            --strategy combined \
-            --threshold "${threshold}" \
-            --top-k 5 \
-            --format json 2>/dev/null) || result='{"matches":[]}'
-
-        match_count=$(echo "$result" | jq '.matches | length')
-        best_ref=$(echo "$result" | jq -r '.best_ref // ""')
-
-        if [[ "$expect_no_match" == "true" ]]; then
-            if [[ $match_count -eq 0 ]]; then
-                tn=$((tn + 1))
-            else
-                fp=$((fp + 1))
-            fi
-        else
-            relevant_count=$(echo "$relevant" | jq 'length')
-            if [[ $relevant_count -eq 0 ]]; then
-                continue
-            fi
-
-            if [[ $match_count -eq 0 ]]; then
-                fn=$((fn + 1))
-            elif echo "$relevant" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-                tp=$((tp + 1))
-            else
-                fp=$((fp + 1))
-            fi
-        fi
-    done
-
-    total_positive=$((tp + fn))
-    total_negative=$((tn + fp))
-
-    if [[ $total_positive -gt 0 ]]; then
-        recall=$(echo "scale=4; $tp / $total_positive" | bc)
-    else
-        recall="0"
-    fi
-
-    if [[ $((tp + fp)) -gt 0 ]]; then
-        precision=$(echo "scale=4; $tp / ($tp + $fp)" | bc)
-    else
-        precision="1"
-    fi
-
-    if [[ $total_negative -gt 0 ]]; then
-        fpr=$(echo "scale=4; $fp / $total_negative" | bc)
-    else
-        fpr="0"
-    fi
-
-    if [[ $(echo "$precision + $recall > 0" | bc) -eq 1 ]]; then
-        f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc)
-    else
-        f1="0"
-    fi
-
-    printf "  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f FPR=%.3f F1=%.3f\n" \
-        "$threshold" "$tp" "$fp" "$fn" "$tn" "$recall" "$precision" "$fpr" "$f1"
-
-    tmp=$(mktemp)
-    jq --arg t "$threshold" \
-       --argjson tp "$tp" --argjson fp "$fp" --argjson fn "$fn" --argjson tn "$tn" \
-       --argjson recall "$recall" --argjson precision "$precision" \
-       --argjson fpr "$fpr" --argjson f1 "$f1" \
-       '.by_threshold[$t] = {
-           tp: $tp, fp: $fp, fn: $fn, tn: $tn,
-           recall: $recall, precision: $precision,
-           false_positive_rate: $fpr, f1: $f1
-       }' "$REPORT_FILE" > "$tmp"
-    mv "$tmp" "$REPORT_FILE"
-done
-
-echo ""
-echo "Calculating recommendations..."
-
-best_f1_threshold="" best_f1=0
-best_recall_threshold="" best_recall=0
-
-for threshold in "${THRESHOLDS[@]}"; do
-    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
-    f1=$(echo "$metrics" | jq -r '.f1')
-    recall=$(echo "$metrics" | jq -r '.recall')
-
-    if (( $(echo "$f1 > $best_f1" | bc -l) )); then
-        best_f1=$f1
-        best_f1_threshold=$threshold
-    fi
-    if (( $(echo "$recall > $best_recall" | bc -l) )); then
-        best_recall=$recall
-        best_recall_threshold=$threshold
-    fi
-done
-
-recovery_threshold=""
-recovery_precision=0
-for threshold in "${THRESHOLDS[@]}"; do
-    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
-    recall=$(echo "$metrics" | jq -r '.recall')
-    precision=$(echo "$metrics" | jq -r '.precision')
-
-    if (( $(echo "$recall >= 0.85" | bc -l) )); then
-        if (( $(echo "$precision > $recovery_precision" | bc -l) )); then
-            recovery_precision=$precision
-            recovery_threshold=$threshold
-        fi
-    fi
-done
-
-if [[ -z "$recovery_threshold" ]]; then
-    recovery_threshold="${THRESHOLDS[0]}"
-fi
-
-default_threshold="$best_f1_threshold"
-
-tmp=$(mktemp)
-jq --arg default "$default_threshold" \
-   --arg recovery "$recovery_threshold" \
-   --arg best_f1 "$best_f1_threshold" \
-   --argjson best_f1_val "$best_f1" \
-   '.recommendations = {
-       default_threshold: $default,
-       recovery_threshold: $recovery,
-       best_f1: { threshold: $best_f1, value: $best_f1_val },
-       notes: "default_threshold optimizes F1. recovery_threshold prioritizes recall (>=85%)."
-   }' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Threshold Calibration Report
-
-Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)
-
-## Recommendations
-
-| Use Case | Threshold | Rationale |
-|----------|-----------|-----------|
-| **Default (find)** | **${default_threshold}** | Best F1 score (${best_f1}) |
-| **Recovery** | **${recovery_threshold}** | High recall for element recovery |
-
-## Metrics by Threshold
-
-| Threshold | TP | FP | FN | TN | Recall | Precision | FPR | F1 |
-|-----------|----|----|----|----|--------|-----------|-----|-----|
-$(for t in "${THRESHOLDS[@]}"; do
-    m=$(jq -r ".by_threshold[\"$t\"]" "$REPORT_FILE")
-    printf "| %.2f | %d | %d | %d | %d | %.3f | %.3f | %.3f | %.3f |\n" \
-        "$t" \
-        "$(echo "$m" | jq -r '.tp')" \
-        "$(echo "$m" | jq -r '.fp')" \
-        "$(echo "$m" | jq -r '.fn')" \
-        "$(echo "$m" | jq -r '.tn')" \
-        "$(echo "$m" | jq -r '.recall')" \
-        "$(echo "$m" | jq -r '.precision')" \
-        "$(echo "$m" | jq -r '.false_positive_rate')" \
-        "$(echo "$m" | jq -r '.f1')"
-done)
-
-## Trade-offs
-
-- **Lower threshold** (0.10-0.20): High recall, more false positives. Good for recovery.
-- **Medium threshold** (0.25-0.35): Balanced. Good default for find operations.
-- **Higher threshold** (0.40+): High precision, misses weaker matches.
-EOF
-
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "================================================"
-echo "  THRESHOLD CALIBRATION COMPLETE"
-echo "================================================"
-echo "  Test cases:         ${TOTAL_CASES}"
-echo "  Default threshold:  ${default_threshold} (F1=${best_f1})"
-echo "  Recovery threshold: ${recovery_threshold}"
-echo "================================================"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/check-baseline.sh b/tests/benchmark/scripts/check-baseline.sh
deleted file mode 100755
index f6e95ae..0000000
--- a/tests/benchmark/scripts/check-baseline.sh
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/bin/bash
-#
-# Check current benchmark results against a baseline.
-#
-# Usage:
-#   ./check-baseline.sh [--baseline <file>] [--fail-on-regression]
-#
-# Exit codes:
-#   0 - No regressions detected
-#   1 - Regressions detected (if --fail-on-regression)
-#   2 - Error (missing files, invalid config)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-# Read config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 2
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-MAX_P1_DROP=$(jq -r '.baseline.quality.max_overall_p_at_1_drop // 0.02' "$CONFIG_FILE")
-MAX_MRR_DROP=$(jq -r '.baseline.quality.max_overall_mrr_drop // 0.02' "$CONFIG_FILE")
-MAX_HIT3_DROP=$(jq -r '.baseline.quality.max_overall_hit_at_3_drop // 0.02' "$CONFIG_FILE")
-MAX_CORPUS_P1_DROP=$(jq -r '.baseline.quality.max_corpus_p_at_1_drop // 0.08' "$CONFIG_FILE")
-MAX_MARGIN_DROP=$(jq -r '.baseline.quality.max_margin_drop_report // 0.15' "$CONFIG_FILE")
-
-# Parse args
-BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
-FAIL_ON_REGRESSION=false
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --baseline) BASELINE_FILE="$2"; shift 2 ;;
-        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
-        *) echo "Unknown option: $1"; exit 2 ;;
-    esac
-done
-
-if [[ ! -f "$BASELINE_FILE" ]]; then
-    echo "ERROR: Baseline not found: $BASELINE_FILE" >&2
-    echo "Run ./create-baseline.sh first" >&2
-    exit 2
-fi
-
-echo "Checking against baseline: ${BASELINE_FILE}"
-echo "Tolerances: P@1=${MAX_P1_DROP}, MRR=${MAX_MRR_DROP}, Hit@3=${MAX_HIT3_DROP}"
-echo ""
-
-# Run current benchmark
-TEMP_DIR=$(mktemp -d)
-trap 'rm -rf "$TEMP_DIR"' EXIT
-
-"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" > "${TEMP_DIR}/output.log" 2>&1
-
-# Find the latest report
-LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
-
-if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
-    echo "ERROR: Could not find benchmark report" >&2
-    exit 2
-fi
-
-# Compare metrics
-REGRESSIONS=0
-WARNINGS=0
-
-compare_metric() {
-    local name="$1"
-    local baseline_val="$2"
-    local current_val="$3"
-    local max_drop="$4"
-
-    local diff
-    diff=$(echo "scale=4; $current_val - $baseline_val" | bc)
-    local drop
-    drop=$(echo "scale=4; $baseline_val - $current_val" | bc)
-
-    if (( $(echo "$drop > $max_drop" | bc -l) )); then
-        echo -e "${RED}REGRESSION${NC} $name: $baseline_val -> $current_val (drop: $drop, max: $max_drop)"
-        REGRESSIONS=$((REGRESSIONS + 1))
-    elif (( $(echo "$drop > 0" | bc -l) )); then
-        echo -e "${YELLOW}WARNING${NC} $name: $baseline_val -> $current_val (drop: $drop)"
-        WARNINGS=$((WARNINGS + 1))
-    else
-        echo -e "${GREEN}OK${NC} $name: $baseline_val -> $current_val (${diff:0:6})"
-    fi
-}
-
-echo "=== Overall Metrics ==="
-echo ""
-
-BASELINE_MRR=$(jq -r '.metrics.mrr' "$BASELINE_FILE")
-CURRENT_MRR=$(jq -r '.metrics.mrr' "$LATEST_REPORT")
-compare_metric "MRR" "$BASELINE_MRR" "$CURRENT_MRR" "$MAX_MRR_DROP"
-
-BASELINE_P1=$(jq -r '.metrics.p_at_1' "$BASELINE_FILE")
-CURRENT_P1=$(jq -r '.metrics.p_at_1' "$LATEST_REPORT")
-compare_metric "P@1" "$BASELINE_P1" "$CURRENT_P1" "$MAX_P1_DROP"
-
-BASELINE_HIT3=$(jq -r '.metrics.hit_at_3' "$BASELINE_FILE")
-CURRENT_HIT3=$(jq -r '.metrics.hit_at_3' "$LATEST_REPORT")
-compare_metric "Hit@3" "$BASELINE_HIT3" "$CURRENT_HIT3" "$MAX_HIT3_DROP"
-
-BASELINE_MARGIN=$(jq -r '.metrics.avg_margin' "$BASELINE_FILE")
-CURRENT_MARGIN=$(jq -r '.metrics.avg_margin' "$LATEST_REPORT")
-compare_metric "Margin" "$BASELINE_MARGIN" "$CURRENT_MARGIN" "$MAX_MARGIN_DROP"
-
-echo ""
-echo "=== Per-Corpus ==="
-echo ""
-
-for corpus in $(jq -r '.by_corpus | keys[]' "$BASELINE_FILE"); do
-    BASELINE_CORPUS_P1=$(jq -r ".by_corpus[\"$corpus\"].p_at_1 // 0" "$BASELINE_FILE")
-    CURRENT_CORPUS_P1=$(jq -r ".metrics.by_corpus[\"$corpus\"].p_at_1 // 0" "$LATEST_REPORT")
-    compare_metric "$corpus P@1" "$BASELINE_CORPUS_P1" "$CURRENT_CORPUS_P1" "$MAX_CORPUS_P1_DROP"
-done
-
-echo ""
-echo "================================================"
-if [[ $REGRESSIONS -gt 0 ]]; then
-    echo -e "${RED}REGRESSIONS: $REGRESSIONS${NC}"
-    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
-        exit 1
-    fi
-elif [[ $WARNINGS -gt 0 ]]; then
-    echo -e "${YELLOW}WARNINGS: $WARNINGS (no regressions)${NC}"
-else
-    echo -e "${GREEN}ALL CHECKS PASSED${NC}"
-fi
-echo "================================================"
diff --git a/tests/benchmark/scripts/create-baseline.sh b/tests/benchmark/scripts/create-baseline.sh
deleted file mode 100755
index cd4696a..0000000
--- a/tests/benchmark/scripts/create-baseline.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-#
-# Create a quality baseline from current corpus benchmark results.
-#
-# Usage:
-#   ./create-baseline.sh [--name <name>]
-#
-# This runs run-corpus-benchmark.sh and saves the results as a baseline.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-
-# Parse args
-BASELINE_NAME="${STRATEGY}"
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --name) BASELINE_NAME="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${BASELINES_DIR}"
-
-BASELINE_FILE="${BASELINES_DIR}/${BASELINE_NAME}.json"
-
-echo "Creating baseline: ${BASELINE_NAME}"
-echo "Strategy: ${STRATEGY}"
-echo ""
-
-# Run corpus benchmark
-TEMP_DIR=$(mktemp -d)
-trap 'rm -rf "$TEMP_DIR"' EXIT
-
-"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" 2>&1 | tee "${TEMP_DIR}/output.log"
-
-# Find the latest report
-LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
-
-if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
-    echo "ERROR: Could not find benchmark report" >&2
-    exit 1
-fi
-
-# Extract baseline data
-jq '{
-    created_at: .benchmark.timestamp,
-    strategy: .benchmark.strategy,
-    threshold: .benchmark.threshold,
-    top_k: .benchmark.top_k,
-    weights: .benchmark.weights,
-    metrics: {
-        total: .metrics.total,
-        mrr: .metrics.mrr,
-        p_at_1: .metrics.p_at_1,
-        p_at_3: .metrics.p_at_3,
-        hit_at_3: .metrics.hit_at_3,
-        hit_at_5: .metrics.hit_at_5,
-        avg_margin: .metrics.avg_margin,
-        latency_p50_ms: .metrics.latency_p50_ms,
-        latency_p95_ms: .metrics.latency_p95_ms
-    },
-    by_difficulty: .metrics.by_difficulty,
-    by_corpus: .metrics.by_corpus,
-    per_query: [.results[] | {id, corpus, difficulty, p_at_1, rr, margin}]
-}' "$LATEST_REPORT" > "$BASELINE_FILE"
-
-echo ""
-echo "================================================"
-echo "  BASELINE CREATED"
-echo "================================================"
-echo "  File: ${BASELINE_FILE}"
-echo ""
-jq -r '"  MRR:     \(.metrics.mrr)\n  P@1:     \(.metrics.p_at_1)\n  Hit@3:   \(.metrics.hit_at_3)\n  Margin:  \(.metrics.avg_margin)"' "$BASELINE_FILE"
-echo "================================================"
diff --git a/tests/benchmark/scripts/finalize-report.sh b/tests/benchmark/scripts/finalize-report.sh
deleted file mode 100755
index 38d314f..0000000
--- a/tests/benchmark/scripts/finalize-report.sh
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-#
-# Finalize benchmark report and generate summary
-#
-# Usage:
-#   ./finalize-report.sh <report_file>
-#
-set -euo pipefail
-
-if [[ $# -lt 1 ]]; then
-    echo "Usage: $0 <report_file>"
-    exit 1
-fi
-
-REPORT_FILE="$1"
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-# Calculate final metrics
-TMP_FILE=$(mktemp)
-jq '
-    .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 10000 | floor / 100) else 0 end) |
-    .summary.avg_score = (if (.results | length) > 0 then ([.results[].score] | add / length | . * 1000 | floor / 1000) else 0 end) |
-    .summary.avg_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | add / length | floor) else 0 end) |
-    .summary.min_score = (if (.results | length) > 0 then ([.results[].score] | min) else 0 end) |
-    .summary.max_score = (if (.results | length) > 0 then ([.results[].score] | max) else 0 end) |
-    .summary.min_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | min) else 0 end) |
-    .summary.max_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | max) else 0 end)
-' "${REPORT_FILE}" > "${TMP_FILE}"
-mv "${TMP_FILE}" "${REPORT_FILE}"
-
-# Generate markdown summary
-TIMESTAMP=$(jq -r '.benchmark.timestamp' "${REPORT_FILE}")
-STRATEGY=$(jq -r '.benchmark.strategy' "${REPORT_FILE}")
-VERSION=$(jq -r '.benchmark.version' "${REPORT_FILE}")
-TOTAL=$(jq -r '.summary.total' "${REPORT_FILE}")
-PASSED=$(jq -r '.summary.passed' "${REPORT_FILE}")
-FAILED=$(jq -r '.summary.failed' "${REPORT_FILE}")
-SKIPPED=$(jq -r '.summary.skipped' "${REPORT_FILE}")
-ACCURACY=$(jq -r '.summary.accuracy' "${REPORT_FILE}")
-AVG_SCORE=$(jq -r '.summary.avg_score' "${REPORT_FILE}")
-AVG_LATENCY=$(jq -r '.summary.avg_latency_ms' "${REPORT_FILE}")
-MIN_SCORE=$(jq -r '.summary.min_score' "${REPORT_FILE}")
-MAX_SCORE=$(jq -r '.summary.max_score' "${REPORT_FILE}")
-MIN_LATENCY=$(jq -r '.summary.min_latency_ms' "${REPORT_FILE}")
-MAX_LATENCY=$(jq -r '.summary.max_latency_ms' "${REPORT_FILE}")
-
-cat > "${SUMMARY_FILE}" << EOF
-# Semantic Matching Benchmark Results
-
-## Benchmark Info
-
-| Field | Value |
-|-------|-------|
-| Timestamp | ${TIMESTAMP} |
-| Strategy | ${STRATEGY} |
-| Version | ${VERSION} |
-
-## Results Summary
-
-| Metric | Value |
-|--------|-------|
-| Total Cases | ${TOTAL} |
-| Passed | ${PASSED} |
-| Failed | ${FAILED} |
-| Skipped | ${SKIPPED} |
-| **Accuracy** | **${ACCURACY}%** |
-
-## Score Distribution
-
-| Metric | Value |
-|--------|-------|
-| Average Score | ${AVG_SCORE} |
-| Min Score | ${MIN_SCORE} |
-| Max Score | ${MAX_SCORE} |
-
-## Latency
-
-| Metric | Value |
-|--------|-------|
-| Average | ${AVG_LATENCY} ms |
-| Min | ${MIN_LATENCY} ms |
-| Max | ${MAX_LATENCY} ms |
-
-## Failed Cases
-
-EOF
-
-# Add failed cases
-jq -r '.results[] | select(.status == "fail") | "| \(.id) | \(.notes) |"' "${REPORT_FILE}" >> "${SUMMARY_FILE}"
-
-if [[ $(jq '[.results[] | select(.status == "fail")] | length' "${REPORT_FILE}") -eq 0 ]]; then
-    echo "_No failures_" >> "${SUMMARY_FILE}"
-else
-    # Add header
-    sed -i.bak '/## Failed Cases/a\
-| ID | Notes |\
-|-----|-------|' "${SUMMARY_FILE}"
-    rm -f "${SUMMARY_FILE}.bak"
-fi
-
-echo ""
-echo "================================================"
-echo "  BENCHMARK SUMMARY"
-echo "================================================"
-echo "  Strategy:  ${STRATEGY}"
-echo "  Total:     ${TOTAL}"
-echo "  Passed:    ${PASSED}"
-echo "  Failed:    ${FAILED}"
-echo "  Accuracy:  ${ACCURACY}%"
-echo "  Avg Score: ${AVG_SCORE}"
-echo "  Avg Latency: ${AVG_LATENCY} ms"
-echo "================================================"
-echo ""
-echo "Report: ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh
deleted file mode 100755
index 783e546..0000000
--- a/tests/benchmark/scripts/lint-corpus.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-CASES_DIR="${BENCHMARK_DIR}/cases"
-SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-ERRORS=0
-WARNINGS=0
-
-error() {
-    echo -e "${RED}ERROR:${NC} $1"
-    ERRORS=$((ERRORS + 1))
-}
-
-warn() {
-    echo -e "${YELLOW}WARN:${NC} $1"
-    WARNINGS=$((WARNINGS + 1))
-}
-
-ok() {
-    echo -e "${GREEN}✓${NC} $1"
-}
-
-echo "=== Corpus Lint ==="
-echo ""
-
-# 1. Check for invalid JSON in all benchmark files
-echo "Checking JSON validity..."
-for f in "${CORPUS_DIR}"/*/*.json "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        if ! jq . "$f" >/dev/null 2>&1; then
-            error "Invalid JSON: $f"
-        fi
-    fi
-done
-
-# 2. Check for duplicate query IDs across corpus files
-echo "Checking for duplicate query IDs..."
-declare -A QUERY_IDS
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r id; do
-            if [[ -n "$id" && "$id" != "null" ]]; then
-                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
-                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
-                else
-                    QUERY_IDS[$id]="$f"
-                fi
-            fi
-        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# Also check cases files
-for f in "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r id; do
-            if [[ -n "$id" && "$id" != "null" ]]; then
-                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
-                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
-                else
-                    QUERY_IDS[$id]="$f"
-                fi
-            fi
-        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 3. Check for duplicate refs within snapshots
-echo "Checking for duplicate refs in snapshots..."
-for f in "${CORPUS_DIR}"/*/snapshot.json; do
-    if [[ -f "$f" ]]; then
-        dupes=$(jq -r '.[].ref' "$f" 2>/dev/null | sort | uniq -d)
-        if [[ -n "$dupes" ]]; then
-            error "Duplicate refs in $f: $dupes"
-        fi
-    fi
-done
-
-# 4. Check that relevant_refs exist in snapshot
-echo "Checking relevant_refs exist in snapshots..."
-for corpus_dir in "${CORPUS_DIR}"/*/; do
-    corpus_name=$(basename "$corpus_dir")
-    snapshot="${corpus_dir}snapshot.json"
-    queries="${corpus_dir}queries.json"
-
-    if [[ -f "$snapshot" && -f "$queries" ]]; then
-        # Get all refs from snapshot
-        refs=$(jq -r '.[].ref' "$snapshot" 2>/dev/null | sort | uniq)
-
-        # Check relevant_refs
-        while IFS= read -r ref; do
-            if [[ -n "$ref" && "$ref" != "null" ]]; then
-                if ! echo "$refs" | grep -qx "$ref"; then
-                    error "[$corpus_name] relevant_ref '$ref' not found in snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].relevant_refs[]? // empty' "$queries" 2>/dev/null)
-
-        # Check partially_relevant_refs
-        while IFS= read -r ref; do
-            if [[ -n "$ref" && "$ref" != "null" ]]; then
-                if ! echo "$refs" | grep -qx "$ref"; then
-                    error "[$corpus_name] partially_relevant_ref '$ref' not found in snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].partially_relevant_refs[]? // empty' "$queries" 2>/dev/null)
-    fi
-done
-
-# 5. Check for empty relevant_refs (except no-match cases)
-echo "Checking for empty relevant_refs..."
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        empty_relevant=$(jq -r '.[] | select(.relevant_refs | length == 0) | select(.partially_relevant_refs | length == 0) | select(.expect_no_match != true) | .id' "$f" 2>/dev/null)
-        for id in $empty_relevant; do
-            if [[ -n "$id" ]]; then
-                warn "Query '$id' in $f has empty relevant_refs"
-            fi
-        done
-    fi
-done
-
-# 6. Check difficulty values
-echo "Checking difficulty values..."
-VALID_DIFFICULTIES="easy medium hard"
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r line; do
-            id=$(echo "$line" | cut -d'|' -f1)
-            diff=$(echo "$line" | cut -d'|' -f2)
-            if [[ -n "$diff" && "$diff" != "null" ]]; then
-                if ! echo "$VALID_DIFFICULTIES" | grep -qw "$diff"; then
-                    error "Invalid difficulty '$diff' for query '$id' in $f"
-                fi
-            fi
-        done < <(jq -r '.[] | "\(.id)|\(.difficulty // "null")"' "$f" 2>/dev/null)
-    fi
-done
-
-# 7. Check for known tags (warn on unknown)
-echo "Checking tags..."
-KNOWN_TAGS="absent-control accessibility action action-synonym action-verb adversarial alertdialog all-stopwords auth basket-cart bulk-action button cell checkbox combobox compound context-exclusion conversational dashboard description descriptive dialog directional disambiguation domain-intent download-export duplicate-labels ecommerce empty-query empty-snapshot exact exact-match filter find-search generic-verb github guard icon implicit input interactive-boost keyboard-mash legal link literal-text login login-signin long-query lookup-search media menu menuitem missing-letter name-match natural-language navigation negative-context no-match noise-tokens nonsense option ordinal pagination parent-context partial position preferences-settings purchase-buy question-form radio register-create registration repeated-word row-context search searchbox section section-context signout-logout single-char social special-chars spinbutton stale-ref state switch synonym synonym-chain tab table textbox threshold toggle transposition typo vague-query visual weak-match wikipedia"
-for f in "${CORPUS_DIR}"/*/queries.json "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r tag; do
-            if [[ -n "$tag" && "$tag" != "null" ]]; then
-                if ! echo "$KNOWN_TAGS" | grep -qw "$tag"; then
-                    warn "Unknown tag '$tag' in $f"
-                fi
-            fi
-        done < <(jq -r '.[].tags[]? // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 8. Check case files reference existing snapshots
-echo "Checking case file snapshot references..."
-for f in "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r snapshot; do
-            if [[ -n "$snapshot" && "$snapshot" != "null" ]]; then
-                if [[ ! -f "${SNAPSHOTS_DIR}/${snapshot}" ]]; then
-                    error "Case file $f references missing snapshot: $snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].snapshot // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 9. Check for generated result files in source tree
-echo "Checking for generated result files..."
-if ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | grep -v '.gitkeep' | head -1 >/dev/null 2>&1; then
-    result_count=$(ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | wc -l | tr -d ' ')
-    warn "Found $result_count generated result files in tests/benchmark/results/ (should be gitignored)"
-fi
-
-echo ""
-echo "=== Summary ==="
-if [[ $ERRORS -eq 0 && $WARNINGS -eq 0 ]]; then
-    ok "All checks passed"
-    exit 0
-elif [[ $ERRORS -eq 0 ]]; then
-    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
-    exit 0
-else
-    echo -e "${RED}Errors: $ERRORS${NC}"
-    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
-    exit 1
-fi
diff --git a/tests/benchmark/scripts/record-result.sh b/tests/benchmark/scripts/record-result.sh
deleted file mode 100755
index 2288f7c..0000000
--- a/tests/benchmark/scripts/record-result.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-#
-# Record a benchmark result
-#
-# Usage:
-#   ./record-result.sh <report_file> <id> <pass|fail|skip> <score> <latency_ms> "notes"
-#
-set -euo pipefail
-
-if [[ $# -lt 5 ]]; then
-    echo "Usage: $0 <report_file> <id> <pass|fail|skip> <score> <latency_ms> [notes]"
-    exit 1
-fi
-
-REPORT_FILE="$1"
-ID="$2"
-STATUS="$3"
-SCORE="$4"
-LATENCY_MS="$5"
-NOTES="${6:-}"
-TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
-
-# Create result entry
-RESULT_JSON=$(jq -n \
-    --arg id "${ID}" \
-    --arg status "${STATUS}" \
-    --argjson score "${SCORE}" \
-    --argjson latency "${LATENCY_MS}" \
-    --arg notes "${NOTES}" \
-    --arg ts "${TIMESTAMP}" \
-    '{id: $id, status: $status, score: $score, latency_ms: $latency, notes: $notes, timestamp: $ts}')
-
-# Append to report
-TMP_FILE=$(mktemp)
-jq --argjson result "${RESULT_JSON}" \
-   --arg status "${STATUS}" \
-   '.results += [$result] |
-    .summary.total += 1 |
-    if $status == "pass" then .summary.passed += 1
-    elif $status == "fail" then .summary.failed += 1
-    else .summary.skipped += 1 end' \
-   "${REPORT_FILE}" > "${TMP_FILE}"
-
-mv "${TMP_FILE}" "${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh
deleted file mode 100755
index 29c8a22..0000000
--- a/tests/benchmark/scripts/run-benchmark.sh
+++ /dev/null
@@ -1,226 +0,0 @@
-#!/bin/bash
-#
-# Run semantic matching benchmark
-#
-# Usage:
-#   ./run-benchmark.sh [--strategy <name>] [--cases <file>]
-#
-# Options:
-#   --strategy <name>   Strategy to benchmark (lexical, embedding, combined)
-#   --cases <file>      Specific case file to run (default: all)
-#   --output <dir>      Output directory (default: ../results)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CASES_DIR="${BENCHMARK_DIR}/cases"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-CASE_FILE=""
-
-# Parse args (override config)
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --strategy) STRATEGY="$2"; shift 2 ;;
-        --cases) CASE_FILE="$2"; shift 2 ;;
-        --output) RESULTS_DIR="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-case "${STRATEGY}" in
-    lexical|embedding|combined) ;;
-    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
-esac
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.json"
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
-    --arg version "$(${SEMANTIC} --version 2>/dev/null || echo 'dev')" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            strategy: $strategy,
-            version: $version
-        },
-        results: [],
-        summary: {
-            total: 0,
-            passed: 0,
-            failed: 0,
-            skipped: 0,
-            accuracy: 0,
-            avg_score: 0,
-            avg_latency_ms: 0
-        }
-    }' > "${REPORT_FILE}"
-
-# Run cases
-score_at_least() {
-    local score="$1"
-    local min_score="$2"
-    awk -v score="${score}" -v min_score="${min_score}" 'BEGIN { exit (score + 0 >= min_score + 0) ? 0 : 1 }'
-}
-
-run_case() {
-    local case_file="$1"
-    local case_name
-    case_name=$(basename "$case_file" .json)
-
-    echo ""
-    echo "=== Running: ${case_name} ==="
-
-    local count
-    count=$(jq length "$case_file")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query snapshot expect_ref expect_ref_alt expect_no_match expect_no_crash expect_has_matches threshold min_score
-
-        id=$(jq -r ".[$i].id" "$case_file")
-        query=$(jq -r ".[$i].query" "$case_file")
-        snapshot=$(jq -r ".[$i].snapshot" "$case_file")
-        expect_ref=$(jq -r ".[$i].expect_ref // empty" "$case_file")
-        expect_ref_alt=$(jq -r ".[$i].expect_ref_alt // [] | join(\",\")" "$case_file")
-        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$case_file")
-        expect_no_crash=$(jq -r ".[$i].expect_no_crash // false" "$case_file")
-        expect_has_matches=$(jq -r ".[$i].expect_has_matches // false" "$case_file")
-        threshold=$(jq -r ".[$i].threshold // 0.3" "$case_file")
-        min_score=$(jq -r ".[$i].min_score // 0" "$case_file")
-
-        local snapshot_path="${SNAPSHOTS_DIR}/${snapshot}"
-        if [[ ! -f "${snapshot_path}" ]]; then
-            echo "  [${id}] SKIP: snapshot not found: ${snapshot}"
-            "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "skip" 0 0 "snapshot not found"
-            continue
-        fi
-
-        # Run query and measure time
-        local start_ms end_ms duration_ms result exit_code
-        start_ms=$(python3 -c 'import time; print(int(time.time() * 1000))')
-
-        set +e
-        result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot_path}" \
-            --strategy "${STRATEGY}" \
-            --threshold "${threshold}" \
-            --format json 2>&1)
-        exit_code=$?
-        set -e
-
-        end_ms=$(python3 -c 'import time; print(int(time.time() * 1000))')
-        duration_ms=$((end_ms - start_ms))
-
-        # Evaluate result
-        local status="fail"
-        local got_ref=""
-        local got_score=0
-        local notes=""
-
-        if [[ ${exit_code} -ne 0 ]]; then
-            if [[ "${expect_no_crash}" == "true" ]]; then
-                # Some crashes are expected (empty query, etc)
-                status="pass"
-                notes="exit ${exit_code} (expected)"
-            else
-                notes="exit ${exit_code}: ${result}"
-            fi
-        else
-            got_ref=$(echo "$result" | jq -r '.best_ref // empty')
-            got_score=$(echo "$result" | jq -r '.best_score // 0')
-            local match_count
-            match_count=$(echo "$result" | jq -r '.matches | length')
-
-            if [[ "${expect_no_match}" == "true" ]]; then
-                if [[ ${match_count} -eq 0 ]]; then
-                    status="pass"
-                    notes="no matches (expected)"
-                else
-                    notes="expected no matches, got ${match_count}"
-                fi
-            elif [[ "${expect_has_matches}" == "true" ]]; then
-                if [[ ${match_count} -gt 0 ]]; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="${match_count} matches, score=${got_score}"
-                    else
-                        notes="${match_count} matches, score=${got_score} below min_score=${min_score}"
-                    fi
-                else
-                    notes="expected matches, got 0"
-                fi
-            elif [[ -n "${expect_ref}" ]]; then
-                if [[ "${got_ref}" == "${expect_ref}" ]]; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="ref=${got_ref}, score=${got_score}"
-                    else
-                        notes="ref=${got_ref}, score=${got_score} below min_score=${min_score}"
-                    fi
-                elif [[ -n "${expect_ref_alt}" ]] && echo ",${expect_ref_alt}," | grep -q ",${got_ref},"; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="ref=${got_ref} (alt), score=${got_score}"
-                    else
-                        notes="ref=${got_ref} (alt), score=${got_score} below min_score=${min_score}"
-                    fi
-                else
-                    notes="got ${got_ref}, want ${expect_ref}"
-                fi
-            elif [[ "${expect_no_crash}" == "true" ]]; then
-                status="pass"
-                notes="no crash"
-            fi
-        fi
-
-        # Record result
-        "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "${status}" "${got_score}" "${duration_ms}" "${notes}"
-
-        if [[ "${status}" == "pass" ]]; then
-            echo "  [${id}] PASS: ${notes}"
-        else
-            echo "  [${id}] FAIL: ${notes}"
-        fi
-    done
-}
-
-# Find case files
-if [[ -n "${CASE_FILE}" ]]; then
-    run_case "${CASES_DIR}/${CASE_FILE}"
-else
-    for case_file in "${CASES_DIR}"/*.json; do
-        [[ -f "$case_file" ]] || continue
-        run_case "$case_file"
-    done
-fi
-
-# Finalize report
-"${SCRIPT_DIR}/finalize-report.sh" "${REPORT_FILE}"
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "Benchmark complete: ${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh
deleted file mode 100755
index 53216af..0000000
--- a/tests/benchmark/scripts/run-corpus-benchmark.sh
+++ /dev/null
@@ -1,514 +0,0 @@
-#!/bin/bash
-#
-# Run semantic matching benchmark with ranking metrics
-#
-# Usage:
-#   ./run-corpus-benchmark.sh [--strategy <name>] [--corpus <dir>] [--lexical-weight <n>] [--embedding-weight <n>]
-#
-# Metrics:
-#   - MRR (Mean Reciprocal Rank)
-#   - P@1 (Precision at 1)
-#   - P@3 (Precision at 3)
-#   - Latency distribution (p50, p95, p99)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
-EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
-SPECIFIC_CORPUS=""
-
-# Parse args (override config)
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --strategy) STRATEGY="$2"; shift 2 ;;
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        --threshold) THRESHOLD="$2"; shift 2 ;;
-        --top-k) TOP_K="$2"; shift 2 ;;
-        --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;;
-        --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-case "${STRATEGY}" in
-    lexical|embedding|combined) ;;
-    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
-esac
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json"
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
-    --argjson threshold "${THRESHOLD}" \
-    --argjson top_k "${TOP_K}" \
-    --argjson lexical_weight "${LEXICAL_WEIGHT}" \
-    --argjson embedding_weight "${EMBEDDING_WEIGHT}" \
-    --arg config_file "${CONFIG_FILE}" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            strategy: $strategy,
-            threshold: $threshold,
-            top_k: $top_k,
-            type: "corpus",
-            config_source: $config_file,
-            weights: {
-                lexical: $lexical_weight,
-                embedding: $embedding_weight
-            }
-        },
-        results: [],
-        metrics: {
-            total: 0,
-            mrr: 0,
-            p_at_1: 0,
-            p_at_3: 0,
-            latencies_ms: [],
-            by_difficulty: {},
-            by_tag: {}
-        }
-    }' > "${REPORT_FILE}"
-
-# Arrays to collect metrics
-declare -a ALL_RRS=()
-declare -a ALL_P1=()
-declare -a ALL_P3=()
-declare -a ALL_HIT3=()
-declare -a ALL_HIT5=()
-declare -a ALL_MARGINS=()
-declare -a ALL_LATENCIES=()
-
-run_corpus() {
-    local corpus_path="$1"
-    local corpus_name
-    corpus_name=$(basename "$corpus_path")
-
-    local snapshot="${corpus_path}/snapshot.json"
-    local queries="${corpus_path}/queries.json"
-
-    if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
-        if [[ -f "${corpus_path}/cases.json" ]] || [[ -f "${corpus_path}/scenarios.json" ]]; then
-            return
-        fi
-        echo "  Skipping ${corpus_name}: missing files"
-        return
-    fi
-
-    echo ""
-    echo "=== Corpus: ${corpus_name} ==="
-
-    local count
-    count=$(jq length "$queries")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query relevant_refs partial_refs difficulty tags
-
-        id=$(jq -r ".[$i].id" "$queries")
-        query=$(jq -r ".[$i].query" "$queries")
-        relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries")
-        partial_refs=$(jq -c ".[$i].partially_relevant_refs // []" "$queries")
-        difficulty=$(jq -r ".[$i].difficulty // \"medium\"" "$queries")
-        tags=$(jq -c ".[$i].tags // []" "$queries")
-
-        # Run query and measure time
-        local start_ns end_ns duration_ms result
-        start_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
-
-        if ! result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot}" \
-            --strategy "${STRATEGY}" \
-            --threshold "${THRESHOLD}" \
-            --top-k "${TOP_K}" \
-            --lexical-weight "${LEXICAL_WEIGHT}" \
-            --embedding-weight "${EMBEDDING_WEIGHT}" \
-            --format json 2>&1); then
-            echo "  [${id}] ERROR: semantic find failed for query: ${query}" >&2
-            echo "${result}" >&2
-            exit 1
-        fi
-
-        if ! echo "$result" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
-            echo "  [${id}] ERROR: semantic find returned invalid JSON" >&2
-            echo "${result}" >&2
-            exit 1
-        fi
-
-        end_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
-        duration_ms=$(( (end_ns - start_ns) / 1000 ))
-
-        # Extract results
-        local matches best_ref best_score
-        matches=$(echo "$result" | jq -c '[.matches[].ref]')
-        best_ref=$(echo "$result" | jq -r '.best_ref // ""')
-        best_score=$(echo "$result" | jq -r '.best_score // 0')
-
-        # Calculate Reciprocal Rank
-        local rr=0
-        for rank in $(seq 1 ${TOP_K}); do
-            local ref_at_rank
-            ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                rr=$(echo "scale=4; 1 / ${rank}" | bc)
-                break
-            fi
-        done
-
-        # Calculate P@1
-        local p1=0
-        if echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-            p1=1
-        elif echo "$partial_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-            p1=0.5
-        fi
-
-        # Calculate P@3 (count relevant in top 3, partials count as 0.5)
-        local relevant_in_top3=0
-        local partial_in_top3=0
-        local hit_at_3=0
-        local hit_at_5=0
-        local best_relevant_rank="null"
-        for rank in 1 2 3 4 5; do
-            local ref_at_rank
-            ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                if [[ "$best_relevant_rank" == "null" ]]; then
-                    best_relevant_rank=$rank
-                fi
-                if [[ $rank -le 3 ]]; then
-                    relevant_in_top3=$((relevant_in_top3 + 1))
-                    hit_at_3=1
-                fi
-                hit_at_5=1
-            elif [[ $rank -le 3 ]]; then
-                if echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                    partial_in_top3=$((partial_in_top3 + 1))
-                fi
-            fi
-        done
-        local p3
-        p3=$(echo "scale=4; (${relevant_in_top3} + ${partial_in_top3} * 0.5) / 3" | bc)
-
-        # Calculate best_relevant_score, best_wrong_score, and margin
-        local best_relevant_score=0
-        local best_wrong_score=0
-        local num_matches
-        num_matches=$(echo "$result" | jq '.matches | length')
-        for idx in $(seq 0 $((num_matches - 1))); do
-            local ref_at_idx score_at_idx
-            ref_at_idx=$(echo "$result" | jq -r ".matches[$idx].ref // \"\"")
-            score_at_idx=$(echo "$result" | jq -r ".matches[$idx].score // 0")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
-                if (( $(echo "$score_at_idx > $best_relevant_score" | bc -l) )); then
-                    best_relevant_score=$score_at_idx
-                fi
-            elif echo "$partial_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
-                : # partials don't count as wrong
-            else
-                if (( $(echo "$score_at_idx > $best_wrong_score" | bc -l) )); then
-                    best_wrong_score=$score_at_idx
-                fi
-            fi
-        done
-        local margin
-        margin=$(echo "scale=4; $best_relevant_score - $best_wrong_score" | bc)
-
-        # Collect metrics
-        ALL_RRS+=("$rr")
-        ALL_P1+=("$p1")
-        ALL_P3+=("$p3")
-        ALL_HIT3+=("$hit_at_3")
-        ALL_HIT5+=("$hit_at_5")
-        ALL_MARGINS+=("$margin")
-        ALL_LATENCIES+=("$duration_ms")
-
-        # Status indicator
-        local status="MISS"
-        if (( $(echo "$p1 >= 1" | bc -l) )); then
-            status="HIT "
-        elif (( $(echo "$p1 >= 0.5" | bc -l) )); then
-            status="PART"
-        fi
-
-        printf "  [%s] %s | RR=%.2f P@1=%.1f P@3=%.2f | %dms | %s\n" \
-            "$id" "$status" "$rr" "$p1" "$p3" "$duration_ms" "$query"
-
-        # Record to report
-        local result_json
-        result_json=$(jq -n \
-            --arg id "$id" \
-            --arg query "$query" \
-            --arg corpus "$corpus_name" \
-            --arg difficulty "$difficulty" \
-            --argjson tags "$tags" \
-            --arg best_ref "$best_ref" \
-            --argjson best_score "$best_score" \
-            --argjson matches "$matches" \
-            --argjson relevant "$relevant_refs" \
-            --argjson rr "$rr" \
-            --argjson p1 "$p1" \
-            --argjson p3 "$p3" \
-            --argjson hit_at_3 "$hit_at_3" \
-            --argjson hit_at_5 "$hit_at_5" \
-            --argjson best_relevant_rank "$best_relevant_rank" \
-            --argjson best_relevant_score "$best_relevant_score" \
-            --argjson best_wrong_score "$best_wrong_score" \
-            --argjson margin "$margin" \
-            --argjson latency "$duration_ms" \
-            '{
-                id: $id, query: $query, corpus: $corpus,
-                difficulty: $difficulty, tags: $tags,
-                best_ref: $best_ref, best_score: $best_score,
-                matches: $matches, relevant_refs: $relevant,
-                rr: $rr, p_at_1: $p1, p_at_3: $p3,
-                hit_at_3: $hit_at_3, hit_at_5: $hit_at_5,
-                best_relevant_rank: $best_relevant_rank,
-                best_relevant_score: $best_relevant_score,
-                best_wrong_score: $best_wrong_score,
-                margin: $margin,
-                latency_ms: $latency
-            }')
-
-        # Append to report
-        local tmp
-        tmp=$(mktemp)
-        jq --argjson r "$result_json" '.results += [$r]' "$REPORT_FILE" > "$tmp"
-        mv "$tmp" "$REPORT_FILE"
-    done
-}
-
-# Run benchmarks
-if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-    run_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}"
-else
-    for corpus in "${CORPUS_DIR}"/*/; do
-        [[ -d "$corpus" ]] || continue
-        run_corpus "$corpus"
-    done
-fi
-
-# Calculate aggregate metrics
-echo ""
-echo "Calculating aggregate metrics..."
-
-TOTAL=${#ALL_RRS[@]}
-if [[ $TOTAL -eq 0 ]]; then
-    echo "No results to aggregate"
-    exit 1
-fi
-
-# MRR
-MRR=$(printf '%s\n' "${ALL_RRS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# P@1
-P1=$(printf '%s\n' "${ALL_P1[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# P@3
-P3=$(printf '%s\n' "${ALL_P3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Hit@3
-HIT3=$(printf '%s\n' "${ALL_HIT3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Hit@5
-HIT5=$(printf '%s\n' "${ALL_HIT5[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Average margin
-AVG_MARGIN=$(printf '%s\n' "${ALL_MARGINS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Latency percentiles
-SORTED_LAT=($(printf '%s\n' "${ALL_LATENCIES[@]}" | sort -n))
-P50_IDX=$(( TOTAL * 50 / 100 ))
-P95_IDX=$(( TOTAL * 95 / 100 ))
-P99_IDX=$(( TOTAL * 99 / 100 ))
-LAT_P50=${SORTED_LAT[$P50_IDX]:-0}
-LAT_P95=${SORTED_LAT[$P95_IDX]:-0}
-LAT_P99=${SORTED_LAT[$P99_IDX]:-0}
-LAT_AVG=$(printf '%s\n' "${ALL_LATENCIES[@]}" | awk '{s+=$1} END {printf "%.0f", s/NR}')
-
-# Update report with aggregates
-tmp=$(mktemp)
-jq \
-    --argjson total "$TOTAL" \
-    --argjson mrr "$MRR" \
-    --argjson p1 "$P1" \
-    --argjson p3 "$P3" \
-    --argjson hit3 "$HIT3" \
-    --argjson hit5 "$HIT5" \
-    --argjson avg_margin "$AVG_MARGIN" \
-    --argjson lat_avg "$LAT_AVG" \
-    --argjson lat_p50 "$LAT_P50" \
-    --argjson lat_p95 "$LAT_P95" \
-    --argjson lat_p99 "$LAT_P99" \
-    '.metrics = {
-        total: $total,
-        mrr: $mrr,
-        p_at_1: $p1,
-        p_at_3: $p3,
-        hit_at_3: $hit3,
-        hit_at_5: $hit5,
-        avg_margin: $avg_margin,
-        latency_avg_ms: $lat_avg,
-        latency_p50_ms: $lat_p50,
-        latency_p95_ms: $lat_p95,
-        latency_p99_ms: $lat_p99
-    }' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-difficulty breakdown
-tmp=$(mktemp)
-jq '.metrics.by_difficulty = (
-    .results | group_by(.difficulty) | map({
-        key: .[0].difficulty,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    }) | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-corpus breakdown
-tmp=$(mktemp)
-jq '.metrics.by_corpus = (
-    .results | group_by(.corpus) | map({
-        key: .[0].corpus,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    }) | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-tag breakdown
-tmp=$(mktemp)
-jq '.metrics.by_tag = (
-    [.results[] | {tags: .tags, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
-    | [.[] | .tags[] as $tag | {tag: $tag, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
-    | group_by(.tag)
-    | map({
-        key: .[0].tag,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    })
-    | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Generate summary
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Semantic Matching Benchmark Results
-
-## Configuration
-
-| Field | Value |
-|-------|-------|
-| Timestamp | $(jq -r '.benchmark.timestamp' "$REPORT_FILE") |
-| Strategy | ${STRATEGY} |
-| Lexical Weight | ${LEXICAL_WEIGHT} |
-| Embedding Weight | ${EMBEDDING_WEIGHT} |
-| Top-K | ${TOP_K} |
-| Total Queries | ${TOTAL} |
-
-## Ranking Metrics
-
-| Metric | Value | Description |
-|--------|-------|-------------|
-| **MRR** | **${MRR}** | Mean Reciprocal Rank |
-| **P@1** | **${P1}** | Precision at rank 1 |
-| **P@3** | **${P3}** | Precision at rank 3 |
-| **Hit@3** | **${HIT3}** | Any relevant in top 3 |
-| **Hit@5** | **${HIT5}** | Any relevant in top 5 |
-| **Avg Margin** | **${AVG_MARGIN}** | best_relevant - best_wrong |
-
-## Latency
-
-| Percentile | Value |
-|------------|-------|
-| Average | ${LAT_AVG} ms |
-| P50 | ${LAT_P50} ms |
-| P95 | ${LAT_P95} ms |
-| P99 | ${LAT_P99} ms |
-
-## By Difficulty
-
-| Difficulty | Count | MRR | P@1 | Hit@3 | Margin |
-|------------|-------|-----|-----|-------|--------|
-$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
-
-## By Corpus
-
-| Corpus | Count | MRR | P@1 | Hit@3 | Margin |
-|--------|-------|-----|-----|-------|--------|
-$(jq -r '.metrics.by_corpus | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
-
-## Misses (P@1 = 0)
-
-| ID | Query | Got | Expected |
-|----|-------|-----|----------|
-$(jq -r '.results[] | select(.p_at_1 == 0) | "| \(.id) | \(.query) | \(.best_ref) | \(.relevant_refs | join(",")) |"' "$REPORT_FILE")
-
-EOF
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "================================================"
-echo "  CORPUS BENCHMARK RESULTS"
-echo "================================================"
-echo "  Strategy:    ${STRATEGY}"
-echo "  Weights:     lexical=${LEXICAL_WEIGHT} embedding=${EMBEDDING_WEIGHT}"
-echo "  Queries:     ${TOTAL}"
-echo "  MRR:         ${MRR}"
-echo "  P@1:         ${P1}"
-echo "  P@3:         ${P3}"
-echo "  Hit@3:       ${HIT3}"
-echo "  Hit@5:       ${HIT5}"
-echo "  Avg Margin:  ${AVG_MARGIN}"
-echo "  Latency P50: ${LAT_P50} ms"
-echo "  Latency P95: ${LAT_P95} ms"
-echo "================================================"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh
deleted file mode 100755
index 5c759dc..0000000
--- a/tests/benchmark/scripts/run-full-benchmark.sh
+++ /dev/null
@@ -1,317 +0,0 @@
-#!/bin/bash
-#
-# Full semantic benchmark: Find + Recovery + Classification
-#
-# Produces a composite score for overall system health.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
-EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary with recovery support
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/full_benchmark_${TIMESTAMP}.json"
-
-has_role_keyword() {
-    local query="$1"
-    echo "$query" | grep -Eiq '(^|[^[:alnum:]])(button|input|link|textbox|checkbox|radio|select|option|tab|menu|form|search)([^[:alnum:]]|$)'
-}
-
-enrich_recovery_query() {
-    local query="$1"
-    local role="$2"
-
-    if [[ -z "$query" || -z "$role" ]]; then
-        printf '%s' "$query"
-        return
-    fi
-    if has_role_keyword "$query"; then
-        printf '%s' "$query"
-        return
-    fi
-    printf '%s %s' "$query" "$role"
-}
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    '{
-        timestamp: $ts,
-        find: { total: 0, mrr: 0, p_at_1: 0, latency_p50: 0 },
-        recovery: { total: 0, recovered: 0, rate: 0 },
-        classification: { total: 0, correct: 0, accuracy: 0 },
-        composite: { score: 0, grade: "" }
-    }' > "${REPORT_FILE}"
-
-echo ""
-echo "=============================================="
-echo "  PHASE 1: FIND BENCHMARK"
-echo "=============================================="
-
-# Run corpus benchmark and capture metrics
-FIND_OUTPUT=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" 2>&1)
-echo "$FIND_OUTPUT"
-
-# Extract metrics from the corpus report rather than the human-readable output.
-FIND_REPORT=$(echo "$FIND_OUTPUT" | awk '/^Report:/ {print $2}' | tail -1)
-if [[ -z "${FIND_REPORT}" ]] || [[ ! -f "${FIND_REPORT}" ]]; then
-    echo "error: could not locate corpus benchmark report" >&2
-    exit 1
-fi
-FIND_MRR=$(jq -r '.metrics.mrr' "$FIND_REPORT")
-FIND_P1=$(jq -r '.metrics.p_at_1' "$FIND_REPORT")
-FIND_TOTAL=$(jq -r '.metrics.total' "$FIND_REPORT")
-FIND_LAT=$(jq -r '.metrics.latency_p50_ms' "$FIND_REPORT")
-
-# Rebuild semantic binary (corpus benchmark deletes it)
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-echo ""
-echo "=============================================="
-echo "  PHASE 2: RECOVERY BENCHMARK"
-echo "=============================================="
-
-SCENARIOS_FILE="${CORPUS_DIR}/recovery-scenarios/scenarios.json"
-RECOVERY_TOTAL=0
-RECOVERY_SUCCESS=0
-
-if [[ -f "$SCENARIOS_FILE" ]]; then
-    SCENARIO_COUNT=$(jq length "$SCENARIOS_FILE")
-
-    for i in $(seq 0 $((SCENARIO_COUNT - 1))); do
-        ID=$(jq -r ".[$i].id" "$SCENARIOS_FILE")
-        NAME=$(jq -r ".[$i].name" "$SCENARIOS_FILE")
-        RAW_QUERY=$(jq -r ".[$i].original_query" "$SCENARIOS_FILE")
-        ORIGINAL_REF=$(jq -r ".[$i].original_ref // empty" "$SCENARIOS_FILE")
-        ORIGINAL_ROLE=$(jq -r ".[$i].before[]? | select(.ref == \"$ORIGINAL_REF\") | .role // empty" "$SCENARIOS_FILE")
-        QUERY=$(enrich_recovery_query "$RAW_QUERY" "$ORIGINAL_ROLE")
-        EXPECTED=$(jq -r ".[$i].expected_ref // empty" "$SCENARIOS_FILE")
-        EXPECTED_ALT=$(jq -r ".[$i].expected_alt // [] | join(\",\")" "$SCENARIOS_FILE")
-        EXPECT_NO_MATCH=$(jq -r ".[$i].expect_no_match // false" "$SCENARIOS_FILE")
-
-        # Write after snapshot to temp file
-        AFTER_FILE=$(mktemp)
-        jq ".[$i].after" "$SCENARIOS_FILE" > "$AFTER_FILE"
-
-        # Run semantic find on after snapshot with the same minimum score
-        # enforced by DefaultRecoveryConfig in the recovery engine.
-        if ! RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>&1); then
-            echo "  [$ID] ERROR: semantic find failed during recovery benchmark" >&2
-            echo "$RESULT" >&2
-            rm -f "$AFTER_FILE"
-            exit 1
-        fi
-        if ! echo "$RESULT" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
-            echo "  [$ID] ERROR: semantic find returned invalid JSON during recovery benchmark" >&2
-            echo "$RESULT" >&2
-            rm -f "$AFTER_FILE"
-            exit 1
-        fi
-        BEST_REF=$(echo "$RESULT" | jq -r '.best_ref // ""')
-
-        rm -f "$AFTER_FILE"
-
-        RECOVERY_TOTAL=$((RECOVERY_TOTAL + 1))
-        STATUS="FAIL"
-
-        if [[ "$EXPECT_NO_MATCH" == "true" ]]; then
-            if [[ -z "$BEST_REF" ]] || [[ "$BEST_REF" == "null" ]]; then
-                STATUS="PASS"
-                RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-            fi
-        elif [[ "$BEST_REF" == "$EXPECTED" ]]; then
-            STATUS="PASS"
-            RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-        elif [[ -n "$EXPECTED_ALT" ]] && echo ",$EXPECTED_ALT," | grep -q ",$BEST_REF,"; then
-            STATUS="PASS"
-            RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-        fi
-
-        printf "  [%s] %s | %s | got=%s want=%s\n" "$ID" "$STATUS" "$NAME" "$BEST_REF" "$EXPECTED"
-    done
-fi
-
-RECOVERY_RATE=0
-if [[ $RECOVERY_TOTAL -gt 0 ]]; then
-    RECOVERY_RATE=$(echo "scale=4; $RECOVERY_SUCCESS / $RECOVERY_TOTAL" | bc)
-fi
-
-echo ""
-echo "  Recovery: $RECOVERY_SUCCESS / $RECOVERY_TOTAL = $RECOVERY_RATE"
-
-echo ""
-echo "=============================================="
-echo "  PHASE 3: CLASSIFICATION BENCHMARK"
-echo "=============================================="
-
-CLASS_FILE="${CORPUS_DIR}/classification/cases.json"
-CLASS_TOTAL=0
-CLASS_CORRECT=0
-
-if [[ -f "$CLASS_FILE" ]]; then
-    CLASS_COUNT=$(jq length "$CLASS_FILE")
-
-    for i in $(seq 0 $((CLASS_COUNT - 1))); do
-        ID=$(jq -r ".[$i].id" "$CLASS_FILE")
-        ERROR=$(jq -r ".[$i].error" "$CLASS_FILE")
-        EXPECTED=$(jq -r ".[$i].expected_type" "$CLASS_FILE")
-
-        # Run semantic classify (extract just the type, first word)
-        if ! RESULT=$("${SEMANTIC}" classify "$ERROR" 2>&1); then
-            echo "  [$ID] ERROR: semantic classify failed" >&2
-            echo "$RESULT" >&2
-            exit 1
-        fi
-        GOT=$(echo "$RESULT" | awk '{print $1}')
-
-        CLASS_TOTAL=$((CLASS_TOTAL + 1))
-        STATUS="FAIL"
-
-        if [[ "$GOT" == "$EXPECTED" ]]; then
-            STATUS="PASS"
-            CLASS_CORRECT=$((CLASS_CORRECT + 1))
-        fi
-
-        printf "  [%s] %s | \"%s\" → %s (want %s)\n" "$ID" "$STATUS" "${ERROR:0:40}" "$GOT" "$EXPECTED"
-    done
-fi
-
-CLASS_ACCURACY=0
-if [[ $CLASS_TOTAL -gt 0 ]]; then
-    CLASS_ACCURACY=$(echo "scale=4; $CLASS_CORRECT / $CLASS_TOTAL" | bc)
-fi
-
-echo ""
-echo "  Classification: $CLASS_CORRECT / $CLASS_TOTAL = $CLASS_ACCURACY"
-
-echo ""
-echo "=============================================="
-echo "  COMPOSITE SCORE"
-echo "=============================================="
-
-# Calculate composite score with weights:
-#   Find P@1:      40%
-#   Find MRR:      20%
-#   Recovery Rate: 25%
-#   Classification: 15%
-
-COMPOSITE=$(echo "scale=4; \
-    ($FIND_P1 * 0.40) + \
-    ($FIND_MRR * 0.20) + \
-    ($RECOVERY_RATE * 0.25) + \
-    ($CLASS_ACCURACY * 0.15)" | bc)
-COMPOSITE=$(awk -v value="$COMPOSITE" 'BEGIN { printf "%.4f", value }')
-
-# Assign grade
-GRADE="F"
-if (( $(echo "$COMPOSITE >= 0.95" | bc -l) )); then GRADE="A+"
-elif (( $(echo "$COMPOSITE >= 0.90" | bc -l) )); then GRADE="A"
-elif (( $(echo "$COMPOSITE >= 0.85" | bc -l) )); then GRADE="B+"
-elif (( $(echo "$COMPOSITE >= 0.80" | bc -l) )); then GRADE="B"
-elif (( $(echo "$COMPOSITE >= 0.75" | bc -l) )); then GRADE="C+"
-elif (( $(echo "$COMPOSITE >= 0.70" | bc -l) )); then GRADE="C"
-elif (( $(echo "$COMPOSITE >= 0.60" | bc -l) )); then GRADE="D"
-fi
-
-# Update report
-TMP=$(mktemp)
-jq \
-    --argjson find_total "${FIND_TOTAL:-0}" \
-    --argjson find_mrr "${FIND_MRR:-0}" \
-    --argjson find_p1 "${FIND_P1:-0}" \
-    --argjson find_lat "${FIND_LAT:-0}" \
-    --argjson rec_total "$RECOVERY_TOTAL" \
-    --argjson rec_success "$RECOVERY_SUCCESS" \
-    --argjson rec_rate "$RECOVERY_RATE" \
-    --argjson class_total "$CLASS_TOTAL" \
-    --argjson class_correct "$CLASS_CORRECT" \
-    --argjson class_acc "$CLASS_ACCURACY" \
-    --argjson composite "$COMPOSITE" \
-    --arg grade "$GRADE" \
-    '.find = { total: $find_total, mrr: $find_mrr, p_at_1: $find_p1, latency_p50: $find_lat } |
-     .recovery = { total: $rec_total, recovered: $rec_success, rate: $rec_rate } |
-     .classification = { total: $class_total, correct: $class_correct, accuracy: $class_acc } |
-     .composite = { score: $composite, grade: $grade }' \
-    "$REPORT_FILE" > "$TMP"
-mv "$TMP" "$REPORT_FILE"
-
-# Generate summary
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-cat > "$SUMMARY_FILE" << EOF
-# Semantic Benchmark Report
-
-## Composite Score: ${COMPOSITE} (${GRADE})
-
-| Component | Weight | Score | Weighted |
-|-----------|--------|-------|----------|
-| Find P@1 | 40% | ${FIND_P1:-0} | $(echo "scale=3; ${FIND_P1:-0} * 0.40" | bc) |
-| Find MRR | 20% | ${FIND_MRR:-0} | $(echo "scale=3; ${FIND_MRR:-0} * 0.20" | bc) |
-| Recovery | 25% | ${RECOVERY_RATE} | $(echo "scale=3; ${RECOVERY_RATE} * 0.25" | bc) |
-| Classification | 15% | ${CLASS_ACCURACY} | $(echo "scale=3; ${CLASS_ACCURACY} * 0.15" | bc) |
-
-## Find Performance
-- Queries: ${FIND_TOTAL:-0}
-- MRR: ${FIND_MRR:-0}
-- P@1: ${FIND_P1:-0}
-- Latency P50: ${FIND_LAT:-0} ms
-
-## Recovery Performance
-- Scenarios: ${RECOVERY_TOTAL}
-- Recovered: ${RECOVERY_SUCCESS}
-- Rate: ${RECOVERY_RATE}
-
-## Classification Performance
-- Cases: ${CLASS_TOTAL}
-- Correct: ${CLASS_CORRECT}
-- Accuracy: ${CLASS_ACCURACY}
-
-## Grade Scale
-| Grade | Score |
-|-------|-------|
-| A+ | >= 0.95 |
-| A | >= 0.90 |
-| B+ | >= 0.85 |
-| B | >= 0.80 |
-| C+ | >= 0.75 |
-| C | >= 0.70 |
-| D | >= 0.60 |
-| F | < 0.60 |
-EOF
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "  ┌─────────────────────────────────────────┐"
-echo "  │  COMPOSITE SCORE: ${COMPOSITE}  GRADE: ${GRADE}      │"
-echo "  ├─────────────────────────────────────────┤"
-echo "  │  Find P@1:       ${FIND_P1:-0}  (40%)            │"
-echo "  │  Find MRR:       ${FIND_MRR:-0}  (20%)            │"
-echo "  │  Recovery:       ${RECOVERY_RATE}  (25%)            │"
-echo "  │  Classification: ${CLASS_ACCURACY}  (15%)            │"
-echo "  └─────────────────────────────────────────┘"
-echo ""
-echo "Report: ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/run-recovery-benchmark.sh b/tests/benchmark/scripts/run-recovery-benchmark.sh
deleted file mode 100755
index 93fc88a..0000000
--- a/tests/benchmark/scripts/run-recovery-benchmark.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-#
-# Recovery Engine Benchmark
-#
-# Exercises RecoveryEngine directly using before/after snapshots
-# and intent cache entries from recovery scenarios.
-#
-# Usage:
-#   ./run-recovery-benchmark.sh
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-mkdir -p "${RESULTS_DIR}"
-
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/recovery_benchmark_${TIMESTAMP}.txt"
-
-echo "=== Recovery Engine Benchmark ==="
-echo ""
-
-cd "${BENCHMARK_DIR}/../.."
-
-# Run the Go test that exercises RecoveryEngine with scenarios
-echo "Running recovery scenarios..."
-echo ""
-
-go test -v -run TestRecoveryBenchmark_Scenarios ./recovery/ 2>&1 | tee "$REPORT_FILE"
-
-# Also run the Go benchmark for performance
-echo ""
-echo "Running performance benchmark..."
-go test -bench=BenchmarkRecoveryEngine_Scenarios -benchmem ./recovery/ 2>&1 | tee -a "$REPORT_FILE"
-
-echo ""
-echo "================================================"
-echo "  RECOVERY BENCHMARK COMPLETE"
-echo "================================================"
-echo "Report: $REPORT_FILE"
diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh
deleted file mode 100755
index 011b1b2..0000000
--- a/tests/benchmark/scripts/tune-weights.sh
+++ /dev/null
@@ -1,167 +0,0 @@
-#!/bin/bash
-#
-# Grid-search combined matcher lexical/embedding weights against the corpus.
-#
-# Usage:
-#   ./tune-weights.sh [--corpus <dir>] [--step <n>] [--output <dir>]
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config (used for threshold/top_k in grid runs)
-if [[ -f "$CONFIG_FILE" ]]; then
-    THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-    TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-else
-    THRESHOLD=0.01
-    TOP_K=5
-fi
-
-SPECIFIC_CORPUS=""
-STEP="0.1"
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        --step) STEP="$2"; shift 2 ;;
-        --output) RESULTS_DIR="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${RESULTS_DIR}"
-
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/tuning_weights_${TIMESTAMP}.json"
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg step "${STEP}" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            type: "weight-tuning",
-            strategy: "combined",
-            step: ($step | tonumber)
-        },
-        results: [],
-        best: null
-    }' > "${REPORT_FILE}"
-
-weights=$(awk -v step="${STEP}" 'BEGIN {
-    if (step <= 0 || step > 1) {
-        exit 1
-    }
-    for (w = 0; w <= 1.000001; w += step) {
-        printf "%.4f\n", w
-    }
-}')
-
-if [[ -z "${weights}" ]]; then
-    echo "Invalid step: ${STEP}" >&2
-    exit 1
-fi
-
-echo "Weight tuning: step=${STEP}"
-echo ""
-printf "%-10s %-10s %-8s %-8s %-8s %-8s %-8s\n" "lexical" "embedding" "MRR" "P@1" "P@3" "P50" "report"
-
-while IFS= read -r lexical_weight; do
-    embedding_weight=$(awk -v w="${lexical_weight}" 'BEGIN { printf "%.4f", 1 - w }')
-
-    args=(
-        --strategy combined
-        --lexical-weight "${lexical_weight}"
-        --embedding-weight "${embedding_weight}"
-    )
-    if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-        args+=(--corpus "${SPECIFIC_CORPUS}")
-    fi
-
-    if ! output=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" "${args[@]}" 2>&1); then
-        echo "$output" >&2
-        exit 1
-    fi
-
-    corpus_report=$(echo "$output" | awk '/^Report:/ {print $2}' | tail -1)
-    if [[ -z "${corpus_report}" || ! -f "${corpus_report}" ]]; then
-        echo "Could not find corpus report for lexical=${lexical_weight}" >&2
-        echo "$output" >&2
-        exit 1
-    fi
-
-    mrr=$(jq -r '.metrics.mrr' "$corpus_report")
-    p1=$(jq -r '.metrics.p_at_1' "$corpus_report")
-    p3=$(jq -r '.metrics.p_at_3' "$corpus_report")
-    p50=$(jq -r '.metrics.latency_p50_ms' "$corpus_report")
-    total=$(jq -r '.metrics.total' "$corpus_report")
-
-    printf "%-10s %-10s %-8s %-8s %-8s %-8s %s\n" \
-        "${lexical_weight}" "${embedding_weight}" "${mrr}" "${p1}" "${p3}" "${p50}" "$(basename "$corpus_report")"
-
-    result_json=$(jq -n \
-        --argjson lexical_weight "${lexical_weight}" \
-        --argjson embedding_weight "${embedding_weight}" \
-        --argjson total "${total}" \
-        --argjson mrr "${mrr}" \
-        --argjson p1 "${p1}" \
-        --argjson p3 "${p3}" \
-        --argjson p50 "${p50}" \
-        --arg report "${corpus_report}" \
-        '{
-            lexical_weight: $lexical_weight,
-            embedding_weight: $embedding_weight,
-            total: $total,
-            mrr: $mrr,
-            p_at_1: $p1,
-            p_at_3: $p3,
-            latency_p50_ms: $p50,
-            report: $report
-        }')
-
-    tmp=$(mktemp)
-    jq --argjson result "${result_json}" '.results += [$result]' "${REPORT_FILE}" > "$tmp"
-    mv "$tmp" "${REPORT_FILE}"
-done <<< "${weights}"
-
-tmp=$(mktemp)
-jq '
-    .best = (
-        .results
-        | sort_by(.p_at_1, .mrr, .p_at_3, -(.latency_p50_ms))
-        | last
-    )
-' "${REPORT_FILE}" > "$tmp"
-mv "$tmp" "${REPORT_FILE}"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Combined Weight Tuning
-
-## Best
-
-| Field | Value |
-|-------|-------|
-| Lexical Weight | $(jq -r '.best.lexical_weight' "$REPORT_FILE") |
-| Embedding Weight | $(jq -r '.best.embedding_weight' "$REPORT_FILE") |
-| MRR | $(jq -r '.best.mrr' "$REPORT_FILE") |
-| P@1 | $(jq -r '.best.p_at_1' "$REPORT_FILE") |
-| P@3 | $(jq -r '.best.p_at_3' "$REPORT_FILE") |
-| Latency P50 | $(jq -r '.best.latency_p50_ms' "$REPORT_FILE") ms |
-
-## All Runs
-
-| Lexical | Embedding | MRR | P@1 | P@3 | P50 |
-|---------|-----------|-----|-----|-----|-----|
-$(jq -r '.results | sort_by(-.p_at_1, -.mrr, -.p_at_3, .latency_p50_ms)[] | "| \(.lexical_weight) | \(.embedding_weight) | \(.mrr) | \(.p_at_1) | \(.p_at_3) | \(.latency_p50_ms) ms |"' "$REPORT_FILE")
-EOF
-
-echo ""
-echo "Best weights:"
-jq '.best' "${REPORT_FILE}"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/update-baseline.sh b/tests/benchmark/scripts/update-baseline.sh
deleted file mode 100755
index ba93089..0000000
--- a/tests/benchmark/scripts/update-baseline.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-#
-# Update baseline after reviewing regressions.
-#
-# Usage:
-#   ./update-baseline.sh --accept [--baseline <file>]
-#
-# This re-runs the benchmark and overwrites the baseline file.
-# Use after reviewing check-baseline.sh output and confirming
-# the changes are intentional.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-
-# Parse args
-BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
-ACCEPT=false
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --accept) ACCEPT=true; shift ;;
-        --baseline) BASELINE_FILE="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-if [[ "$ACCEPT" != "true" ]]; then
-    echo "Usage: $0 --accept [--baseline <file>]"
-    echo ""
-    echo "This will overwrite the baseline. Run check-baseline.sh first"
-    echo "to review changes before accepting."
-    exit 1
-fi
-
-if [[ ! -f "$BASELINE_FILE" ]]; then
-    echo "Baseline not found: $BASELINE_FILE"
-    echo "Creating new baseline instead..."
-    exec "${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
-fi
-
-# Show what will change
-echo "Current baseline: ${BASELINE_FILE}"
-echo ""
-jq -r '"  MRR:   \(.metrics.mrr)\n  P@1:   \(.metrics.p_at_1)\n  Hit@3: \(.metrics.hit_at_3)"' "$BASELINE_FILE"
-echo ""
-echo "Running benchmark to generate new baseline..."
-echo ""
-
-# Backup old baseline
-BACKUP_FILE="${BASELINE_FILE%.json}_$(date +%Y%m%d_%H%M%S).backup.json"
-cp "$BASELINE_FILE" "$BACKUP_FILE"
-echo "Backed up old baseline to: $BACKUP_FILE"
-
-# Create new baseline (overwrites)
-"${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
-
-echo ""
-echo "Baseline updated. Old baseline backed up to:"
-echo "  $BACKUP_FILE"

From dfb7b022e358904d3bffadab3822ed3ebf3b472d Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:35:49 +0100
Subject: [PATCH 24/30] feat: move runtime baseline check to Go CLI

Add `semantic-bench runtime` command to check Go benchmark
performance against baseline. Remove last bash script and
the scripts/ directory.
---
 cmd/semantic-bench/main.go                    |  16 ++
 dev                                           |   2 +-
 internal/benchmark/commands.go                | 209 ++++++++++++++++++
 internal/benchmark/config.go                  |  14 ++
 .../scripts/check-runtime-baseline.sh         | 137 ------------
 5 files changed, 240 insertions(+), 138 deletions(-)
 delete mode 100755 tests/benchmark/scripts/check-runtime-baseline.sh

diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go
index 4866601..076d71a 100644
--- a/cmd/semantic-bench/main.go
+++ b/cmd/semantic-bench/main.go
@@ -21,6 +21,7 @@ Commands:
   baseline    Manage quality baselines (create, update)
   calibrate   Find optimal thresholds via precision/recall analysis
   tune        Grid-search lexical/embedding weights
+  runtime     Check Go benchmark performance against baseline
 
 Flags:
   -h, --help    Show help
@@ -54,6 +55,8 @@ func main() {
 		runCalibrate(args)
 	case "tune":
 		runTune(args)
+	case "runtime":
+		runRuntime(args)
 	case "-h", "--help", "help":
 		fmt.Print(usage)
 	default:
@@ -150,3 +153,16 @@ func runTune(args []string) {
 	}
 	benchmark.PrintTuneResult(result, cfg)
 }
+
+func runRuntime(args []string) {
+	cfg := benchmark.ParseRuntimeFlags(args)
+	result, err := benchmark.RunRuntime(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintRuntimeResult(result, cfg)
+	if result.Status == "fail" && cfg.FailOnRegression {
+		os.Exit(1)
+	}
+}
diff --git a/dev b/dev
index da0f70c..987e04c 100755
--- a/dev
+++ b/dev
@@ -197,7 +197,7 @@ run_calibrate() {
 
 run_runtime() {
   echo "  ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}"
-  bash tests/benchmark/scripts/check-runtime-baseline.sh "$@"
+  go run ./cmd/semantic-bench runtime "$@"
 }
 
 run_tune() {
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
index 7f37ed5..f537934 100644
--- a/internal/benchmark/commands.go
+++ b/internal/benchmark/commands.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"sort"
 	"strings"
@@ -871,3 +872,211 @@ func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
 	}
 	fmt.Println()
 }
+
+// Runtime baseline
+
+type RuntimeResult struct {
+	Status      string                     `json:"status"`
+	Benchmarks  []RuntimeBenchmark         `json:"benchmarks"`
+	Regressions int                        `json:"regressions"`
+	BaselinePath string                    `json:"baseline_path"`
+	Created     bool                       `json:"created"`
+}
+
+type RuntimeBenchmark struct {
+	Name       string  `json:"name"`
+	NsOp       float64 `json:"ns_op"`
+	BytesOp    int     `json:"bytes_op"`
+	AllocsOp   int     `json:"allocs_op"`
+	BaselineNs float64 `json:"baseline_ns,omitempty"`
+	Ratio      float64 `json:"ratio,omitempty"`
+	Status     string  `json:"status"`
+}
+
+type runtimeBaseline struct {
+	Timestamp  string             `json:"timestamp"`
+	Benchmarks []RuntimeBenchmark `json:"benchmarks"`
+}
+
+func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
+	root := FindBenchmarkRoot()
+	baselinePath := filepath.Join(root, "baselines", "runtime.json")
+
+	benchmarks, err := runGoBenchmarks()
+	if err != nil {
+		return nil, err
+	}
+
+	result := &RuntimeResult{
+		Status:       "pass",
+		Benchmarks:   benchmarks,
+		BaselinePath: baselinePath,
+	}
+
+	if _, err := os.Stat(baselinePath); os.IsNotExist(err) {
+		if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil {
+			return nil, err
+		}
+		result.Created = true
+		return result, nil
+	}
+
+	baseline, err := loadRuntimeBaseline(baselinePath)
+	if err != nil {
+		return nil, err
+	}
+
+	baselineMap := make(map[string]RuntimeBenchmark)
+	for _, b := range baseline.Benchmarks {
+		baselineMap[b.Name] = b
+	}
+
+	maxRatio := 1.25
+	for i, b := range result.Benchmarks {
+		if base, ok := baselineMap[b.Name]; ok {
+			ratio := b.NsOp / base.NsOp
+			result.Benchmarks[i].BaselineNs = base.NsOp
+			result.Benchmarks[i].Ratio = ratio
+
+			if ratio > maxRatio {
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			} else if ratio > 1.1 {
+				result.Benchmarks[i].Status = "warning"
+			} else {
+				result.Benchmarks[i].Status = "ok"
+			}
+		} else {
+			result.Benchmarks[i].Status = "new"
+		}
+	}
+
+	if result.Regressions > 0 {
+		result.Status = "fail"
+	}
+
+	return result, nil
+}
+
+func runGoBenchmarks() ([]RuntimeBenchmark, error) {
+	root := FindBenchmarkRoot()
+	projectRoot := filepath.Join(root, "..", "..")
+
+	cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...")
+	cmd.Dir = projectRoot
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("go test failed: %w\n%s", err, output)
+	}
+
+	return parseBenchOutput(string(output)), nil
+}
+
+func parseBenchOutput(output string) []RuntimeBenchmark {
+	var results []RuntimeBenchmark
+	lines := strings.Split(output, "\n")
+
+	for _, line := range lines {
+		if !strings.HasPrefix(line, "Benchmark") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 3 {
+			continue
+		}
+
+		name := strings.TrimSuffix(fields[0], "-8")
+		name = strings.TrimSuffix(name, "-10")
+		name = strings.TrimSuffix(name, "-12")
+		name = strings.TrimSuffix(name, "-16")
+
+		var nsOp float64
+		var bytesOp, allocsOp int
+
+		for i, f := range fields {
+			if f == "ns/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%f", &nsOp)
+			}
+			if f == "B/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
+			}
+			if f == "allocs/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
+			}
+		}
+
+		if nsOp > 0 {
+			results = append(results, RuntimeBenchmark{
+				Name:     name,
+				NsOp:     nsOp,
+				BytesOp:  bytesOp,
+				AllocsOp: allocsOp,
+			})
+		}
+	}
+
+	return results
+}
+
+func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error {
+	baseline := runtimeBaseline{
+		Timestamp:  time.Now().UTC().Format(time.RFC3339),
+		Benchmarks: benchmarks,
+	}
+	data, err := json.MarshalIndent(baseline, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func loadRuntimeBaseline(path string) (*runtimeBaseline, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var baseline runtimeBaseline
+	if err := json.Unmarshal(data, &baseline); err != nil {
+		return nil, err
+	}
+	return &baseline, nil
+}
+
+func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) {
+	if result.Created {
+		fmt.Printf("\n  Created runtime baseline: %s\n", result.BaselinePath)
+		fmt.Printf("  Benchmarks: %d\n\n", len(result.Benchmarks))
+		return
+	}
+
+	fmt.Printf("\n  Runtime Baseline Check\n\n")
+
+	for _, b := range result.Benchmarks {
+		var status string
+		switch b.Status {
+		case "regression":
+			status = "\033[31mREGRESSION\033[0m"
+		case "warning":
+			status = "\033[33mWARNING\033[0m"
+		case "ok":
+			status = "\033[32mOK\033[0m"
+		case "new":
+			status = "\033[33mNEW\033[0m"
+		}
+
+		if b.BaselineNs > 0 {
+			fmt.Printf("  %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n",
+				status, b.Name, b.BaselineNs, b.NsOp, b.Ratio)
+		} else {
+			fmt.Printf("  %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp)
+		}
+	}
+
+	fmt.Println()
+	if result.Regressions > 0 {
+		fmt.Printf("  \033[31mRegressions: %d\033[0m\n\n", result.Regressions)
+	} else {
+		fmt.Printf("  \033[32mNo regressions\033[0m\n\n")
+	}
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index eb2fe57..83e3f5c 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -118,6 +118,11 @@ type TuneConfig struct {
 	Verbose bool
 }
 
+type RuntimeConfig struct {
+	FailOnRegression bool
+	Verbose          bool
+}
+
 func FindBenchmarkRoot() string {
 	cwd, _ := os.Getwd()
 	for d := cwd; d != "/"; d = filepath.Dir(d) {
@@ -304,3 +309,12 @@ func ParseTuneFlags(args []string) TuneConfig {
 	fs.Parse(args)
 	return cfg
 }
+
+func ParseRuntimeFlags(args []string) RuntimeConfig {
+	fs := flag.NewFlagSet("runtime", flag.ExitOnError)
+	cfg := RuntimeConfig{}
+	fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
diff --git a/tests/benchmark/scripts/check-runtime-baseline.sh b/tests/benchmark/scripts/check-runtime-baseline.sh
deleted file mode 100755
index 75bc4fc..0000000
--- a/tests/benchmark/scripts/check-runtime-baseline.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/bin/bash
-#
-# Check Go benchmark results against runtime baseline.
-#
-# Usage:
-#   ./check-runtime-baseline.sh [--fail-on-regression]
-#
-# Runs Go benchmarks and compares against saved baseline.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-PROJECT_ROOT="${BENCHMARK_DIR}/../.."
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-# Read tolerances from config
-if [[ -f "$CONFIG_FILE" ]]; then
-    MAX_NS_RATIO=$(jq -r '.baseline.runtime.max_ns_op_regression_ratio // 1.25' "$CONFIG_FILE")
-    MAX_ALLOC_RATIO=$(jq -r '.baseline.runtime.max_alloc_regression_ratio // 1.25' "$CONFIG_FILE")
-else
-    MAX_NS_RATIO=1.25
-    MAX_ALLOC_RATIO=1.25
-fi
-
-# Parse args
-FAIL_ON_REGRESSION=false
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${RESULTS_DIR}"
-mkdir -p "${BASELINES_DIR}"
-
-BASELINE_FILE="${BASELINES_DIR}/runtime.json"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/runtime_${TIMESTAMP}.json"
-
-echo "Running Go benchmarks..."
-echo ""
-
-# Run benchmarks
-BENCH_OUTPUT=$(mktemp)
-(cd "$PROJECT_ROOT" && go test -bench=. -benchmem ./internal/engine/... 2>&1) | tee "$BENCH_OUTPUT"
-
-# Parse benchmark output into JSON
-echo ""
-echo "Parsing results..."
-
-jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" '{timestamp: $ts, benchmarks: []}' > "$REPORT_FILE"
-
-while IFS= read -r line; do
-    if [[ "$line" =~ ^Benchmark ]]; then
-        # Parse: BenchmarkName-N  iterations  ns/op  bytes/op  allocs/op
-        name=$(echo "$line" | awk '{print $1}' | sed 's/-[0-9]*$//')
-        ns_op=$(echo "$line" | grep -oE '[0-9.]+ ns/op' | awk '{print $1}' || echo "0")
-        bytes_op=$(echo "$line" | grep -oE '[0-9]+ B/op' | awk '{print $1}' || echo "0")
-        allocs_op=$(echo "$line" | grep -oE '[0-9]+ allocs/op' | awk '{print $1}' || echo "0")
-
-        if [[ -n "$ns_op" ]] && [[ "$ns_op" != "0" ]]; then
-            tmp=$(mktemp)
-            jq --arg name "$name" \
-               --argjson ns "$ns_op" \
-               --argjson bytes "${bytes_op:-0}" \
-               --argjson allocs "${allocs_op:-0}" \
-               '.benchmarks += [{name: $name, ns_op: $ns, bytes_op: $bytes, allocs_op: $allocs}]' \
-               "$REPORT_FILE" > "$tmp"
-            mv "$tmp" "$REPORT_FILE"
-        fi
-    fi
-done < "$BENCH_OUTPUT"
-
-rm -f "$BENCH_OUTPUT"
-
-# If no baseline exists, create one
-if [[ ! -f "$BASELINE_FILE" ]]; then
-    echo ""
-    echo "No runtime baseline found. Creating initial baseline..."
-    cp "$REPORT_FILE" "$BASELINE_FILE"
-    echo "Baseline saved to: $BASELINE_FILE"
-    exit 0
-fi
-
-# Compare against baseline
-echo ""
-echo "=== Comparing against baseline ==="
-echo ""
-
-REGRESSIONS=0
-
-for name in $(jq -r '.benchmarks[].name' "$REPORT_FILE"); do
-    baseline_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$BASELINE_FILE")
-    current_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$REPORT_FILE")
-
-    baseline_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$BASELINE_FILE")
-    current_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$REPORT_FILE")
-
-    if [[ "$baseline_ns" == "0" ]] || [[ "$baseline_ns" == "null" ]]; then
-        echo -e "${YELLOW}NEW${NC} $name: ${current_ns} ns/op"
-        continue
-    fi
-
-    ratio=$(echo "scale=4; $current_ns / $baseline_ns" | bc)
-
-    if (( $(echo "$ratio > $MAX_NS_RATIO" | bc -l) )); then
-        echo -e "${RED}REGRESSION${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x, max: ${MAX_NS_RATIO}x)"
-        REGRESSIONS=$((REGRESSIONS + 1))
-    elif (( $(echo "$ratio > 1.1" | bc -l) )); then
-        echo -e "${YELLOW}WARNING${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
-    else
-        echo -e "${GREEN}OK${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
-    fi
-done
-
-echo ""
-echo "================================================"
-if [[ $REGRESSIONS -gt 0 ]]; then
-    echo -e "${RED}RUNTIME REGRESSIONS: $REGRESSIONS${NC}"
-    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
-        exit 1
-    fi
-else
-    echo -e "${GREEN}NO RUNTIME REGRESSIONS${NC}"
-fi
-echo "================================================"
-echo ""
-echo "Report: ${REPORT_FILE}"

From 513d2813a1a7cea3cb274c5adeaf2ed0b17edb13 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:39:48 +0100
Subject: [PATCH 25/30] chore: ignore generated baseline files

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 9a58d8e..419dfaa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,4 +23,4 @@ cover.out
 tests/e2e/results/*.txt
 tests/benchmark/results/*.json
 tests/benchmark/results/*.md
-tests/benchmark/baselines/*.backup.json
\ No newline at end of file
+tests/benchmark/baselines/*.json
\ No newline at end of file

From f7a1c8f9f8e236f776dad808e3a056482fe52475 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:59:19 +0100
Subject: [PATCH 26/30] chore: simplify dev tool and update SKILL.md

- Remove redundant ./dev loop (same as ./dev bench)
- Add cmd/semantic-bench to architecture docs
- Simplify benchmark improvement loop section
---
 README.md                    |  2 +-
 dev                          |  7 ----
 scripts/check-docs-links.sh  | 62 ++++++++++++++++++++++++++++++++++++
 skills/semantic-dev/SKILL.md | 54 +++++++------------------------
 4 files changed, 74 insertions(+), 51 deletions(-)
 create mode 100755 scripts/check-docs-links.sh

diff --git a/README.md b/README.md
index 57e3053..83fb48e 100644
--- a/README.md
+++ b/README.md
@@ -204,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode
 
 ## Design Trade-offs
 
-See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
+See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
 
 ## Origin
 
diff --git a/dev b/dev
index 987e04c..5d8c88d 100755
--- a/dev
+++ b/dev
@@ -33,7 +33,6 @@ commands=(
   "runtime:⏱️:Check runtime baseline"
   "tune:🎛️:Tune combined weights"
   "e2e:🐳:Run E2E tests (Docker)"
-  "loop:🔄:Benchmark loop (bench → compare → report)"
 )
 
 show_help() {
@@ -214,11 +213,6 @@ run_e2e() {
   bash scripts/e2e.sh
 }
 
-run_loop() {
-  echo "  ${ACCENT}${BOLD}🔄 Benchmark Loop${NC}"
-  go run ./cmd/semantic-bench check -verbose "$@"
-}
-
 case "${1:-help}" in
   pr)        run_pr ;;
   doctor)    exec bash scripts/doctor.sh ;;
@@ -258,6 +252,5 @@ case "${1:-help}" in
   runtime)   shift; run_runtime "$@" ;;
   tune)      shift; run_tune "$@" ;;
   e2e)       run_e2e ;;
-  loop)      run_loop ;;
   help|*)    show_help ;;
 esac
diff --git a/scripts/check-docs-links.sh b/scripts/check-docs-links.sh
new file mode 100755
index 0000000..90a8738
--- /dev/null
+++ b/scripts/check-docs-links.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#
+# Check for broken documentation links
+#
+# Usage:
+#   ./scripts/check-docs-links.sh
+#
+set -uo pipefail
+
+cd "$(dirname "$0")/.."
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m'
+
+ERRORS=0
+
+echo "Checking documentation links..."
+echo ""
+
+# Find all markdown files and check links
+while IFS= read -r file; do
+    dir=$(dirname "$file")
+
+    # Extract markdown links: [text](path)
+    while IFS= read -r link; do
+        # Skip URLs and anchors
+        if [[ "$link" =~ ^https?:// ]] || [[ "$link" =~ ^mailto: ]] || [[ "$link" =~ ^# ]]; then
+            continue
+        fi
+        
+        # Remove anchor from link
+        link_path="${link%%#*}"
+        
+        # Skip empty paths
+        if [[ -z "$link_path" ]]; then
+            continue
+        fi
+        
+        # Resolve relative path
+        if [[ "$link_path" =~ ^/ ]]; then
+            target="$link_path"
+        else
+            target="$dir/$link_path"
+        fi
+        
+        # Check if target exists
+        if [[ ! -e "$target" ]]; then
+            echo -e "${RED}BROKEN:${NC} $file -> $link"
+            ERRORS=$((ERRORS + 1))
+        fi
+    done < <(grep -oE '\]\([^)]+\)' "$file" 2>/dev/null | sed 's/\](//' | sed 's/)//')
+done < <(find . -name "*.md" -not -path "./.git/*" -not -path "./node_modules/*")
+
+echo ""
+if [[ $ERRORS -eq 0 ]]; then
+    echo -e "${GREEN}✓${NC} All documentation links valid"
+    exit 0
+else
+    echo -e "${RED}Found $ERRORS broken link(s)${NC}"
+    exit 1
+fi
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index 7cbb684..2bea9dd 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -65,6 +65,7 @@ recovery/                  Public subpackage
   failure.go                 FailureType classification
 
 cmd/semantic/main.go       CLI tool (find, match, classify)
+cmd/semantic-bench/        Benchmark CLI (check, baseline, calibrate, tune, runtime)
 ```
 
 ## Key Design Decisions
@@ -92,57 +93,24 @@ cmd/semantic/main.go       CLI tool (find, match, classify)
 
 ## Benchmark Improvement Loop
 
-When implementing changes that affect matching quality, follow this loop:
-
-### Step 1: Ensure baseline exists
-
-```bash
-./dev baseline
-```
-
-Creates `tests/benchmark/baselines/combined.json` if missing.
-
-### Step 2: Implement change
-
-Make one focused improvement at a time.
-
-### Step 3: Run benchmark loop
+When implementing changes that affect matching quality:
 
 ```bash
-./dev loop
+./dev baseline          # create baseline (first time only)
+# ... make changes ...
+./dev bench             # run benchmark, compare to baseline
+./dev baseline update   # accept new baseline (if improved)
 ```
 
-Shows comparison table with deltas:
-- **Green (+)** = improved
-- **Red (-)** = regressed  
-- **Gray** = unchanged
-
-### Step 4: Evaluate and decide
-
-| Result | Action |
-|--------|--------|
-| All metrics improved/unchanged | `./dev baseline update` |
-| Mixed (some up, some down) | Investigate tradeoff |
-| Key metrics regressed | Fix before merging |
-
-### Step 5: Iterate
-
-Repeat steps 2-4. Each `baseline update` sets new goalpost.
-
-### Key metrics
-
+**Key metrics:**
 - **MRR** — Mean Reciprocal Rank (higher = finds correct element faster)
 - **P@1** — Precision at 1 (is top result correct?)
 - **Hit@3** — Any correct result in top 3?
-- **Margin** — Score gap between best correct and best wrong
-
-### Adding test cases
-
-When a query should work better:
 
-1. Add to `tests/benchmark/corpus/*/queries.json` or `cases/*.json`
-2. Run `./dev lint corpus`
-3. Run `./dev loop` — benchmark will show regression until fixed
+**Adding test cases:**
+1. Add to `tests/benchmark/corpus/*/queries.json`
+2. Run `./dev lint corpus` to validate
+3. Run `./dev bench` — shows regression until fixed
 
 ## Public API Surface
 

From 26ddb72003a29a6c91786c14595b34b449e16c14 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 18:04:03 +0100
Subject: [PATCH 27/30] refactor: split benchmark commands.go into separate
 files

- types.go: shared result types
- check.go: RunCheck, PrintCheckResult
- compare.go: RunCompare, PrintCompareResult
- lint.go: RunLint, PrintLintResult
- catalog.go: RunCatalog, PrintCatalogResult
- baseline.go: baseline management
- calibrate.go: threshold calibration
- tune.go: weight tuning
- runtime.go: Go benchmark performance
---
 internal/benchmark/baseline.go  |  110 ++++
 internal/benchmark/calibrate.go |  175 +++++
 internal/benchmark/catalog.go   |   75 +++
 internal/benchmark/check.go     |  237 +++++++
 internal/benchmark/commands.go  | 1082 -------------------------------
 internal/benchmark/compare.go   |   78 +++
 internal/benchmark/lint.go      |   68 ++
 internal/benchmark/runtime.go   |  217 +++++++
 internal/benchmark/tune.go      |   90 +++
 internal/benchmark/types.go     |   67 ++
 10 files changed, 1117 insertions(+), 1082 deletions(-)
 create mode 100644 internal/benchmark/baseline.go
 create mode 100644 internal/benchmark/calibrate.go
 create mode 100644 internal/benchmark/catalog.go
 create mode 100644 internal/benchmark/check.go
 delete mode 100644 internal/benchmark/commands.go
 create mode 100644 internal/benchmark/compare.go
 create mode 100644 internal/benchmark/lint.go
 create mode 100644 internal/benchmark/runtime.go
 create mode 100644 internal/benchmark/tune.go
 create mode 100644 internal/benchmark/types.go

diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go
new file mode 100644
index 0000000..de2a371
--- /dev/null
+++ b/internal/benchmark/baseline.go
@@ -0,0 +1,110 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type BaselineResult struct {
+	Action   string          `json:"action"`
+	Path     string          `json:"path"`
+	Metrics  OverallMetrics  `json:"metrics"`
+	Previous *OverallMetrics `json:"previous,omitempty"`
+}
+
+func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) {
+	root := FindBenchmarkRoot()
+	baselinesDir := filepath.Join(root, "baselines")
+	if err := os.MkdirAll(baselinesDir, 0755); err != nil {
+		return nil, err
+	}
+
+	baselinePath := filepath.Join(baselinesDir, cfg.Name+".json")
+
+	switch cfg.Action {
+	case "create":
+		return createBaseline(root, baselinePath, cfg)
+	case "update":
+		if !cfg.Accept {
+			return nil, fmt.Errorf("use --accept to confirm baseline update")
+		}
+		return updateBaseline(root, baselinePath, cfg)
+	default:
+		return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action)
+	}
+}
+
+func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Mode:            "library",
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	data, err := json.MarshalIndent(report, "", "  ")
+	if err != nil {
+		return nil, err
+	}
+	if err := os.WriteFile(baselinePath, data, 0644); err != nil {
+		return nil, err
+	}
+
+	return &BaselineResult{
+		Action:  "create",
+		Path:    baselinePath,
+		Metrics: report.Metrics.Overall,
+	}, nil
+}
+
+func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	var previous *OverallMetrics
+	if data, err := os.ReadFile(baselinePath); err == nil {
+		var old Report
+		if json.Unmarshal(data, &old) == nil {
+			previous = &old.Metrics.Overall
+		}
+		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
+		os.WriteFile(backupPath, data, 0644)
+	}
+
+	result, err := createBaseline(root, baselinePath, cfg)
+	if err != nil {
+		return nil, err
+	}
+	result.Action = "update"
+	result.Previous = previous
+	return result, nil
+}
+
+func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) {
+	fmt.Printf("\n  Baseline %sd: %s\n\n", result.Action, result.Path)
+	fmt.Printf("  MRR:    %.4f\n", result.Metrics.MRR)
+	fmt.Printf("  P@1:    %.4f\n", result.Metrics.PAt1)
+	fmt.Printf("  Hit@3:  %.4f\n", result.Metrics.HitAt3)
+
+	if result.Previous != nil {
+		fmt.Printf("\n  Previous:\n")
+		fmt.Printf("    MRR:    %.4f\n", result.Previous.MRR)
+		fmt.Printf("    P@1:    %.4f\n", result.Previous.PAt1)
+		fmt.Printf("    Hit@3:  %.4f\n", result.Previous.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go
new file mode 100644
index 0000000..9c9fa33
--- /dev/null
+++ b/internal/benchmark/calibrate.go
@@ -0,0 +1,175 @@
+package benchmark
+
+import (
+	"fmt"
+
+	"github.com/pinchtab/semantic"
+)
+
+type CalibrateResult struct {
+	ByThreshold     map[string]ThresholdMetrics `json:"by_threshold"`
+	Recommendations CalibrateRecommendations    `json:"recommendations"`
+	TotalCases      int                         `json:"total_cases"`
+}
+
+type ThresholdMetrics struct {
+	TP        int     `json:"tp"`
+	FP        int     `json:"fp"`
+	FN        int     `json:"fn"`
+	TN        int     `json:"tn"`
+	Recall    float64 `json:"recall"`
+	Precision float64 `json:"precision"`
+	FPR       float64 `json:"false_positive_rate"`
+	F1        float64 `json:"f1"`
+}
+
+type CalibrateRecommendations struct {
+	DefaultThreshold  float64 `json:"default_threshold"`
+	RecoveryThreshold float64 `json:"recovery_threshold"`
+	BestF1            float64 `json:"best_f1"`
+}
+
+func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &CalibrateResult{
+		ByThreshold: make(map[string]ThresholdMetrics),
+	}
+
+	type testCase struct {
+		query  Query
+		corpus *Corpus
+	}
+
+	var cases []testCase
+	for i := range ds.Corpora {
+		corpus := &ds.Corpora[i]
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+		for _, q := range corpus.Queries {
+			cases = append(cases, testCase{query: q, corpus: corpus})
+		}
+	}
+	result.TotalCases = len(cases)
+
+	if cfg.Verbose {
+		fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases))
+	}
+
+	runCfg := RunConfig{
+		Strategy:        "combined",
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+	}
+	matcher := createMatcher(runCfg)
+
+	var bestF1, bestF1Threshold float64
+	var bestRecallThreshold float64
+	var bestRecallWithPrecision float64
+
+	for _, threshold := range cfg.Thresholds {
+		tp, fp, fn, tn := 0, 0, 0, 0
+
+		for _, tc := range cases {
+			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
+				Threshold: threshold,
+				TopK:      5,
+			})
+
+			hasMatch := len(findResult.Matches) > 0
+			topRef := ""
+			if hasMatch {
+				topRef = findResult.Matches[0].Ref
+			}
+
+			if tc.query.ExpectNoMatch {
+				if hasMatch {
+					fp++
+				} else {
+					tn++
+				}
+			} else if len(tc.query.RelevantRefs) > 0 {
+				if !hasMatch {
+					fn++
+				} else if contains(tc.query.RelevantRefs, topRef) {
+					tp++
+				} else {
+					fp++
+				}
+			}
+		}
+
+		totalPos := tp + fn
+		totalNeg := tn + fp
+
+		var recall, precision, fpr, f1 float64
+		if totalPos > 0 {
+			recall = float64(tp) / float64(totalPos)
+		}
+		if tp+fp > 0 {
+			precision = float64(tp) / float64(tp+fp)
+		}
+		if totalNeg > 0 {
+			fpr = float64(fp) / float64(totalNeg)
+		}
+		if precision+recall > 0 {
+			f1 = 2 * precision * recall / (precision + recall)
+		}
+
+		key := fmt.Sprintf("%.2f", threshold)
+		result.ByThreshold[key] = ThresholdMetrics{
+			TP: tp, FP: fp, FN: fn, TN: tn,
+			Recall: recall, Precision: precision, FPR: fpr, F1: f1,
+		}
+
+		if f1 > bestF1 {
+			bestF1 = f1
+			bestF1Threshold = threshold
+		}
+		if recall >= 0.85 && precision > bestRecallWithPrecision {
+			bestRecallWithPrecision = precision
+			bestRecallThreshold = threshold
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n",
+				threshold, tp, fp, fn, tn, recall, precision, f1)
+		}
+	}
+
+	if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 {
+		bestRecallThreshold = cfg.Thresholds[0]
+	}
+
+	result.Recommendations = CalibrateRecommendations{
+		DefaultThreshold:  bestF1Threshold,
+		RecoveryThreshold: bestRecallThreshold,
+		BestF1:            bestF1,
+	}
+
+	return result, nil
+}
+
+func contains(refs []string, ref string) bool {
+	for _, r := range refs {
+		if r == ref {
+			return true
+		}
+	}
+	return false
+}
+
+func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) {
+	fmt.Printf("\n  Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold))
+
+	fmt.Printf("  Recommendations:\n")
+	fmt.Printf("    Default (best F1):   %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1)
+	fmt.Printf("    Recovery (recall):   %.2f\n", result.Recommendations.RecoveryThreshold)
+	fmt.Println()
+}
diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go
new file mode 100644
index 0000000..b4c4ec1
--- /dev/null
+++ b/internal/benchmark/catalog.go
@@ -0,0 +1,75 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"sort"
+)
+
+func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+
+	result := &CatalogResult{
+		ByTag:        make(map[string]int),
+		ByDifficulty: make(map[string]int),
+	}
+
+	for _, c := range ds.Corpora {
+		tags := make(map[string]bool)
+		for _, q := range c.Queries {
+			result.TotalQueries++
+			result.ByDifficulty[q.Difficulty]++
+			for _, t := range q.Tags {
+				tags[t] = true
+				result.ByTag[t]++
+			}
+		}
+		var tagList []string
+		for t := range tags {
+			tagList = append(tagList, t)
+		}
+		sort.Strings(tagList)
+		result.Corpora = append(result.Corpora, CorpusSummary{
+			ID:      c.ID,
+			Queries: len(c.Queries),
+			Tags:    tagList,
+		})
+	}
+
+	return result, nil
+}
+
+func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n  Corpora: %d\n", len(result.Corpora))
+	fmt.Printf("  Total Queries: %d\n\n", result.TotalQueries)
+
+	fmt.Printf("  %-30s %8s\n", "Corpus", "Queries")
+	fmt.Printf("  %-30s %8s\n", "------", "-------")
+	for _, c := range result.Corpora {
+		fmt.Printf("  %-30s %8d\n", c.ID, c.Queries)
+	}
+
+	switch cfg.By {
+	case "difficulty":
+		fmt.Printf("\n  By Difficulty:\n")
+		for d, n := range result.ByDifficulty {
+			fmt.Printf("    %-10s %4d\n", d, n)
+		}
+	case "tag":
+		fmt.Printf("\n  By Tag:\n")
+		for t, n := range result.ByTag {
+			fmt.Printf("    %-20s %4d\n", t, n)
+		}
+	}
+	fmt.Printf("\n")
+}
diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
new file mode 100644
index 0000000..81171bb
--- /dev/null
+++ b/internal/benchmark/check.go
@@ -0,0 +1,237 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+func RunCheck(cfg CheckConfig) (*CheckResult, error) {
+	root := FindBenchmarkRoot()
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	benchCfg, _ := LoadConfig(root)
+	profile := Profile{
+		Strategy:  "combined",
+		Threshold: 0.01,
+		TopK:      5,
+		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+	}
+	if benchCfg != nil {
+		profile = ResolveProfile(benchCfg, cfg.Profile)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        profile.Strategy,
+		Threshold:       profile.Threshold,
+		TopK:            profile.TopK,
+		LexicalWeight:   profile.Weights.Lexical,
+		EmbeddingWeight: profile.Weights.Embedding,
+		Profile:         cfg.Profile,
+		Mode:            "library",
+		Verbose:         cfg.Verbose,
+		Explain:         cfg.Explain,
+		OutputDir:       cfg.OutputDir,
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	result := &CheckResult{
+		Status: "pass",
+		Report: report,
+	}
+	result.Summary.PAt1 = report.Metrics.Overall.PAt1
+	result.Summary.MRR = report.Metrics.Overall.MRR
+	result.Summary.HitAt3 = report.Metrics.Overall.HitAt3
+	result.Summary.Total = report.Metrics.Overall.Total
+
+	for _, r := range report.Results {
+		if r.Status == "miss" {
+			result.TopRegs = append(result.TopRegs, Regression{
+				ID:           r.ID,
+				Corpus:       r.Corpus,
+				Query:        r.Query,
+				Expected:     r.Expected.RelevantRefs,
+				CurrentRef:   r.Actual.BestRef,
+				Reason:       "miss",
+				DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID),
+			})
+		}
+	}
+	result.Summary.Regressions = len(result.TopRegs)
+
+	baselinePath := cfg.BaselinePath
+	if baselinePath == "" {
+		baselinePath = filepath.Join(root, "baselines", "combined.json")
+	}
+	if _, err := os.Stat(baselinePath); err == nil {
+		baseline, err := loadReport(baselinePath)
+		if err == nil {
+			result.Delta = &MetricsDelta{
+				PAt1:   report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+			}
+			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
+				result.Status = "fail"
+			}
+		}
+	}
+
+	os.MkdirAll(cfg.OutputDir, 0755)
+	ts := time.Now().Format("20060102_150405")
+	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
+	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
+
+	reportJSON, _ := json.MarshalIndent(report, "", "  ")
+	os.WriteFile(reportPath, reportJSON, 0644)
+
+	summaryMD := generateSummaryMD(report, result)
+	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
+
+	result.Artifacts.ReportJSON = reportPath
+	result.Artifacts.SummaryMD = summaryPath
+
+	return result, nil
+}
+
+func RunBenchmark(cfg RunConfig) (*Report, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+	return RunCorpusBenchmark(ds, cfg)
+}
+
+func loadReport(path string) (*Report, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var r Report
+	if err := json.Unmarshal(data, &r); err != nil {
+		return nil, err
+	}
+	return &r, nil
+}
+
+func generateSummaryMD(report *Report, result *CheckResult) string {
+	var sb strings.Builder
+
+	sb.WriteString("# Benchmark Summary\n\n")
+	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
+
+	sb.WriteString("## Overall Metrics\n\n")
+	sb.WriteString("| Metric | Value |\n")
+	sb.WriteString("|--------|-------|\n")
+	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
+	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
+	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
+	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
+	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
+
+	if result.Delta != nil {
+		sb.WriteString("\n## Delta from Baseline\n\n")
+		sb.WriteString("| Metric | Delta |\n")
+		sb.WriteString("|--------|-------|\n")
+		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
+		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
+		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
+	}
+
+	if len(result.TopRegs) > 0 {
+		sb.WriteString("\n## Misses\n\n")
+		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
+		sb.WriteString("|----|--------|-------|-----|----------|\n")
+		for _, r := range result.TopRegs {
+			if len(result.TopRegs) > 10 {
+				break
+			}
+			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
+				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
+		}
+	}
+
+	return sb.String()
+}
+
+func PrintCheckResult(result *CheckResult, cfg CheckConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m Benchmark passed\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Benchmark failed\n")
+	}
+	fmt.Printf("\n")
+
+	fmt.Printf("  %-12s %8.4f\n", "MRR", result.Summary.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", result.Summary.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", result.Summary.Total)
+	fmt.Printf("  %-12s %8d\n", "Misses", result.Summary.Regressions)
+
+	if result.Delta != nil {
+		fmt.Printf("\n  Delta from baseline:\n")
+		printDelta("P@1", result.Delta.PAt1)
+		printDelta("MRR", result.Delta.MRR)
+		printDelta("Hit@3", result.Delta.HitAt3)
+	}
+
+	fmt.Printf("\n  Artifacts:\n")
+	fmt.Printf("    Report:  %s\n", result.Artifacts.ReportJSON)
+	fmt.Printf("    Summary: %s\n", result.Artifacts.SummaryMD)
+	fmt.Printf("\n")
+}
+
+func printDelta(name string, delta float64) {
+	color := "\033[0m"
+	sign := ""
+	if delta > 0.001 {
+		color = "\033[32m"
+		sign = "+"
+	} else if delta < -0.001 {
+		color = "\033[31m"
+	}
+	fmt.Printf("    %s%-8s %s%.4f\033[0m\n", color, name, sign, delta)
+}
+
+func PrintRunResult(report *Report, cfg RunConfig) {
+	fmt.Printf("\n")
+	fmt.Printf("  %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", report.Metrics.Overall.Total)
+	fmt.Printf("\n")
+
+	if cfg.Verbose {
+		for _, r := range report.Results {
+			status := "\033[32mHIT \033[0m"
+			switch r.Status {
+			case "miss":
+				status = "\033[31mMISS\033[0m"
+			case "partial":
+				status = "\033[33mPART\033[0m"
+			}
+			fmt.Printf("  [%s] %s | %s | got=%s score=%.3f\n",
+				r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore)
+		}
+	}
+}
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
deleted file mode 100644
index f537934..0000000
--- a/internal/benchmark/commands.go
+++ /dev/null
@@ -1,1082 +0,0 @@
-package benchmark
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"sort"
-	"strings"
-	"time"
-
-	"github.com/pinchtab/semantic"
-)
-
-type CheckResult struct {
-	Status    string        `json:"status"`
-	Summary   CheckSummary  `json:"summary"`
-	Delta     *MetricsDelta `json:"delta,omitempty"`
-	TopRegs   []Regression  `json:"top_regressions,omitempty"`
-	Artifacts Artifacts     `json:"artifacts"`
-	Report    *Report       `json:"-"`
-}
-
-type CheckSummary struct {
-	PAt1        float64 `json:"p_at_1"`
-	MRR         float64 `json:"mrr"`
-	HitAt3      float64 `json:"hit_at_3"`
-	Total       int     `json:"total"`
-	Regressions int     `json:"regressions"`
-	Warnings    int     `json:"warnings"`
-}
-
-type MetricsDelta struct {
-	PAt1   float64 `json:"p_at_1"`
-	MRR    float64 `json:"mrr"`
-	HitAt3 float64 `json:"hit_at_3"`
-}
-
-type Regression struct {
-	ID           string   `json:"id"`
-	Corpus       string   `json:"corpus"`
-	Query        string   `json:"query"`
-	Expected     []string `json:"expected"`
-	BaselineRef  string   `json:"baseline_ref,omitempty"`
-	CurrentRef   string   `json:"current_ref"`
-	Reason       string   `json:"reason"`
-	DebugCommand string   `json:"debug_command"`
-}
-
-type Artifacts struct {
-	ReportJSON string `json:"report_json"`
-	SummaryMD  string `json:"summary_md"`
-}
-
-type CompareResult struct {
-	Status       string       `json:"status"`
-	Delta        MetricsDelta `json:"delta"`
-	Regressions  []Regression `json:"regressions"`
-	Improvements []string     `json:"improvements"`
-}
-
-type LintResult struct {
-	Errors   int      `json:"errors"`
-	Warnings int      `json:"warnings"`
-	Messages []string `json:"messages"`
-}
-
-type CatalogResult struct {
-	Corpora      []CorpusSummary `json:"corpora"`
-	TotalQueries int             `json:"total_queries"`
-	ByTag        map[string]int  `json:"by_tag,omitempty"`
-	ByDifficulty map[string]int  `json:"by_difficulty,omitempty"`
-}
-
-type CorpusSummary struct {
-	ID      string   `json:"id"`
-	Queries int      `json:"queries"`
-	Tags    []string `json:"tags"`
-}
-
-func RunCheck(cfg CheckConfig) (*CheckResult, error) {
-	root := FindBenchmarkRoot()
-
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	benchCfg, _ := LoadConfig(root)
-	profile := Profile{
-		Strategy:  "combined",
-		Threshold: 0.01,
-		TopK:      5,
-		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
-	}
-	if benchCfg != nil {
-		profile = ResolveProfile(benchCfg, cfg.Profile)
-	}
-
-	runCfg := RunConfig{
-		Suite:           "corpus",
-		Strategy:        profile.Strategy,
-		Threshold:       profile.Threshold,
-		TopK:            profile.TopK,
-		LexicalWeight:   profile.Weights.Lexical,
-		EmbeddingWeight: profile.Weights.Embedding,
-		Profile:         cfg.Profile,
-		Mode:            "library",
-		Verbose:         cfg.Verbose,
-		Explain:         cfg.Explain,
-		OutputDir:       cfg.OutputDir,
-	}
-
-	report, err := RunCorpusBenchmark(ds, runCfg)
-	if err != nil {
-		return nil, fmt.Errorf("run benchmark: %w", err)
-	}
-
-	result := &CheckResult{
-		Status: "pass",
-		Report: report,
-	}
-	result.Summary.PAt1 = report.Metrics.Overall.PAt1
-	result.Summary.MRR = report.Metrics.Overall.MRR
-	result.Summary.HitAt3 = report.Metrics.Overall.HitAt3
-	result.Summary.Total = report.Metrics.Overall.Total
-
-	// Count misses
-	for _, r := range report.Results {
-		if r.Status == "miss" {
-			result.TopRegs = append(result.TopRegs, Regression{
-				ID:           r.ID,
-				Corpus:       r.Corpus,
-				Query:        r.Query,
-				Expected:     r.Expected.RelevantRefs,
-				CurrentRef:   r.Actual.BestRef,
-				Reason:       "miss",
-				DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID),
-			})
-		}
-	}
-	result.Summary.Regressions = len(result.TopRegs)
-
-	// Compare to baseline if exists
-	baselinePath := cfg.BaselinePath
-	if baselinePath == "" {
-		baselinePath = filepath.Join(root, "baselines", "combined.json")
-	}
-	if _, err := os.Stat(baselinePath); err == nil {
-		baseline, err := loadReport(baselinePath)
-		if err == nil {
-			result.Delta = &MetricsDelta{
-				PAt1:   report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
-				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
-				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
-			}
-			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
-				result.Status = "fail"
-			}
-		}
-	}
-
-	// Write artifacts
-	os.MkdirAll(cfg.OutputDir, 0755)
-	ts := time.Now().Format("20060102_150405")
-	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
-	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
-
-	reportJSON, _ := json.MarshalIndent(report, "", "  ")
-	os.WriteFile(reportPath, reportJSON, 0644)
-
-	summaryMD := generateSummaryMD(report, result)
-	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
-
-	result.Artifacts.ReportJSON = reportPath
-	result.Artifacts.SummaryMD = summaryPath
-
-	return result, nil
-}
-
-func RunBenchmark(cfg RunConfig) (*Report, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, err
-	}
-	return RunCorpusBenchmark(ds, cfg)
-}
-
-func RunCompare(cfg CompareConfig) (*CompareResult, error) {
-	baseline, err := loadReport(cfg.BaselinePath)
-	if err != nil {
-		return nil, fmt.Errorf("load baseline: %w", err)
-	}
-	current, err := loadReport(cfg.CurrentPath)
-	if err != nil {
-		return nil, fmt.Errorf("load current: %w", err)
-	}
-
-	result := &CompareResult{
-		Status: "pass",
-		Delta: MetricsDelta{
-			PAt1:   current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
-			MRR:    current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
-			HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
-		},
-	}
-
-	if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 {
-		result.Status = "fail"
-	}
-
-	// Find regressions
-	baselineResults := make(map[string]QueryResult)
-	for _, r := range baseline.Results {
-		baselineResults[r.ID] = r
-	}
-	for _, r := range current.Results {
-		if base, ok := baselineResults[r.ID]; ok {
-			if base.Status == "hit" && r.Status != "hit" {
-				result.Regressions = append(result.Regressions, Regression{
-					ID:          r.ID,
-					Corpus:      r.Corpus,
-					Query:       r.Query,
-					BaselineRef: base.Actual.BestRef,
-					CurrentRef:  r.Actual.BestRef,
-					Reason:      fmt.Sprintf("%s -> %s", base.Status, r.Status),
-				})
-			}
-		}
-	}
-
-	return result, nil
-}
-
-func RunLint(cfg LintConfig) (*LintResult, error) {
-	root := FindBenchmarkRoot()
-	result := &LintResult{}
-
-	ds, err := LoadDataset(root)
-	if err != nil {
-		result.Errors++
-		result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err))
-		return result, nil
-	}
-
-	// Check for duplicate IDs
-	ids := make(map[string]string)
-	for _, c := range ds.Corpora {
-		for _, q := range c.Queries {
-			if existing, ok := ids[q.ID]; ok {
-				result.Errors++
-				result.Messages = append(result.Messages,
-					fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing))
-			} else {
-				ids[q.ID] = c.ID
-			}
-		}
-	}
-
-	// Check refs exist
-	for _, c := range ds.Corpora {
-		refs := make(map[string]bool)
-		for _, d := range c.Snapshot {
-			refs[d.Ref] = true
-		}
-		for _, q := range c.Queries {
-			for _, r := range q.RelevantRefs {
-				if !refs[r] {
-					result.Errors++
-					result.Messages = append(result.Messages,
-						fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r))
-				}
-			}
-		}
-	}
-
-	// Check difficulty values
-	validDiff := map[string]bool{"easy": true, "medium": true, "hard": true}
-	for _, c := range ds.Corpora {
-		for _, q := range c.Queries {
-			if q.Difficulty != "" && !validDiff[q.Difficulty] {
-				result.Errors++
-				result.Messages = append(result.Messages,
-					fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID))
-			}
-		}
-	}
-
-	if result.Errors == 0 && result.Warnings == 0 {
-		result.Messages = append(result.Messages, "All checks passed")
-	}
-
-	return result, nil
-}
-
-func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, err
-	}
-
-	result := &CatalogResult{
-		ByTag:        make(map[string]int),
-		ByDifficulty: make(map[string]int),
-	}
-
-	for _, c := range ds.Corpora {
-		tags := make(map[string]bool)
-		for _, q := range c.Queries {
-			result.TotalQueries++
-			result.ByDifficulty[q.Difficulty]++
-			for _, t := range q.Tags {
-				tags[t] = true
-				result.ByTag[t]++
-			}
-		}
-		var tagList []string
-		for t := range tags {
-			tagList = append(tagList, t)
-		}
-		sort.Strings(tagList)
-		result.Corpora = append(result.Corpora, CorpusSummary{
-			ID:      c.ID,
-			Queries: len(c.Queries),
-			Tags:    tagList,
-		})
-	}
-
-	return result, nil
-}
-
-func loadReport(path string) (*Report, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, err
-	}
-	var r Report
-	if err := json.Unmarshal(data, &r); err != nil {
-		return nil, err
-	}
-	return &r, nil
-}
-
-func generateSummaryMD(report *Report, result *CheckResult) string {
-	var sb strings.Builder
-
-	sb.WriteString("# Benchmark Summary\n\n")
-	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
-
-	sb.WriteString("## Overall Metrics\n\n")
-	sb.WriteString("| Metric | Value |\n")
-	sb.WriteString("|--------|-------|\n")
-	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
-	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
-	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
-	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
-	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
-
-	if result.Delta != nil {
-		sb.WriteString("\n## Delta from Baseline\n\n")
-		sb.WriteString("| Metric | Delta |\n")
-		sb.WriteString("|--------|-------|\n")
-		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
-		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
-		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
-	}
-
-	if len(result.TopRegs) > 0 {
-		sb.WriteString("\n## Misses\n\n")
-		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
-		sb.WriteString("|----|--------|-------|-----|----------|\n")
-		for _, r := range result.TopRegs {
-			if len(result.TopRegs) > 10 {
-				break
-			}
-			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
-				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
-		}
-	}
-
-	return sb.String()
-}
-
-func PrintCheckResult(result *CheckResult, cfg CheckConfig) {
-	if cfg.Format == "json" {
-		data, _ := json.MarshalIndent(result, "", "  ")
-		fmt.Println(string(data))
-		return
-	}
-
-	fmt.Printf("\n")
-	if result.Status == "pass" {
-		fmt.Printf("  \033[32m✓\033[0m Benchmark passed\n")
-	} else {
-		fmt.Printf("  \033[31m✗\033[0m Benchmark failed\n")
-	}
-	fmt.Printf("\n")
-
-	fmt.Printf("  %-12s %8.4f\n", "MRR", result.Summary.MRR)
-	fmt.Printf("  %-12s %8.4f\n", "P@1", result.Summary.PAt1)
-	fmt.Printf("  %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3)
-	fmt.Printf("  %-12s %8d\n", "Total", result.Summary.Total)
-	fmt.Printf("  %-12s %8d\n", "Misses", result.Summary.Regressions)
-
-	if result.Delta != nil {
-		fmt.Printf("\n  Delta from baseline:\n")
-		printDelta("P@1", result.Delta.PAt1)
-		printDelta("MRR", result.Delta.MRR)
-		printDelta("Hit@3", result.Delta.HitAt3)
-	}
-
-	fmt.Printf("\n  Artifacts:\n")
-	fmt.Printf("    Report:  %s\n", result.Artifacts.ReportJSON)
-	fmt.Printf("    Summary: %s\n", result.Artifacts.SummaryMD)
-	fmt.Printf("\n")
-}
-
-func printDelta(name string, delta float64) {
-	color := "\033[0m"
-	sign := ""
-	if delta > 0.001 {
-		color = "\033[32m"
-		sign = "+"
-	} else if delta < -0.001 {
-		color = "\033[31m"
-	}
-	fmt.Printf("    %s%-8s %s%.4f\033[0m\n", color, name, sign, delta)
-}
-
-func PrintRunResult(report *Report, cfg RunConfig) {
-	fmt.Printf("\n")
-	fmt.Printf("  %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR)
-	fmt.Printf("  %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1)
-	fmt.Printf("  %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3)
-	fmt.Printf("  %-12s %8d\n", "Total", report.Metrics.Overall.Total)
-	fmt.Printf("\n")
-
-	if cfg.Verbose {
-		for _, r := range report.Results {
-			status := "\033[32mHIT \033[0m"
-			switch r.Status {
-			case "miss":
-				status = "\033[31mMISS\033[0m"
-			case "partial":
-				status = "\033[33mPART\033[0m"
-			}
-			fmt.Printf("  [%s] %s | %s | got=%s score=%.3f\n",
-				r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore)
-		}
-	}
-}
-
-func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
-	fmt.Printf("\n")
-	if result.Status == "pass" {
-		fmt.Printf("  \033[32m✓\033[0m No regression\n")
-	} else {
-		fmt.Printf("  \033[31m✗\033[0m Regression detected\n")
-	}
-	fmt.Printf("\n")
-	printDelta("P@1", result.Delta.PAt1)
-	printDelta("MRR", result.Delta.MRR)
-	printDelta("Hit@3", result.Delta.HitAt3)
-
-	if len(result.Regressions) > 0 {
-		fmt.Printf("\n  Regressions:\n")
-		for _, r := range result.Regressions {
-			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
-		}
-	}
-	fmt.Printf("\n")
-}
-
-func PrintLintResult(result *LintResult, cfg LintConfig) {
-	for _, msg := range result.Messages {
-		fmt.Println(msg)
-	}
-	fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings)
-}
-
-func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
-	if cfg.Format == "json" {
-		data, _ := json.MarshalIndent(result, "", "  ")
-		fmt.Println(string(data))
-		return
-	}
-
-	fmt.Printf("\n  Corpora: %d\n", len(result.Corpora))
-	fmt.Printf("  Total Queries: %d\n\n", result.TotalQueries)
-
-	fmt.Printf("  %-30s %8s\n", "Corpus", "Queries")
-	fmt.Printf("  %-30s %8s\n", "------", "-------")
-	for _, c := range result.Corpora {
-		fmt.Printf("  %-30s %8d\n", c.ID, c.Queries)
-	}
-
-	switch cfg.By {
-	case "difficulty":
-		fmt.Printf("\n  By Difficulty:\n")
-		for d, n := range result.ByDifficulty {
-			fmt.Printf("    %-10s %4d\n", d, n)
-		}
-	case "tag":
-		fmt.Printf("\n  By Tag:\n")
-		for t, n := range result.ByTag {
-			fmt.Printf("    %-20s %4d\n", t, n)
-		}
-	}
-	fmt.Printf("\n")
-}
-
-// Baseline management
-
-type BaselineResult struct {
-	Action   string         `json:"action"`
-	Path     string         `json:"path"`
-	Metrics  OverallMetrics `json:"metrics"`
-	Previous *OverallMetrics `json:"previous,omitempty"`
-}
-
-func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) {
-	root := FindBenchmarkRoot()
-	baselinesDir := filepath.Join(root, "baselines")
-	if err := os.MkdirAll(baselinesDir, 0755); err != nil {
-		return nil, err
-	}
-
-	baselinePath := filepath.Join(baselinesDir, cfg.Name+".json")
-
-	switch cfg.Action {
-	case "create":
-		return createBaseline(root, baselinePath, cfg)
-	case "update":
-		if !cfg.Accept {
-			return nil, fmt.Errorf("use --accept to confirm baseline update")
-		}
-		return updateBaseline(root, baselinePath, cfg)
-	default:
-		return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action)
-	}
-}
-
-func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	runCfg := RunConfig{
-		Suite:           "corpus",
-		Strategy:        "combined",
-		Threshold:       0.01,
-		TopK:            5,
-		LexicalWeight:   0.6,
-		EmbeddingWeight: 0.4,
-		Mode:            "library",
-	}
-
-	report, err := RunCorpusBenchmark(ds, runCfg)
-	if err != nil {
-		return nil, fmt.Errorf("run benchmark: %w", err)
-	}
-
-	data, err := json.MarshalIndent(report, "", "  ")
-	if err != nil {
-		return nil, err
-	}
-	if err := os.WriteFile(baselinePath, data, 0644); err != nil {
-		return nil, err
-	}
-
-	return &BaselineResult{
-		Action:  "create",
-		Path:    baselinePath,
-		Metrics: report.Metrics.Overall,
-	}, nil
-}
-
-func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
-	var previous *OverallMetrics
-	if data, err := os.ReadFile(baselinePath); err == nil {
-		var old Report
-		if json.Unmarshal(data, &old) == nil {
-			previous = &old.Metrics.Overall
-		}
-		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
-		os.WriteFile(backupPath, data, 0644)
-	}
-
-	result, err := createBaseline(root, baselinePath, cfg)
-	if err != nil {
-		return nil, err
-	}
-	result.Action = "update"
-	result.Previous = previous
-	return result, nil
-}
-
-func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) {
-	fmt.Printf("\n  Baseline %sd: %s\n\n", result.Action, result.Path)
-	fmt.Printf("  MRR:    %.4f\n", result.Metrics.MRR)
-	fmt.Printf("  P@1:    %.4f\n", result.Metrics.PAt1)
-	fmt.Printf("  Hit@3:  %.4f\n", result.Metrics.HitAt3)
-
-	if result.Previous != nil {
-		fmt.Printf("\n  Previous:\n")
-		fmt.Printf("    MRR:    %.4f\n", result.Previous.MRR)
-		fmt.Printf("    P@1:    %.4f\n", result.Previous.PAt1)
-		fmt.Printf("    Hit@3:  %.4f\n", result.Previous.HitAt3)
-	}
-	fmt.Println()
-}
-
-// Threshold calibration
-
-type CalibrateResult struct {
-	ByThreshold     map[string]ThresholdMetrics `json:"by_threshold"`
-	Recommendations CalibrateRecommendations    `json:"recommendations"`
-	TotalCases      int                         `json:"total_cases"`
-}
-
-type ThresholdMetrics struct {
-	TP        int     `json:"tp"`
-	FP        int     `json:"fp"`
-	FN        int     `json:"fn"`
-	TN        int     `json:"tn"`
-	Recall    float64 `json:"recall"`
-	Precision float64 `json:"precision"`
-	FPR       float64 `json:"false_positive_rate"`
-	F1        float64 `json:"f1"`
-}
-
-type CalibrateRecommendations struct {
-	DefaultThreshold  float64 `json:"default_threshold"`
-	RecoveryThreshold float64 `json:"recovery_threshold"`
-	BestF1            float64 `json:"best_f1"`
-}
-
-func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	result := &CalibrateResult{
-		ByThreshold: make(map[string]ThresholdMetrics),
-	}
-
-	type testCase struct {
-		query         Query
-		corpus        *Corpus
-	}
-
-	var cases []testCase
-	for i := range ds.Corpora {
-		corpus := &ds.Corpora[i]
-		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
-			continue
-		}
-		for _, q := range corpus.Queries {
-			cases = append(cases, testCase{query: q, corpus: corpus})
-		}
-	}
-	result.TotalCases = len(cases)
-
-	if cfg.Verbose {
-		fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases))
-	}
-
-	runCfg := RunConfig{
-		Strategy:        "combined",
-		TopK:            5,
-		LexicalWeight:   0.6,
-		EmbeddingWeight: 0.4,
-	}
-	matcher := createMatcher(runCfg)
-
-	var bestF1, bestF1Threshold float64
-	var bestRecallThreshold float64
-	var bestRecallWithPrecision float64
-
-	for _, threshold := range cfg.Thresholds {
-		tp, fp, fn, tn := 0, 0, 0, 0
-
-		for _, tc := range cases {
-			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
-				Threshold: threshold,
-				TopK:      5,
-			})
-
-			hasMatch := len(findResult.Matches) > 0
-			topRef := ""
-			if hasMatch {
-				topRef = findResult.Matches[0].Ref
-			}
-
-			if tc.query.ExpectNoMatch {
-				if hasMatch {
-					fp++
-				} else {
-					tn++
-				}
-			} else if len(tc.query.RelevantRefs) > 0 {
-				if !hasMatch {
-					fn++
-				} else if contains(tc.query.RelevantRefs, topRef) {
-					tp++
-				} else {
-					fp++
-				}
-			}
-		}
-
-		totalPos := tp + fn
-		totalNeg := tn + fp
-
-		var recall, precision, fpr, f1 float64
-		if totalPos > 0 {
-			recall = float64(tp) / float64(totalPos)
-		}
-		if tp+fp > 0 {
-			precision = float64(tp) / float64(tp+fp)
-		}
-		if totalNeg > 0 {
-			fpr = float64(fp) / float64(totalNeg)
-		}
-		if precision+recall > 0 {
-			f1 = 2 * precision * recall / (precision + recall)
-		}
-
-		key := fmt.Sprintf("%.2f", threshold)
-		result.ByThreshold[key] = ThresholdMetrics{
-			TP: tp, FP: fp, FN: fn, TN: tn,
-			Recall: recall, Precision: precision, FPR: fpr, F1: f1,
-		}
-
-		if f1 > bestF1 {
-			bestF1 = f1
-			bestF1Threshold = threshold
-		}
-		if recall >= 0.85 && precision > bestRecallWithPrecision {
-			bestRecallWithPrecision = precision
-			bestRecallThreshold = threshold
-		}
-
-		if cfg.Verbose {
-			fmt.Printf("  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n",
-				threshold, tp, fp, fn, tn, recall, precision, f1)
-		}
-	}
-
-	if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 {
-		bestRecallThreshold = cfg.Thresholds[0]
-	}
-
-	result.Recommendations = CalibrateRecommendations{
-		DefaultThreshold:  bestF1Threshold,
-		RecoveryThreshold: bestRecallThreshold,
-		BestF1:            bestF1,
-	}
-
-	return result, nil
-}
-
-func contains(refs []string, ref string) bool {
-	for _, r := range refs {
-		if r == ref {
-			return true
-		}
-	}
-	return false
-}
-
-func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) {
-	fmt.Printf("\n  Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold))
-
-	fmt.Printf("  Recommendations:\n")
-	fmt.Printf("    Default (best F1):   %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1)
-	fmt.Printf("    Recovery (recall):   %.2f\n", result.Recommendations.RecoveryThreshold)
-	fmt.Println()
-}
-
-// Weight tuning
-
-type TuneResult struct {
-	Results []TuneRun `json:"results"`
-	Best    *TuneRun  `json:"best"`
-}
-
-type TuneRun struct {
-	LexicalWeight   float64 `json:"lexical_weight"`
-	EmbeddingWeight float64 `json:"embedding_weight"`
-	MRR             float64 `json:"mrr"`
-	PAt1            float64 `json:"p_at_1"`
-	HitAt3          float64 `json:"hit_at_3"`
-}
-
-func RunTune(cfg TuneConfig) (*TuneResult, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	result := &TuneResult{}
-
-	if cfg.Verbose {
-		fmt.Printf("  %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3")
-	}
-
-	for w := 0.0; w <= 1.0001; w += cfg.Step {
-		lexW := w
-		embW := 1.0 - w
-
-		runCfg := RunConfig{
-			Suite:           "corpus",
-			Strategy:        "combined",
-			Threshold:       0.01,
-			TopK:            5,
-			LexicalWeight:   lexW,
-			EmbeddingWeight: embW,
-			Mode:            "library",
-		}
-
-		if cfg.Corpus != "" {
-			runCfg.Corpus = cfg.Corpus
-		}
-
-		report, err := RunCorpusBenchmark(ds, runCfg)
-		if err != nil {
-			return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err)
-		}
-
-		run := TuneRun{
-			LexicalWeight:   lexW,
-			EmbeddingWeight: embW,
-			MRR:             report.Metrics.Overall.MRR,
-			PAt1:            report.Metrics.Overall.PAt1,
-			HitAt3:          report.Metrics.Overall.HitAt3,
-		}
-		result.Results = append(result.Results, run)
-
-		if result.Best == nil || run.PAt1 > result.Best.PAt1 ||
-			(run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) {
-			best := run
-			result.Best = &best
-		}
-
-		if cfg.Verbose {
-			fmt.Printf("  %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n",
-				lexW, embW, run.MRR, run.PAt1, run.HitAt3)
-		}
-	}
-
-	return result, nil
-}
-
-func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
-	fmt.Printf("\n  Tested %d weight combinations\n\n", len(result.Results))
-
-	if result.Best != nil {
-		fmt.Printf("  Best weights:\n")
-		fmt.Printf("    Lexical:   %.2f\n", result.Best.LexicalWeight)
-		fmt.Printf("    Embedding: %.2f\n", result.Best.EmbeddingWeight)
-		fmt.Printf("    MRR:       %.4f\n", result.Best.MRR)
-		fmt.Printf("    P@1:       %.4f\n", result.Best.PAt1)
-		fmt.Printf("    Hit@3:     %.4f\n", result.Best.HitAt3)
-	}
-	fmt.Println()
-}
-
-// Runtime baseline
-
-type RuntimeResult struct {
-	Status      string                     `json:"status"`
-	Benchmarks  []RuntimeBenchmark         `json:"benchmarks"`
-	Regressions int                        `json:"regressions"`
-	BaselinePath string                    `json:"baseline_path"`
-	Created     bool                       `json:"created"`
-}
-
-type RuntimeBenchmark struct {
-	Name       string  `json:"name"`
-	NsOp       float64 `json:"ns_op"`
-	BytesOp    int     `json:"bytes_op"`
-	AllocsOp   int     `json:"allocs_op"`
-	BaselineNs float64 `json:"baseline_ns,omitempty"`
-	Ratio      float64 `json:"ratio,omitempty"`
-	Status     string  `json:"status"`
-}
-
-type runtimeBaseline struct {
-	Timestamp  string             `json:"timestamp"`
-	Benchmarks []RuntimeBenchmark `json:"benchmarks"`
-}
-
-func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
-	root := FindBenchmarkRoot()
-	baselinePath := filepath.Join(root, "baselines", "runtime.json")
-
-	benchmarks, err := runGoBenchmarks()
-	if err != nil {
-		return nil, err
-	}
-
-	result := &RuntimeResult{
-		Status:       "pass",
-		Benchmarks:   benchmarks,
-		BaselinePath: baselinePath,
-	}
-
-	if _, err := os.Stat(baselinePath); os.IsNotExist(err) {
-		if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil {
-			return nil, err
-		}
-		result.Created = true
-		return result, nil
-	}
-
-	baseline, err := loadRuntimeBaseline(baselinePath)
-	if err != nil {
-		return nil, err
-	}
-
-	baselineMap := make(map[string]RuntimeBenchmark)
-	for _, b := range baseline.Benchmarks {
-		baselineMap[b.Name] = b
-	}
-
-	maxRatio := 1.25
-	for i, b := range result.Benchmarks {
-		if base, ok := baselineMap[b.Name]; ok {
-			ratio := b.NsOp / base.NsOp
-			result.Benchmarks[i].BaselineNs = base.NsOp
-			result.Benchmarks[i].Ratio = ratio
-
-			if ratio > maxRatio {
-				result.Benchmarks[i].Status = "regression"
-				result.Regressions++
-			} else if ratio > 1.1 {
-				result.Benchmarks[i].Status = "warning"
-			} else {
-				result.Benchmarks[i].Status = "ok"
-			}
-		} else {
-			result.Benchmarks[i].Status = "new"
-		}
-	}
-
-	if result.Regressions > 0 {
-		result.Status = "fail"
-	}
-
-	return result, nil
-}
-
-func runGoBenchmarks() ([]RuntimeBenchmark, error) {
-	root := FindBenchmarkRoot()
-	projectRoot := filepath.Join(root, "..", "..")
-
-	cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...")
-	cmd.Dir = projectRoot
-	output, err := cmd.CombinedOutput()
-	if err != nil {
-		return nil, fmt.Errorf("go test failed: %w\n%s", err, output)
-	}
-
-	return parseBenchOutput(string(output)), nil
-}
-
-func parseBenchOutput(output string) []RuntimeBenchmark {
-	var results []RuntimeBenchmark
-	lines := strings.Split(output, "\n")
-
-	for _, line := range lines {
-		if !strings.HasPrefix(line, "Benchmark") {
-			continue
-		}
-
-		fields := strings.Fields(line)
-		if len(fields) < 3 {
-			continue
-		}
-
-		name := strings.TrimSuffix(fields[0], "-8")
-		name = strings.TrimSuffix(name, "-10")
-		name = strings.TrimSuffix(name, "-12")
-		name = strings.TrimSuffix(name, "-16")
-
-		var nsOp float64
-		var bytesOp, allocsOp int
-
-		for i, f := range fields {
-			if f == "ns/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%f", &nsOp)
-			}
-			if f == "B/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
-			}
-			if f == "allocs/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
-			}
-		}
-
-		if nsOp > 0 {
-			results = append(results, RuntimeBenchmark{
-				Name:     name,
-				NsOp:     nsOp,
-				BytesOp:  bytesOp,
-				AllocsOp: allocsOp,
-			})
-		}
-	}
-
-	return results
-}
-
-func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error {
-	baseline := runtimeBaseline{
-		Timestamp:  time.Now().UTC().Format(time.RFC3339),
-		Benchmarks: benchmarks,
-	}
-	data, err := json.MarshalIndent(baseline, "", "  ")
-	if err != nil {
-		return err
-	}
-	return os.WriteFile(path, data, 0644)
-}
-
-func loadRuntimeBaseline(path string) (*runtimeBaseline, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, err
-	}
-	var baseline runtimeBaseline
-	if err := json.Unmarshal(data, &baseline); err != nil {
-		return nil, err
-	}
-	return &baseline, nil
-}
-
-func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) {
-	if result.Created {
-		fmt.Printf("\n  Created runtime baseline: %s\n", result.BaselinePath)
-		fmt.Printf("  Benchmarks: %d\n\n", len(result.Benchmarks))
-		return
-	}
-
-	fmt.Printf("\n  Runtime Baseline Check\n\n")
-
-	for _, b := range result.Benchmarks {
-		var status string
-		switch b.Status {
-		case "regression":
-			status = "\033[31mREGRESSION\033[0m"
-		case "warning":
-			status = "\033[33mWARNING\033[0m"
-		case "ok":
-			status = "\033[32mOK\033[0m"
-		case "new":
-			status = "\033[33mNEW\033[0m"
-		}
-
-		if b.BaselineNs > 0 {
-			fmt.Printf("  %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n",
-				status, b.Name, b.BaselineNs, b.NsOp, b.Ratio)
-		} else {
-			fmt.Printf("  %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp)
-		}
-	}
-
-	fmt.Println()
-	if result.Regressions > 0 {
-		fmt.Printf("  \033[31mRegressions: %d\033[0m\n\n", result.Regressions)
-	} else {
-		fmt.Printf("  \033[32mNo regressions\033[0m\n\n")
-	}
-}
diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go
new file mode 100644
index 0000000..2b0a3d5
--- /dev/null
+++ b/internal/benchmark/compare.go
@@ -0,0 +1,78 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+)
+
+func RunCompare(cfg CompareConfig) (*CompareResult, error) {
+	baseline, err := loadReport(cfg.BaselinePath)
+	if err != nil {
+		return nil, fmt.Errorf("load baseline: %w", err)
+	}
+	current, err := loadReport(cfg.CurrentPath)
+	if err != nil {
+		return nil, fmt.Errorf("load current: %w", err)
+	}
+
+	result := &CompareResult{
+		Status: "pass",
+		Delta: MetricsDelta{
+			PAt1:   current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+			MRR:    current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+			HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+		},
+	}
+
+	if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 {
+		result.Status = "fail"
+	}
+
+	baselineResults := make(map[string]QueryResult)
+	for _, r := range baseline.Results {
+		baselineResults[r.ID] = r
+	}
+	for _, r := range current.Results {
+		if base, ok := baselineResults[r.ID]; ok {
+			if base.Status == "hit" && r.Status != "hit" {
+				result.Regressions = append(result.Regressions, Regression{
+					ID:          r.ID,
+					Corpus:      r.Corpus,
+					Query:       r.Query,
+					BaselineRef: base.Actual.BestRef,
+					CurrentRef:  r.Actual.BestRef,
+					Reason:      fmt.Sprintf("%s -> %s", base.Status, r.Status),
+				})
+			}
+		}
+	}
+
+	return result, nil
+}
+
+func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m No regression\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Regression detected\n")
+	}
+	fmt.Printf("\n")
+	printDelta("P@1", result.Delta.PAt1)
+	printDelta("MRR", result.Delta.MRR)
+	printDelta("Hit@3", result.Delta.HitAt3)
+
+	if len(result.Regressions) > 0 {
+		fmt.Printf("\n  Regressions:\n")
+		for _, r := range result.Regressions {
+			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
+		}
+	}
+	fmt.Printf("\n")
+}
diff --git a/internal/benchmark/lint.go b/internal/benchmark/lint.go
new file mode 100644
index 0000000..20565ce
--- /dev/null
+++ b/internal/benchmark/lint.go
@@ -0,0 +1,68 @@
+package benchmark
+
+import "fmt"
+
+func RunLint(cfg LintConfig) (*LintResult, error) {
+	root := FindBenchmarkRoot()
+	result := &LintResult{}
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		result.Errors++
+		result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err))
+		return result, nil
+	}
+
+	ids := make(map[string]string)
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if existing, ok := ids[q.ID]; ok {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing))
+			} else {
+				ids[q.ID] = c.ID
+			}
+		}
+	}
+
+	for _, c := range ds.Corpora {
+		refs := make(map[string]bool)
+		for _, d := range c.Snapshot {
+			refs[d.Ref] = true
+		}
+		for _, q := range c.Queries {
+			for _, r := range q.RelevantRefs {
+				if !refs[r] {
+					result.Errors++
+					result.Messages = append(result.Messages,
+						fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r))
+				}
+			}
+		}
+	}
+
+	validDiff := map[string]bool{"easy": true, "medium": true, "hard": true}
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if q.Difficulty != "" && !validDiff[q.Difficulty] {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID))
+			}
+		}
+	}
+
+	if result.Errors == 0 && result.Warnings == 0 {
+		result.Messages = append(result.Messages, "All checks passed")
+	}
+
+	return result, nil
+}
+
+func PrintLintResult(result *LintResult, cfg LintConfig) {
+	for _, msg := range result.Messages {
+		fmt.Println(msg)
+	}
+	fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings)
+}
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
new file mode 100644
index 0000000..8e28dcb
--- /dev/null
+++ b/internal/benchmark/runtime.go
@@ -0,0 +1,217 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type RuntimeResult struct {
+	Status       string             `json:"status"`
+	Benchmarks   []RuntimeBenchmark `json:"benchmarks"`
+	Regressions  int                `json:"regressions"`
+	BaselinePath string             `json:"baseline_path"`
+	Created      bool               `json:"created"`
+}
+
+type RuntimeBenchmark struct {
+	Name       string  `json:"name"`
+	NsOp       float64 `json:"ns_op"`
+	BytesOp    int     `json:"bytes_op"`
+	AllocsOp   int     `json:"allocs_op"`
+	BaselineNs float64 `json:"baseline_ns,omitempty"`
+	Ratio      float64 `json:"ratio,omitempty"`
+	Status     string  `json:"status"`
+}
+
+type runtimeBaseline struct {
+	Timestamp  string             `json:"timestamp"`
+	Benchmarks []RuntimeBenchmark `json:"benchmarks"`
+}
+
+func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
+	root := FindBenchmarkRoot()
+	baselinePath := filepath.Join(root, "baselines", "runtime.json")
+
+	benchmarks, err := runGoBenchmarks()
+	if err != nil {
+		return nil, err
+	}
+
+	result := &RuntimeResult{
+		Status:       "pass",
+		Benchmarks:   benchmarks,
+		BaselinePath: baselinePath,
+	}
+
+	if _, err := os.Stat(baselinePath); os.IsNotExist(err) {
+		if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil {
+			return nil, err
+		}
+		result.Created = true
+		return result, nil
+	}
+
+	baseline, err := loadRuntimeBaseline(baselinePath)
+	if err != nil {
+		return nil, err
+	}
+
+	baselineMap := make(map[string]RuntimeBenchmark)
+	for _, b := range baseline.Benchmarks {
+		baselineMap[b.Name] = b
+	}
+
+	maxRatio := 1.25
+	for i, b := range result.Benchmarks {
+		if base, ok := baselineMap[b.Name]; ok {
+			ratio := b.NsOp / base.NsOp
+			result.Benchmarks[i].BaselineNs = base.NsOp
+			result.Benchmarks[i].Ratio = ratio
+
+			if ratio > maxRatio {
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			} else if ratio > 1.1 {
+				result.Benchmarks[i].Status = "warning"
+			} else {
+				result.Benchmarks[i].Status = "ok"
+			}
+		} else {
+			result.Benchmarks[i].Status = "new"
+		}
+	}
+
+	if result.Regressions > 0 {
+		result.Status = "fail"
+	}
+
+	return result, nil
+}
+
+func runGoBenchmarks() ([]RuntimeBenchmark, error) {
+	root := FindBenchmarkRoot()
+	projectRoot := filepath.Join(root, "..", "..")
+
+	cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...")
+	cmd.Dir = projectRoot
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("go test failed: %w\n%s", err, output)
+	}
+
+	return parseBenchOutput(string(output)), nil
+}
+
+func parseBenchOutput(output string) []RuntimeBenchmark {
+	var results []RuntimeBenchmark
+	lines := strings.Split(output, "\n")
+
+	for _, line := range lines {
+		if !strings.HasPrefix(line, "Benchmark") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 3 {
+			continue
+		}
+
+		name := strings.TrimSuffix(fields[0], "-8")
+		name = strings.TrimSuffix(name, "-10")
+		name = strings.TrimSuffix(name, "-12")
+		name = strings.TrimSuffix(name, "-16")
+
+		var nsOp float64
+		var bytesOp, allocsOp int
+
+		for i, f := range fields {
+			if f == "ns/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%f", &nsOp)
+			}
+			if f == "B/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
+			}
+			if f == "allocs/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
+			}
+		}
+
+		if nsOp > 0 {
+			results = append(results, RuntimeBenchmark{
+				Name:     name,
+				NsOp:     nsOp,
+				BytesOp:  bytesOp,
+				AllocsOp: allocsOp,
+			})
+		}
+	}
+
+	return results
+}
+
+func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error {
+	baseline := runtimeBaseline{
+		Timestamp:  time.Now().UTC().Format(time.RFC3339),
+		Benchmarks: benchmarks,
+	}
+	data, err := json.MarshalIndent(baseline, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func loadRuntimeBaseline(path string) (*runtimeBaseline, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var baseline runtimeBaseline
+	if err := json.Unmarshal(data, &baseline); err != nil {
+		return nil, err
+	}
+	return &baseline, nil
+}
+
+func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) {
+	if result.Created {
+		fmt.Printf("\n  Created runtime baseline: %s\n", result.BaselinePath)
+		fmt.Printf("  Benchmarks: %d\n\n", len(result.Benchmarks))
+		return
+	}
+
+	fmt.Printf("\n  Runtime Baseline Check\n\n")
+
+	for _, b := range result.Benchmarks {
+		var status string
+		switch b.Status {
+		case "regression":
+			status = "\033[31mREGRESSION\033[0m"
+		case "warning":
+			status = "\033[33mWARNING\033[0m"
+		case "ok":
+			status = "\033[32mOK\033[0m"
+		case "new":
+			status = "\033[33mNEW\033[0m"
+		}
+
+		if b.BaselineNs > 0 {
+			fmt.Printf("  %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n",
+				status, b.Name, b.BaselineNs, b.NsOp, b.Ratio)
+		} else {
+			fmt.Printf("  %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp)
+		}
+	}
+
+	fmt.Println()
+	if result.Regressions > 0 {
+		fmt.Printf("  \033[31mRegressions: %d\033[0m\n\n", result.Regressions)
+	} else {
+		fmt.Printf("  \033[32mNo regressions\033[0m\n\n")
+	}
+}
diff --git a/internal/benchmark/tune.go b/internal/benchmark/tune.go
new file mode 100644
index 0000000..7db259b
--- /dev/null
+++ b/internal/benchmark/tune.go
@@ -0,0 +1,90 @@
+package benchmark
+
+import "fmt"
+
+type TuneResult struct {
+	Results []TuneRun `json:"results"`
+	Best    *TuneRun  `json:"best"`
+}
+
+type TuneRun struct {
+	LexicalWeight   float64 `json:"lexical_weight"`
+	EmbeddingWeight float64 `json:"embedding_weight"`
+	MRR             float64 `json:"mrr"`
+	PAt1            float64 `json:"p_at_1"`
+	HitAt3          float64 `json:"hit_at_3"`
+}
+
+func RunTune(cfg TuneConfig) (*TuneResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &TuneResult{}
+
+	if cfg.Verbose {
+		fmt.Printf("  %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3")
+	}
+
+	for w := 0.0; w <= 1.0001; w += cfg.Step {
+		lexW := w
+		embW := 1.0 - w
+
+		runCfg := RunConfig{
+			Suite:           "corpus",
+			Strategy:        "combined",
+			Threshold:       0.01,
+			TopK:            5,
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			Mode:            "library",
+		}
+
+		if cfg.Corpus != "" {
+			runCfg.Corpus = cfg.Corpus
+		}
+
+		report, err := RunCorpusBenchmark(ds, runCfg)
+		if err != nil {
+			return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err)
+		}
+
+		run := TuneRun{
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			MRR:             report.Metrics.Overall.MRR,
+			PAt1:            report.Metrics.Overall.PAt1,
+			HitAt3:          report.Metrics.Overall.HitAt3,
+		}
+		result.Results = append(result.Results, run)
+
+		if result.Best == nil || run.PAt1 > result.Best.PAt1 ||
+			(run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) {
+			best := run
+			result.Best = &best
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n",
+				lexW, embW, run.MRR, run.PAt1, run.HitAt3)
+		}
+	}
+
+	return result, nil
+}
+
+func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
+	fmt.Printf("\n  Tested %d weight combinations\n\n", len(result.Results))
+
+	if result.Best != nil {
+		fmt.Printf("  Best weights:\n")
+		fmt.Printf("    Lexical:   %.2f\n", result.Best.LexicalWeight)
+		fmt.Printf("    Embedding: %.2f\n", result.Best.EmbeddingWeight)
+		fmt.Printf("    MRR:       %.4f\n", result.Best.MRR)
+		fmt.Printf("    P@1:       %.4f\n", result.Best.PAt1)
+		fmt.Printf("    Hit@3:     %.4f\n", result.Best.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/types.go b/internal/benchmark/types.go
new file mode 100644
index 0000000..916978a
--- /dev/null
+++ b/internal/benchmark/types.go
@@ -0,0 +1,67 @@
+package benchmark
+
+type CheckResult struct {
+	Status    string        `json:"status"`
+	Summary   CheckSummary  `json:"summary"`
+	Delta     *MetricsDelta `json:"delta,omitempty"`
+	TopRegs   []Regression  `json:"top_regressions,omitempty"`
+	Artifacts Artifacts     `json:"artifacts"`
+	Report    *Report       `json:"-"`
+}
+
+type CheckSummary struct {
+	PAt1        float64 `json:"p_at_1"`
+	MRR         float64 `json:"mrr"`
+	HitAt3      float64 `json:"hit_at_3"`
+	Total       int     `json:"total"`
+	Regressions int     `json:"regressions"`
+	Warnings    int     `json:"warnings"`
+}
+
+type MetricsDelta struct {
+	PAt1   float64 `json:"p_at_1"`
+	MRR    float64 `json:"mrr"`
+	HitAt3 float64 `json:"hit_at_3"`
+}
+
+type Regression struct {
+	ID           string   `json:"id"`
+	Corpus       string   `json:"corpus"`
+	Query        string   `json:"query"`
+	Expected     []string `json:"expected"`
+	BaselineRef  string   `json:"baseline_ref,omitempty"`
+	CurrentRef   string   `json:"current_ref"`
+	Reason       string   `json:"reason"`
+	DebugCommand string   `json:"debug_command"`
+}
+
+type Artifacts struct {
+	ReportJSON string `json:"report_json"`
+	SummaryMD  string `json:"summary_md"`
+}
+
+type CompareResult struct {
+	Status       string       `json:"status"`
+	Delta        MetricsDelta `json:"delta"`
+	Regressions  []Regression `json:"regressions"`
+	Improvements []string     `json:"improvements"`
+}
+
+type LintResult struct {
+	Errors   int      `json:"errors"`
+	Warnings int      `json:"warnings"`
+	Messages []string `json:"messages"`
+}
+
+type CatalogResult struct {
+	Corpora      []CorpusSummary `json:"corpora"`
+	TotalQueries int             `json:"total_queries"`
+	ByTag        map[string]int  `json:"by_tag,omitempty"`
+	ByDifficulty map[string]int  `json:"by_difficulty,omitempty"`
+}
+
+type CorpusSummary struct {
+	ID      string   `json:"id"`
+	Queries int      `json:"queries"`
+	Tags    []string `json:"tags"`
+}

From 03c7a6e6831ec72a340d4316aa8c5fe082ea7890 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 18:50:28 +0100
Subject: [PATCH 28/30] fix: resolve golangci-lint errors in benchmark package

- Fix unchecked error returns (errcheck)
- Convert if-else chains to switch statements (gocritic)
- Use context.Background() instead of nil context (staticcheck)
- Replace WriteString(fmt.Sprintf) with fmt.Fprintf (staticcheck)
---
 dev                             | 14 +++++++--
 internal/benchmark/baseline.go  |  2 +-
 internal/benchmark/calibrate.go | 28 +++++++++---------
 internal/benchmark/check.go     | 28 +++++++++---------
 internal/benchmark/config.go    | 50 ++++++++++++++++-----------------
 internal/benchmark/dataset.go   |  8 +++---
 internal/benchmark/runner.go    | 50 +++++++++++++++++----------------
 internal/benchmark/runtime.go   | 13 +++++----
 recovery/benchmark_test.go      |  2 +-
 9 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/dev b/dev
index 5d8c88d..11d53d9 100755
--- a/dev
+++ b/dev
@@ -128,9 +128,19 @@ run_check() {
   if [ -n "$unformatted" ]; then
     echo "  ${ERROR}✗${NC} Unformatted files:"
     echo "$unformatted"
-    exit 1
+    echo ""
+    printf "  Fix formatting now? (Y/n) "
+    read -r answer
+    if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then
+      gofmt -w .
+      echo "  ${SUCCESS}✓${NC} Format (fixed)"
+    else
+      echo "  ${MUTED}Run: gofmt -w .${NC}"
+      exit 1
+    fi
+  else
+    echo "  ${SUCCESS}✓${NC} Format"
   fi
-  echo "  ${SUCCESS}✓${NC} Format"
 
   echo "  ${MUTED}2/4 Vet${NC}"
   go vet ./...
diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go
index de2a371..07cc418 100644
--- a/internal/benchmark/baseline.go
+++ b/internal/benchmark/baseline.go
@@ -82,7 +82,7 @@ func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*Baseline
 			previous = &old.Metrics.Overall
 		}
 		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
-		os.WriteFile(backupPath, data, 0644)
+		_ = os.WriteFile(backupPath, data, 0644)
 	}
 
 	result, err := createBaseline(root, baselinePath, cfg)
diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go
index 9c9fa33..48ec06e 100644
--- a/internal/benchmark/calibrate.go
+++ b/internal/benchmark/calibrate.go
@@ -1,6 +1,7 @@
 package benchmark
 
 import (
+	"context"
 	"fmt"
 
 	"github.com/pinchtab/semantic"
@@ -77,7 +78,7 @@ func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
 		tp, fp, fn, tn := 0, 0, 0, 0
 
 		for _, tc := range cases {
-			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
+			findResult, _ := matcher.Find(context.Background(), tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
 				Threshold: threshold,
 				TopK:      5,
 			})
@@ -88,20 +89,17 @@ func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
 				topRef = findResult.Matches[0].Ref
 			}
 
-			if tc.query.ExpectNoMatch {
-				if hasMatch {
-					fp++
-				} else {
-					tn++
-				}
-			} else if len(tc.query.RelevantRefs) > 0 {
-				if !hasMatch {
-					fn++
-				} else if contains(tc.query.RelevantRefs, topRef) {
-					tp++
-				} else {
-					fp++
-				}
+			switch {
+			case tc.query.ExpectNoMatch && hasMatch:
+				fp++
+			case tc.query.ExpectNoMatch && !hasMatch:
+				tn++
+			case len(tc.query.RelevantRefs) > 0 && !hasMatch:
+				fn++
+			case len(tc.query.RelevantRefs) > 0 && contains(tc.query.RelevantRefs, topRef):
+				tp++
+			case len(tc.query.RelevantRefs) > 0:
+				fp++
 			}
 		}
 
diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
index 81171bb..e2ceedc 100644
--- a/internal/benchmark/check.go
+++ b/internal/benchmark/check.go
@@ -89,16 +89,16 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 		}
 	}
 
-	os.MkdirAll(cfg.OutputDir, 0755)
+	_ = os.MkdirAll(cfg.OutputDir, 0755)
 	ts := time.Now().Format("20060102_150405")
 	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
 	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
 
 	reportJSON, _ := json.MarshalIndent(report, "", "  ")
-	os.WriteFile(reportPath, reportJSON, 0644)
+	_ = os.WriteFile(reportPath, reportJSON, 0644)
 
 	summaryMD := generateSummaryMD(report, result)
-	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
+	_ = os.WriteFile(summaryPath, []byte(summaryMD), 0644)
 
 	result.Artifacts.ReportJSON = reportPath
 	result.Artifacts.SummaryMD = summaryPath
@@ -131,24 +131,24 @@ func generateSummaryMD(report *Report, result *CheckResult) string {
 	var sb strings.Builder
 
 	sb.WriteString("# Benchmark Summary\n\n")
-	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
+	fmt.Fprintf(&sb, "Generated: %s\n\n", report.Run.Timestamp)
 
 	sb.WriteString("## Overall Metrics\n\n")
 	sb.WriteString("| Metric | Value |\n")
 	sb.WriteString("|--------|-------|\n")
-	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
-	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
-	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
-	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
-	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
+	fmt.Fprintf(&sb, "| Total | %d |\n", report.Metrics.Overall.Total)
+	fmt.Fprintf(&sb, "| MRR | %.4f |\n", report.Metrics.Overall.MRR)
+	fmt.Fprintf(&sb, "| P@1 | %.4f |\n", report.Metrics.Overall.PAt1)
+	fmt.Fprintf(&sb, "| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3)
+	fmt.Fprintf(&sb, "| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin)
 
 	if result.Delta != nil {
 		sb.WriteString("\n## Delta from Baseline\n\n")
 		sb.WriteString("| Metric | Delta |\n")
 		sb.WriteString("|--------|-------|\n")
-		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
-		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
-		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
+		fmt.Fprintf(&sb, "| P@1 | %+.4f |\n", result.Delta.PAt1)
+		fmt.Fprintf(&sb, "| MRR | %+.4f |\n", result.Delta.MRR)
+		fmt.Fprintf(&sb, "| Hit@3 | %+.4f |\n", result.Delta.HitAt3)
 	}
 
 	if len(result.TopRegs) > 0 {
@@ -159,8 +159,8 @@ func generateSummaryMD(report *Report, result *CheckResult) string {
 			if len(result.TopRegs) > 10 {
 				break
 			}
-			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
-				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
+			fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n",
+				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))
 		}
 	}
 
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index 83e3f5c..e41fe1c 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -19,16 +19,16 @@ type DefaultsConfig struct {
 }
 
 type Profile struct {
-	Strategy   string   `json:"strategy"`
-	Threshold  float64  `json:"threshold"`
-	TopK       int      `json:"top_k"`
-	Weights    Weights  `json:"weights"`
-	Suites     []string `json:"suites"`
-	Mode       string   `json:"mode"`
-	Inherits   string   `json:"inherits"`
-	Verbose    bool     `json:"verbose"`
-	Explain    bool     `json:"explain"`
-	FailOnReg  bool     `json:"fail_on_regression"`
+	Strategy  string   `json:"strategy"`
+	Threshold float64  `json:"threshold"`
+	TopK      int      `json:"top_k"`
+	Weights   Weights  `json:"weights"`
+	Suites    []string `json:"suites"`
+	Mode      string   `json:"mode"`
+	Inherits  string   `json:"inherits"`
+	Verbose   bool     `json:"verbose"`
+	Explain   bool     `json:"explain"`
+	FailOnReg bool     `json:"fail_on_regression"`
 }
 
 type Weights struct {
@@ -42,16 +42,16 @@ type BaselineConfig struct {
 }
 
 type BaselineQuality struct {
-	MaxOverallPAt1Drop  float64 `json:"max_overall_p_at_1_drop"`
-	MaxOverallMRRDrop   float64 `json:"max_overall_mrr_drop"`
+	MaxOverallPAt1Drop   float64 `json:"max_overall_p_at_1_drop"`
+	MaxOverallMRRDrop    float64 `json:"max_overall_mrr_drop"`
 	MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"`
-	MaxCorpusPAt1Drop   float64 `json:"max_corpus_p_at_1_drop"`
-	MaxTagPAt1Drop      float64 `json:"max_tag_p_at_1_drop"`
+	MaxCorpusPAt1Drop    float64 `json:"max_corpus_p_at_1_drop"`
+	MaxTagPAt1Drop       float64 `json:"max_tag_p_at_1_drop"`
 }
 
 type BaselineRuntime struct {
-	MaxNsOpRegressionRatio   float64 `json:"max_ns_op_regression_ratio"`
-	MaxAllocRegressionRatio  float64 `json:"max_alloc_regression_ratio"`
+	MaxNsOpRegressionRatio  float64 `json:"max_ns_op_regression_ratio"`
+	MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"`
 }
 
 type CheckConfig struct {
@@ -200,7 +200,7 @@ func ParseCheckFlags(args []string) CheckConfig {
 	fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details")
 	fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -231,7 +231,7 @@ func ParseRunFlags(args []string) RunConfig {
 	fs.BoolVar(&cfg.Explain, "explain", false, "include explanations")
 	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
 	fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -244,7 +244,7 @@ func ParseCompareFlags(args []string) CompareConfig {
 	fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)")
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -255,7 +255,7 @@ func ParseLintFlags(args []string) LintConfig {
 	}
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -266,7 +266,7 @@ func ParseCatalogFlags(args []string) CatalogConfig {
 	}
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)")
 	fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -279,7 +279,7 @@ func ParseBaselineFlags(args []string) BaselineCmdConfig {
 	fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name")
 	fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 
 	if len(fs.Args()) > 0 {
 		cfg.Action = fs.Args()[0]
@@ -294,7 +294,7 @@ func ParseCalibrateFlags(args []string) CalibrateConfig {
 	}
 	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -306,7 +306,7 @@ func ParseTuneFlags(args []string) TuneConfig {
 	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against")
 	fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -315,6 +315,6 @@ func ParseRuntimeFlags(args []string) RuntimeConfig {
 	cfg := RuntimeConfig{}
 	fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go
index 555b503..86c5014 100644
--- a/internal/benchmark/dataset.go
+++ b/internal/benchmark/dataset.go
@@ -25,10 +25,10 @@ type Query struct {
 }
 
 type Corpus struct {
-	ID        string
-	Path      string
-	Snapshot  []semantic.ElementDescriptor
-	Queries   []Query
+	ID       string
+	Path     string
+	Snapshot []semantic.ElementDescriptor
+	Queries  []Query
 }
 
 type Dataset struct {
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
index 391cc0a..f5b3a7d 100644
--- a/internal/benchmark/runner.go
+++ b/internal/benchmark/runner.go
@@ -8,14 +8,14 @@ import (
 )
 
 type QueryResult struct {
-	ID       string   `json:"id"`
-	Corpus   string   `json:"corpus"`
-	Query    string   `json:"query"`
-	Difficulty string `json:"difficulty"`
-	Tags     []string `json:"tags"`
-	Intent   string   `json:"intent,omitempty"`
-	PageType string   `json:"page_type,omitempty"`
-	Expected struct {
+	ID         string   `json:"id"`
+	Corpus     string   `json:"corpus"`
+	Query      string   `json:"query"`
+	Difficulty string   `json:"difficulty"`
+	Tags       []string `json:"tags"`
+	Intent     string   `json:"intent,omitempty"`
+	PageType   string   `json:"page_type,omitempty"`
+	Expected   struct {
 		RelevantRefs          []string `json:"relevant_refs"`
 		PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
 	} `json:"expected"`
@@ -36,7 +36,7 @@ type QueryResult struct {
 		Margin            float64 `json:"margin"`
 	} `json:"metrics"`
 	Latency struct {
-		LibraryMs int64 `json:"library_ms"`
+		LibraryMs int64  `json:"library_ms"`
 		CLIMs     *int64 `json:"cli_ms,omitempty"`
 	} `json:"latency"`
 	Status string `json:"status"`
@@ -60,10 +60,10 @@ type Report struct {
 		Command   string `json:"command"`
 	} `json:"run"`
 	Dataset struct {
-		Name         string `json:"name"`
-		Version      string `json:"version,omitempty"`
-		QueryCount   int    `json:"query_count"`
-		CorpusCount  int    `json:"corpus_count"`
+		Name        string `json:"name"`
+		Version     string `json:"version,omitempty"`
+		QueryCount  int    `json:"query_count"`
+		CorpusCount int    `json:"corpus_count"`
 	} `json:"dataset"`
 	Config struct {
 		Profile   string  `json:"profile"`
@@ -74,11 +74,11 @@ type Report struct {
 	} `json:"config"`
 	Status  string `json:"status"`
 	Metrics struct {
-		Overall    OverallMetrics           `json:"overall"`
-		Latency    LatencyMetrics           `json:"latency"`
-		ByCorpus   map[string]CorpusMetrics `json:"by_corpus"`
+		Overall      OverallMetrics           `json:"overall"`
+		Latency      LatencyMetrics           `json:"latency"`
+		ByCorpus     map[string]CorpusMetrics `json:"by_corpus"`
 		ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"`
-		ByTag      map[string]CorpusMetrics `json:"by_tag"`
+		ByTag        map[string]CorpusMetrics `json:"by_tag"`
 	} `json:"metrics"`
 	Results []QueryResult `json:"results"`
 }
@@ -243,7 +243,8 @@ func computeQueryMetrics(result *QueryResult, query Query) {
 		if i >= 5 {
 			break
 		}
-		if relevantSet[m.Ref] {
+		switch {
+		case relevantSet[m.Ref]:
 			if result.Metrics.BestRelevantRank == nil {
 				rank := i + 1
 				result.Metrics.BestRelevantRank = &rank
@@ -256,11 +257,11 @@ func computeQueryMetrics(result *QueryResult, query Query) {
 				result.Metrics.HitAt3 = 1
 			}
 			result.Metrics.HitAt5 = 1
-		} else if partialSet[m.Ref] {
+		case partialSet[m.Ref]:
 			if i < 3 {
 				partialInTop3++
 			}
-		} else {
+		default:
 			if m.Score > result.Metrics.BestWrongScore {
 				result.Metrics.BestWrongScore = m.Score
 			}
@@ -270,17 +271,18 @@ func computeQueryMetrics(result *QueryResult, query Query) {
 	result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore
 
 	// Status
-	if query.ExpectNoMatch {
+	switch {
+	case query.ExpectNoMatch:
 		if len(result.Actual.Matches) == 0 {
 			result.Status = "no_match_expected"
 		} else {
 			result.Status = "unexpected_match"
 		}
-	} else if result.Metrics.PAt1 >= 1.0 {
+	case result.Metrics.PAt1 >= 1.0:
 		result.Status = "hit"
-	} else if result.Metrics.PAt1 >= 0.5 {
+	case result.Metrics.PAt1 >= 0.5:
 		result.Status = "partial"
-	} else {
+	default:
 		result.Status = "miss"
 	}
 }
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
index 8e28dcb..e7622f1 100644
--- a/internal/benchmark/runtime.go
+++ b/internal/benchmark/runtime.go
@@ -73,12 +73,13 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 			result.Benchmarks[i].BaselineNs = base.NsOp
 			result.Benchmarks[i].Ratio = ratio
 
-			if ratio > maxRatio {
+			switch {
+			case ratio > maxRatio:
 				result.Benchmarks[i].Status = "regression"
 				result.Regressions++
-			} else if ratio > 1.1 {
+			case ratio > 1.1:
 				result.Benchmarks[i].Status = "warning"
-			} else {
+			default:
 				result.Benchmarks[i].Status = "ok"
 			}
 		} else {
@@ -131,13 +132,13 @@ func parseBenchOutput(output string) []RuntimeBenchmark {
 
 		for i, f := range fields {
 			if f == "ns/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%f", &nsOp)
+				_, _ = fmt.Sscanf(fields[i-1], "%f", &nsOp)
 			}
 			if f == "B/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
+				_, _ = fmt.Sscanf(fields[i-1], "%d", &bytesOp)
 			}
 			if f == "allocs/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
+				_, _ = fmt.Sscanf(fields[i-1], "%d", &allocsOp)
 			}
 		}
 
diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go
index 9670a68..1261dd6 100644
--- a/recovery/benchmark_test.go
+++ b/recovery/benchmark_test.go
@@ -237,7 +237,7 @@ func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc Ben
 
 	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
 
-	re.AttemptWithClassification(
+	_, _, _ = re.AttemptWithClassification(
 		context.Background(),
 		"test-tab",
 		sc.OriginalRef,

From c3a85ab63a223d6de22774844311834b06dd3fa2 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 22:52:43 +0100
Subject: [PATCH 29/30] feat: config-driven thresholds, validation, and
 deterministic output

---
 internal/benchmark/catalog.go     |  19 ++-
 internal/benchmark/check.go       |  45 +++++-
 internal/benchmark/compare.go     |  11 ++
 internal/benchmark/config.go      | 239 ++++++++++++++++++++++++++++--
 internal/benchmark/config_test.go | 147 ++++++++++++++++++
 internal/benchmark/runner.go      |  84 ++++++++++-
 internal/benchmark/runtime.go     |  42 +++++-
 internal/engine/benchmark_test.go | 119 ++++++++++++++-
 8 files changed, 673 insertions(+), 33 deletions(-)
 create mode 100644 internal/benchmark/config_test.go

diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go
index b4c4ec1..69a3091 100644
--- a/internal/benchmark/catalog.go
+++ b/internal/benchmark/catalog.go
@@ -62,14 +62,25 @@ func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
 	switch cfg.By {
 	case "difficulty":
 		fmt.Printf("\n  By Difficulty:\n")
-		for d, n := range result.ByDifficulty {
-			fmt.Printf("    %-10s %4d\n", d, n)
+		diffs := sortedKeys(result.ByDifficulty)
+		for _, d := range diffs {
+			fmt.Printf("    %-10s %4d\n", d, result.ByDifficulty[d])
 		}
 	case "tag":
 		fmt.Printf("\n  By Tag:\n")
-		for t, n := range result.ByTag {
-			fmt.Printf("    %-20s %4d\n", t, n)
+		tags := sortedKeys(result.ByTag)
+		for _, t := range tags {
+			fmt.Printf("    %-20s %4d\n", t, result.ByTag[t])
 		}
 	}
 	fmt.Printf("\n")
 }
+
+func sortedKeys(m map[string]int) []string {
+	keys := make([]string, 0, len(m))
+	for k := range m {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	return keys
+}
diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
index e2ceedc..0528059 100644
--- a/internal/benchmark/check.go
+++ b/internal/benchmark/check.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"sort"
 	"strings"
 	"time"
 )
@@ -40,6 +41,7 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 		Verbose:         cfg.Verbose,
 		Explain:         cfg.Explain,
 		OutputDir:       cfg.OutputDir,
+		Quick:           cfg.Quick,
 	}
 
 	report, err := RunCorpusBenchmark(ds, runCfg)
@@ -71,10 +73,28 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 	}
 	result.Summary.Regressions = len(result.TopRegs)
 
+	// Determine baseline path from config
 	baselinePath := cfg.BaselinePath
 	if baselinePath == "" {
-		baselinePath = filepath.Join(root, "baselines", "combined.json")
+		if benchCfg != nil {
+			baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json")
+		} else {
+			baselinePath = filepath.Join(root, "baselines", "combined.json")
+		}
+	}
+
+	// Get quality thresholds from config
+	var thresholds BaselineQuality
+	if benchCfg != nil {
+		thresholds = benchCfg.QualityThresholds()
+	} else {
+		thresholds = BaselineQuality{
+			MaxOverallPAt1Drop:   0.02,
+			MaxOverallMRRDrop:    0.02,
+			MaxOverallHitAt3Drop: 0.02,
+		}
 	}
+
 	if _, err := os.Stat(baselinePath); err == nil {
 		baseline, err := loadReport(baselinePath)
 		if err == nil {
@@ -83,12 +103,24 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
 				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
 			}
-			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
-				result.Status = "fail"
+			if cfg.FailOnReg {
+				if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop ||
+					result.Delta.MRR < -thresholds.MaxOverallMRRDrop ||
+					result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop {
+					result.Status = "fail"
+				}
 			}
 		}
 	}
 
+	// Sort regressions for deterministic output
+	sort.Slice(result.TopRegs, func(i, j int) bool {
+		if result.TopRegs[i].Corpus != result.TopRegs[j].Corpus {
+			return result.TopRegs[i].Corpus < result.TopRegs[j].Corpus
+		}
+		return result.TopRegs[i].ID < result.TopRegs[j].ID
+	})
+
 	_ = os.MkdirAll(cfg.OutputDir, 0755)
 	ts := time.Now().Format("20060102_150405")
 	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
@@ -155,13 +187,16 @@ func generateSummaryMD(report *Report, result *CheckResult) string {
 		sb.WriteString("\n## Misses\n\n")
 		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
 		sb.WriteString("|----|--------|-------|-----|----------|\n")
-		for _, r := range result.TopRegs {
-			if len(result.TopRegs) > 10 {
+		for i, r := range result.TopRegs {
+			if i >= 10 {
 				break
 			}
 			fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n",
 				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))
 		}
+		if len(result.TopRegs) > 10 {
+			fmt.Fprintf(&sb, "\n*Showing 10 of %d misses.*\n", len(result.TopRegs))
+		}
 	}
 
 	return sb.String()
diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go
index 2b0a3d5..f0e6ccf 100644
--- a/internal/benchmark/compare.go
+++ b/internal/benchmark/compare.go
@@ -3,6 +3,7 @@ package benchmark
 import (
 	"encoding/json"
 	"fmt"
+	"sort"
 )
 
 func RunCompare(cfg CompareConfig) (*CompareResult, error) {
@@ -70,9 +71,19 @@ func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
 
 	if len(result.Regressions) > 0 {
 		fmt.Printf("\n  Regressions:\n")
+		sortRegressions(result.Regressions)
 		for _, r := range result.Regressions {
 			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
 		}
 	}
 	fmt.Printf("\n")
 }
+
+func sortRegressions(regs []Regression) {
+	sort.Slice(regs, func(i, j int) bool {
+		if regs[i].Corpus != regs[j].Corpus {
+			return regs[i].Corpus < regs[j].Corpus
+		}
+		return regs[i].ID < regs[j].ID
+	})
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index e41fe1c..cd0bbec 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -2,20 +2,35 @@ package benchmark
 
 import (
 	"encoding/json"
+	"errors"
 	"flag"
+	"fmt"
 	"os"
 	"path/filepath"
 )
 
 type Config struct {
-	Version  string             `json:"version"`
-	Defaults DefaultsConfig     `json:"defaults"`
-	Profiles map[string]Profile `json:"profiles"`
-	Baseline BaselineConfig     `json:"baseline"`
+	Version      string             `json:"version"`
+	Defaults     DefaultsConfig     `json:"defaults"`
+	Profiles     map[string]Profile `json:"profiles"`
+	Baseline     BaselineConfig     `json:"baseline"`
+	Results      ResultsConfig      `json:"results"`
+	Strategies   []string           `json:"strategies"`
+	SnapshotsDir string             `json:"snapshots_dir"`
 }
 
 type DefaultsConfig struct {
-	Profile string `json:"profile"`
+	Profile   string  `json:"profile"`
+	Strategy  string  `json:"strategy"`
+	Threshold float64 `json:"threshold"`
+	TopK      int     `json:"top_k"`
+	Weights   Weights `json:"weights"`
+}
+
+type ResultsConfig struct {
+	Dir                  string `json:"dir"`
+	BaselinesDir         string `json:"baselines_dir"`
+	GeneratedFilesPolicy string `json:"generated_files_policy"`
 }
 
 type Profile struct {
@@ -42,16 +57,20 @@ type BaselineConfig struct {
 }
 
 type BaselineQuality struct {
-	MaxOverallPAt1Drop   float64 `json:"max_overall_p_at_1_drop"`
-	MaxOverallMRRDrop    float64 `json:"max_overall_mrr_drop"`
-	MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"`
-	MaxCorpusPAt1Drop    float64 `json:"max_corpus_p_at_1_drop"`
-	MaxTagPAt1Drop       float64 `json:"max_tag_p_at_1_drop"`
+	MaxOverallPAt1Drop    float64 `json:"max_overall_p_at_1_drop"`
+	MaxOverallMRRDrop     float64 `json:"max_overall_mrr_drop"`
+	MaxOverallHitAt3Drop  float64 `json:"max_overall_hit_at_3_drop"`
+	MaxCorpusPAt1Drop     float64 `json:"max_corpus_p_at_1_drop"`
+	MaxDifficultyPAt1Drop float64 `json:"max_difficulty_p_at_1_drop"`
+	MaxTagPAt1Drop        float64 `json:"max_tag_p_at_1_drop"`
+	MaxMarginDropReport   float64 `json:"max_margin_drop_report"`
 }
 
 type BaselineRuntime struct {
 	MaxNsOpRegressionRatio  float64 `json:"max_ns_op_regression_ratio"`
 	MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"`
+	MaxCorpusLatencyP50MS   int     `json:"max_corpus_latency_p50_ms"`
+	MaxCorpusLatencyP95MS   int     `json:"max_corpus_latency_p95_ms"`
 }
 
 type CheckConfig struct {
@@ -80,6 +99,7 @@ type RunConfig struct {
 	Explain         bool
 	OutputDir       string
 	ReportName      string
+	Quick           bool
 }
 
 type CompareConfig struct {
@@ -152,11 +172,28 @@ func LoadConfig(benchmarkRoot string) (*Config, error) {
 func ResolveProfile(cfg *Config, name string) Profile {
 	p, ok := cfg.Profiles[name]
 	if !ok {
+		// Use defaults from config, falling back to hardcoded values
+		strategy := cfg.Defaults.Strategy
+		if strategy == "" {
+			strategy = "combined"
+		}
+		threshold := cfg.Defaults.Threshold
+		if threshold == 0 {
+			threshold = 0.01
+		}
+		topK := cfg.Defaults.TopK
+		if topK == 0 {
+			topK = 5
+		}
+		weights := cfg.Defaults.Weights
+		if weights.Lexical == 0 && weights.Embedding == 0 {
+			weights = Weights{Lexical: 0.6, Embedding: 0.4}
+		}
 		return Profile{
-			Strategy:  "combined",
-			Threshold: 0.01,
-			TopK:      5,
-			Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+			Strategy:  strategy,
+			Threshold: threshold,
+			TopK:      topK,
+			Weights:   weights,
 			Suites:    []string{"corpus"},
 			Mode:      "library",
 		}
@@ -185,6 +222,180 @@ func ResolveProfile(cfg *Config, name string) Profile {
 	return p
 }
 
+// projectRoot returns the project root (parent of tests/benchmark).
+func projectRoot(benchmarkRoot string) string {
+	return filepath.Dir(filepath.Dir(benchmarkRoot))
+}
+
+// ResultsDir returns the configured results directory.
+func (c *Config) ResultsDir(benchmarkRoot string) string {
+	if c.Results.Dir != "" {
+		if filepath.IsAbs(c.Results.Dir) {
+			return c.Results.Dir
+		}
+		return filepath.Join(projectRoot(benchmarkRoot), c.Results.Dir)
+	}
+	return filepath.Join(benchmarkRoot, "results")
+}
+
+// BaselinesDir returns the configured baselines directory.
+func (c *Config) BaselinesDir(benchmarkRoot string) string {
+	if c.Results.BaselinesDir != "" {
+		if filepath.IsAbs(c.Results.BaselinesDir) {
+			return c.Results.BaselinesDir
+		}
+		return filepath.Join(projectRoot(benchmarkRoot), c.Results.BaselinesDir)
+	}
+	return filepath.Join(benchmarkRoot, "baselines")
+}
+
+// QualityThresholds returns quality thresholds with fallback defaults.
+func (c *Config) QualityThresholds() BaselineQuality {
+	q := c.Baseline.Quality
+	if q.MaxOverallPAt1Drop == 0 {
+		q.MaxOverallPAt1Drop = 0.02
+	}
+	if q.MaxOverallMRRDrop == 0 {
+		q.MaxOverallMRRDrop = 0.02
+	}
+	if q.MaxOverallHitAt3Drop == 0 {
+		q.MaxOverallHitAt3Drop = 0.02
+	}
+	if q.MaxCorpusPAt1Drop == 0 {
+		q.MaxCorpusPAt1Drop = 0.08
+	}
+	if q.MaxDifficultyPAt1Drop == 0 {
+		q.MaxDifficultyPAt1Drop = 0.08
+	}
+	if q.MaxTagPAt1Drop == 0 {
+		q.MaxTagPAt1Drop = 0.08
+	}
+	if q.MaxMarginDropReport == 0 {
+		q.MaxMarginDropReport = 0.15
+	}
+	return q
+}
+
+// RuntimeThresholds returns runtime thresholds with fallback defaults.
+func (c *Config) RuntimeThresholds() BaselineRuntime {
+	r := c.Baseline.Runtime
+	if r.MaxNsOpRegressionRatio == 0 {
+		r.MaxNsOpRegressionRatio = 1.25
+	}
+	if r.MaxAllocRegressionRatio == 0 {
+		r.MaxAllocRegressionRatio = 1.25
+	}
+	return r
+}
+
+// ValidateConfig checks the config for errors and returns a descriptive error if invalid.
+func ValidateConfig(cfg *Config) error {
+	var errs []error
+
+	// Validate strategies
+	if len(cfg.Strategies) == 0 {
+		errs = append(errs, errors.New("strategies list is empty"))
+	} else {
+		validStrategies := make(map[string]bool)
+		for _, s := range cfg.Strategies {
+			validStrategies[s] = true
+		}
+		// Check default strategy is in list
+		if cfg.Defaults.Strategy != "" && !validStrategies[cfg.Defaults.Strategy] {
+			errs = append(errs, fmt.Errorf("default strategy %q not in strategies list", cfg.Defaults.Strategy))
+		}
+		// Check profile strategies
+		for name, p := range cfg.Profiles {
+			if p.Strategy != "" && !validStrategies[p.Strategy] {
+				errs = append(errs, fmt.Errorf("profile %q uses strategy %q not in strategies list", name, p.Strategy))
+			}
+		}
+	}
+
+	// Validate weights
+	if cfg.Defaults.Weights.Lexical < 0 {
+		errs = append(errs, errors.New("defaults.weights.lexical must be non-negative"))
+	}
+	if cfg.Defaults.Weights.Embedding < 0 {
+		errs = append(errs, errors.New("defaults.weights.embedding must be non-negative"))
+	}
+	if cfg.Defaults.Weights.Lexical == 0 && cfg.Defaults.Weights.Embedding == 0 {
+		errs = append(errs, errors.New("defaults.weights: lexical and embedding cannot both be zero"))
+	}
+
+	// Validate profile weights
+	for name, p := range cfg.Profiles {
+		if p.Weights.Lexical < 0 {
+			errs = append(errs, fmt.Errorf("profile %q: weights.lexical must be non-negative", name))
+		}
+		if p.Weights.Embedding < 0 {
+			errs = append(errs, fmt.Errorf("profile %q: weights.embedding must be non-negative", name))
+		}
+	}
+
+	// Validate quality thresholds (should be positive when set)
+	q := cfg.Baseline.Quality
+	if q.MaxOverallPAt1Drop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_p_at_1_drop must be non-negative"))
+	}
+	if q.MaxOverallMRRDrop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_mrr_drop must be non-negative"))
+	}
+	if q.MaxOverallHitAt3Drop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_hit_at_3_drop must be non-negative"))
+	}
+
+	// Validate runtime thresholds (must be >= 1)
+	r := cfg.Baseline.Runtime
+	if r.MaxNsOpRegressionRatio != 0 && r.MaxNsOpRegressionRatio < 1 {
+		errs = append(errs, errors.New("baseline.runtime.max_ns_op_regression_ratio must be >= 1"))
+	}
+	if r.MaxAllocRegressionRatio != 0 && r.MaxAllocRegressionRatio < 1 {
+		errs = append(errs, errors.New("baseline.runtime.max_alloc_regression_ratio must be >= 1"))
+	}
+
+	// Validate profile inheritance
+	if err := validateProfileInheritance(cfg); err != nil {
+		errs = append(errs, err)
+	}
+
+	if len(errs) == 0 {
+		return nil
+	}
+	if len(errs) == 1 {
+		return errs[0]
+	}
+	return fmt.Errorf("config has %d errors: %v", len(errs), errs)
+}
+
+// validateProfileInheritance checks for missing references and cycles.
+func validateProfileInheritance(cfg *Config) error {
+	for name, p := range cfg.Profiles {
+		if p.Inherits == "" {
+			continue
+		}
+		// Check reference exists
+		if _, ok := cfg.Profiles[p.Inherits]; !ok {
+			return fmt.Errorf("profile %q inherits from non-existent profile %q", name, p.Inherits)
+		}
+		// Check for cycles
+		visited := map[string]bool{name: true}
+		current := p.Inherits
+		for current != "" {
+			if visited[current] {
+				return fmt.Errorf("profile inheritance cycle detected: %q -> %q", name, current)
+			}
+			visited[current] = true
+			if parent, ok := cfg.Profiles[current]; ok {
+				current = parent.Inherits
+			} else {
+				break
+			}
+		}
+	}
+	return nil
+}
+
 func ParseCheckFlags(args []string) CheckConfig {
 	fs := flag.NewFlagSet("check", flag.ExitOnError)
 	cfg := CheckConfig{
diff --git a/internal/benchmark/config_test.go b/internal/benchmark/config_test.go
new file mode 100644
index 0000000..2590556
--- /dev/null
+++ b/internal/benchmark/config_test.go
@@ -0,0 +1,147 @@
+package benchmark
+
+import "testing"
+
+func TestValidateConfig_Valid(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"lexical", "embedding", "combined"},
+		Defaults: DefaultsConfig{
+			Strategy: "combined",
+			Weights:  Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Quality: BaselineQuality{
+				MaxOverallPAt1Drop: 0.02,
+			},
+			Runtime: BaselineRuntime{
+				MaxNsOpRegressionRatio: 1.25,
+			},
+		},
+	}
+	if err := ValidateConfig(cfg); err != nil {
+		t.Errorf("expected valid config, got error: %v", err)
+	}
+}
+
+func TestValidateConfig_EmptyStrategies(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for empty strategies")
+	}
+}
+
+func TestValidateConfig_InvalidDefaultStrategy(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"lexical", "embedding"},
+		Defaults: DefaultsConfig{
+			Strategy: "combined",
+			Weights:  Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for invalid default strategy")
+	}
+}
+
+func TestValidateConfig_NegativeWeights(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: -0.5, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for negative weight")
+	}
+}
+
+func TestValidateConfig_BothWeightsZero(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0, Embedding: 0},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error when both weights are zero")
+	}
+}
+
+func TestValidateConfig_RuntimeRatioTooLow(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Runtime: BaselineRuntime{
+				MaxNsOpRegressionRatio: 0.5,
+			},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for runtime ratio < 1")
+	}
+}
+
+func TestValidateConfig_ProfileInheritsMissing(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Profiles: map[string]Profile{
+			"fast": {Inherits: "nonexistent"},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for missing inherited profile")
+	}
+}
+
+func TestValidateConfig_ProfileInheritanceCycle(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Profiles: map[string]Profile{
+			"a": {Inherits: "b"},
+			"b": {Inherits: "c"},
+			"c": {Inherits: "a"},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for inheritance cycle")
+	}
+}
+
+func TestValidateConfig_NegativeQualityThreshold(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Quality: BaselineQuality{
+				MaxOverallPAt1Drop: -0.02,
+			},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for negative quality threshold")
+	}
+}
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
index f5b3a7d..253a4c3 100644
--- a/internal/benchmark/runner.go
+++ b/internal/benchmark/runner.go
@@ -2,6 +2,8 @@ package benchmark
 
 import (
 	"context"
+	"os/exec"
+	"strings"
 	"time"
 
 	"github.com/pinchtab/semantic"
@@ -118,6 +120,7 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 	report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile
 	report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339)
 	report.Run.Tool = "semantic-bench"
+	report.Run.GitSHA, report.Run.GitDirty = getGitInfo()
 	report.Dataset.Name = "semantic-ui-matching-corpus"
 	report.Dataset.QueryCount = ds.QueryCount()
 	report.Dataset.CorpusCount = ds.CorpusCount()
@@ -138,7 +141,12 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 			continue
 		}
 
-		for _, query := range corpus.Queries {
+		queries := corpus.Queries
+		if cfg.Quick {
+			queries = selectQuickSubset(corpus.Queries)
+		}
+
+		for _, query := range queries {
 			if cfg.QueryID != "" && query.ID != cfg.QueryID {
 				continue
 			}
@@ -153,6 +161,56 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 	return report, nil
 }
 
+// selectQuickSubset returns a deterministic subset of queries for quick mode.
+// It selects at most 3 queries per corpus, preferring a mix of difficulties.
+func selectQuickSubset(queries []Query) []Query {
+	if len(queries) <= 3 {
+		return queries
+	}
+
+	// Group by difficulty
+	byDiff := make(map[string][]Query)
+	for _, q := range queries {
+		diff := q.Difficulty
+		if diff == "" {
+			diff = "medium"
+		}
+		byDiff[diff] = append(byDiff[diff], q)
+	}
+
+	// Select one from each difficulty level, up to 3 total
+	var selected []Query
+	for _, diff := range []string{"easy", "medium", "hard"} {
+		if qs, ok := byDiff[diff]; ok && len(qs) > 0 {
+			selected = append(selected, qs[0])
+			if len(selected) >= 3 {
+				break
+			}
+		}
+	}
+
+	// If we don't have 3 yet, fill from remaining
+	if len(selected) < 3 {
+		for _, q := range queries {
+			found := false
+			for _, s := range selected {
+				if s.ID == q.ID {
+					found = true
+					break
+				}
+			}
+			if !found {
+				selected = append(selected, q)
+				if len(selected) >= 3 {
+					break
+				}
+			}
+		}
+	}
+
+	return selected
+}
+
 func createMatcher(cfg RunConfig) semantic.ElementMatcher {
 	embedder := semantic.NewHashingEmbedder(128)
 	switch cfg.Strategy {
@@ -189,8 +247,11 @@ func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg R
 
 	start := time.Now()
 	findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{
-		Threshold: threshold,
-		TopK:      topK,
+		Threshold:       threshold,
+		TopK:            topK,
+		LexicalWeight:   cfg.LexicalWeight,
+		EmbeddingWeight: cfg.EmbeddingWeight,
+		Explain:         cfg.Explain,
 	})
 	result.Latency.LibraryMs = time.Since(start).Milliseconds()
 
@@ -384,3 +445,20 @@ func sortInt64(s []int64) {
 		}
 	}
 }
+
+func getGitInfo() (sha string, dirty bool) {
+	cmd := exec.Command("git", "rev-parse", "HEAD")
+	out, err := cmd.Output()
+	if err != nil {
+		return "", false
+	}
+	sha = strings.TrimSpace(string(out))
+
+	cmd = exec.Command("git", "status", "--porcelain")
+	out, err = cmd.Output()
+	if err != nil {
+		return sha, false
+	}
+	dirty = len(strings.TrimSpace(string(out))) > 0
+	return sha, dirty
+}
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
index e7622f1..6545913 100644
--- a/internal/benchmark/runtime.go
+++ b/internal/benchmark/runtime.go
@@ -35,7 +35,26 @@ type runtimeBaseline struct {
 
 func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 	root := FindBenchmarkRoot()
-	baselinePath := filepath.Join(root, "baselines", "runtime.json")
+
+	// Load config for thresholds
+	benchCfg, _ := LoadConfig(root)
+	var thresholds BaselineRuntime
+	if benchCfg != nil {
+		thresholds = benchCfg.RuntimeThresholds()
+	} else {
+		thresholds = BaselineRuntime{
+			MaxNsOpRegressionRatio:  1.25,
+			MaxAllocRegressionRatio: 1.25,
+		}
+	}
+
+	// Determine baseline path from config
+	var baselinePath string
+	if benchCfg != nil {
+		baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "runtime.json")
+	} else {
+		baselinePath = filepath.Join(root, "baselines", "runtime.json")
+	}
 
 	benchmarks, err := runGoBenchmarks()
 	if err != nil {
@@ -66,18 +85,29 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 		baselineMap[b.Name] = b
 	}
 
-	maxRatio := 1.25
+	// Warning threshold is halfway between 1.0 and max ratio
+	warnRatio := 1.0 + ((thresholds.MaxNsOpRegressionRatio - 1.0) / 2.0)
+
 	for i, b := range result.Benchmarks {
 		if base, ok := baselineMap[b.Name]; ok {
-			ratio := b.NsOp / base.NsOp
+			nsRatio := b.NsOp / base.NsOp
 			result.Benchmarks[i].BaselineNs = base.NsOp
-			result.Benchmarks[i].Ratio = ratio
+			result.Benchmarks[i].Ratio = nsRatio
+
+			// Check allocation regression if baseline has allocation data
+			var allocRatio float64
+			if base.AllocsOp > 0 && b.AllocsOp > 0 {
+				allocRatio = float64(b.AllocsOp) / float64(base.AllocsOp)
+			}
 
 			switch {
-			case ratio > maxRatio:
+			case nsRatio > thresholds.MaxNsOpRegressionRatio:
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			case allocRatio > thresholds.MaxAllocRegressionRatio:
 				result.Benchmarks[i].Status = "regression"
 				result.Regressions++
-			case ratio > 1.1:
+			case nsRatio > warnRatio:
 				result.Benchmarks[i].Status = "warning"
 			default:
 				result.Benchmarks[i].Status = "ok"
diff --git a/internal/engine/benchmark_test.go b/internal/engine/benchmark_test.go
index c37528c..0ebc2c6 100644
--- a/internal/engine/benchmark_test.go
+++ b/internal/engine/benchmark_test.go
@@ -2,9 +2,10 @@ package engine
 
 import (
 	"context"
-	"github.com/pinchtab/semantic/internal/types"
 	"strconv"
 	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 // benchElements returns a realistic set of elements for benchmarking.
@@ -244,3 +245,119 @@ func BenchmarkCombinedFind_Issue24_100Elements(b *testing.B) {
 		})
 	}
 }
+
+// Focused microbenchmarks for individual components
+
+func BenchmarkParseQueryContext(b *testing.B) {
+	queries := []string{
+		"sign in button",
+		"the first email textbox in the login form",
+		"button not submit near the checkout section",
+		"second item in the dropdown menu",
+	}
+	b.ReportAllocs()
+
+	for b.Loop() {
+		for _, q := range queries {
+			ParseQueryContext(q)
+		}
+	}
+}
+
+func BenchmarkParseQueryContext_Complex(b *testing.B) {
+	q := "the third blue submit button in the checkout form not disabled"
+	b.ReportAllocs()
+
+	for b.Loop() {
+		ParseQueryContext(q)
+	}
+}
+
+func BenchmarkRemoveStopwords(b *testing.B) {
+	tokenSets := [][]string{
+		{"click", "the", "sign", "in", "button"},
+		{"find", "the", "email", "address", "textbox"},
+		{"the", "first", "item", "in", "a", "dropdown", "menu"},
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, tokens := range tokenSets {
+			removeStopwords(tokens)
+		}
+	}
+}
+
+func BenchmarkScoreFusion(b *testing.B) {
+	// Test the score fusion calculation
+	lexScores := make([]float64, 100)
+	embScores := make([]float64, 100)
+	for i := range lexScores {
+		lexScores[i] = float64(i) / 100.0
+		embScores[i] = float64(100-i) / 100.0
+	}
+	lexWeight, embWeight := 0.6, 0.4
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for j := range lexScores {
+			_ = lexWeight*lexScores[j] + embWeight*embScores[j]
+		}
+	}
+}
+
+func BenchmarkLexicalScore_Variants(b *testing.B) {
+	cases := []struct {
+		name  string
+		query string
+		desc  string
+	}{
+		{"exact", "Sign In", "button: Sign In"},
+		{"partial", "sign", "button: Sign In"},
+		{"synonym", "login", "button: Sign In"},
+		{"mismatch", "checkout", "button: Sign In"},
+		{"long_query", "click the sign in button on the login page", "button: Sign In"},
+	}
+	for _, tc := range cases {
+		b.Run(tc.name, func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				LexicalScore(tc.query, tc.desc)
+			}
+		})
+	}
+}
+
+func BenchmarkCombinedFind_WeightVariants(b *testing.B) {
+	elements := benchElements()
+	ctx := context.Background()
+
+	weights := []struct {
+		name string
+		lex  float64
+		emb  float64
+	}{
+		{"lex_only", 1.0, 0.0},
+		{"emb_only", 0.0, 1.0},
+		{"balanced", 0.5, 0.5},
+		{"lex_heavy", 0.8, 0.2},
+		{"emb_heavy", 0.2, 0.8},
+	}
+
+	for _, w := range weights {
+		b.Run(w.name, func(b *testing.B) {
+			m := NewCombinedMatcher(NewHashingEmbedder(128))
+			opts := types.FindOptions{
+				Threshold:       0.3,
+				TopK:            3,
+				LexicalWeight:   w.lex,
+				EmbeddingWeight: w.emb,
+			}
+			b.ReportAllocs()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_, _ = m.Find(ctx, "sign in button", elements, opts)
+			}
+		})
+	}
+}

From f8c81366b84937d7f28745ffb6cd323451d5df88 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 23:05:00 +0100
Subject: [PATCH 30/30] feat: config-driven thresholds with validation and
 enforcement

---
 internal/benchmark/check.go   | 55 ++++++++++++++++++++---------------
 internal/benchmark/config.go  |  5 +++-
 internal/benchmark/runner.go  |  6 ++--
 internal/benchmark/runtime.go | 22 ++++----------
 4 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
index 0528059..88234f6 100644
--- a/internal/benchmark/check.go
+++ b/internal/benchmark/check.go
@@ -18,16 +18,11 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 		return nil, fmt.Errorf("load dataset: %w", err)
 	}
 
-	benchCfg, _ := LoadConfig(root)
-	profile := Profile{
-		Strategy:  "combined",
-		Threshold: 0.01,
-		TopK:      5,
-		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
-	}
-	if benchCfg != nil {
-		profile = ResolveProfile(benchCfg, cfg.Profile)
+	benchCfg, err := LoadConfig(root)
+	if err != nil {
+		return nil, fmt.Errorf("load config: %w", err)
 	}
+	profile := ResolveProfile(benchCfg, cfg.Profile)
 
 	runCfg := RunConfig{
 		Suite:           "corpus",
@@ -76,24 +71,11 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 	// Determine baseline path from config
 	baselinePath := cfg.BaselinePath
 	if baselinePath == "" {
-		if benchCfg != nil {
-			baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json")
-		} else {
-			baselinePath = filepath.Join(root, "baselines", "combined.json")
-		}
+		baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json")
 	}
 
 	// Get quality thresholds from config
-	var thresholds BaselineQuality
-	if benchCfg != nil {
-		thresholds = benchCfg.QualityThresholds()
-	} else {
-		thresholds = BaselineQuality{
-			MaxOverallPAt1Drop:   0.02,
-			MaxOverallMRRDrop:    0.02,
-			MaxOverallHitAt3Drop: 0.02,
-		}
-	}
+	thresholds := benchCfg.QualityThresholds()
 
 	if _, err := os.Stat(baselinePath); err == nil {
 		baseline, err := loadReport(baselinePath)
@@ -104,11 +86,36 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
 			}
 			if cfg.FailOnReg {
+				// Check overall thresholds
 				if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop ||
 					result.Delta.MRR < -thresholds.MaxOverallMRRDrop ||
 					result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop {
 					result.Status = "fail"
 				}
+				// Check corpus-level thresholds
+				for corpus, current := range report.Metrics.ByCorpus {
+					if base, ok := baseline.Metrics.ByCorpus[corpus]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxCorpusPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
+				// Check difficulty-level thresholds
+				for diff, current := range report.Metrics.ByDifficulty {
+					if base, ok := baseline.Metrics.ByDifficulty[diff]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxDifficultyPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
+				// Check tag-level thresholds
+				for tag, current := range report.Metrics.ByTag {
+					if base, ok := baseline.Metrics.ByTag[tag]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxTagPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
 			}
 		}
 	}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index cd0bbec..2d233e2 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -166,6 +166,9 @@ func LoadConfig(benchmarkRoot string) (*Config, error) {
 	if err := json.Unmarshal(data, &cfg); err != nil {
 		return nil, err
 	}
+	if err := ValidateConfig(&cfg); err != nil {
+		return nil, fmt.Errorf("invalid config: %w", err)
+	}
 	return &cfg, nil
 }
 
@@ -408,7 +411,7 @@ func ParseCheckFlags(args []string) CheckConfig {
 	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)")
 	fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression")
-	fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks")
+	fs.BoolVar(&cfg.Quick, "quick", false, "smoke mode: 3 queries per corpus (not representative)")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details")
 	fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations")
 	_ = fs.Parse(args)
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
index 253a4c3..6f00821 100644
--- a/internal/benchmark/runner.go
+++ b/internal/benchmark/runner.go
@@ -161,8 +161,10 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 	return report, nil
 }
 
-// selectQuickSubset returns a deterministic subset of queries for quick mode.
-// It selects at most 3 queries per corpus, preferring a mix of difficulties.
+// selectQuickSubset returns a deterministic subset for smoke testing.
+// Selects up to 3 queries per corpus by difficulty. This is NOT representative
+// of full corpus coverage—edge-case tags may be missed. Use for fast iteration,
+// not for final regression checks.
 func selectQuickSubset(queries []Query) []Query {
 	if len(queries) <= 3 {
 		return queries
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
index 6545913..dd68f75 100644
--- a/internal/benchmark/runtime.go
+++ b/internal/benchmark/runtime.go
@@ -37,24 +37,12 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 	root := FindBenchmarkRoot()
 
 	// Load config for thresholds
-	benchCfg, _ := LoadConfig(root)
-	var thresholds BaselineRuntime
-	if benchCfg != nil {
-		thresholds = benchCfg.RuntimeThresholds()
-	} else {
-		thresholds = BaselineRuntime{
-			MaxNsOpRegressionRatio:  1.25,
-			MaxAllocRegressionRatio: 1.25,
-		}
-	}
-
-	// Determine baseline path from config
-	var baselinePath string
-	if benchCfg != nil {
-		baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "runtime.json")
-	} else {
-		baselinePath = filepath.Join(root, "baselines", "runtime.json")
+	benchCfg, err := LoadConfig(root)
+	if err != nil {
+		return nil, fmt.Errorf("load config: %w", err)
 	}
+	thresholds := benchCfg.RuntimeThresholds()
+	baselinePath := filepath.Join(benchCfg.BaselinesDir(root), "runtime.json")
 
 	benchmarks, err := runGoBenchmarks()
 	if err != nil {