diff --git a/.gitignore b/.gitignore
index 2f3b5cc..419dfaa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Binary
 /semantic
+/semantic-bench
 tests/benchmark/semantic
 tests/e2e/semantic
 *.exe
@@ -21,4 +22,5 @@ cover.out
 .claude
 tests/e2e/results/*.txt
 tests/benchmark/results/*.json
-tests/benchmark/results/*.md
\ No newline at end of file
+tests/benchmark/results/*.md
+tests/benchmark/baselines/*.json
\ No newline at end of file
diff --git a/README.md b/README.md
index 57e3053..83fb48e 100644
--- a/README.md
+++ b/README.md
@@ -204,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode
 
 ## Design Trade-offs
 
-See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
+See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
 
 ## Origin
 
diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go
new file mode 100644
index 0000000..076d71a
--- /dev/null
+++ b/cmd/semantic-bench/main.go
@@ -0,0 +1,168 @@
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/pinchtab/semantic/internal/benchmark"
+)
+
+const usage = `semantic-bench - Benchmark runner for semantic matching
+
+Usage:
+  semantic-bench <command> [flags]
+
+Commands:
+  check       Run benchmark and compare against baseline (default)
+  run         Run benchmark suites
+  compare     Compare two reports
+  lint        Validate dataset
+  catalog     Print dataset inventory
+  baseline    Manage quality baselines (create, update)
+  calibrate   Find optimal thresholds via precision/recall analysis
+  tune        Grid-search lexical/embedding weights
+  runtime     Check Go benchmark performance against baseline
+
+Flags:
+  -h, --help    Show help
+
+Run 'semantic-bench <command> --help' for command-specific help.
+`
+
+func main() {
+	if len(os.Args) < 2 {
+		runCheck(os.Args[1:])
+		return
+	}
+
+	cmd := os.Args[1]
+	args := os.Args[2:]
+
+	switch cmd {
+	case "check":
+		runCheck(args)
+	case "run":
+		runRun(args)
+	case "compare":
+		runCompare(args)
+	case "lint":
+		runLint(args)
+	case "catalog":
+		runCatalog(args)
+	case "baseline":
+		runBaseline(args)
+	case "calibrate":
+		runCalibrate(args)
+	case "tune":
+		runTune(args)
+	case "runtime":
+		runRuntime(args)
+	case "-h", "--help", "help":
+		fmt.Print(usage)
+	default:
+		fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage)
+		os.Exit(2)
+	}
+}
+
+func runCheck(args []string) {
+	cfg := benchmark.ParseCheckFlags(args)
+	result, err := benchmark.RunCheck(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCheckResult(result, cfg)
+	if result.Status == "fail" {
+		os.Exit(1)
+	}
+}
+
+func runRun(args []string) {
+	cfg := benchmark.ParseRunFlags(args)
+	result, err := benchmark.RunBenchmark(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintRunResult(result, cfg)
+}
+
+func runCompare(args []string) {
+	cfg := benchmark.ParseCompareFlags(args)
+	result, err := benchmark.RunCompare(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCompareResult(result, cfg)
+	if result.Status == "fail" {
+		os.Exit(1)
+	}
+}
+
+func runLint(args []string) {
+	cfg := benchmark.ParseLintFlags(args)
+	result, err := benchmark.RunLint(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintLintResult(result, cfg)
+	if result.Errors > 0 {
+		os.Exit(1)
+	}
+}
+
+func runCatalog(args []string) {
+	cfg := benchmark.ParseCatalogFlags(args)
+	result, err := benchmark.RunCatalog(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCatalogResult(result, cfg)
+}
+
+func runBaseline(args []string) {
+	cfg := benchmark.ParseBaselineFlags(args)
+	result, err := benchmark.RunBaseline(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintBaselineResult(result, cfg)
+}
+
+func runCalibrate(args []string) {
+	cfg := benchmark.ParseCalibrateFlags(args)
+	result, err := benchmark.RunCalibrate(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCalibrateResult(result, cfg)
+}
+
+func runTune(args []string) {
+	cfg := benchmark.ParseTuneFlags(args)
+	result, err := benchmark.RunTune(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintTuneResult(result, cfg)
+}
+
+func runRuntime(args []string) {
+	cfg := benchmark.ParseRuntimeFlags(args)
+	result, err := benchmark.RunRuntime(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintRuntimeResult(result, cfg)
+	if result.Status == "fail" && cfg.FailOnRegression {
+		os.Exit(1)
+	}
+}
diff --git a/dev b/dev
index dc15e75..11d53d9 100755
--- a/dev
+++ b/dev
@@ -11,17 +11,27 @@ ERROR=$'\033[38;2;230;57;70m'
 NC=$'\033[0m'
 
 commands=(
+  "pr:🚀:Pre-PR checks (check + e2e + bench)"
   "doctor:🩺:Setup dev environment"
   "test:🧪:Run unit tests"
   "test verbose:🧪:Run unit tests (verbose)"
   "test race:🧪:Run unit tests with race detector"
   "coverage:📊:Run tests with coverage report"
   "lint:🔍:Run golangci-lint"
+  "lint corpus:🔍:Lint benchmark corpus"
+  "lint docs:🔍:Check documentation links"
   "fmt:✨:Format code"
   "vet:🔬:Run go vet"
   "check:✅:Run all checks (fmt + vet + lint + test)"
   "build:📦:Build CLI binary"
-  "bench:🏋:Run corpus benchmark suite"
+  "bench:🏋:Run corpus benchmark"
+  "bench full:🏋:Run full benchmark suite"
+  "baseline:📏:Create quality baseline"
+  "baseline check:📏:Check against baseline"
+  "baseline update:📏:Update baseline (--accept)"
+  "calibrate:🎯:Calibrate threshold recommendations"
+  "runtime:⏱️:Check runtime baseline"
+  "tune:🎛️:Tune combined weights"
   "e2e:🐳:Run E2E tests (Docker)"
 )
 
@@ -36,6 +46,36 @@ show_help() {
   echo ""
 }
 
+run_pr() {
+  echo "  ${ACCENT}${BOLD}🚀 Pre-PR checks${NC}"
+  echo ""
+
+  echo "  ${MUTED}1/4 All checks (fmt + vet + lint + test)${NC}"
+  run_check
+
+  echo ""
+  echo "  ${MUTED}2/4 E2E tests${NC}"
+  if [[ -f tests/e2e/run.sh ]]; then
+    go build -o /tmp/semantic ./cmd/semantic
+    PATH="/tmp:$PATH" bash tests/e2e/run.sh
+    echo "  ${SUCCESS}✓${NC} E2E passed"
+  else
+    echo "  ${MUTED}Skipped (no e2e/run.sh)${NC}"
+  fi
+
+  echo ""
+  echo "  ${MUTED}3/4 Lint corpus${NC}"
+  run_lint_corpus
+
+  echo ""
+  echo "  ${MUTED}4/4 Corpus benchmark${NC}"
+  run_bench > /dev/null 2>&1
+  echo "  ${SUCCESS}✓${NC} Benchmark complete"
+
+  echo ""
+  echo "  ${SUCCESS}${BOLD}🚀 Ready for PR${NC}"
+}
+
 run_test() {
   echo "  ${ACCENT}${BOLD}🧪 Running tests${NC}"
   go test ./... -count=1
@@ -88,9 +128,19 @@ run_check() {
   if [ -n "$unformatted" ]; then
     echo "  ${ERROR}✗${NC} Unformatted files:"
     echo "$unformatted"
-    exit 1
+    echo ""
+    printf "  Fix formatting now? (Y/n) "
+    read -r answer
+    if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then
+      gofmt -w .
+      echo "  ${SUCCESS}✓${NC} Format (fixed)"
+    else
+      echo "  ${MUTED}Run: gofmt -w .${NC}"
+      exit 1
+    fi
+  else
+    echo "  ${SUCCESS}✓${NC} Format"
   fi
-  echo "  ${SUCCESS}✓${NC} Format"
 
   echo "  ${MUTED}2/4 Vet${NC}"
   go vet ./...
@@ -115,8 +165,53 @@ run_build() {
 }
 
 run_bench() {
-  echo "  ${ACCENT}${BOLD}⏱️  Running corpus benchmark suite${NC}"
-  bash tests/benchmark/scripts/run-corpus-benchmark.sh
+  echo "  ${ACCENT}${BOLD}🏋 Running corpus benchmark${NC}"
+  go run ./cmd/semantic-bench check "$@"
+}
+
+run_bench_full() {
+  echo "  ${ACCENT}${BOLD}🏋 Running full benchmark suite${NC}"
+  go run ./cmd/semantic-bench run -suite=all "$@"
+}
+
+run_lint_corpus() {
+  echo "  ${ACCENT}${BOLD}🔍 Linting benchmark corpus${NC}"
+  go run ./cmd/semantic-bench lint "$@"
+}
+
+run_lint_docs() {
+  echo "  ${ACCENT}${BOLD}🔍 Checking documentation links${NC}"
+  bash scripts/check-docs-links.sh
+}
+
+run_baseline() {
+  echo "  ${ACCENT}${BOLD}📏 Creating quality baseline${NC}"
+  go run ./cmd/semantic-bench baseline create "$@"
+}
+
+run_baseline_check() {
+  echo "  ${ACCENT}${BOLD}📏 Checking against baseline${NC}"
+  go run ./cmd/semantic-bench check "$@"
+}
+
+run_baseline_update() {
+  echo "  ${ACCENT}${BOLD}📏 Updating baseline${NC}"
+  go run ./cmd/semantic-bench baseline update --accept "$@"
+}
+
+run_calibrate() {
+  echo "  ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}"
+  go run ./cmd/semantic-bench calibrate -verbose "$@"
+}
+
+run_runtime() {
+  echo "  ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}"
+  go run ./cmd/semantic-bench runtime "$@"
+}
+
+run_tune() {
+  echo "  ${ACCENT}${BOLD}🎛️ Tuning combined weights${NC}"
+  go run ./cmd/semantic-bench tune -verbose "$@"
 }
 
 run_e2e() {
@@ -129,6 +224,7 @@ run_e2e() {
 }
 
 case "${1:-help}" in
+  pr)        run_pr ;;
   doctor)    exec bash scripts/doctor.sh ;;
   test)
     case "${2:-}" in
@@ -138,12 +234,33 @@ case "${1:-help}" in
     esac
     ;;
   coverage)  run_coverage ;;
-  lint)      run_lint ;;
+  lint)
+    case "${2:-}" in
+      corpus) run_lint_corpus ;;
+      docs) run_lint_docs ;;
+      *) run_lint ;;
+    esac
+    ;;
   fmt)       run_fmt ;;
   vet)       run_vet ;;
   check)     run_check ;;
   build)     run_build ;;
-  bench|benchmark) run_bench ;;
+  bench|benchmark)
+    case "${2:-}" in
+      full) run_bench_full ;;
+      *) shift; run_bench "$@" ;;
+    esac
+    ;;
+  baseline)
+    case "${2:-}" in
+      check) shift 2; run_baseline_check "$@" ;;
+      update) shift 2; run_baseline_update "$@" ;;
+      *) shift; run_baseline "$@" ;;
+    esac
+    ;;
+  calibrate) shift; run_calibrate "$@" ;;
+  runtime)   shift; run_runtime "$@" ;;
+  tune)      shift; run_tune "$@" ;;
   e2e)       run_e2e ;;
   help|*)    show_help ;;
 esac
diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go
new file mode 100644
index 0000000..07cc418
--- /dev/null
+++ b/internal/benchmark/baseline.go
@@ -0,0 +1,110 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type BaselineResult struct {
+	Action   string          `json:"action"`
+	Path     string          `json:"path"`
+	Metrics  OverallMetrics  `json:"metrics"`
+	Previous *OverallMetrics `json:"previous,omitempty"`
+}
+
+func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) {
+	root := FindBenchmarkRoot()
+	baselinesDir := filepath.Join(root, "baselines")
+	if err := os.MkdirAll(baselinesDir, 0755); err != nil {
+		return nil, err
+	}
+
+	baselinePath := filepath.Join(baselinesDir, cfg.Name+".json")
+
+	switch cfg.Action {
+	case "create":
+		return createBaseline(root, baselinePath, cfg)
+	case "update":
+		if !cfg.Accept {
+			return nil, fmt.Errorf("use --accept to confirm baseline update")
+		}
+		return updateBaseline(root, baselinePath, cfg)
+	default:
+		return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action)
+	}
+}
+
+func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Mode:            "library",
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	data, err := json.MarshalIndent(report, "", "  ")
+	if err != nil {
+		return nil, err
+	}
+	if err := os.WriteFile(baselinePath, data, 0644); err != nil {
+		return nil, err
+	}
+
+	return &BaselineResult{
+		Action:  "create",
+		Path:    baselinePath,
+		Metrics: report.Metrics.Overall,
+	}, nil
+}
+
+func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	var previous *OverallMetrics
+	if data, err := os.ReadFile(baselinePath); err == nil {
+		var old Report
+		if json.Unmarshal(data, &old) == nil {
+			previous = &old.Metrics.Overall
+		}
+		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
+		_ = os.WriteFile(backupPath, data, 0644)
+	}
+
+	result, err := createBaseline(root, baselinePath, cfg)
+	if err != nil {
+		return nil, err
+	}
+	result.Action = "update"
+	result.Previous = previous
+	return result, nil
+}
+
+func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) {
+	fmt.Printf("\n  Baseline %sd: %s\n\n", result.Action, result.Path)
+	fmt.Printf("  MRR:    %.4f\n", result.Metrics.MRR)
+	fmt.Printf("  P@1:    %.4f\n", result.Metrics.PAt1)
+	fmt.Printf("  Hit@3:  %.4f\n", result.Metrics.HitAt3)
+
+	if result.Previous != nil {
+		fmt.Printf("\n  Previous:\n")
+		fmt.Printf("    MRR:    %.4f\n", result.Previous.MRR)
+		fmt.Printf("    P@1:    %.4f\n", result.Previous.PAt1)
+		fmt.Printf("    Hit@3:  %.4f\n", result.Previous.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go
new file mode 100644
index 0000000..48ec06e
--- /dev/null
+++ b/internal/benchmark/calibrate.go
@@ -0,0 +1,173 @@
+package benchmark
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/pinchtab/semantic"
+)
+
+type CalibrateResult struct {
+	ByThreshold     map[string]ThresholdMetrics `json:"by_threshold"`
+	Recommendations CalibrateRecommendations    `json:"recommendations"`
+	TotalCases      int                         `json:"total_cases"`
+}
+
+type ThresholdMetrics struct {
+	TP        int     `json:"tp"`
+	FP        int     `json:"fp"`
+	FN        int     `json:"fn"`
+	TN        int     `json:"tn"`
+	Recall    float64 `json:"recall"`
+	Precision float64 `json:"precision"`
+	FPR       float64 `json:"false_positive_rate"`
+	F1        float64 `json:"f1"`
+}
+
+type CalibrateRecommendations struct {
+	DefaultThreshold  float64 `json:"default_threshold"`
+	RecoveryThreshold float64 `json:"recovery_threshold"`
+	BestF1            float64 `json:"best_f1"`
+}
+
+func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &CalibrateResult{
+		ByThreshold: make(map[string]ThresholdMetrics),
+	}
+
+	type testCase struct {
+		query  Query
+		corpus *Corpus
+	}
+
+	var cases []testCase
+	for i := range ds.Corpora {
+		corpus := &ds.Corpora[i]
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+		for _, q := range corpus.Queries {
+			cases = append(cases, testCase{query: q, corpus: corpus})
+		}
+	}
+	result.TotalCases = len(cases)
+
+	if cfg.Verbose {
+		fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases))
+	}
+
+	runCfg := RunConfig{
+		Strategy:        "combined",
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+	}
+	matcher := createMatcher(runCfg)
+
+	var bestF1, bestF1Threshold float64
+	var bestRecallThreshold float64
+	var bestRecallWithPrecision float64
+
+	for _, threshold := range cfg.Thresholds {
+		tp, fp, fn, tn := 0, 0, 0, 0
+
+		for _, tc := range cases {
+			findResult, _ := matcher.Find(context.Background(), tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
+				Threshold: threshold,
+				TopK:      5,
+			})
+
+			hasMatch := len(findResult.Matches) > 0
+			topRef := ""
+			if hasMatch {
+				topRef = findResult.Matches[0].Ref
+			}
+
+			switch {
+			case tc.query.ExpectNoMatch && hasMatch:
+				fp++
+			case tc.query.ExpectNoMatch && !hasMatch:
+				tn++
+			case len(tc.query.RelevantRefs) > 0 && !hasMatch:
+				fn++
+			case len(tc.query.RelevantRefs) > 0 && contains(tc.query.RelevantRefs, topRef):
+				tp++
+			case len(tc.query.RelevantRefs) > 0:
+				fp++
+			}
+		}
+
+		totalPos := tp + fn
+		totalNeg := tn + fp
+
+		var recall, precision, fpr, f1 float64
+		if totalPos > 0 {
+			recall = float64(tp) / float64(totalPos)
+		}
+		if tp+fp > 0 {
+			precision = float64(tp) / float64(tp+fp)
+		}
+		if totalNeg > 0 {
+			fpr = float64(fp) / float64(totalNeg)
+		}
+		if precision+recall > 0 {
+			f1 = 2 * precision * recall / (precision + recall)
+		}
+
+		key := fmt.Sprintf("%.2f", threshold)
+		result.ByThreshold[key] = ThresholdMetrics{
+			TP: tp, FP: fp, FN: fn, TN: tn,
+			Recall: recall, Precision: precision, FPR: fpr, F1: f1,
+		}
+
+		if f1 > bestF1 {
+			bestF1 = f1
+			bestF1Threshold = threshold
+		}
+		if recall >= 0.85 && precision > bestRecallWithPrecision {
+			bestRecallWithPrecision = precision
+			bestRecallThreshold = threshold
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n",
+				threshold, tp, fp, fn, tn, recall, precision, f1)
+		}
+	}
+
+	if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 {
+		bestRecallThreshold = cfg.Thresholds[0]
+	}
+
+	result.Recommendations = CalibrateRecommendations{
+		DefaultThreshold:  bestF1Threshold,
+		RecoveryThreshold: bestRecallThreshold,
+		BestF1:            bestF1,
+	}
+
+	return result, nil
+}
+
+func contains(refs []string, ref string) bool {
+	for _, r := range refs {
+		if r == ref {
+			return true
+		}
+	}
+	return false
+}
+
+func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) {
+	fmt.Printf("\n  Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold))
+
+	fmt.Printf("  Recommendations:\n")
+	fmt.Printf("    Default (best F1):   %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1)
+	fmt.Printf("    Recovery (recall):   %.2f\n", result.Recommendations.RecoveryThreshold)
+	fmt.Println()
+}
diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go
new file mode 100644
index 0000000..69a3091
--- /dev/null
+++ b/internal/benchmark/catalog.go
@@ -0,0 +1,86 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"sort"
+)
+
+func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+
+	result := &CatalogResult{
+		ByTag:        make(map[string]int),
+		ByDifficulty: make(map[string]int),
+	}
+
+	for _, c := range ds.Corpora {
+		tags := make(map[string]bool)
+		for _, q := range c.Queries {
+			result.TotalQueries++
+			result.ByDifficulty[q.Difficulty]++
+			for _, t := range q.Tags {
+				tags[t] = true
+				result.ByTag[t]++
+			}
+		}
+		var tagList []string
+		for t := range tags {
+			tagList = append(tagList, t)
+		}
+		sort.Strings(tagList)
+		result.Corpora = append(result.Corpora, CorpusSummary{
+			ID:      c.ID,
+			Queries: len(c.Queries),
+			Tags:    tagList,
+		})
+	}
+
+	return result, nil
+}
+
+func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n  Corpora: %d\n", len(result.Corpora))
+	fmt.Printf("  Total Queries: %d\n\n", result.TotalQueries)
+
+	fmt.Printf("  %-30s %8s\n", "Corpus", "Queries")
+	fmt.Printf("  %-30s %8s\n", "------", "-------")
+	for _, c := range result.Corpora {
+		fmt.Printf("  %-30s %8d\n", c.ID, c.Queries)
+	}
+
+	switch cfg.By {
+	case "difficulty":
+		fmt.Printf("\n  By Difficulty:\n")
+		diffs := sortedKeys(result.ByDifficulty)
+		for _, d := range diffs {
+			fmt.Printf("    %-10s %4d\n", d, result.ByDifficulty[d])
+		}
+	case "tag":
+		fmt.Printf("\n  By Tag:\n")
+		tags := sortedKeys(result.ByTag)
+		for _, t := range tags {
+			fmt.Printf("    %-20s %4d\n", t, result.ByTag[t])
+		}
+	}
+	fmt.Printf("\n")
+}
+
+func sortedKeys(m map[string]int) []string {
+	keys := make([]string, 0, len(m))
+	for k := range m {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	return keys
+}
diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
new file mode 100644
index 0000000..88234f6
--- /dev/null
+++ b/internal/benchmark/check.go
@@ -0,0 +1,279 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+)
+
+func RunCheck(cfg CheckConfig) (*CheckResult, error) {
+	root := FindBenchmarkRoot()
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	benchCfg, err := LoadConfig(root)
+	if err != nil {
+		return nil, fmt.Errorf("load config: %w", err)
+	}
+	profile := ResolveProfile(benchCfg, cfg.Profile)
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        profile.Strategy,
+		Threshold:       profile.Threshold,
+		TopK:            profile.TopK,
+		LexicalWeight:   profile.Weights.Lexical,
+		EmbeddingWeight: profile.Weights.Embedding,
+		Profile:         cfg.Profile,
+		Mode:            "library",
+		Verbose:         cfg.Verbose,
+		Explain:         cfg.Explain,
+		OutputDir:       cfg.OutputDir,
+		Quick:           cfg.Quick,
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	result := &CheckResult{
+		Status: "pass",
+		Report: report,
+	}
+	result.Summary.PAt1 = report.Metrics.Overall.PAt1
+	result.Summary.MRR = report.Metrics.Overall.MRR
+	result.Summary.HitAt3 = report.Metrics.Overall.HitAt3
+	result.Summary.Total = report.Metrics.Overall.Total
+
+	for _, r := range report.Results {
+		if r.Status == "miss" {
+			result.TopRegs = append(result.TopRegs, Regression{
+				ID:           r.ID,
+				Corpus:       r.Corpus,
+				Query:        r.Query,
+				Expected:     r.Expected.RelevantRefs,
+				CurrentRef:   r.Actual.BestRef,
+				Reason:       "miss",
+				DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID),
+			})
+		}
+	}
+	result.Summary.Regressions = len(result.TopRegs)
+
+	// Determine baseline path from config
+	baselinePath := cfg.BaselinePath
+	if baselinePath == "" {
+		baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json")
+	}
+
+	// Get quality thresholds from config
+	thresholds := benchCfg.QualityThresholds()
+
+	if _, err := os.Stat(baselinePath); err == nil {
+		baseline, err := loadReport(baselinePath)
+		if err == nil {
+			result.Delta = &MetricsDelta{
+				PAt1:   report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+			}
+			if cfg.FailOnReg {
+				// Check overall thresholds
+				if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop ||
+					result.Delta.MRR < -thresholds.MaxOverallMRRDrop ||
+					result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop {
+					result.Status = "fail"
+				}
+				// Check corpus-level thresholds
+				for corpus, current := range report.Metrics.ByCorpus {
+					if base, ok := baseline.Metrics.ByCorpus[corpus]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxCorpusPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
+				// Check difficulty-level thresholds
+				for diff, current := range report.Metrics.ByDifficulty {
+					if base, ok := baseline.Metrics.ByDifficulty[diff]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxDifficultyPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
+				// Check tag-level thresholds
+				for tag, current := range report.Metrics.ByTag {
+					if base, ok := baseline.Metrics.ByTag[tag]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxTagPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// Sort regressions for deterministic output
+	sort.Slice(result.TopRegs, func(i, j int) bool {
+		if result.TopRegs[i].Corpus != result.TopRegs[j].Corpus {
+			return result.TopRegs[i].Corpus < result.TopRegs[j].Corpus
+		}
+		return result.TopRegs[i].ID < result.TopRegs[j].ID
+	})
+
+	_ = os.MkdirAll(cfg.OutputDir, 0755)
+	ts := time.Now().Format("20060102_150405")
+	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
+	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
+
+	reportJSON, _ := json.MarshalIndent(report, "", "  ")
+	_ = os.WriteFile(reportPath, reportJSON, 0644)
+
+	summaryMD := generateSummaryMD(report, result)
+	_ = os.WriteFile(summaryPath, []byte(summaryMD), 0644)
+
+	result.Artifacts.ReportJSON = reportPath
+	result.Artifacts.SummaryMD = summaryPath
+
+	return result, nil
+}
+
+func RunBenchmark(cfg RunConfig) (*Report, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+	return RunCorpusBenchmark(ds, cfg)
+}
+
+func loadReport(path string) (*Report, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var r Report
+	if err := json.Unmarshal(data, &r); err != nil {
+		return nil, err
+	}
+	return &r, nil
+}
+
+func generateSummaryMD(report *Report, result *CheckResult) string {
+	var sb strings.Builder
+
+	sb.WriteString("# Benchmark Summary\n\n")
+	fmt.Fprintf(&sb, "Generated: %s\n\n", report.Run.Timestamp)
+
+	sb.WriteString("## Overall Metrics\n\n")
+	sb.WriteString("| Metric | Value |\n")
+	sb.WriteString("|--------|-------|\n")
+	fmt.Fprintf(&sb, "| Total | %d |\n", report.Metrics.Overall.Total)
+	fmt.Fprintf(&sb, "| MRR | %.4f |\n", report.Metrics.Overall.MRR)
+	fmt.Fprintf(&sb, "| P@1 | %.4f |\n", report.Metrics.Overall.PAt1)
+	fmt.Fprintf(&sb, "| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3)
+	fmt.Fprintf(&sb, "| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin)
+
+	if result.Delta != nil {
+		sb.WriteString("\n## Delta from Baseline\n\n")
+		sb.WriteString("| Metric | Delta |\n")
+		sb.WriteString("|--------|-------|\n")
+		fmt.Fprintf(&sb, "| P@1 | %+.4f |\n", result.Delta.PAt1)
+		fmt.Fprintf(&sb, "| MRR | %+.4f |\n", result.Delta.MRR)
+		fmt.Fprintf(&sb, "| Hit@3 | %+.4f |\n", result.Delta.HitAt3)
+	}
+
+	if len(result.TopRegs) > 0 {
+		sb.WriteString("\n## Misses\n\n")
+		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
+		sb.WriteString("|----|--------|-------|-----|----------|\n")
+		for i, r := range result.TopRegs {
+			if i >= 10 {
+				break
+			}
+			fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n",
+				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))
+		}
+		if len(result.TopRegs) > 10 {
+			fmt.Fprintf(&sb, "\n*Showing 10 of %d misses.*\n", len(result.TopRegs))
+		}
+	}
+
+	return sb.String()
+}
+
+func PrintCheckResult(result *CheckResult, cfg CheckConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m Benchmark passed\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Benchmark failed\n")
+	}
+	fmt.Printf("\n")
+
+	fmt.Printf("  %-12s %8.4f\n", "MRR", result.Summary.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", result.Summary.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", result.Summary.Total)
+	fmt.Printf("  %-12s %8d\n", "Misses", result.Summary.Regressions)
+
+	if result.Delta != nil {
+		fmt.Printf("\n  Delta from baseline:\n")
+		printDelta("P@1", result.Delta.PAt1)
+		printDelta("MRR", result.Delta.MRR)
+		printDelta("Hit@3", result.Delta.HitAt3)
+	}
+
+	fmt.Printf("\n  Artifacts:\n")
+	fmt.Printf("    Report:  %s\n", result.Artifacts.ReportJSON)
+	fmt.Printf("    Summary: %s\n", result.Artifacts.SummaryMD)
+	fmt.Printf("\n")
+}
+
+func printDelta(name string, delta float64) {
+	color := "\033[0m"
+	sign := ""
+	if delta > 0.001 {
+		color = "\033[32m"
+		sign = "+"
+	} else if delta < -0.001 {
+		color = "\033[31m"
+	}
+	fmt.Printf("    %s%-8s %s%.4f\033[0m\n", color, name, sign, delta)
+}
+
+func PrintRunResult(report *Report, cfg RunConfig) {
+	fmt.Printf("\n")
+	fmt.Printf("  %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", report.Metrics.Overall.Total)
+	fmt.Printf("\n")
+
+	if cfg.Verbose {
+		for _, r := range report.Results {
+			status := "\033[32mHIT \033[0m"
+			switch r.Status {
+			case "miss":
+				status = "\033[31mMISS\033[0m"
+			case "partial":
+				status = "\033[33mPART\033[0m"
+			}
+			fmt.Printf("  [%s] %s | %s | got=%s score=%.3f\n",
+				r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore)
+		}
+	}
+}
diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go
new file mode 100644
index 0000000..f0e6ccf
--- /dev/null
+++ b/internal/benchmark/compare.go
@@ -0,0 +1,89 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"sort"
+)
+
+func RunCompare(cfg CompareConfig) (*CompareResult, error) {
+	baseline, err := loadReport(cfg.BaselinePath)
+	if err != nil {
+		return nil, fmt.Errorf("load baseline: %w", err)
+	}
+	current, err := loadReport(cfg.CurrentPath)
+	if err != nil {
+		return nil, fmt.Errorf("load current: %w", err)
+	}
+
+	result := &CompareResult{
+		Status: "pass",
+		Delta: MetricsDelta{
+			PAt1:   current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+			MRR:    current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+			HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+		},
+	}
+
+	if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 {
+		result.Status = "fail"
+	}
+
+	baselineResults := make(map[string]QueryResult)
+	for _, r := range baseline.Results {
+		baselineResults[r.ID] = r
+	}
+	for _, r := range current.Results {
+		if base, ok := baselineResults[r.ID]; ok {
+			if base.Status == "hit" && r.Status != "hit" {
+				result.Regressions = append(result.Regressions, Regression{
+					ID:          r.ID,
+					Corpus:      r.Corpus,
+					Query:       r.Query,
+					BaselineRef: base.Actual.BestRef,
+					CurrentRef:  r.Actual.BestRef,
+					Reason:      fmt.Sprintf("%s -> %s", base.Status, r.Status),
+				})
+			}
+		}
+	}
+
+	return result, nil
+}
+
+func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m No regression\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Regression detected\n")
+	}
+	fmt.Printf("\n")
+	printDelta("P@1", result.Delta.PAt1)
+	printDelta("MRR", result.Delta.MRR)
+	printDelta("Hit@3", result.Delta.HitAt3)
+
+	if len(result.Regressions) > 0 {
+		fmt.Printf("\n  Regressions:\n")
+		sortRegressions(result.Regressions)
+		for _, r := range result.Regressions {
+			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
+		}
+	}
+	fmt.Printf("\n")
+}
+
+func sortRegressions(regs []Regression) {
+	sort.Slice(regs, func(i, j int) bool {
+		if regs[i].Corpus != regs[j].Corpus {
+			return regs[i].Corpus < regs[j].Corpus
+		}
+		return regs[i].ID < regs[j].ID
+	})
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
new file mode 100644
index 0000000..2d233e2
--- /dev/null
+++ b/internal/benchmark/config.go
@@ -0,0 +1,534 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"errors"
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+type Config struct {
+	Version      string             `json:"version"`
+	Defaults     DefaultsConfig     `json:"defaults"`
+	Profiles     map[string]Profile `json:"profiles"`
+	Baseline     BaselineConfig     `json:"baseline"`
+	Results      ResultsConfig      `json:"results"`
+	Strategies   []string           `json:"strategies"`
+	SnapshotsDir string             `json:"snapshots_dir"`
+}
+
+type DefaultsConfig struct {
+	Profile   string  `json:"profile"`
+	Strategy  string  `json:"strategy"`
+	Threshold float64 `json:"threshold"`
+	TopK      int     `json:"top_k"`
+	Weights   Weights `json:"weights"`
+}
+
+type ResultsConfig struct {
+	Dir                  string `json:"dir"`
+	BaselinesDir         string `json:"baselines_dir"`
+	GeneratedFilesPolicy string `json:"generated_files_policy"`
+}
+
+type Profile struct {
+	Strategy  string   `json:"strategy"`
+	Threshold float64  `json:"threshold"`
+	TopK      int      `json:"top_k"`
+	Weights   Weights  `json:"weights"`
+	Suites    []string `json:"suites"`
+	Mode      string   `json:"mode"`
+	Inherits  string   `json:"inherits"`
+	Verbose   bool     `json:"verbose"`
+	Explain   bool     `json:"explain"`
+	FailOnReg bool     `json:"fail_on_regression"`
+}
+
+type Weights struct {
+	Lexical   float64 `json:"lexical"`
+	Embedding float64 `json:"embedding"`
+}
+
+type BaselineConfig struct {
+	Quality BaselineQuality `json:"quality"`
+	Runtime BaselineRuntime `json:"runtime"`
+}
+
+type BaselineQuality struct {
+	MaxOverallPAt1Drop    float64 `json:"max_overall_p_at_1_drop"`
+	MaxOverallMRRDrop     float64 `json:"max_overall_mrr_drop"`
+	MaxOverallHitAt3Drop  float64 `json:"max_overall_hit_at_3_drop"`
+	MaxCorpusPAt1Drop     float64 `json:"max_corpus_p_at_1_drop"`
+	MaxDifficultyPAt1Drop float64 `json:"max_difficulty_p_at_1_drop"`
+	MaxTagPAt1Drop        float64 `json:"max_tag_p_at_1_drop"`
+	MaxMarginDropReport   float64 `json:"max_margin_drop_report"`
+}
+
+type BaselineRuntime struct {
+	MaxNsOpRegressionRatio  float64 `json:"max_ns_op_regression_ratio"`
+	MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"`
+	MaxCorpusLatencyP50MS   int     `json:"max_corpus_latency_p50_ms"`
+	MaxCorpusLatencyP95MS   int     `json:"max_corpus_latency_p95_ms"`
+}
+
+type CheckConfig struct {
+	Profile      string
+	BaselinePath string
+	OutputDir    string
+	Format       string
+	FailOnReg    bool
+	Quick        bool
+	Verbose      bool
+	Explain      bool
+}
+
+type RunConfig struct {
+	Suite           string
+	Corpus          string
+	QueryID         string
+	Strategy        string
+	Threshold       float64
+	TopK            int
+	LexicalWeight   float64
+	EmbeddingWeight float64
+	Profile         string
+	Mode            string
+	Verbose         bool
+	Explain         bool
+	OutputDir       string
+	ReportName      string
+	Quick           bool
+}
+
+type CompareConfig struct {
+	BaselinePath string
+	CurrentPath  string
+	Format       string
+	Verbose      bool
+}
+
+type LintConfig struct {
+	Format  string
+	Verbose bool
+}
+
+type CatalogConfig struct {
+	Format string
+	By     string
+}
+
+type BaselineCmdConfig struct {
+	Action  string // "create" or "update"
+	Name    string
+	Accept  bool
+	Verbose bool
+}
+
+type CalibrateConfig struct {
+	Corpus     string
+	Thresholds []float64
+	Verbose    bool
+}
+
+type TuneConfig struct {
+	Corpus  string
+	Step    float64
+	Verbose bool
+}
+
+type RuntimeConfig struct {
+	FailOnRegression bool
+	Verbose          bool
+}
+
+func FindBenchmarkRoot() string {
+	cwd, _ := os.Getwd()
+	for d := cwd; d != "/"; d = filepath.Dir(d) {
+		if _, err := os.Stat(filepath.Join(d, "tests/benchmark/config/benchmark.json")); err == nil {
+			return filepath.Join(d, "tests/benchmark")
+		}
+		if _, err := os.Stat(filepath.Join(d, "go.mod")); err == nil {
+			return filepath.Join(d, "tests/benchmark")
+		}
+	}
+	return filepath.Join(cwd, "tests/benchmark")
+}
+
+func LoadConfig(benchmarkRoot string) (*Config, error) {
+	path := filepath.Join(benchmarkRoot, "config/benchmark.json")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var cfg Config
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return nil, err
+	}
+	if err := ValidateConfig(&cfg); err != nil {
+		return nil, fmt.Errorf("invalid config: %w", err)
+	}
+	return &cfg, nil
+}
+
+func ResolveProfile(cfg *Config, name string) Profile {
+	p, ok := cfg.Profiles[name]
+	if !ok {
+		// Use defaults from config, falling back to hardcoded values
+		strategy := cfg.Defaults.Strategy
+		if strategy == "" {
+			strategy = "combined"
+		}
+		threshold := cfg.Defaults.Threshold
+		if threshold == 0 {
+			threshold = 0.01
+		}
+		topK := cfg.Defaults.TopK
+		if topK == 0 {
+			topK = 5
+		}
+		weights := cfg.Defaults.Weights
+		if weights.Lexical == 0 && weights.Embedding == 0 {
+			weights = Weights{Lexical: 0.6, Embedding: 0.4}
+		}
+		return Profile{
+			Strategy:  strategy,
+			Threshold: threshold,
+			TopK:      topK,
+			Weights:   weights,
+			Suites:    []string{"corpus"},
+			Mode:      "library",
+		}
+	}
+	if p.Inherits != "" {
+		base := ResolveProfile(cfg, p.Inherits)
+		if p.Strategy == "" {
+			p.Strategy = base.Strategy
+		}
+		if p.Threshold == 0 {
+			p.Threshold = base.Threshold
+		}
+		if p.TopK == 0 {
+			p.TopK = base.TopK
+		}
+		if p.Weights.Lexical == 0 && p.Weights.Embedding == 0 {
+			p.Weights = base.Weights
+		}
+		if len(p.Suites) == 0 {
+			p.Suites = base.Suites
+		}
+		if p.Mode == "" {
+			p.Mode = base.Mode
+		}
+	}
+	return p
+}
+
+// projectRoot returns the project root (parent of tests/benchmark).
+func projectRoot(benchmarkRoot string) string {
+	return filepath.Dir(filepath.Dir(benchmarkRoot))
+}
+
+// ResultsDir returns the configured results directory.
+func (c *Config) ResultsDir(benchmarkRoot string) string {
+	if c.Results.Dir != "" {
+		if filepath.IsAbs(c.Results.Dir) {
+			return c.Results.Dir
+		}
+		return filepath.Join(projectRoot(benchmarkRoot), c.Results.Dir)
+	}
+	return filepath.Join(benchmarkRoot, "results")
+}
+
+// BaselinesDir returns the configured baselines directory.
+func (c *Config) BaselinesDir(benchmarkRoot string) string {
+	if c.Results.BaselinesDir != "" {
+		if filepath.IsAbs(c.Results.BaselinesDir) {
+			return c.Results.BaselinesDir
+		}
+		return filepath.Join(projectRoot(benchmarkRoot), c.Results.BaselinesDir)
+	}
+	return filepath.Join(benchmarkRoot, "baselines")
+}
+
+// QualityThresholds returns quality thresholds with fallback defaults.
+func (c *Config) QualityThresholds() BaselineQuality {
+	q := c.Baseline.Quality
+	if q.MaxOverallPAt1Drop == 0 {
+		q.MaxOverallPAt1Drop = 0.02
+	}
+	if q.MaxOverallMRRDrop == 0 {
+		q.MaxOverallMRRDrop = 0.02
+	}
+	if q.MaxOverallHitAt3Drop == 0 {
+		q.MaxOverallHitAt3Drop = 0.02
+	}
+	if q.MaxCorpusPAt1Drop == 0 {
+		q.MaxCorpusPAt1Drop = 0.08
+	}
+	if q.MaxDifficultyPAt1Drop == 0 {
+		q.MaxDifficultyPAt1Drop = 0.08
+	}
+	if q.MaxTagPAt1Drop == 0 {
+		q.MaxTagPAt1Drop = 0.08
+	}
+	if q.MaxMarginDropReport == 0 {
+		q.MaxMarginDropReport = 0.15
+	}
+	return q
+}
+
+// RuntimeThresholds returns runtime thresholds with fallback defaults.
+func (c *Config) RuntimeThresholds() BaselineRuntime {
+	r := c.Baseline.Runtime
+	if r.MaxNsOpRegressionRatio == 0 {
+		r.MaxNsOpRegressionRatio = 1.25
+	}
+	if r.MaxAllocRegressionRatio == 0 {
+		r.MaxAllocRegressionRatio = 1.25
+	}
+	return r
+}
+
+// ValidateConfig checks the config for errors and returns a descriptive error if invalid.
+func ValidateConfig(cfg *Config) error {
+	var errs []error
+
+	// Validate strategies
+	if len(cfg.Strategies) == 0 {
+		errs = append(errs, errors.New("strategies list is empty"))
+	} else {
+		validStrategies := make(map[string]bool)
+		for _, s := range cfg.Strategies {
+			validStrategies[s] = true
+		}
+		// Check default strategy is in list
+		if cfg.Defaults.Strategy != "" && !validStrategies[cfg.Defaults.Strategy] {
+			errs = append(errs, fmt.Errorf("default strategy %q not in strategies list", cfg.Defaults.Strategy))
+		}
+		// Check profile strategies
+		for name, p := range cfg.Profiles {
+			if p.Strategy != "" && !validStrategies[p.Strategy] {
+				errs = append(errs, fmt.Errorf("profile %q uses strategy %q not in strategies list", name, p.Strategy))
+			}
+		}
+	}
+
+	// Validate weights
+	if cfg.Defaults.Weights.Lexical < 0 {
+		errs = append(errs, errors.New("defaults.weights.lexical must be non-negative"))
+	}
+	if cfg.Defaults.Weights.Embedding < 0 {
+		errs = append(errs, errors.New("defaults.weights.embedding must be non-negative"))
+	}
+	if cfg.Defaults.Weights.Lexical == 0 && cfg.Defaults.Weights.Embedding == 0 {
+		errs = append(errs, errors.New("defaults.weights: lexical and embedding cannot both be zero"))
+	}
+
+	// Validate profile weights
+	for name, p := range cfg.Profiles {
+		if p.Weights.Lexical < 0 {
+			errs = append(errs, fmt.Errorf("profile %q: weights.lexical must be non-negative", name))
+		}
+		if p.Weights.Embedding < 0 {
+			errs = append(errs, fmt.Errorf("profile %q: weights.embedding must be non-negative", name))
+		}
+	}
+
+	// Validate quality thresholds (should be positive when set)
+	q := cfg.Baseline.Quality
+	if q.MaxOverallPAt1Drop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_p_at_1_drop must be non-negative"))
+	}
+	if q.MaxOverallMRRDrop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_mrr_drop must be non-negative"))
+	}
+	if q.MaxOverallHitAt3Drop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_hit_at_3_drop must be non-negative"))
+	}
+
+	// Validate runtime thresholds (must be >= 1)
+	r := cfg.Baseline.Runtime
+	if r.MaxNsOpRegressionRatio != 0 && r.MaxNsOpRegressionRatio < 1 {
+		errs = append(errs, errors.New("baseline.runtime.max_ns_op_regression_ratio must be >= 1"))
+	}
+	if r.MaxAllocRegressionRatio != 0 && r.MaxAllocRegressionRatio < 1 {
+		errs = append(errs, errors.New("baseline.runtime.max_alloc_regression_ratio must be >= 1"))
+	}
+
+	// Validate profile inheritance
+	if err := validateProfileInheritance(cfg); err != nil {
+		errs = append(errs, err)
+	}
+
+	if len(errs) == 0 {
+		return nil
+	}
+	if len(errs) == 1 {
+		return errs[0]
+	}
+	return fmt.Errorf("config has %d errors: %v", len(errs), errs)
+}
+
+// validateProfileInheritance checks for missing references and cycles.
+func validateProfileInheritance(cfg *Config) error {
+	for name, p := range cfg.Profiles {
+		if p.Inherits == "" {
+			continue
+		}
+		// Check reference exists
+		if _, ok := cfg.Profiles[p.Inherits]; !ok {
+			return fmt.Errorf("profile %q inherits from non-existent profile %q", name, p.Inherits)
+		}
+		// Check for cycles
+		visited := map[string]bool{name: true}
+		current := p.Inherits
+		for current != "" {
+			if visited[current] {
+				return fmt.Errorf("profile inheritance cycle detected: %q -> %q", name, current)
+			}
+			visited[current] = true
+			if parent, ok := cfg.Profiles[current]; ok {
+				current = parent.Inherits
+			} else {
+				break
+			}
+		}
+	}
+	return nil
+}
+
+func ParseCheckFlags(args []string) CheckConfig {
+	fs := flag.NewFlagSet("check", flag.ExitOnError)
+	cfg := CheckConfig{
+		Profile:   "default",
+		OutputDir: filepath.Join(FindBenchmarkRoot(), "results"),
+		Format:    "text",
+	}
+	fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile")
+	fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline file path")
+	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)")
+	fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression")
+	fs.BoolVar(&cfg.Quick, "quick", false, "smoke mode: 3 queries per corpus (not representative)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details")
+	fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations")
+	_ = fs.Parse(args)
+	return cfg
+}
+
+func ParseRunFlags(args []string) RunConfig {
+	fs := flag.NewFlagSet("run", flag.ExitOnError)
+	cfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Profile:         "default",
+		Mode:            "library",
+		OutputDir:       filepath.Join(FindBenchmarkRoot(), "results"),
+	}
+	fs.StringVar(&cfg.Suite, "suite", cfg.Suite, "suite to run (corpus|recovery|classification|runtime|all)")
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to run")
+	fs.StringVar(&cfg.QueryID, "query", "", "specific query ID to run")
+	fs.StringVar(&cfg.Strategy, "strategy", cfg.Strategy, "matching strategy")
+	fs.Float64Var(&cfg.Threshold, "threshold", cfg.Threshold, "score threshold")
+	fs.IntVar(&cfg.TopK, "top-k", cfg.TopK, "number of results")
+	fs.Float64Var(&cfg.LexicalWeight, "lexical-weight", cfg.LexicalWeight, "lexical weight")
+	fs.Float64Var(&cfg.EmbeddingWeight, "embedding-weight", cfg.EmbeddingWeight, "embedding weight")
+	fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile")
+	fs.StringVar(&cfg.Mode, "mode", cfg.Mode, "execution mode (cli|library|both)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.BoolVar(&cfg.Explain, "explain", false, "include explanations")
+	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
+	fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name")
+	_ = fs.Parse(args)
+	return cfg
+}
+
+func ParseCompareFlags(args []string) CompareConfig {
+	fs := flag.NewFlagSet("compare", flag.ExitOnError)
+	cfg := CompareConfig{
+		Format: "text",
+	}
+	fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline report path (required)")
+	fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)")
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	_ = fs.Parse(args)
+	return cfg
+}
+
+func ParseLintFlags(args []string) LintConfig {
+	fs := flag.NewFlagSet("lint", flag.ExitOnError)
+	cfg := LintConfig{
+		Format: "text",
+	}
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	_ = fs.Parse(args)
+	return cfg
+}
+
+func ParseCatalogFlags(args []string) CatalogConfig {
+	fs := flag.NewFlagSet("catalog", flag.ExitOnError)
+	cfg := CatalogConfig{
+		Format: "table",
+	}
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)")
+	fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)")
+	_ = fs.Parse(args)
+	return cfg
+}
+
+func ParseBaselineFlags(args []string) BaselineCmdConfig {
+	fs := flag.NewFlagSet("baseline", flag.ExitOnError)
+	cfg := BaselineCmdConfig{
+		Action: "create",
+		Name:   "combined",
+	}
+	fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name")
+	fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	_ = fs.Parse(args)
+
+	if len(fs.Args()) > 0 {
+		cfg.Action = fs.Args()[0]
+	}
+	return cfg
+}
+
+func ParseCalibrateFlags(args []string) CalibrateConfig {
+	fs := flag.NewFlagSet("calibrate", flag.ExitOnError)
+	cfg := CalibrateConfig{
+		Thresholds: []float64{0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60},
+	}
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	_ = fs.Parse(args)
+	return cfg
+}
+
+func ParseTuneFlags(args []string) TuneConfig {
+	fs := flag.NewFlagSet("tune", flag.ExitOnError)
+	cfg := TuneConfig{
+		Step: 0.1,
+	}
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against")
+	fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	_ = fs.Parse(args)
+	return cfg
+}
+
+func ParseRuntimeFlags(args []string) RuntimeConfig {
+	fs := flag.NewFlagSet("runtime", flag.ExitOnError)
+	cfg := RuntimeConfig{}
+	fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	_ = fs.Parse(args)
+	return cfg
+}
diff --git a/internal/benchmark/config_test.go b/internal/benchmark/config_test.go
new file mode 100644
index 0000000..2590556
--- /dev/null
+++ b/internal/benchmark/config_test.go
@@ -0,0 +1,147 @@
+package benchmark
+
+import "testing"
+
+func TestValidateConfig_Valid(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"lexical", "embedding", "combined"},
+		Defaults: DefaultsConfig{
+			Strategy: "combined",
+			Weights:  Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Quality: BaselineQuality{
+				MaxOverallPAt1Drop: 0.02,
+			},
+			Runtime: BaselineRuntime{
+				MaxNsOpRegressionRatio: 1.25,
+			},
+		},
+	}
+	if err := ValidateConfig(cfg); err != nil {
+		t.Errorf("expected valid config, got error: %v", err)
+	}
+}
+
+func TestValidateConfig_EmptyStrategies(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for empty strategies")
+	}
+}
+
+func TestValidateConfig_InvalidDefaultStrategy(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"lexical", "embedding"},
+		Defaults: DefaultsConfig{
+			Strategy: "combined",
+			Weights:  Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for invalid default strategy")
+	}
+}
+
+func TestValidateConfig_NegativeWeights(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: -0.5, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for negative weight")
+	}
+}
+
+func TestValidateConfig_BothWeightsZero(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0, Embedding: 0},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error when both weights are zero")
+	}
+}
+
+func TestValidateConfig_RuntimeRatioTooLow(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Runtime: BaselineRuntime{
+				MaxNsOpRegressionRatio: 0.5,
+			},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for runtime ratio < 1")
+	}
+}
+
+func TestValidateConfig_ProfileInheritsMissing(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Profiles: map[string]Profile{
+			"fast": {Inherits: "nonexistent"},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for missing inherited profile")
+	}
+}
+
+func TestValidateConfig_ProfileInheritanceCycle(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Profiles: map[string]Profile{
+			"a": {Inherits: "b"},
+			"b": {Inherits: "c"},
+			"c": {Inherits: "a"},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for inheritance cycle")
+	}
+}
+
+func TestValidateConfig_NegativeQualityThreshold(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Quality: BaselineQuality{
+				MaxOverallPAt1Drop: -0.02,
+			},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for negative quality threshold")
+	}
+}
diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go
new file mode 100644
index 0000000..86c5014
--- /dev/null
+++ b/internal/benchmark/dataset.go
@@ -0,0 +1,117 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+
+	"github.com/pinchtab/semantic"
+)
+
+type Query struct {
+	ID                    string   `json:"id"`
+	QueryText             string   `json:"query"`
+	RelevantRefs          []string `json:"relevant_refs"`
+	PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
+	Difficulty            string   `json:"difficulty"`
+	Tags                  []string `json:"tags"`
+	Intent                string   `json:"intent,omitempty"`
+	PageType              string   `json:"page_type,omitempty"`
+	Threshold             *float64 `json:"threshold,omitempty"`
+	TopK                  *int     `json:"top_k,omitempty"`
+	ExpectNoMatch         bool     `json:"expect_no_match,omitempty"`
+	MinScore              *float64 `json:"min_score,omitempty"`
+	Notes                 string   `json:"notes,omitempty"`
+}
+
+type Corpus struct {
+	ID       string
+	Path     string
+	Snapshot []semantic.ElementDescriptor
+	Queries  []Query
+}
+
+type Dataset struct {
+	Root    string
+	Corpora []Corpus
+}
+
+func LoadDataset(benchmarkRoot string) (*Dataset, error) {
+	corpusDir := filepath.Join(benchmarkRoot, "corpus")
+	entries, err := os.ReadDir(corpusDir)
+	if err != nil {
+		return nil, err
+	}
+
+	ds := &Dataset{Root: benchmarkRoot}
+
+	for _, entry := range entries {
+		if !entry.IsDir() {
+			continue
+		}
+
+		corpusPath := filepath.Join(corpusDir, entry.Name())
+		snapshotPath := filepath.Join(corpusPath, "snapshot.json")
+		queriesPath := filepath.Join(corpusPath, "queries.json")
+
+		if _, err := os.Stat(snapshotPath); os.IsNotExist(err) {
+			continue
+		}
+		if _, err := os.Stat(queriesPath); os.IsNotExist(err) {
+			continue
+		}
+
+		corpus, err := loadCorpus(entry.Name(), corpusPath)
+		if err != nil {
+			return nil, err
+		}
+
+		ds.Corpora = append(ds.Corpora, *corpus)
+	}
+
+	return ds, nil
+}
+
+func loadCorpus(id, path string) (*Corpus, error) {
+	snapshotPath := filepath.Join(path, "snapshot.json")
+	queriesPath := filepath.Join(path, "queries.json")
+
+	snapshotData, err := os.ReadFile(snapshotPath)
+	if err != nil {
+		return nil, err
+	}
+
+	var snapshot []semantic.ElementDescriptor
+	if err := json.Unmarshal(snapshotData, &snapshot); err != nil {
+		return nil, err
+	}
+
+	queriesData, err := os.ReadFile(queriesPath)
+	if err != nil {
+		return nil, err
+	}
+
+	var queries []Query
+	if err := json.Unmarshal(queriesData, &queries); err != nil {
+		return nil, err
+	}
+
+	return &Corpus{
+		ID:       id,
+		Path:     path,
+		Snapshot: snapshot,
+		Queries:  queries,
+	}, nil
+}
+
+func (ds *Dataset) QueryCount() int {
+	count := 0
+	for _, c := range ds.Corpora {
+		count += len(c.Queries)
+	}
+	return count
+}
+
+func (ds *Dataset) CorpusCount() int {
+	return len(ds.Corpora)
+}
diff --git a/internal/benchmark/lint.go b/internal/benchmark/lint.go
new file mode 100644
index 0000000..20565ce
--- /dev/null
+++ b/internal/benchmark/lint.go
@@ -0,0 +1,68 @@
+package benchmark
+
+import "fmt"
+
+func RunLint(cfg LintConfig) (*LintResult, error) {
+	root := FindBenchmarkRoot()
+	result := &LintResult{}
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		result.Errors++
+		result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err))
+		return result, nil
+	}
+
+	ids := make(map[string]string)
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if existing, ok := ids[q.ID]; ok {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing))
+			} else {
+				ids[q.ID] = c.ID
+			}
+		}
+	}
+
+	for _, c := range ds.Corpora {
+		refs := make(map[string]bool)
+		for _, d := range c.Snapshot {
+			refs[d.Ref] = true
+		}
+		for _, q := range c.Queries {
+			for _, r := range q.RelevantRefs {
+				if !refs[r] {
+					result.Errors++
+					result.Messages = append(result.Messages,
+						fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r))
+				}
+			}
+		}
+	}
+
+	validDiff := map[string]bool{"easy": true, "medium": true, "hard": true}
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if q.Difficulty != "" && !validDiff[q.Difficulty] {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID))
+			}
+		}
+	}
+
+	if result.Errors == 0 && result.Warnings == 0 {
+		result.Messages = append(result.Messages, "All checks passed")
+	}
+
+	return result, nil
+}
+
+func PrintLintResult(result *LintResult, cfg LintConfig) {
+	for _, msg := range result.Messages {
+		fmt.Println(msg)
+	}
+	fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings)
+}
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
new file mode 100644
index 0000000..6f00821
--- /dev/null
+++ b/internal/benchmark/runner.go
@@ -0,0 +1,466 @@
+package benchmark
+
+import (
+	"context"
+	"os/exec"
+	"strings"
+	"time"
+
+	"github.com/pinchtab/semantic"
+)
+
+type QueryResult struct {
+	ID         string   `json:"id"`
+	Corpus     string   `json:"corpus"`
+	Query      string   `json:"query"`
+	Difficulty string   `json:"difficulty"`
+	Tags       []string `json:"tags"`
+	Intent     string   `json:"intent,omitempty"`
+	PageType   string   `json:"page_type,omitempty"`
+	Expected   struct {
+		RelevantRefs          []string `json:"relevant_refs"`
+		PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
+	} `json:"expected"`
+	Actual struct {
+		BestRef   string  `json:"best_ref"`
+		BestScore float64 `json:"best_score"`
+		Matches   []Match `json:"matches"`
+	} `json:"actual"`
+	Metrics struct {
+		RR                float64 `json:"rr"`
+		PAt1              float64 `json:"p_at_1"`
+		PAt3              float64 `json:"p_at_3"`
+		HitAt3            int     `json:"hit_at_3"`
+		HitAt5            int     `json:"hit_at_5"`
+		BestRelevantRank  *int    `json:"best_relevant_rank"`
+		BestRelevantScore float64 `json:"best_relevant_score"`
+		BestWrongScore    float64 `json:"best_wrong_score"`
+		Margin            float64 `json:"margin"`
+	} `json:"metrics"`
+	Latency struct {
+		LibraryMs int64  `json:"library_ms"`
+		CLIMs     *int64 `json:"cli_ms,omitempty"`
+	} `json:"latency"`
+	Status string `json:"status"`
+}
+
+type Match struct {
+	Ref   string  `json:"ref"`
+	Score float64 `json:"score"`
+	Role  string  `json:"role"`
+	Name  string  `json:"name"`
+}
+
+type Report struct {
+	SchemaVersion string `json:"schema_version"`
+	Run           struct {
+		ID        string `json:"id"`
+		Timestamp string `json:"timestamp"`
+		Tool      string `json:"tool"`
+		GitSHA    string `json:"git_sha,omitempty"`
+		GitDirty  bool   `json:"git_dirty,omitempty"`
+		Command   string `json:"command"`
+	} `json:"run"`
+	Dataset struct {
+		Name        string `json:"name"`
+		Version     string `json:"version,omitempty"`
+		QueryCount  int    `json:"query_count"`
+		CorpusCount int    `json:"corpus_count"`
+	} `json:"dataset"`
+	Config struct {
+		Profile   string  `json:"profile"`
+		Strategy  string  `json:"strategy"`
+		Threshold float64 `json:"threshold"`
+		TopK      int     `json:"top_k"`
+		Weights   Weights `json:"weights"`
+	} `json:"config"`
+	Status  string `json:"status"`
+	Metrics struct {
+		Overall      OverallMetrics           `json:"overall"`
+		Latency      LatencyMetrics           `json:"latency"`
+		ByCorpus     map[string]CorpusMetrics `json:"by_corpus"`
+		ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"`
+		ByTag        map[string]CorpusMetrics `json:"by_tag"`
+	} `json:"metrics"`
+	Results []QueryResult `json:"results"`
+}
+
+type OverallMetrics struct {
+	Total     int     `json:"total"`
+	MRR       float64 `json:"mrr"`
+	PAt1      float64 `json:"p_at_1"`
+	PAt3      float64 `json:"p_at_3"`
+	HitAt3    float64 `json:"hit_at_3"`
+	HitAt5    float64 `json:"hit_at_5"`
+	AvgMargin float64 `json:"avg_margin"`
+}
+
+type LatencyMetrics struct {
+	LibraryP50Ms int64  `json:"library_p50_ms"`
+	LibraryP95Ms int64  `json:"library_p95_ms"`
+	CLIP50Ms     *int64 `json:"cli_p50_ms,omitempty"`
+	CLIP95Ms     *int64 `json:"cli_p95_ms,omitempty"`
+}
+
+type CorpusMetrics struct {
+	Count     int     `json:"count"`
+	MRR       float64 `json:"mrr"`
+	PAt1      float64 `json:"p_at_1"`
+	HitAt3    float64 `json:"hit_at_3"`
+	AvgMargin float64 `json:"avg_margin"`
+}
+
+func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
+	matcher := createMatcher(cfg)
+
+	report := &Report{
+		SchemaVersion: "1.0.0",
+		Status:        "pass",
+	}
+	report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile
+	report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339)
+	report.Run.Tool = "semantic-bench"
+	report.Run.GitSHA, report.Run.GitDirty = getGitInfo()
+	report.Dataset.Name = "semantic-ui-matching-corpus"
+	report.Dataset.QueryCount = ds.QueryCount()
+	report.Dataset.CorpusCount = ds.CorpusCount()
+	report.Config.Profile = cfg.Profile
+	report.Config.Strategy = cfg.Strategy
+	report.Config.Threshold = cfg.Threshold
+	report.Config.TopK = cfg.TopK
+	report.Config.Weights = Weights{Lexical: cfg.LexicalWeight, Embedding: cfg.EmbeddingWeight}
+
+	report.Metrics.ByCorpus = make(map[string]CorpusMetrics)
+	report.Metrics.ByDifficulty = make(map[string]CorpusMetrics)
+	report.Metrics.ByTag = make(map[string]CorpusMetrics)
+
+	var allLatencies []int64
+
+	for _, corpus := range ds.Corpora {
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+
+		queries := corpus.Queries
+		if cfg.Quick {
+			queries = selectQuickSubset(corpus.Queries)
+		}
+
+		for _, query := range queries {
+			if cfg.QueryID != "" && query.ID != cfg.QueryID {
+				continue
+			}
+
+			result := runQuery(matcher, corpus, query, cfg)
+			report.Results = append(report.Results, result)
+			allLatencies = append(allLatencies, result.Latency.LibraryMs)
+		}
+	}
+
+	aggregateMetrics(report, allLatencies)
+	return report, nil
+}
+
+// selectQuickSubset returns a deterministic subset for smoke testing.
+// Selects up to 3 queries per corpus by difficulty. This is NOT representative
+// of full corpus coverage—edge-case tags may be missed. Use for fast iteration,
+// not for final regression checks.
+func selectQuickSubset(queries []Query) []Query {
+	if len(queries) <= 3 {
+		return queries
+	}
+
+	// Group by difficulty
+	byDiff := make(map[string][]Query)
+	for _, q := range queries {
+		diff := q.Difficulty
+		if diff == "" {
+			diff = "medium"
+		}
+		byDiff[diff] = append(byDiff[diff], q)
+	}
+
+	// Select one from each difficulty level, up to 3 total
+	var selected []Query
+	for _, diff := range []string{"easy", "medium", "hard"} {
+		if qs, ok := byDiff[diff]; ok && len(qs) > 0 {
+			selected = append(selected, qs[0])
+			if len(selected) >= 3 {
+				break
+			}
+		}
+	}
+
+	// If we don't have 3 yet, fill from remaining
+	if len(selected) < 3 {
+		for _, q := range queries {
+			found := false
+			for _, s := range selected {
+				if s.ID == q.ID {
+					found = true
+					break
+				}
+			}
+			if !found {
+				selected = append(selected, q)
+				if len(selected) >= 3 {
+					break
+				}
+			}
+		}
+	}
+
+	return selected
+}
+
+func createMatcher(cfg RunConfig) semantic.ElementMatcher {
+	embedder := semantic.NewHashingEmbedder(128)
+	switch cfg.Strategy {
+	case "lexical":
+		return semantic.NewLexicalMatcher()
+	case "embedding":
+		return semantic.NewEmbeddingMatcher(embedder)
+	default:
+		return semantic.NewCombinedMatcher(embedder)
+	}
+}
+
+func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg RunConfig) QueryResult {
+	result := QueryResult{
+		ID:         query.ID,
+		Corpus:     corpus.ID,
+		Query:      query.QueryText,
+		Difficulty: query.Difficulty,
+		Tags:       query.Tags,
+		Intent:     query.Intent,
+		PageType:   query.PageType,
+	}
+	result.Expected.RelevantRefs = query.RelevantRefs
+	result.Expected.PartiallyRelevantRefs = query.PartiallyRelevantRefs
+
+	threshold := cfg.Threshold
+	if query.Threshold != nil {
+		threshold = *query.Threshold
+	}
+	topK := cfg.TopK
+	if query.TopK != nil {
+		topK = *query.TopK
+	}
+
+	start := time.Now()
+	findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{
+		Threshold:       threshold,
+		TopK:            topK,
+		LexicalWeight:   cfg.LexicalWeight,
+		EmbeddingWeight: cfg.EmbeddingWeight,
+		Explain:         cfg.Explain,
+	})
+	result.Latency.LibraryMs = time.Since(start).Milliseconds()
+
+	result.Actual.BestRef = findResult.BestRef
+	result.Actual.BestScore = findResult.BestScore
+	for _, m := range findResult.Matches {
+		result.Actual.Matches = append(result.Actual.Matches, Match{
+			Ref:   m.Ref,
+			Score: m.Score,
+			Role:  m.Role,
+			Name:  m.Name,
+		})
+	}
+
+	computeQueryMetrics(&result, query)
+	return result
+}
+
+func computeQueryMetrics(result *QueryResult, query Query) {
+	relevantSet := make(map[string]bool)
+	for _, r := range query.RelevantRefs {
+		relevantSet[r] = true
+	}
+	partialSet := make(map[string]bool)
+	for _, r := range query.PartiallyRelevantRefs {
+		partialSet[r] = true
+	}
+
+	// Reciprocal Rank
+	for i, m := range result.Actual.Matches {
+		if relevantSet[m.Ref] {
+			result.Metrics.RR = 1.0 / float64(i+1)
+			break
+		}
+	}
+
+	// P@1
+	if len(result.Actual.Matches) > 0 {
+		if relevantSet[result.Actual.Matches[0].Ref] {
+			result.Metrics.PAt1 = 1.0
+		} else if partialSet[result.Actual.Matches[0].Ref] {
+			result.Metrics.PAt1 = 0.5
+		}
+	}
+
+	// P@3, Hit@3, Hit@5
+	relevantInTop3 := 0
+	partialInTop3 := 0
+	for i, m := range result.Actual.Matches {
+		if i >= 5 {
+			break
+		}
+		switch {
+		case relevantSet[m.Ref]:
+			if result.Metrics.BestRelevantRank == nil {
+				rank := i + 1
+				result.Metrics.BestRelevantRank = &rank
+			}
+			if result.Metrics.BestRelevantScore == 0 || m.Score > result.Metrics.BestRelevantScore {
+				result.Metrics.BestRelevantScore = m.Score
+			}
+			if i < 3 {
+				relevantInTop3++
+				result.Metrics.HitAt3 = 1
+			}
+			result.Metrics.HitAt5 = 1
+		case partialSet[m.Ref]:
+			if i < 3 {
+				partialInTop3++
+			}
+		default:
+			if m.Score > result.Metrics.BestWrongScore {
+				result.Metrics.BestWrongScore = m.Score
+			}
+		}
+	}
+	result.Metrics.PAt3 = (float64(relevantInTop3) + float64(partialInTop3)*0.5) / 3.0
+	result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore
+
+	// Status
+	switch {
+	case query.ExpectNoMatch:
+		if len(result.Actual.Matches) == 0 {
+			result.Status = "no_match_expected"
+		} else {
+			result.Status = "unexpected_match"
+		}
+	case result.Metrics.PAt1 >= 1.0:
+		result.Status = "hit"
+	case result.Metrics.PAt1 >= 0.5:
+		result.Status = "partial"
+	default:
+		result.Status = "miss"
+	}
+}
+
+func aggregateMetrics(report *Report, latencies []int64) {
+	n := len(report.Results)
+	if n == 0 {
+		return
+	}
+
+	report.Metrics.Overall.Total = n
+
+	var sumRR, sumP1, sumP3, sumHit3, sumHit5, sumMargin float64
+	corpusAgg := make(map[string]*aggregator)
+	diffAgg := make(map[string]*aggregator)
+	tagAgg := make(map[string]*aggregator)
+
+	for _, r := range report.Results {
+		sumRR += r.Metrics.RR
+		sumP1 += r.Metrics.PAt1
+		sumP3 += r.Metrics.PAt3
+		sumHit3 += float64(r.Metrics.HitAt3)
+		sumHit5 += float64(r.Metrics.HitAt5)
+		sumMargin += r.Metrics.Margin
+
+		addToAgg(corpusAgg, r.Corpus, r)
+		addToAgg(diffAgg, r.Difficulty, r)
+		for _, t := range r.Tags {
+			addToAgg(tagAgg, t, r)
+		}
+	}
+
+	report.Metrics.Overall.MRR = sumRR / float64(n)
+	report.Metrics.Overall.PAt1 = sumP1 / float64(n)
+	report.Metrics.Overall.PAt3 = sumP3 / float64(n)
+	report.Metrics.Overall.HitAt3 = sumHit3 / float64(n)
+	report.Metrics.Overall.HitAt5 = sumHit5 / float64(n)
+	report.Metrics.Overall.AvgMargin = sumMargin / float64(n)
+
+	for k, a := range corpusAgg {
+		report.Metrics.ByCorpus[k] = a.toMetrics()
+	}
+	for k, a := range diffAgg {
+		report.Metrics.ByDifficulty[k] = a.toMetrics()
+	}
+	for k, a := range tagAgg {
+		report.Metrics.ByTag[k] = a.toMetrics()
+	}
+
+	// Latency percentiles
+	if len(latencies) > 0 {
+		sorted := make([]int64, len(latencies))
+		copy(sorted, latencies)
+		sortInt64(sorted)
+		report.Metrics.Latency.LibraryP50Ms = sorted[len(sorted)*50/100]
+		report.Metrics.Latency.LibraryP95Ms = sorted[len(sorted)*95/100]
+	}
+}
+
+type aggregator struct {
+	count     int
+	sumRR     float64
+	sumP1     float64
+	sumHit3   float64
+	sumMargin float64
+}
+
+func addToAgg(m map[string]*aggregator, key string, r QueryResult) {
+	if _, ok := m[key]; !ok {
+		m[key] = &aggregator{}
+	}
+	a := m[key]
+	a.count++
+	a.sumRR += r.Metrics.RR
+	a.sumP1 += r.Metrics.PAt1
+	a.sumHit3 += float64(r.Metrics.HitAt3)
+	a.sumMargin += r.Metrics.Margin
+}
+
+func (a *aggregator) toMetrics() CorpusMetrics {
+	if a.count == 0 {
+		return CorpusMetrics{}
+	}
+	return CorpusMetrics{
+		Count:     a.count,
+		MRR:       a.sumRR / float64(a.count),
+		PAt1:      a.sumP1 / float64(a.count),
+		HitAt3:    a.sumHit3 / float64(a.count),
+		AvgMargin: a.sumMargin / float64(a.count),
+	}
+}
+
+func sortInt64(s []int64) {
+	for i := range s {
+		for j := i + 1; j < len(s); j++ {
+			if s[j] < s[i] {
+				s[i], s[j] = s[j], s[i]
+			}
+		}
+	}
+}
+
+func getGitInfo() (sha string, dirty bool) {
+	cmd := exec.Command("git", "rev-parse", "HEAD")
+	out, err := cmd.Output()
+	if err != nil {
+		return "", false
+	}
+	sha = strings.TrimSpace(string(out))
+
+	cmd = exec.Command("git", "status", "--porcelain")
+	out, err = cmd.Output()
+	if err != nil {
+		return sha, false
+	}
+	dirty = len(strings.TrimSpace(string(out))) > 0
+	return sha, dirty
+}
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
new file mode 100644
index 0000000..dd68f75
--- /dev/null
+++ b/internal/benchmark/runtime.go
@@ -0,0 +1,236 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type RuntimeResult struct {
+	Status       string             `json:"status"`
+	Benchmarks   []RuntimeBenchmark `json:"benchmarks"`
+	Regressions  int                `json:"regressions"`
+	BaselinePath string             `json:"baseline_path"`
+	Created      bool               `json:"created"`
+}
+
+type RuntimeBenchmark struct {
+	Name       string  `json:"name"`
+	NsOp       float64 `json:"ns_op"`
+	BytesOp    int     `json:"bytes_op"`
+	AllocsOp   int     `json:"allocs_op"`
+	BaselineNs float64 `json:"baseline_ns,omitempty"`
+	Ratio      float64 `json:"ratio,omitempty"`
+	Status     string  `json:"status"`
+}
+
+type runtimeBaseline struct {
+	Timestamp  string             `json:"timestamp"`
+	Benchmarks []RuntimeBenchmark `json:"benchmarks"`
+}
+
+func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
+	root := FindBenchmarkRoot()
+
+	// Load config for thresholds
+	benchCfg, err := LoadConfig(root)
+	if err != nil {
+		return nil, fmt.Errorf("load config: %w", err)
+	}
+	thresholds := benchCfg.RuntimeThresholds()
+	baselinePath := filepath.Join(benchCfg.BaselinesDir(root), "runtime.json")
+
+	benchmarks, err := runGoBenchmarks()
+	if err != nil {
+		return nil, err
+	}
+
+	result := &RuntimeResult{
+		Status:       "pass",
+		Benchmarks:   benchmarks,
+		BaselinePath: baselinePath,
+	}
+
+	if _, err := os.Stat(baselinePath); os.IsNotExist(err) {
+		if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil {
+			return nil, err
+		}
+		result.Created = true
+		return result, nil
+	}
+
+	baseline, err := loadRuntimeBaseline(baselinePath)
+	if err != nil {
+		return nil, err
+	}
+
+	baselineMap := make(map[string]RuntimeBenchmark)
+	for _, b := range baseline.Benchmarks {
+		baselineMap[b.Name] = b
+	}
+
+	// Warning threshold is halfway between 1.0 and max ratio
+	warnRatio := 1.0 + ((thresholds.MaxNsOpRegressionRatio - 1.0) / 2.0)
+
+	for i, b := range result.Benchmarks {
+		if base, ok := baselineMap[b.Name]; ok {
+			nsRatio := b.NsOp / base.NsOp
+			result.Benchmarks[i].BaselineNs = base.NsOp
+			result.Benchmarks[i].Ratio = nsRatio
+
+			// Check allocation regression if baseline has allocation data
+			var allocRatio float64
+			if base.AllocsOp > 0 && b.AllocsOp > 0 {
+				allocRatio = float64(b.AllocsOp) / float64(base.AllocsOp)
+			}
+
+			switch {
+			case nsRatio > thresholds.MaxNsOpRegressionRatio:
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			case allocRatio > thresholds.MaxAllocRegressionRatio:
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			case nsRatio > warnRatio:
+				result.Benchmarks[i].Status = "warning"
+			default:
+				result.Benchmarks[i].Status = "ok"
+			}
+		} else {
+			result.Benchmarks[i].Status = "new"
+		}
+	}
+
+	if result.Regressions > 0 {
+		result.Status = "fail"
+	}
+
+	return result, nil
+}
+
+func runGoBenchmarks() ([]RuntimeBenchmark, error) {
+	root := FindBenchmarkRoot()
+	projectRoot := filepath.Join(root, "..", "..")
+
+	cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...")
+	cmd.Dir = projectRoot
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("go test failed: %w\n%s", err, output)
+	}
+
+	return parseBenchOutput(string(output)), nil
+}
+
+func parseBenchOutput(output string) []RuntimeBenchmark {
+	var results []RuntimeBenchmark
+	lines := strings.Split(output, "\n")
+
+	for _, line := range lines {
+		if !strings.HasPrefix(line, "Benchmark") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 3 {
+			continue
+		}
+
+		name := strings.TrimSuffix(fields[0], "-8")
+		name = strings.TrimSuffix(name, "-10")
+		name = strings.TrimSuffix(name, "-12")
+		name = strings.TrimSuffix(name, "-16")
+
+		var nsOp float64
+		var bytesOp, allocsOp int
+
+		for i, f := range fields {
+			if f == "ns/op" && i > 0 {
+				_, _ = fmt.Sscanf(fields[i-1], "%f", &nsOp)
+			}
+			if f == "B/op" && i > 0 {
+				_, _ = fmt.Sscanf(fields[i-1], "%d", &bytesOp)
+			}
+			if f == "allocs/op" && i > 0 {
+				_, _ = fmt.Sscanf(fields[i-1], "%d", &allocsOp)
+			}
+		}
+
+		if nsOp > 0 {
+			results = append(results, RuntimeBenchmark{
+				Name:     name,
+				NsOp:     nsOp,
+				BytesOp:  bytesOp,
+				AllocsOp: allocsOp,
+			})
+		}
+	}
+
+	return results
+}
+
+func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error {
+	baseline := runtimeBaseline{
+		Timestamp:  time.Now().UTC().Format(time.RFC3339),
+		Benchmarks: benchmarks,
+	}
+	data, err := json.MarshalIndent(baseline, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func loadRuntimeBaseline(path string) (*runtimeBaseline, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var baseline runtimeBaseline
+	if err := json.Unmarshal(data, &baseline); err != nil {
+		return nil, err
+	}
+	return &baseline, nil
+}
+
+func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) {
+	if result.Created {
+		fmt.Printf("\n  Created runtime baseline: %s\n", result.BaselinePath)
+		fmt.Printf("  Benchmarks: %d\n\n", len(result.Benchmarks))
+		return
+	}
+
+	fmt.Printf("\n  Runtime Baseline Check\n\n")
+
+	for _, b := range result.Benchmarks {
+		var status string
+		switch b.Status {
+		case "regression":
+			status = "\033[31mREGRESSION\033[0m"
+		case "warning":
+			status = "\033[33mWARNING\033[0m"
+		case "ok":
+			status = "\033[32mOK\033[0m"
+		case "new":
+			status = "\033[33mNEW\033[0m"
+		}
+
+		if b.BaselineNs > 0 {
+			fmt.Printf("  %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n",
+				status, b.Name, b.BaselineNs, b.NsOp, b.Ratio)
+		} else {
+			fmt.Printf("  %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp)
+		}
+	}
+
+	fmt.Println()
+	if result.Regressions > 0 {
+		fmt.Printf("  \033[31mRegressions: %d\033[0m\n\n", result.Regressions)
+	} else {
+		fmt.Printf("  \033[32mNo regressions\033[0m\n\n")
+	}
+}
diff --git a/internal/benchmark/tune.go b/internal/benchmark/tune.go
new file mode 100644
index 0000000..7db259b
--- /dev/null
+++ b/internal/benchmark/tune.go
@@ -0,0 +1,90 @@
+package benchmark
+
+import "fmt"
+
+type TuneResult struct {
+	Results []TuneRun `json:"results"`
+	Best    *TuneRun  `json:"best"`
+}
+
+type TuneRun struct {
+	LexicalWeight   float64 `json:"lexical_weight"`
+	EmbeddingWeight float64 `json:"embedding_weight"`
+	MRR             float64 `json:"mrr"`
+	PAt1            float64 `json:"p_at_1"`
+	HitAt3          float64 `json:"hit_at_3"`
+}
+
+func RunTune(cfg TuneConfig) (*TuneResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &TuneResult{}
+
+	if cfg.Verbose {
+		fmt.Printf("  %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3")
+	}
+
+	for w := 0.0; w <= 1.0001; w += cfg.Step {
+		lexW := w
+		embW := 1.0 - w
+
+		runCfg := RunConfig{
+			Suite:           "corpus",
+			Strategy:        "combined",
+			Threshold:       0.01,
+			TopK:            5,
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			Mode:            "library",
+		}
+
+		if cfg.Corpus != "" {
+			runCfg.Corpus = cfg.Corpus
+		}
+
+		report, err := RunCorpusBenchmark(ds, runCfg)
+		if err != nil {
+			return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err)
+		}
+
+		run := TuneRun{
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			MRR:             report.Metrics.Overall.MRR,
+			PAt1:            report.Metrics.Overall.PAt1,
+			HitAt3:          report.Metrics.Overall.HitAt3,
+		}
+		result.Results = append(result.Results, run)
+
+		if result.Best == nil || run.PAt1 > result.Best.PAt1 ||
+			(run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) {
+			best := run
+			result.Best = &best
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n",
+				lexW, embW, run.MRR, run.PAt1, run.HitAt3)
+		}
+	}
+
+	return result, nil
+}
+
+func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
+	fmt.Printf("\n  Tested %d weight combinations\n\n", len(result.Results))
+
+	if result.Best != nil {
+		fmt.Printf("  Best weights:\n")
+		fmt.Printf("    Lexical:   %.2f\n", result.Best.LexicalWeight)
+		fmt.Printf("    Embedding: %.2f\n", result.Best.EmbeddingWeight)
+		fmt.Printf("    MRR:       %.4f\n", result.Best.MRR)
+		fmt.Printf("    P@1:       %.4f\n", result.Best.PAt1)
+		fmt.Printf("    Hit@3:     %.4f\n", result.Best.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/types.go b/internal/benchmark/types.go
new file mode 100644
index 0000000..916978a
--- /dev/null
+++ b/internal/benchmark/types.go
@@ -0,0 +1,67 @@
+package benchmark
+
+type CheckResult struct {
+	Status    string        `json:"status"`
+	Summary   CheckSummary  `json:"summary"`
+	Delta     *MetricsDelta `json:"delta,omitempty"`
+	TopRegs   []Regression  `json:"top_regressions,omitempty"`
+	Artifacts Artifacts     `json:"artifacts"`
+	Report    *Report       `json:"-"`
+}
+
+type CheckSummary struct {
+	PAt1        float64 `json:"p_at_1"`
+	MRR         float64 `json:"mrr"`
+	HitAt3      float64 `json:"hit_at_3"`
+	Total       int     `json:"total"`
+	Regressions int     `json:"regressions"`
+	Warnings    int     `json:"warnings"`
+}
+
+type MetricsDelta struct {
+	PAt1   float64 `json:"p_at_1"`
+	MRR    float64 `json:"mrr"`
+	HitAt3 float64 `json:"hit_at_3"`
+}
+
+type Regression struct {
+	ID           string   `json:"id"`
+	Corpus       string   `json:"corpus"`
+	Query        string   `json:"query"`
+	Expected     []string `json:"expected"`
+	BaselineRef  string   `json:"baseline_ref,omitempty"`
+	CurrentRef   string   `json:"current_ref"`
+	Reason       string   `json:"reason"`
+	DebugCommand string   `json:"debug_command"`
+}
+
+type Artifacts struct {
+	ReportJSON string `json:"report_json"`
+	SummaryMD  string `json:"summary_md"`
+}
+
+type CompareResult struct {
+	Status       string       `json:"status"`
+	Delta        MetricsDelta `json:"delta"`
+	Regressions  []Regression `json:"regressions"`
+	Improvements []string     `json:"improvements"`
+}
+
+type LintResult struct {
+	Errors   int      `json:"errors"`
+	Warnings int      `json:"warnings"`
+	Messages []string `json:"messages"`
+}
+
+type CatalogResult struct {
+	Corpora      []CorpusSummary `json:"corpora"`
+	TotalQueries int             `json:"total_queries"`
+	ByTag        map[string]int  `json:"by_tag,omitempty"`
+	ByDifficulty map[string]int  `json:"by_difficulty,omitempty"`
+}
+
+type CorpusSummary struct {
+	ID      string   `json:"id"`
+	Queries int      `json:"queries"`
+	Tags    []string `json:"tags"`
+}
diff --git a/internal/engine/benchmark_test.go b/internal/engine/benchmark_test.go
index c37528c..0ebc2c6 100644
--- a/internal/engine/benchmark_test.go
+++ b/internal/engine/benchmark_test.go
@@ -2,9 +2,10 @@ package engine
 
 import (
 	"context"
-	"github.com/pinchtab/semantic/internal/types"
 	"strconv"
 	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 // benchElements returns a realistic set of elements for benchmarking.
@@ -244,3 +245,119 @@ func BenchmarkCombinedFind_Issue24_100Elements(b *testing.B) {
 		})
 	}
 }
+
+// Focused microbenchmarks for individual components
+
+func BenchmarkParseQueryContext(b *testing.B) {
+	queries := []string{
+		"sign in button",
+		"the first email textbox in the login form",
+		"button not submit near the checkout section",
+		"second item in the dropdown menu",
+	}
+	b.ReportAllocs()
+
+	for b.Loop() {
+		for _, q := range queries {
+			ParseQueryContext(q)
+		}
+	}
+}
+
+func BenchmarkParseQueryContext_Complex(b *testing.B) {
+	q := "the third blue submit button in the checkout form not disabled"
+	b.ReportAllocs()
+
+	for b.Loop() {
+		ParseQueryContext(q)
+	}
+}
+
+func BenchmarkRemoveStopwords(b *testing.B) {
+	tokenSets := [][]string{
+		{"click", "the", "sign", "in", "button"},
+		{"find", "the", "email", "address", "textbox"},
+		{"the", "first", "item", "in", "a", "dropdown", "menu"},
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, tokens := range tokenSets {
+			removeStopwords(tokens)
+		}
+	}
+}
+
+func BenchmarkScoreFusion(b *testing.B) {
+	// Test the score fusion calculation
+	lexScores := make([]float64, 100)
+	embScores := make([]float64, 100)
+	for i := range lexScores {
+		lexScores[i] = float64(i) / 100.0
+		embScores[i] = float64(100-i) / 100.0
+	}
+	lexWeight, embWeight := 0.6, 0.4
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for j := range lexScores {
+			_ = lexWeight*lexScores[j] + embWeight*embScores[j]
+		}
+	}
+}
+
+func BenchmarkLexicalScore_Variants(b *testing.B) {
+	cases := []struct {
+		name  string
+		query string
+		desc  string
+	}{
+		{"exact", "Sign In", "button: Sign In"},
+		{"partial", "sign", "button: Sign In"},
+		{"synonym", "login", "button: Sign In"},
+		{"mismatch", "checkout", "button: Sign In"},
+		{"long_query", "click the sign in button on the login page", "button: Sign In"},
+	}
+	for _, tc := range cases {
+		b.Run(tc.name, func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				LexicalScore(tc.query, tc.desc)
+			}
+		})
+	}
+}
+
+func BenchmarkCombinedFind_WeightVariants(b *testing.B) {
+	elements := benchElements()
+	ctx := context.Background()
+
+	weights := []struct {
+		name string
+		lex  float64
+		emb  float64
+	}{
+		{"lex_only", 1.0, 0.0},
+		{"emb_only", 0.0, 1.0},
+		{"balanced", 0.5, 0.5},
+		{"lex_heavy", 0.8, 0.2},
+		{"emb_heavy", 0.2, 0.8},
+	}
+
+	for _, w := range weights {
+		b.Run(w.name, func(b *testing.B) {
+			m := NewCombinedMatcher(NewHashingEmbedder(128))
+			opts := types.FindOptions{
+				Threshold:       0.3,
+				TopK:            3,
+				LexicalWeight:   w.lex,
+				EmbeddingWeight: w.emb,
+			}
+			b.ReportAllocs()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_, _ = m.Find(ctx, "sign in button", elements, opts)
+			}
+		})
+	}
+}
diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go
new file mode 100644
index 0000000..1261dd6
--- /dev/null
+++ b/recovery/benchmark_test.go
@@ -0,0 +1,250 @@
+package recovery
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/pinchtab/semantic"
+)
+
+type BenchmarkScenario struct {
+	ID            string                       `json:"id"`
+	Name          string                       `json:"name"`
+	Description   string                       `json:"description"`
+	OriginalQuery string                       `json:"original_query"`
+	OriginalRef   string                       `json:"original_ref"`
+	Before        []semantic.ElementDescriptor `json:"before"`
+	After         []semantic.ElementDescriptor `json:"after"`
+	ExpectedRef   *string                      `json:"expected_ref"`
+	ExpectedAlt   []string                     `json:"expected_alt"`
+	ExpectNoMatch bool                         `json:"expect_no_match"`
+	Difficulty    string                       `json:"difficulty"`
+}
+
+func loadScenarios(t *testing.T) []BenchmarkScenario {
+	_, thisFile, _, _ := runtime.Caller(0)
+	repoRoot := filepath.Join(filepath.Dir(thisFile), "..")
+	scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json")
+
+	data, err := os.ReadFile(scenariosPath)
+	if err != nil {
+		t.Fatalf("failed to read scenarios: %v", err)
+	}
+
+	var scenarios []BenchmarkScenario
+	if err := json.Unmarshal(data, &scenarios); err != nil {
+		t.Fatalf("failed to parse scenarios: %v", err)
+	}
+
+	return scenarios
+}
+
+func TestRecoveryBenchmark_Scenarios(t *testing.T) {
+	scenarios := loadScenarios(t)
+	matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128))
+
+	passed, failed := 0, 0
+
+	for _, sc := range scenarios {
+		t.Run(sc.ID, func(t *testing.T) {
+			result := runBenchmarkScenario(t, matcher, sc)
+
+			if result.pass {
+				passed++
+				t.Logf("PASS: recovered=%v got=%s expected=%s score=%.3f",
+					result.recovered, result.gotRef, result.expectedRef, result.score)
+			} else {
+				failed++
+				t.Errorf("FAIL: recovered=%v got=%s expected=%s score=%.3f error=%s",
+					result.recovered, result.gotRef, result.expectedRef, result.score, result.err)
+			}
+		})
+	}
+
+	t.Logf("Summary: %d passed, %d failed out of %d scenarios", passed, failed, len(scenarios))
+}
+
+type scenarioResult struct {
+	pass        bool
+	recovered   bool
+	gotRef      string
+	expectedRef string
+	score       float64
+	confidence  string
+	latencyMs   int64
+	err         string
+}
+
+func runBenchmarkScenario(t *testing.T, matcher semantic.ElementMatcher, sc BenchmarkScenario) scenarioResult {
+	result := scenarioResult{}
+
+	if sc.ExpectedRef != nil {
+		result.expectedRef = *sc.ExpectedRef
+	}
+
+	var origDesc semantic.ElementDescriptor
+	for _, d := range sc.Before {
+		if d.Ref == sc.OriginalRef {
+			origDesc = d
+			break
+		}
+	}
+
+	cache := NewIntentCache(100, 5*time.Minute)
+	cache.Store("test-tab", sc.OriginalRef, IntentEntry{
+		Query:      sc.OriginalQuery,
+		Descriptor: origDesc,
+		Score:      0.95,
+		Confidence: "high",
+		Strategy:   "combined",
+	})
+
+	re := NewRecoveryEngine(
+		DefaultRecoveryConfig(),
+		matcher,
+		cache,
+		func(_ context.Context, _ string) error { return nil },
+		func(_, ref string) (int64, bool) {
+			for i, d := range sc.After {
+				if d.Ref == ref {
+					return int64(1000 + i), true
+				}
+			}
+			return 0, false
+		},
+		func(_ string) []semantic.ElementDescriptor { return sc.After },
+	)
+
+	start := time.Now()
+
+	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
+
+	if !re.ShouldAttempt(err, sc.OriginalRef) {
+		result.err = "ShouldAttempt returned false"
+		result.pass = sc.ExpectNoMatch
+		result.latencyMs = time.Since(start).Milliseconds()
+		return result
+	}
+
+	rr, _, recErr := re.AttemptWithClassification(
+		context.Background(),
+		"test-tab",
+		sc.OriginalRef,
+		"click",
+		ClassifyFailure(err),
+		func(_ context.Context, kind string, nodeID int64) (map[string]any, error) {
+			return map[string]any{"clicked": true}, nil
+		},
+	)
+
+	result.latencyMs = time.Since(start).Milliseconds()
+	result.recovered = rr.Recovered
+	result.gotRef = rr.NewRef
+	result.score = rr.Score
+	result.confidence = rr.Confidence
+
+	if recErr != nil {
+		result.err = recErr.Error()
+	}
+
+	if sc.ExpectNoMatch {
+		result.pass = !rr.Recovered
+	} else if sc.ExpectedRef != nil {
+		if rr.NewRef == *sc.ExpectedRef {
+			result.pass = true
+		} else {
+			for _, alt := range sc.ExpectedAlt {
+				if rr.NewRef == alt {
+					result.pass = true
+					break
+				}
+			}
+		}
+	}
+
+	return result
+}
+
+func BenchmarkRecoveryEngine_Scenarios(b *testing.B) {
+	scenarios := loadScenariosB(b)
+	matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, sc := range scenarios {
+			runBenchmarkScenarioB(b, matcher, sc)
+		}
+	}
+}
+
+func loadScenariosB(b *testing.B) []BenchmarkScenario {
+	_, thisFile, _, _ := runtime.Caller(0)
+	repoRoot := filepath.Join(filepath.Dir(thisFile), "..")
+	scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json")
+
+	data, err := os.ReadFile(scenariosPath)
+	if err != nil {
+		b.Fatalf("failed to read scenarios: %v", err)
+	}
+
+	var scenarios []BenchmarkScenario
+	if err := json.Unmarshal(data, &scenarios); err != nil {
+		b.Fatalf("failed to parse scenarios: %v", err)
+	}
+
+	return scenarios
+}
+
+func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc BenchmarkScenario) {
+	var origDesc semantic.ElementDescriptor
+	for _, d := range sc.Before {
+		if d.Ref == sc.OriginalRef {
+			origDesc = d
+			break
+		}
+	}
+
+	cache := NewIntentCache(100, 5*time.Minute)
+	cache.Store("test-tab", sc.OriginalRef, IntentEntry{
+		Query:      sc.OriginalQuery,
+		Descriptor: origDesc,
+		Score:      0.95,
+		Confidence: "high",
+		Strategy:   "combined",
+	})
+
+	re := NewRecoveryEngine(
+		DefaultRecoveryConfig(),
+		matcher,
+		cache,
+		func(_ context.Context, _ string) error { return nil },
+		func(_, ref string) (int64, bool) {
+			for i, d := range sc.After {
+				if d.Ref == ref {
+					return int64(1000 + i), true
+				}
+			}
+			return 0, false
+		},
+		func(_ string) []semantic.ElementDescriptor { return sc.After },
+	)
+
+	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
+
+	_, _, _ = re.AttemptWithClassification(
+		context.Background(),
+		"test-tab",
+		sc.OriginalRef,
+		"click",
+		ClassifyFailure(err),
+		func(_ context.Context, kind string, nodeID int64) (map[string]any, error) {
+			return map[string]any{"clicked": true}, nil
+		},
+	)
+}
diff --git a/scripts/check-docs-links.sh b/scripts/check-docs-links.sh
new file mode 100755
index 0000000..90a8738
--- /dev/null
+++ b/scripts/check-docs-links.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#
+# Check for broken documentation links
+#
+# Usage:
+#   ./scripts/check-docs-links.sh
+#
+set -uo pipefail
+
+cd "$(dirname "$0")/.."
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m'
+
+ERRORS=0
+
+echo "Checking documentation links..."
+echo ""
+
+# Find all markdown files and check links
+while IFS= read -r file; do
+    dir=$(dirname "$file")
+
+    # Extract markdown links: [text](path)
+    while IFS= read -r link; do
+        # Skip URLs and anchors
+        if [[ "$link" =~ ^https?:// ]] || [[ "$link" =~ ^mailto: ]] || [[ "$link" =~ ^# ]]; then
+            continue
+        fi
+        
+        # Remove anchor from link
+        link_path="${link%%#*}"
+        
+        # Skip empty paths
+        if [[ -z "$link_path" ]]; then
+            continue
+        fi
+        
+        # Resolve relative path
+        if [[ "$link_path" =~ ^/ ]]; then
+            target="$link_path"
+        else
+            target="$dir/$link_path"
+        fi
+        
+        # Check if target exists
+        if [[ ! -e "$target" ]]; then
+            echo -e "${RED}BROKEN:${NC} $file -> $link"
+            ERRORS=$((ERRORS + 1))
+        fi
+    done < <(grep -oE '\]\([^)]+\)' "$file" 2>/dev/null | sed 's/\](//' | sed 's/)//')
+done < <(find . -name "*.md" -not -path "./.git/*" -not -path "./node_modules/*")
+
+echo ""
+if [[ $ERRORS -eq 0 ]]; then
+    echo -e "${GREEN}✓${NC} All documentation links valid"
+    exit 0
+else
+    echo -e "${RED}Found $ERRORS broken link(s)${NC}"
+    exit 1
+fi
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index 84ade33..2bea9dd 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -5,32 +5,43 @@ description: Develop and contribute to the Semantic project. Use when working on
 
 # Semantic Development
 
-Semantic is a zero-dependency Go library for matching natural language queries against accessibility tree elements.
+Zero-dependency Go library for matching natural language queries against accessibility tree elements.
 
-## Project Location
+## Essential Commands
 
+**Before any PR:**
 ```bash
-cd ~/dev/semantic
+./dev pr                # runs: check + e2e + lint corpus + bench
 ```
 
-## Dev Commands
-
-All development commands run via `./dev`:
-
-| Command | Description |
-|---------|-------------|
-| `./dev doctor` | Setup dev environment |
-| `./dev test` | Run unit tests |
-| `./dev test verbose` | Run unit tests (verbose) |
-| `./dev test race` | Run unit tests with race detector |
-| `./dev coverage` | Run tests with coverage report |
-| `./dev lint` | Run golangci-lint |
-| `./dev fmt` | Format code |
-| `./dev vet` | Run go vet |
-| `./dev check` | All checks (fmt + vet + lint + test) |
-| `./dev build` | Build CLI binary |
-| `./dev bench` | Run corpus benchmark suite |
-| `./dev e2e` | Run E2E tests (Docker) |
+**During development:**
+```bash
+./dev test              # unit tests (fast)
+./dev check             # fmt + vet + lint + test race (full validation)
+./dev build             # build ./semantic CLI binary
+```
+
+**Quality regression checks:**
+```bash
+./dev baseline check    # compare quality against baseline
+./dev runtime           # compare performance against baseline
+```
+
+**When quality changes intentionally:**
+```bash
+./dev baseline update   # accept new quality baseline (after review)
+```
+
+## When to Use Each
+
+| Scenario | Command |
+|----------|---------|
+| Made code changes, quick sanity | `./dev test` |
+| Ready to commit | `./dev check` |
+| Before opening PR | `./dev pr` |
+| Changed scoring/matching logic | `./dev baseline check` |
+| Performance-sensitive changes | `./dev runtime` |
+| Tuning weights | `./dev tune` then `./dev bench` |
 
 ## Architecture
 
@@ -54,6 +65,7 @@ recovery/                  Public subpackage
   failure.go                 FailureType classification
 
 cmd/semantic/main.go       CLI tool (find, match, classify)
+cmd/semantic-bench/        Benchmark CLI (check, baseline, calibrate, tune, runtime)
 ```
 
 ## Key Design Decisions
@@ -79,6 +91,27 @@ cmd/semantic/main.go       CLI tool (find, match, classify)
 
 4. **Pre-commit hook** runs gofmt + golangci-lint automatically on staged files.
 
+## Benchmark Improvement Loop
+
+When implementing changes that affect matching quality:
+
+```bash
+./dev baseline          # create baseline (first time only)
+# ... make changes ...
+./dev bench             # run benchmark, compare to baseline
+./dev baseline update   # accept new baseline (if improved)
+```
+
+**Key metrics:**
+- **MRR** — Mean Reciprocal Rank (higher = finds correct element faster)
+- **P@1** — Precision at 1 (is top result correct?)
+- **Hit@3** — Any correct result in top 3?
+
+**Adding test cases:**
+1. Add to `tests/benchmark/corpus/*/queries.json`
+2. Run `./dev lint corpus` to validate
+3. Run `./dev bench` — shows regression until fixed
+
 ## Public API Surface
 
 Only these symbols are visible to consumers:
diff --git a/tests/benchmark/baselines/.gitkeep b/tests/benchmark/baselines/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/benchmark/config/benchmark.json b/tests/benchmark/config/benchmark.json
index 23b5661..7b06060 100644
--- a/tests/benchmark/config/benchmark.json
+++ b/tests/benchmark/config/benchmark.json
@@ -1,13 +1,35 @@
 {
-  "version": "1.0.0",
-  "strategies": ["lexical", "embedding", "combined"],
-  "default_strategy": "combined",
-  "default_threshold": 0.3,
-  "default_top_k": 3,
-  "metrics": {
-    "min_accuracy": 0.85,
-    "min_avg_score": 0.5,
-    "max_latency_ms": 100
+  "version": "1.1.0",
+  "defaults": {
+    "strategy": "combined",
+    "threshold": 0.01,
+    "top_k": 5,
+    "weights": {
+      "lexical": 0.6,
+      "embedding": 0.4
+    }
+  },
+  "baseline": {
+    "quality": {
+      "max_overall_p_at_1_drop": 0.02,
+      "max_overall_mrr_drop": 0.02,
+      "max_overall_hit_at_3_drop": 0.02,
+      "max_corpus_p_at_1_drop": 0.08,
+      "max_difficulty_p_at_1_drop": 0.08,
+      "max_margin_drop_report": 0.15
+    },
+    "runtime": {
+      "max_ns_op_regression_ratio": 1.25,
+      "max_alloc_regression_ratio": 1.25,
+      "max_corpus_latency_p50_ms": 75,
+      "max_corpus_latency_p95_ms": 200
+    }
   },
+  "results": {
+    "dir": "tests/benchmark/results",
+    "baselines_dir": "tests/benchmark/baselines",
+    "generated_files_policy": "warn"
+  },
+  "strategies": ["lexical", "embedding", "combined"],
   "snapshots_dir": "../e2e/assets/snapshots"
 }
diff --git a/tests/benchmark/scripts/finalize-report.sh b/tests/benchmark/scripts/finalize-report.sh
deleted file mode 100755
index 38d314f..0000000
--- a/tests/benchmark/scripts/finalize-report.sh
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-#
-# Finalize benchmark report and generate summary
-#
-# Usage:
-#   ./finalize-report.sh <report_file>
-#
-set -euo pipefail
-
-if [[ $# -lt 1 ]]; then
-    echo "Usage: $0 <report_file>"
-    exit 1
-fi
-
-REPORT_FILE="$1"
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-# Calculate final metrics
-TMP_FILE=$(mktemp)
-jq '
-    .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 10000 | floor / 100) else 0 end) |
-    .summary.avg_score = (if (.results | length) > 0 then ([.results[].score] | add / length | . * 1000 | floor / 1000) else 0 end) |
-    .summary.avg_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | add / length | floor) else 0 end) |
-    .summary.min_score = (if (.results | length) > 0 then ([.results[].score] | min) else 0 end) |
-    .summary.max_score = (if (.results | length) > 0 then ([.results[].score] | max) else 0 end) |
-    .summary.min_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | min) else 0 end) |
-    .summary.max_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | max) else 0 end)
-' "${REPORT_FILE}" > "${TMP_FILE}"
-mv "${TMP_FILE}" "${REPORT_FILE}"
-
-# Generate markdown summary
-TIMESTAMP=$(jq -r '.benchmark.timestamp' "${REPORT_FILE}")
-STRATEGY=$(jq -r '.benchmark.strategy' "${REPORT_FILE}")
-VERSION=$(jq -r '.benchmark.version' "${REPORT_FILE}")
-TOTAL=$(jq -r '.summary.total' "${REPORT_FILE}")
-PASSED=$(jq -r '.summary.passed' "${REPORT_FILE}")
-FAILED=$(jq -r '.summary.failed' "${REPORT_FILE}")
-SKIPPED=$(jq -r '.summary.skipped' "${REPORT_FILE}")
-ACCURACY=$(jq -r '.summary.accuracy' "${REPORT_FILE}")
-AVG_SCORE=$(jq -r '.summary.avg_score' "${REPORT_FILE}")
-AVG_LATENCY=$(jq -r '.summary.avg_latency_ms' "${REPORT_FILE}")
-MIN_SCORE=$(jq -r '.summary.min_score' "${REPORT_FILE}")
-MAX_SCORE=$(jq -r '.summary.max_score' "${REPORT_FILE}")
-MIN_LATENCY=$(jq -r '.summary.min_latency_ms' "${REPORT_FILE}")
-MAX_LATENCY=$(jq -r '.summary.max_latency_ms' "${REPORT_FILE}")
-
-cat > "${SUMMARY_FILE}" << EOF
-# Semantic Matching Benchmark Results
-
-## Benchmark Info
-
-| Field | Value |
-|-------|-------|
-| Timestamp | ${TIMESTAMP} |
-| Strategy | ${STRATEGY} |
-| Version | ${VERSION} |
-
-## Results Summary
-
-| Metric | Value |
-|--------|-------|
-| Total Cases | ${TOTAL} |
-| Passed | ${PASSED} |
-| Failed | ${FAILED} |
-| Skipped | ${SKIPPED} |
-| **Accuracy** | **${ACCURACY}%** |
-
-## Score Distribution
-
-| Metric | Value |
-|--------|-------|
-| Average Score | ${AVG_SCORE} |
-| Min Score | ${MIN_SCORE} |
-| Max Score | ${MAX_SCORE} |
-
-## Latency
-
-| Metric | Value |
-|--------|-------|
-| Average | ${AVG_LATENCY} ms |
-| Min | ${MIN_LATENCY} ms |
-| Max | ${MAX_LATENCY} ms |
-
-## Failed Cases
-
-EOF
-
-# Add failed cases
-jq -r '.results[] | select(.status == "fail") | "| \(.id) | \(.notes) |"' "${REPORT_FILE}" >> "${SUMMARY_FILE}"
-
-if [[ $(jq '[.results[] | select(.status == "fail")] | length' "${REPORT_FILE}") -eq 0 ]]; then
-    echo "_No failures_" >> "${SUMMARY_FILE}"
-else
-    # Add header
-    sed -i.bak '/## Failed Cases/a\
-| ID | Notes |\
-|-----|-------|' "${SUMMARY_FILE}"
-    rm -f "${SUMMARY_FILE}.bak"
-fi
-
-echo ""
-echo "================================================"
-echo "  BENCHMARK SUMMARY"
-echo "================================================"
-echo "  Strategy:  ${STRATEGY}"
-echo "  Total:     ${TOTAL}"
-echo "  Passed:    ${PASSED}"
-echo "  Failed:    ${FAILED}"
-echo "  Accuracy:  ${ACCURACY}%"
-echo "  Avg Score: ${AVG_SCORE}"
-echo "  Avg Latency: ${AVG_LATENCY} ms"
-echo "================================================"
-echo ""
-echo "Report: ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh
deleted file mode 100755
index 29f81b2..0000000
--- a/tests/benchmark/scripts/lint-corpus.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-CASES_DIR="${BENCHMARK_DIR}/cases"
-SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-ERRORS=0
-WARNINGS=0
-
-error() {
-    echo -e "${RED}ERROR:${NC} $1"
-    ((ERRORS++))
-}
-
-warn() {
-    echo -e "${YELLOW}WARN:${NC} $1"
-    ((WARNINGS++))
-}
-
-ok() {
-    echo -e "${GREEN}✓${NC} $1"
-}
-
-echo "=== Corpus Lint ==="
-echo ""
-
-# 1. Check for invalid JSON in all benchmark files
-echo "Checking JSON validity..."
-for f in "${CORPUS_DIR}"/*/*.json "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        if ! jq . "$f" >/dev/null 2>&1; then
-            error "Invalid JSON: $f"
-        fi
-    fi
-done
-
-# 2. Check for duplicate query IDs across corpus files
-echo "Checking for duplicate query IDs..."
-declare -A QUERY_IDS
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r id; do
-            if [[ -n "$id" && "$id" != "null" ]]; then
-                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
-                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
-                else
-                    QUERY_IDS[$id]="$f"
-                fi
-            fi
-        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# Also check cases files
-for f in "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r id; do
-            if [[ -n "$id" && "$id" != "null" ]]; then
-                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
-                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
-                else
-                    QUERY_IDS[$id]="$f"
-                fi
-            fi
-        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 3. Check for duplicate refs within snapshots
-echo "Checking for duplicate refs in snapshots..."
-for f in "${CORPUS_DIR}"/*/snapshot.json; do
-    if [[ -f "$f" ]]; then
-        dupes=$(jq -r '.[].ref' "$f" 2>/dev/null | sort | uniq -d)
-        if [[ -n "$dupes" ]]; then
-            error "Duplicate refs in $f: $dupes"
-        fi
-    fi
-done
-
-# 4. Check that relevant_refs exist in snapshot
-echo "Checking relevant_refs exist in snapshots..."
-for corpus_dir in "${CORPUS_DIR}"/*/; do
-    corpus_name=$(basename "$corpus_dir")
-    snapshot="${corpus_dir}snapshot.json"
-    queries="${corpus_dir}queries.json"
-
-    if [[ -f "$snapshot" && -f "$queries" ]]; then
-        # Get all refs from snapshot
-        refs=$(jq -r '.[].ref' "$snapshot" 2>/dev/null | sort | uniq)
-
-        # Check relevant_refs
-        while IFS= read -r ref; do
-            if [[ -n "$ref" && "$ref" != "null" ]]; then
-                if ! echo "$refs" | grep -qx "$ref"; then
-                    error "[$corpus_name] relevant_ref '$ref' not found in snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].relevant_refs[]? // empty' "$queries" 2>/dev/null)
-
-        # Check partially_relevant_refs
-        while IFS= read -r ref; do
-            if [[ -n "$ref" && "$ref" != "null" ]]; then
-                if ! echo "$refs" | grep -qx "$ref"; then
-                    error "[$corpus_name] partially_relevant_ref '$ref' not found in snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].partially_relevant_refs[]? // empty' "$queries" 2>/dev/null)
-    fi
-done
-
-# 5. Check for empty relevant_refs (except no-match cases)
-echo "Checking for empty relevant_refs..."
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        empty_relevant=$(jq -r '.[] | select(.relevant_refs | length == 0) | select(.partially_relevant_refs | length == 0) | select(.expect_no_match != true) | .id' "$f" 2>/dev/null)
-        for id in $empty_relevant; do
-            if [[ -n "$id" ]]; then
-                warn "Query '$id' in $f has empty relevant_refs"
-            fi
-        done
-    fi
-done
-
-# 6. Check difficulty values
-echo "Checking difficulty values..."
-VALID_DIFFICULTIES="easy medium hard"
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r line; do
-            id=$(echo "$line" | cut -d'|' -f1)
-            diff=$(echo "$line" | cut -d'|' -f2)
-            if [[ -n "$diff" && "$diff" != "null" ]]; then
-                if ! echo "$VALID_DIFFICULTIES" | grep -qw "$diff"; then
-                    error "Invalid difficulty '$diff' for query '$id' in $f"
-                fi
-            fi
-        done < <(jq -r '.[] | "\(.id)|\(.difficulty // "null")"' "$f" 2>/dev/null)
-    fi
-done
-
-# 7. Check for known tags (warn on unknown)
-echo "Checking tags..."
-KNOWN_TAGS="absent-control accessibility action action-synonym action-verb adversarial alertdialog all-stopwords auth basket-cart bulk-action button cell checkbox combobox compound context-exclusion conversational dashboard description descriptive dialog directional disambiguation domain-intent download-export duplicate-labels ecommerce empty-query empty-snapshot exact exact-match filter find-search generic-verb github guard icon implicit input interactive-boost keyboard-mash legal link literal-text login login-signin long-query lookup-search media menu menuitem missing-letter name-match natural-language navigation negative-context no-match noise-tokens nonsense option ordinal pagination parent-context partial position preferences-settings purchase-buy question-form radio register-create registration repeated-word row-context search searchbox section section-context signout-logout single-char social special-chars spinbutton stale-ref state switch synonym synonym-chain tab table textbox threshold toggle transposition typo vague-query visual weak-match wikipedia"
-for f in "${CORPUS_DIR}"/*/queries.json "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r tag; do
-            if [[ -n "$tag" && "$tag" != "null" ]]; then
-                if ! echo "$KNOWN_TAGS" | grep -qw "$tag"; then
-                    warn "Unknown tag '$tag' in $f"
-                fi
-            fi
-        done < <(jq -r '.[].tags[]? // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 8. Check case files reference existing snapshots
-echo "Checking case file snapshot references..."
-for f in "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r snapshot; do
-            if [[ -n "$snapshot" && "$snapshot" != "null" ]]; then
-                if [[ ! -f "${SNAPSHOTS_DIR}/${snapshot}" ]]; then
-                    error "Case file $f references missing snapshot: $snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].snapshot // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 9. Check for generated result files in source tree
-echo "Checking for generated result files..."
-if ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | grep -v '.gitkeep' | head -1 >/dev/null 2>&1; then
-    result_count=$(ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | wc -l | tr -d ' ')
-    warn "Found $result_count generated result files in tests/benchmark/results/ (should be gitignored)"
-fi
-
-echo ""
-echo "=== Summary ==="
-if [[ $ERRORS -eq 0 && $WARNINGS -eq 0 ]]; then
-    ok "All checks passed"
-    exit 0
-elif [[ $ERRORS -eq 0 ]]; then
-    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
-    exit 0
-else
-    echo -e "${RED}Errors: $ERRORS${NC}"
-    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
-    exit 1
-fi
diff --git a/tests/benchmark/scripts/record-result.sh b/tests/benchmark/scripts/record-result.sh
deleted file mode 100755
index 2288f7c..0000000
--- a/tests/benchmark/scripts/record-result.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-#
-# Record a benchmark result
-#
-# Usage:
-#   ./record-result.sh <report_file> <id> <pass|fail|skip> <score> <latency_ms> "notes"
-#
-set -euo pipefail
-
-if [[ $# -lt 5 ]]; then
-    echo "Usage: $0 <report_file> <id> <pass|fail|skip> <score> <latency_ms> [notes]"
-    exit 1
-fi
-
-REPORT_FILE="$1"
-ID="$2"
-STATUS="$3"
-SCORE="$4"
-LATENCY_MS="$5"
-NOTES="${6:-}"
-TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
-
-# Create result entry
-RESULT_JSON=$(jq -n \
-    --arg id "${ID}" \
-    --arg status "${STATUS}" \
-    --argjson score "${SCORE}" \
-    --argjson latency "${LATENCY_MS}" \
-    --arg notes "${NOTES}" \
-    --arg ts "${TIMESTAMP}" \
-    '{id: $id, status: $status, score: $score, latency_ms: $latency, notes: $notes, timestamp: $ts}')
-
-# Append to report
-TMP_FILE=$(mktemp)
-jq --argjson result "${RESULT_JSON}" \
-   --arg status "${STATUS}" \
-   '.results += [$result] |
-    .summary.total += 1 |
-    if $status == "pass" then .summary.passed += 1
-    elif $status == "fail" then .summary.failed += 1
-    else .summary.skipped += 1 end' \
-   "${REPORT_FILE}" > "${TMP_FILE}"
-
-mv "${TMP_FILE}" "${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh
deleted file mode 100755
index 4ce67d6..0000000
--- a/tests/benchmark/scripts/run-benchmark.sh
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/bin/bash
-#
-# Run semantic matching benchmark
-#
-# Usage:
-#   ./run-benchmark.sh [--strategy <name>] [--cases <file>]
-#
-# Options:
-#   --strategy <name>   Strategy to benchmark (lexical, embedding, combined)
-#   --cases <file>      Specific case file to run (default: all)
-#   --output <dir>      Output directory (default: ../results)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CASES_DIR="${BENCHMARK_DIR}/cases"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-# Parse args
-STRATEGY="combined"
-CASE_FILE=""
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --strategy) STRATEGY="$2"; shift 2 ;;
-        --cases) CASE_FILE="$2"; shift 2 ;;
-        --output) RESULTS_DIR="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-case "${STRATEGY}" in
-    lexical|embedding|combined) ;;
-    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
-esac
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.json"
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
-    --arg version "$(${SEMANTIC} --version 2>/dev/null || echo 'dev')" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            strategy: $strategy,
-            version: $version
-        },
-        results: [],
-        summary: {
-            total: 0,
-            passed: 0,
-            failed: 0,
-            skipped: 0,
-            accuracy: 0,
-            avg_score: 0,
-            avg_latency_ms: 0
-        }
-    }' > "${REPORT_FILE}"
-
-# Run cases
-score_at_least() {
-    local score="$1"
-    local min_score="$2"
-    awk -v score="${score}" -v min_score="${min_score}" 'BEGIN { exit (score + 0 >= min_score + 0) ? 0 : 1 }'
-}
-
-run_case() {
-    local case_file="$1"
-    local case_name
-    case_name=$(basename "$case_file" .json)
-
-    echo ""
-    echo "=== Running: ${case_name} ==="
-
-    local count
-    count=$(jq length "$case_file")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query snapshot expect_ref expect_ref_alt expect_no_match expect_no_crash expect_has_matches threshold min_score
-
-        id=$(jq -r ".[$i].id" "$case_file")
-        query=$(jq -r ".[$i].query" "$case_file")
-        snapshot=$(jq -r ".[$i].snapshot" "$case_file")
-        expect_ref=$(jq -r ".[$i].expect_ref // empty" "$case_file")
-        expect_ref_alt=$(jq -r ".[$i].expect_ref_alt // [] | join(\",\")" "$case_file")
-        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$case_file")
-        expect_no_crash=$(jq -r ".[$i].expect_no_crash // false" "$case_file")
-        expect_has_matches=$(jq -r ".[$i].expect_has_matches // false" "$case_file")
-        threshold=$(jq -r ".[$i].threshold // 0.3" "$case_file")
-        min_score=$(jq -r ".[$i].min_score // 0" "$case_file")
-
-        local snapshot_path="${SNAPSHOTS_DIR}/${snapshot}"
-        if [[ ! -f "${snapshot_path}" ]]; then
-            echo "  [${id}] SKIP: snapshot not found: ${snapshot}"
-            "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "skip" 0 0 "snapshot not found"
-            continue
-        fi
-
-        # Run query and measure time
-        local start_ms end_ms duration_ms result exit_code
-        start_ms=$(python3 -c 'import time; print(int(time.time() * 1000))')
-
-        set +e
-        result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot_path}" \
-            --strategy "${STRATEGY}" \
-            --threshold "${threshold}" \
-            --format json 2>&1)
-        exit_code=$?
-        set -e
-
-        end_ms=$(python3 -c 'import time; print(int(time.time() * 1000))')
-        duration_ms=$((end_ms - start_ms))
-
-        # Evaluate result
-        local status="fail"
-        local got_ref=""
-        local got_score=0
-        local notes=""
-
-        if [[ ${exit_code} -ne 0 ]]; then
-            if [[ "${expect_no_crash}" == "true" ]]; then
-                # Some crashes are expected (empty query, etc)
-                status="pass"
-                notes="exit ${exit_code} (expected)"
-            else
-                notes="exit ${exit_code}: ${result}"
-            fi
-        else
-            got_ref=$(echo "$result" | jq -r '.best_ref // empty')
-            got_score=$(echo "$result" | jq -r '.best_score // 0')
-            local match_count
-            match_count=$(echo "$result" | jq -r '.matches | length')
-
-            if [[ "${expect_no_match}" == "true" ]]; then
-                if [[ ${match_count} -eq 0 ]]; then
-                    status="pass"
-                    notes="no matches (expected)"
-                else
-                    notes="expected no matches, got ${match_count}"
-                fi
-            elif [[ "${expect_has_matches}" == "true" ]]; then
-                if [[ ${match_count} -gt 0 ]]; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="${match_count} matches, score=${got_score}"
-                    else
-                        notes="${match_count} matches, score=${got_score} below min_score=${min_score}"
-                    fi
-                else
-                    notes="expected matches, got 0"
-                fi
-            elif [[ -n "${expect_ref}" ]]; then
-                if [[ "${got_ref}" == "${expect_ref}" ]]; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="ref=${got_ref}, score=${got_score}"
-                    else
-                        notes="ref=${got_ref}, score=${got_score} below min_score=${min_score}"
-                    fi
-                elif [[ -n "${expect_ref_alt}" ]] && echo ",${expect_ref_alt}," | grep -q ",${got_ref},"; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="ref=${got_ref} (alt), score=${got_score}"
-                    else
-                        notes="ref=${got_ref} (alt), score=${got_score} below min_score=${min_score}"
-                    fi
-                else
-                    notes="got ${got_ref}, want ${expect_ref}"
-                fi
-            elif [[ "${expect_no_crash}" == "true" ]]; then
-                status="pass"
-                notes="no crash"
-            fi
-        fi
-
-        # Record result
-        "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "${status}" "${got_score}" "${duration_ms}" "${notes}"
-
-        if [[ "${status}" == "pass" ]]; then
-            echo "  [${id}] PASS: ${notes}"
-        else
-            echo "  [${id}] FAIL: ${notes}"
-        fi
-    done
-}
-
-# Find case files
-if [[ -n "${CASE_FILE}" ]]; then
-    run_case "${CASES_DIR}/${CASE_FILE}"
-else
-    for case_file in "${CASES_DIR}"/*.json; do
-        [[ -f "$case_file" ]] || continue
-        run_case "$case_file"
-    done
-fi
-
-# Finalize report
-"${SCRIPT_DIR}/finalize-report.sh" "${REPORT_FILE}"
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "Benchmark complete: ${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh
deleted file mode 100755
index b5579bf..0000000
--- a/tests/benchmark/scripts/run-corpus-benchmark.sh
+++ /dev/null
@@ -1,500 +0,0 @@
-#!/bin/bash
-#
-# Run semantic matching benchmark with ranking metrics
-#
-# Usage:
-#   ./run-corpus-benchmark.sh [--strategy <name>] [--corpus <dir>] [--lexical-weight <n>] [--embedding-weight <n>]
-#
-# Metrics:
-#   - MRR (Mean Reciprocal Rank)
-#   - P@1 (Precision at 1)
-#   - P@3 (Precision at 3)
-#   - Latency distribution (p50, p95, p99)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-# Parse args
-STRATEGY="combined"
-SPECIFIC_CORPUS=""
-TOP_K=5
-LEXICAL_WEIGHT=0.6
-EMBEDDING_WEIGHT=0.4
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --strategy) STRATEGY="$2"; shift 2 ;;
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        --top-k) TOP_K="$2"; shift 2 ;;
-        --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;;
-        --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-case "${STRATEGY}" in
-    lexical|embedding|combined) ;;
-    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
-esac
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json"
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
-    --argjson top_k "${TOP_K}" \
-    --argjson lexical_weight "${LEXICAL_WEIGHT}" \
-    --argjson embedding_weight "${EMBEDDING_WEIGHT}" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            strategy: $strategy,
-            top_k: $top_k,
-            type: "corpus",
-            weights: {
-                lexical: $lexical_weight,
-                embedding: $embedding_weight
-            }
-        },
-        results: [],
-        metrics: {
-            total: 0,
-            mrr: 0,
-            p_at_1: 0,
-            p_at_3: 0,
-            latencies_ms: [],
-            by_difficulty: {},
-            by_tag: {}
-        }
-    }' > "${REPORT_FILE}"
-
-# Arrays to collect metrics
-declare -a ALL_RRS=()
-declare -a ALL_P1=()
-declare -a ALL_P3=()
-declare -a ALL_HIT3=()
-declare -a ALL_HIT5=()
-declare -a ALL_MARGINS=()
-declare -a ALL_LATENCIES=()
-
-run_corpus() {
-    local corpus_path="$1"
-    local corpus_name
-    corpus_name=$(basename "$corpus_path")
-
-    local snapshot="${corpus_path}/snapshot.json"
-    local queries="${corpus_path}/queries.json"
-
-    if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
-        if [[ -f "${corpus_path}/cases.json" ]] || [[ -f "${corpus_path}/scenarios.json" ]]; then
-            return
-        fi
-        echo "  Skipping ${corpus_name}: missing files"
-        return
-    fi
-
-    echo ""
-    echo "=== Corpus: ${corpus_name} ==="
-
-    local count
-    count=$(jq length "$queries")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query relevant_refs partial_refs difficulty tags
-
-        id=$(jq -r ".[$i].id" "$queries")
-        query=$(jq -r ".[$i].query" "$queries")
-        relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries")
-        partial_refs=$(jq -c ".[$i].partially_relevant_refs // []" "$queries")
-        difficulty=$(jq -r ".[$i].difficulty // \"medium\"" "$queries")
-        tags=$(jq -c ".[$i].tags // []" "$queries")
-
-        # Run query and measure time
-        local start_ns end_ns duration_ms result
-        start_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
-
-        if ! result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot}" \
-            --strategy "${STRATEGY}" \
-            --threshold 0.01 \
-            --top-k "${TOP_K}" \
-            --lexical-weight "${LEXICAL_WEIGHT}" \
-            --embedding-weight "${EMBEDDING_WEIGHT}" \
-            --format json 2>&1); then
-            echo "  [${id}] ERROR: semantic find failed for query: ${query}" >&2
-            echo "${result}" >&2
-            exit 1
-        fi
-
-        if ! echo "$result" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
-            echo "  [${id}] ERROR: semantic find returned invalid JSON" >&2
-            echo "${result}" >&2
-            exit 1
-        fi
-
-        end_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
-        duration_ms=$(( (end_ns - start_ns) / 1000 ))
-
-        # Extract results
-        local matches best_ref best_score
-        matches=$(echo "$result" | jq -c '[.matches[].ref]')
-        best_ref=$(echo "$result" | jq -r '.best_ref // ""')
-        best_score=$(echo "$result" | jq -r '.best_score // 0')
-
-        # Calculate Reciprocal Rank
-        local rr=0
-        for rank in $(seq 1 ${TOP_K}); do
-            local ref_at_rank
-            ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                rr=$(echo "scale=4; 1 / ${rank}" | bc)
-                break
-            fi
-        done
-
-        # Calculate P@1
-        local p1=0
-        if echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-            p1=1
-        elif echo "$partial_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-            p1=0.5
-        fi
-
-        # Calculate P@3 (count relevant in top 3, partials count as 0.5)
-        local relevant_in_top3=0
-        local partial_in_top3=0
-        local hit_at_3=0
-        local hit_at_5=0
-        local best_relevant_rank="null"
-        for rank in 1 2 3 4 5; do
-            local ref_at_rank
-            ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                if [[ "$best_relevant_rank" == "null" ]]; then
-                    best_relevant_rank=$rank
-                fi
-                if [[ $rank -le 3 ]]; then
-                    relevant_in_top3=$((relevant_in_top3 + 1))
-                    hit_at_3=1
-                fi
-                hit_at_5=1
-            elif [[ $rank -le 3 ]]; then
-                if echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                    partial_in_top3=$((partial_in_top3 + 1))
-                fi
-            fi
-        done
-        local p3
-        p3=$(echo "scale=4; (${relevant_in_top3} + ${partial_in_top3} * 0.5) / 3" | bc)
-
-        # Calculate best_relevant_score, best_wrong_score, and margin
-        local best_relevant_score=0
-        local best_wrong_score=0
-        local num_matches
-        num_matches=$(echo "$result" | jq '.matches | length')
-        for idx in $(seq 0 $((num_matches - 1))); do
-            local ref_at_idx score_at_idx
-            ref_at_idx=$(echo "$result" | jq -r ".matches[$idx].ref // \"\"")
-            score_at_idx=$(echo "$result" | jq -r ".matches[$idx].score // 0")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
-                if (( $(echo "$score_at_idx > $best_relevant_score" | bc -l) )); then
-                    best_relevant_score=$score_at_idx
-                fi
-            elif echo "$partial_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
-                : # partials don't count as wrong
-            else
-                if (( $(echo "$score_at_idx > $best_wrong_score" | bc -l) )); then
-                    best_wrong_score=$score_at_idx
-                fi
-            fi
-        done
-        local margin
-        margin=$(echo "scale=4; $best_relevant_score - $best_wrong_score" | bc)
-
-        # Collect metrics
-        ALL_RRS+=("$rr")
-        ALL_P1+=("$p1")
-        ALL_P3+=("$p3")
-        ALL_HIT3+=("$hit_at_3")
-        ALL_HIT5+=("$hit_at_5")
-        ALL_MARGINS+=("$margin")
-        ALL_LATENCIES+=("$duration_ms")
-
-        # Status indicator
-        local status="MISS"
-        if (( $(echo "$p1 >= 1" | bc -l) )); then
-            status="HIT "
-        elif (( $(echo "$p1 >= 0.5" | bc -l) )); then
-            status="PART"
-        fi
-
-        printf "  [%s] %s | RR=%.2f P@1=%.1f P@3=%.2f | %dms | %s\n" \
-            "$id" "$status" "$rr" "$p1" "$p3" "$duration_ms" "$query"
-
-        # Record to report
-        local result_json
-        result_json=$(jq -n \
-            --arg id "$id" \
-            --arg query "$query" \
-            --arg corpus "$corpus_name" \
-            --arg difficulty "$difficulty" \
-            --argjson tags "$tags" \
-            --arg best_ref "$best_ref" \
-            --argjson best_score "$best_score" \
-            --argjson matches "$matches" \
-            --argjson relevant "$relevant_refs" \
-            --argjson rr "$rr" \
-            --argjson p1 "$p1" \
-            --argjson p3 "$p3" \
-            --argjson hit_at_3 "$hit_at_3" \
-            --argjson hit_at_5 "$hit_at_5" \
-            --argjson best_relevant_rank "$best_relevant_rank" \
-            --argjson best_relevant_score "$best_relevant_score" \
-            --argjson best_wrong_score "$best_wrong_score" \
-            --argjson margin "$margin" \
-            --argjson latency "$duration_ms" \
-            '{
-                id: $id, query: $query, corpus: $corpus,
-                difficulty: $difficulty, tags: $tags,
-                best_ref: $best_ref, best_score: $best_score,
-                matches: $matches, relevant_refs: $relevant,
-                rr: $rr, p_at_1: $p1, p_at_3: $p3,
-                hit_at_3: $hit_at_3, hit_at_5: $hit_at_5,
-                best_relevant_rank: $best_relevant_rank,
-                best_relevant_score: $best_relevant_score,
-                best_wrong_score: $best_wrong_score,
-                margin: $margin,
-                latency_ms: $latency
-            }')
-
-        # Append to report
-        local tmp
-        tmp=$(mktemp)
-        jq --argjson r "$result_json" '.results += [$r]' "$REPORT_FILE" > "$tmp"
-        mv "$tmp" "$REPORT_FILE"
-    done
-}
-
-# Run benchmarks
-if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-    run_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}"
-else
-    for corpus in "${CORPUS_DIR}"/*/; do
-        [[ -d "$corpus" ]] || continue
-        run_corpus "$corpus"
-    done
-fi
-
-# Calculate aggregate metrics
-echo ""
-echo "Calculating aggregate metrics..."
-
-TOTAL=${#ALL_RRS[@]}
-if [[ $TOTAL -eq 0 ]]; then
-    echo "No results to aggregate"
-    exit 1
-fi
-
-# MRR
-MRR=$(printf '%s\n' "${ALL_RRS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# P@1
-P1=$(printf '%s\n' "${ALL_P1[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# P@3
-P3=$(printf '%s\n' "${ALL_P3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Hit@3
-HIT3=$(printf '%s\n' "${ALL_HIT3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Hit@5
-HIT5=$(printf '%s\n' "${ALL_HIT5[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Average margin
-AVG_MARGIN=$(printf '%s\n' "${ALL_MARGINS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Latency percentiles
-SORTED_LAT=($(printf '%s\n' "${ALL_LATENCIES[@]}" | sort -n))
-P50_IDX=$(( TOTAL * 50 / 100 ))
-P95_IDX=$(( TOTAL * 95 / 100 ))
-P99_IDX=$(( TOTAL * 99 / 100 ))
-LAT_P50=${SORTED_LAT[$P50_IDX]:-0}
-LAT_P95=${SORTED_LAT[$P95_IDX]:-0}
-LAT_P99=${SORTED_LAT[$P99_IDX]:-0}
-LAT_AVG=$(printf '%s\n' "${ALL_LATENCIES[@]}" | awk '{s+=$1} END {printf "%.0f", s/NR}')
-
-# Update report with aggregates
-tmp=$(mktemp)
-jq \
-    --argjson total "$TOTAL" \
-    --argjson mrr "$MRR" \
-    --argjson p1 "$P1" \
-    --argjson p3 "$P3" \
-    --argjson hit3 "$HIT3" \
-    --argjson hit5 "$HIT5" \
-    --argjson avg_margin "$AVG_MARGIN" \
-    --argjson lat_avg "$LAT_AVG" \
-    --argjson lat_p50 "$LAT_P50" \
-    --argjson lat_p95 "$LAT_P95" \
-    --argjson lat_p99 "$LAT_P99" \
-    '.metrics = {
-        total: $total,
-        mrr: $mrr,
-        p_at_1: $p1,
-        p_at_3: $p3,
-        hit_at_3: $hit3,
-        hit_at_5: $hit5,
-        avg_margin: $avg_margin,
-        latency_avg_ms: $lat_avg,
-        latency_p50_ms: $lat_p50,
-        latency_p95_ms: $lat_p95,
-        latency_p99_ms: $lat_p99
-    }' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-difficulty breakdown
-tmp=$(mktemp)
-jq '.metrics.by_difficulty = (
-    .results | group_by(.difficulty) | map({
-        key: .[0].difficulty,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    }) | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-corpus breakdown
-tmp=$(mktemp)
-jq '.metrics.by_corpus = (
-    .results | group_by(.corpus) | map({
-        key: .[0].corpus,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    }) | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-tag breakdown
-tmp=$(mktemp)
-jq '.metrics.by_tag = (
-    [.results[] | {tags: .tags, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
-    | [.[] | .tags[] as $tag | {tag: $tag, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
-    | group_by(.tag)
-    | map({
-        key: .[0].tag,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    })
-    | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Generate summary
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Semantic Matching Benchmark Results
-
-## Configuration
-
-| Field | Value |
-|-------|-------|
-| Timestamp | $(jq -r '.benchmark.timestamp' "$REPORT_FILE") |
-| Strategy | ${STRATEGY} |
-| Lexical Weight | ${LEXICAL_WEIGHT} |
-| Embedding Weight | ${EMBEDDING_WEIGHT} |
-| Top-K | ${TOP_K} |
-| Total Queries | ${TOTAL} |
-
-## Ranking Metrics
-
-| Metric | Value | Description |
-|--------|-------|-------------|
-| **MRR** | **${MRR}** | Mean Reciprocal Rank |
-| **P@1** | **${P1}** | Precision at rank 1 |
-| **P@3** | **${P3}** | Precision at rank 3 |
-| **Hit@3** | **${HIT3}** | Any relevant in top 3 |
-| **Hit@5** | **${HIT5}** | Any relevant in top 5 |
-| **Avg Margin** | **${AVG_MARGIN}** | best_relevant - best_wrong |
-
-## Latency
-
-| Percentile | Value |
-|------------|-------|
-| Average | ${LAT_AVG} ms |
-| P50 | ${LAT_P50} ms |
-| P95 | ${LAT_P95} ms |
-| P99 | ${LAT_P99} ms |
-
-## By Difficulty
-
-| Difficulty | Count | MRR | P@1 | Hit@3 | Margin |
-|------------|-------|-----|-----|-------|--------|
-$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
-
-## By Corpus
-
-| Corpus | Count | MRR | P@1 | Hit@3 | Margin |
-|--------|-------|-----|-----|-------|--------|
-$(jq -r '.metrics.by_corpus | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
-
-## Misses (P@1 = 0)
-
-| ID | Query | Got | Expected |
-|----|-------|-----|----------|
-$(jq -r '.results[] | select(.p_at_1 == 0) | "| \(.id) | \(.query) | \(.best_ref) | \(.relevant_refs | join(",")) |"' "$REPORT_FILE")
-
-EOF
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "================================================"
-echo "  CORPUS BENCHMARK RESULTS"
-echo "================================================"
-echo "  Strategy:    ${STRATEGY}"
-echo "  Weights:     lexical=${LEXICAL_WEIGHT} embedding=${EMBEDDING_WEIGHT}"
-echo "  Queries:     ${TOTAL}"
-echo "  MRR:         ${MRR}"
-echo "  P@1:         ${P1}"
-echo "  P@3:         ${P3}"
-echo "  Hit@3:       ${HIT3}"
-echo "  Hit@5:       ${HIT5}"
-echo "  Avg Margin:  ${AVG_MARGIN}"
-echo "  Latency P50: ${LAT_P50} ms"
-echo "  Latency P95: ${LAT_P95} ms"
-echo "================================================"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh
deleted file mode 100755
index eadaad7..0000000
--- a/tests/benchmark/scripts/run-full-benchmark.sh
+++ /dev/null
@@ -1,304 +0,0 @@
-#!/bin/bash
-#
-# Full semantic benchmark: Find + Recovery + Classification
-#
-# Produces a composite score for overall system health.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary with recovery support
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/full_benchmark_${TIMESTAMP}.json"
-
-has_role_keyword() {
-    local query="$1"
-    echo "$query" | grep -Eiq '(^|[^[:alnum:]])(button|input|link|textbox|checkbox|radio|select|option|tab|menu|form|search)([^[:alnum:]]|$)'
-}
-
-enrich_recovery_query() {
-    local query="$1"
-    local role="$2"
-
-    if [[ -z "$query" || -z "$role" ]]; then
-        printf '%s' "$query"
-        return
-    fi
-    if has_role_keyword "$query"; then
-        printf '%s' "$query"
-        return
-    fi
-    printf '%s %s' "$query" "$role"
-}
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    '{
-        timestamp: $ts,
-        find: { total: 0, mrr: 0, p_at_1: 0, latency_p50: 0 },
-        recovery: { total: 0, recovered: 0, rate: 0 },
-        classification: { total: 0, correct: 0, accuracy: 0 },
-        composite: { score: 0, grade: "" }
-    }' > "${REPORT_FILE}"
-
-echo ""
-echo "=============================================="
-echo "  PHASE 1: FIND BENCHMARK"
-echo "=============================================="
-
-# Run corpus benchmark and capture metrics
-FIND_OUTPUT=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" 2>&1)
-echo "$FIND_OUTPUT"
-
-# Extract metrics from the corpus report rather than the human-readable output.
-FIND_REPORT=$(echo "$FIND_OUTPUT" | awk '/^Report:/ {print $2}' | tail -1)
-if [[ -z "${FIND_REPORT}" ]] || [[ ! -f "${FIND_REPORT}" ]]; then
-    echo "error: could not locate corpus benchmark report" >&2
-    exit 1
-fi
-FIND_MRR=$(jq -r '.metrics.mrr' "$FIND_REPORT")
-FIND_P1=$(jq -r '.metrics.p_at_1' "$FIND_REPORT")
-FIND_TOTAL=$(jq -r '.metrics.total' "$FIND_REPORT")
-FIND_LAT=$(jq -r '.metrics.latency_p50_ms' "$FIND_REPORT")
-
-# Rebuild semantic binary (corpus benchmark deletes it)
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-echo ""
-echo "=============================================="
-echo "  PHASE 2: RECOVERY BENCHMARK"
-echo "=============================================="
-
-SCENARIOS_FILE="${CORPUS_DIR}/recovery-scenarios/scenarios.json"
-RECOVERY_TOTAL=0
-RECOVERY_SUCCESS=0
-
-if [[ -f "$SCENARIOS_FILE" ]]; then
-    SCENARIO_COUNT=$(jq length "$SCENARIOS_FILE")
-
-    for i in $(seq 0 $((SCENARIO_COUNT - 1))); do
-        ID=$(jq -r ".[$i].id" "$SCENARIOS_FILE")
-        NAME=$(jq -r ".[$i].name" "$SCENARIOS_FILE")
-        RAW_QUERY=$(jq -r ".[$i].original_query" "$SCENARIOS_FILE")
-        ORIGINAL_REF=$(jq -r ".[$i].original_ref // empty" "$SCENARIOS_FILE")
-        ORIGINAL_ROLE=$(jq -r ".[$i].before[]? | select(.ref == \"$ORIGINAL_REF\") | .role // empty" "$SCENARIOS_FILE")
-        QUERY=$(enrich_recovery_query "$RAW_QUERY" "$ORIGINAL_ROLE")
-        EXPECTED=$(jq -r ".[$i].expected_ref // empty" "$SCENARIOS_FILE")
-        EXPECTED_ALT=$(jq -r ".[$i].expected_alt // [] | join(\",\")" "$SCENARIOS_FILE")
-        EXPECT_NO_MATCH=$(jq -r ".[$i].expect_no_match // false" "$SCENARIOS_FILE")
-
-        # Write after snapshot to temp file
-        AFTER_FILE=$(mktemp)
-        jq ".[$i].after" "$SCENARIOS_FILE" > "$AFTER_FILE"
-
-        # Run semantic find on after snapshot with the same minimum score
-        # enforced by DefaultRecoveryConfig in the recovery engine.
-        if ! RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>&1); then
-            echo "  [$ID] ERROR: semantic find failed during recovery benchmark" >&2
-            echo "$RESULT" >&2
-            rm -f "$AFTER_FILE"
-            exit 1
-        fi
-        if ! echo "$RESULT" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
-            echo "  [$ID] ERROR: semantic find returned invalid JSON during recovery benchmark" >&2
-            echo "$RESULT" >&2
-            rm -f "$AFTER_FILE"
-            exit 1
-        fi
-        BEST_REF=$(echo "$RESULT" | jq -r '.best_ref // ""')
-
-        rm -f "$AFTER_FILE"
-
-        RECOVERY_TOTAL=$((RECOVERY_TOTAL + 1))
-        STATUS="FAIL"
-
-        if [[ "$EXPECT_NO_MATCH" == "true" ]]; then
-            if [[ -z "$BEST_REF" ]] || [[ "$BEST_REF" == "null" ]]; then
-                STATUS="PASS"
-                RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-            fi
-        elif [[ "$BEST_REF" == "$EXPECTED" ]]; then
-            STATUS="PASS"
-            RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-        elif [[ -n "$EXPECTED_ALT" ]] && echo ",$EXPECTED_ALT," | grep -q ",$BEST_REF,"; then
-            STATUS="PASS"
-            RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-        fi
-
-        printf "  [%s] %s | %s | got=%s want=%s\n" "$ID" "$STATUS" "$NAME" "$BEST_REF" "$EXPECTED"
-    done
-fi
-
-RECOVERY_RATE=0
-if [[ $RECOVERY_TOTAL -gt 0 ]]; then
-    RECOVERY_RATE=$(echo "scale=4; $RECOVERY_SUCCESS / $RECOVERY_TOTAL" | bc)
-fi
-
-echo ""
-echo "  Recovery: $RECOVERY_SUCCESS / $RECOVERY_TOTAL = $RECOVERY_RATE"
-
-echo ""
-echo "=============================================="
-echo "  PHASE 3: CLASSIFICATION BENCHMARK"
-echo "=============================================="
-
-CLASS_FILE="${CORPUS_DIR}/classification/cases.json"
-CLASS_TOTAL=0
-CLASS_CORRECT=0
-
-if [[ -f "$CLASS_FILE" ]]; then
-    CLASS_COUNT=$(jq length "$CLASS_FILE")
-
-    for i in $(seq 0 $((CLASS_COUNT - 1))); do
-        ID=$(jq -r ".[$i].id" "$CLASS_FILE")
-        ERROR=$(jq -r ".[$i].error" "$CLASS_FILE")
-        EXPECTED=$(jq -r ".[$i].expected_type" "$CLASS_FILE")
-
-        # Run semantic classify (extract just the type, first word)
-        if ! RESULT=$("${SEMANTIC}" classify "$ERROR" 2>&1); then
-            echo "  [$ID] ERROR: semantic classify failed" >&2
-            echo "$RESULT" >&2
-            exit 1
-        fi
-        GOT=$(echo "$RESULT" | awk '{print $1}')
-
-        CLASS_TOTAL=$((CLASS_TOTAL + 1))
-        STATUS="FAIL"
-
-        if [[ "$GOT" == "$EXPECTED" ]]; then
-            STATUS="PASS"
-            CLASS_CORRECT=$((CLASS_CORRECT + 1))
-        fi
-
-        printf "  [%s] %s | \"%s\" → %s (want %s)\n" "$ID" "$STATUS" "${ERROR:0:40}" "$GOT" "$EXPECTED"
-    done
-fi
-
-CLASS_ACCURACY=0
-if [[ $CLASS_TOTAL -gt 0 ]]; then
-    CLASS_ACCURACY=$(echo "scale=4; $CLASS_CORRECT / $CLASS_TOTAL" | bc)
-fi
-
-echo ""
-echo "  Classification: $CLASS_CORRECT / $CLASS_TOTAL = $CLASS_ACCURACY"
-
-echo ""
-echo "=============================================="
-echo "  COMPOSITE SCORE"
-echo "=============================================="
-
-# Calculate composite score with weights:
-#   Find P@1:      40%
-#   Find MRR:      20%
-#   Recovery Rate: 25%
-#   Classification: 15%
-
-COMPOSITE=$(echo "scale=4; \
-    ($FIND_P1 * 0.40) + \
-    ($FIND_MRR * 0.20) + \
-    ($RECOVERY_RATE * 0.25) + \
-    ($CLASS_ACCURACY * 0.15)" | bc)
-COMPOSITE=$(awk -v value="$COMPOSITE" 'BEGIN { printf "%.4f", value }')
-
-# Assign grade
-GRADE="F"
-if (( $(echo "$COMPOSITE >= 0.95" | bc -l) )); then GRADE="A+"
-elif (( $(echo "$COMPOSITE >= 0.90" | bc -l) )); then GRADE="A"
-elif (( $(echo "$COMPOSITE >= 0.85" | bc -l) )); then GRADE="B+"
-elif (( $(echo "$COMPOSITE >= 0.80" | bc -l) )); then GRADE="B"
-elif (( $(echo "$COMPOSITE >= 0.75" | bc -l) )); then GRADE="C+"
-elif (( $(echo "$COMPOSITE >= 0.70" | bc -l) )); then GRADE="C"
-elif (( $(echo "$COMPOSITE >= 0.60" | bc -l) )); then GRADE="D"
-fi
-
-# Update report
-TMP=$(mktemp)
-jq \
-    --argjson find_total "${FIND_TOTAL:-0}" \
-    --argjson find_mrr "${FIND_MRR:-0}" \
-    --argjson find_p1 "${FIND_P1:-0}" \
-    --argjson find_lat "${FIND_LAT:-0}" \
-    --argjson rec_total "$RECOVERY_TOTAL" \
-    --argjson rec_success "$RECOVERY_SUCCESS" \
-    --argjson rec_rate "$RECOVERY_RATE" \
-    --argjson class_total "$CLASS_TOTAL" \
-    --argjson class_correct "$CLASS_CORRECT" \
-    --argjson class_acc "$CLASS_ACCURACY" \
-    --argjson composite "$COMPOSITE" \
-    --arg grade "$GRADE" \
-    '.find = { total: $find_total, mrr: $find_mrr, p_at_1: $find_p1, latency_p50: $find_lat } |
-     .recovery = { total: $rec_total, recovered: $rec_success, rate: $rec_rate } |
-     .classification = { total: $class_total, correct: $class_correct, accuracy: $class_acc } |
-     .composite = { score: $composite, grade: $grade }' \
-    "$REPORT_FILE" > "$TMP"
-mv "$TMP" "$REPORT_FILE"
-
-# Generate summary
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-cat > "$SUMMARY_FILE" << EOF
-# Semantic Benchmark Report
-
-## Composite Score: ${COMPOSITE} (${GRADE})
-
-| Component | Weight | Score | Weighted |
-|-----------|--------|-------|----------|
-| Find P@1 | 40% | ${FIND_P1:-0} | $(echo "scale=3; ${FIND_P1:-0} * 0.40" | bc) |
-| Find MRR | 20% | ${FIND_MRR:-0} | $(echo "scale=3; ${FIND_MRR:-0} * 0.20" | bc) |
-| Recovery | 25% | ${RECOVERY_RATE} | $(echo "scale=3; ${RECOVERY_RATE} * 0.25" | bc) |
-| Classification | 15% | ${CLASS_ACCURACY} | $(echo "scale=3; ${CLASS_ACCURACY} * 0.15" | bc) |
-
-## Find Performance
-- Queries: ${FIND_TOTAL:-0}
-- MRR: ${FIND_MRR:-0}
-- P@1: ${FIND_P1:-0}
-- Latency P50: ${FIND_LAT:-0} ms
-
-## Recovery Performance
-- Scenarios: ${RECOVERY_TOTAL}
-- Recovered: ${RECOVERY_SUCCESS}
-- Rate: ${RECOVERY_RATE}
-
-## Classification Performance
-- Cases: ${CLASS_TOTAL}
-- Correct: ${CLASS_CORRECT}
-- Accuracy: ${CLASS_ACCURACY}
-
-## Grade Scale
-| Grade | Score |
-|-------|-------|
-| A+ | >= 0.95 |
-| A | >= 0.90 |
-| B+ | >= 0.85 |
-| B | >= 0.80 |
-| C+ | >= 0.75 |
-| C | >= 0.70 |
-| D | >= 0.60 |
-| F | < 0.60 |
-EOF
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "  ┌─────────────────────────────────────────┐"
-echo "  │  COMPOSITE SCORE: ${COMPOSITE}  GRADE: ${GRADE}      │"
-echo "  ├─────────────────────────────────────────┤"
-echo "  │  Find P@1:       ${FIND_P1:-0}  (40%)            │"
-echo "  │  Find MRR:       ${FIND_MRR:-0}  (20%)            │"
-echo "  │  Recovery:       ${RECOVERY_RATE}  (25%)            │"
-echo "  │  Classification: ${CLASS_ACCURACY}  (15%)            │"
-echo "  └─────────────────────────────────────────┘"
-echo ""
-echo "Report: ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh
deleted file mode 100755
index ef61d88..0000000
--- a/tests/benchmark/scripts/tune-weights.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/bin/bash
-#
-# Grid-search combined matcher lexical/embedding weights against the corpus.
-#
-# Usage:
-#   ./tune-weights.sh [--corpus <dir>] [--step <n>] [--output <dir>]
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-SPECIFIC_CORPUS=""
-STEP="0.1"
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        --step) STEP="$2"; shift 2 ;;
-        --output) RESULTS_DIR="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${RESULTS_DIR}"
-
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/tuning_weights_${TIMESTAMP}.json"
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg step "${STEP}" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            type: "weight-tuning",
-            strategy: "combined",
-            step: ($step | tonumber)
-        },
-        results: [],
-        best: null
-    }' > "${REPORT_FILE}"
-
-weights=$(awk -v step="${STEP}" 'BEGIN {
-    if (step <= 0 || step > 1) {
-        exit 1
-    }
-    for (w = 0; w <= 1.000001; w += step) {
-        printf "%.4f\n", w
-    }
-}')
-
-if [[ -z "${weights}" ]]; then
-    echo "Invalid step: ${STEP}" >&2
-    exit 1
-fi
-
-echo "Weight tuning: step=${STEP}"
-echo ""
-printf "%-10s %-10s %-8s %-8s %-8s %-8s %-8s\n" "lexical" "embedding" "MRR" "P@1" "P@3" "P50" "report"
-
-while IFS= read -r lexical_weight; do
-    embedding_weight=$(awk -v w="${lexical_weight}" 'BEGIN { printf "%.4f", 1 - w }')
-
-    args=(
-        --strategy combined
-        --lexical-weight "${lexical_weight}"
-        --embedding-weight "${embedding_weight}"
-    )
-    if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-        args+=(--corpus "${SPECIFIC_CORPUS}")
-    fi
-
-    if ! output=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" "${args[@]}" 2>&1); then
-        echo "$output" >&2
-        exit 1
-    fi
-
-    corpus_report=$(echo "$output" | awk '/^Report:/ {print $2}' | tail -1)
-    if [[ -z "${corpus_report}" || ! -f "${corpus_report}" ]]; then
-        echo "Could not find corpus report for lexical=${lexical_weight}" >&2
-        echo "$output" >&2
-        exit 1
-    fi
-
-    mrr=$(jq -r '.metrics.mrr' "$corpus_report")
-    p1=$(jq -r '.metrics.p_at_1' "$corpus_report")
-    p3=$(jq -r '.metrics.p_at_3' "$corpus_report")
-    p50=$(jq -r '.metrics.latency_p50_ms' "$corpus_report")
-    total=$(jq -r '.metrics.total' "$corpus_report")
-
-    printf "%-10s %-10s %-8s %-8s %-8s %-8s %s\n" \
-        "${lexical_weight}" "${embedding_weight}" "${mrr}" "${p1}" "${p3}" "${p50}" "$(basename "$corpus_report")"
-
-    result_json=$(jq -n \
-        --argjson lexical_weight "${lexical_weight}" \
-        --argjson embedding_weight "${embedding_weight}" \
-        --argjson total "${total}" \
-        --argjson mrr "${mrr}" \
-        --argjson p1 "${p1}" \
-        --argjson p3 "${p3}" \
-        --argjson p50 "${p50}" \
-        --arg report "${corpus_report}" \
-        '{
-            lexical_weight: $lexical_weight,
-            embedding_weight: $embedding_weight,
-            total: $total,
-            mrr: $mrr,
-            p_at_1: $p1,
-            p_at_3: $p3,
-            latency_p50_ms: $p50,
-            report: $report
-        }')
-
-    tmp=$(mktemp)
-    jq --argjson result "${result_json}" '.results += [$result]' "${REPORT_FILE}" > "$tmp"
-    mv "$tmp" "${REPORT_FILE}"
-done <<< "${weights}"
-
-tmp=$(mktemp)
-jq '
-    .best = (
-        .results
-        | sort_by(.p_at_1, .mrr, .p_at_3, -(.latency_p50_ms))
-        | last
-    )
-' "${REPORT_FILE}" > "$tmp"
-mv "$tmp" "${REPORT_FILE}"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Combined Weight Tuning
-
-## Best
-
-| Field | Value |
-|-------|-------|
-| Lexical Weight | $(jq -r '.best.lexical_weight' "$REPORT_FILE") |
-| Embedding Weight | $(jq -r '.best.embedding_weight' "$REPORT_FILE") |
-| MRR | $(jq -r '.best.mrr' "$REPORT_FILE") |
-| P@1 | $(jq -r '.best.p_at_1' "$REPORT_FILE") |
-| P@3 | $(jq -r '.best.p_at_3' "$REPORT_FILE") |
-| Latency P50 | $(jq -r '.best.latency_p50_ms' "$REPORT_FILE") ms |
-
-## All Runs
-
-| Lexical | Embedding | MRR | P@1 | P@3 | P50 |
-|---------|-----------|-----|-----|-----|-----|
-$(jq -r '.results | sort_by(-.p_at_1, -.mrr, -.p_at_3, .latency_p50_ms)[] | "| \(.lexical_weight) | \(.embedding_weight) | \(.mrr) | \(.p_at_1) | \(.p_at_3) | \(.latency_p50_ms) ms |"' "$REPORT_FILE")
-EOF
-
-echo ""
-echo "Best weights:"
-jq '.best' "${REPORT_FILE}"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"