From 5c33b3622dae2c13953da58b387f669d0ddf0ec5 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 11:03:34 +0100
Subject: [PATCH 01/14] chore: expand benchmark corpus and add tuning tools

Add baseline management scripts (create/check/update), threshold
calibration, and runtime baseline tracking. Centralize benchmark
config. Update dev tool with ./dev pr command for pre-PR validation.
---
 .gitignore                                    |   3 +-
 dev                                           | 110 ++++++++-
 skills/semantic-dev/SKILL.md                  |  36 +--
 tests/benchmark/baselines/.gitkeep            |   0
 tests/benchmark/config/benchmark.json         |  40 +++-
 .../benchmark/scripts/calibrate-thresholds.sh | 208 ++++++++++++++++++
 tests/benchmark/scripts/check-baseline.sh     | 140 ++++++++++++
 .../scripts/check-runtime-baseline.sh         | 137 ++++++++++++
 tests/benchmark/scripts/create-baseline.sh    |  86 ++++++++
 tests/benchmark/scripts/lint-corpus.sh        |   4 +-
 tests/benchmark/scripts/run-benchmark.sh      |  13 +-
 .../benchmark/scripts/run-corpus-benchmark.sh |  26 ++-
 tests/benchmark/scripts/run-full-benchmark.sh |  13 ++
 tests/benchmark/scripts/tune-weights.sh       |  10 +
 tests/benchmark/scripts/update-baseline.sh    |  70 ++++++
 15 files changed, 855 insertions(+), 41 deletions(-)
 create mode 100644 tests/benchmark/baselines/.gitkeep
 create mode 100755 tests/benchmark/scripts/calibrate-thresholds.sh
 create mode 100755 tests/benchmark/scripts/check-baseline.sh
 create mode 100755 tests/benchmark/scripts/check-runtime-baseline.sh
 create mode 100755 tests/benchmark/scripts/create-baseline.sh
 create mode 100755 tests/benchmark/scripts/update-baseline.sh

diff --git a/.gitignore b/.gitignore
index 2f3b5cc..8a46978 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,5 @@ cover.out
 .claude
 tests/e2e/results/*.txt
 tests/benchmark/results/*.json
-tests/benchmark/results/*.md
\ No newline at end of file
+tests/benchmark/results/*.md
+tests/benchmark/baselines/*.backup.json
\ No newline at end of file
diff --git a/dev b/dev
index dc15e75..215b566 100755
--- a/dev
+++ b/dev
@@ -11,17 +11,26 @@ ERROR=$'\033[38;2;230;57;70m'
 NC=$'\033[0m'
 
 commands=(
+  "pr:🚀:Pre-PR checks (check + e2e + bench)"
   "doctor:🩺:Setup dev environment"
   "test:🧪:Run unit tests"
   "test verbose:🧪:Run unit tests (verbose)"
   "test race:🧪:Run unit tests with race detector"
   "coverage:📊:Run tests with coverage report"
   "lint:🔍:Run golangci-lint"
+  "lint corpus:🔍:Lint benchmark corpus"
   "fmt:✨:Format code"
   "vet:🔬:Run go vet"
   "check:✅:Run all checks (fmt + vet + lint + test)"
   "build:📦:Build CLI binary"
-  "bench:🏋:Run corpus benchmark suite"
+  "bench:🏋:Run corpus benchmark"
+  "bench full:🏋:Run full benchmark suite"
+  "baseline:📏:Create quality baseline"
+  "baseline check:📏:Check against baseline"
+  "baseline update:📏:Update baseline (--accept)"
+  "calibrate:🎯:Calibrate threshold recommendations"
+  "runtime:⏱️:Check runtime baseline"
+  "tune:🎛️:Tune combined weights"
   "e2e:🐳:Run E2E tests (Docker)"
 )
 
@@ -36,6 +45,36 @@ show_help() {
   echo ""
 }
 
+run_pr() {
+  echo "  ${ACCENT}${BOLD}🚀 Pre-PR checks${NC}"
+  echo ""
+
+  echo "  ${MUTED}1/4 All checks (fmt + vet + lint + test)${NC}"
+  run_check
+
+  echo ""
+  echo "  ${MUTED}2/4 E2E tests${NC}"
+  if [[ -f tests/e2e/run.sh ]]; then
+    go build -o /tmp/semantic ./cmd/semantic
+    PATH="/tmp:$PATH" bash tests/e2e/run.sh
+    echo "  ${SUCCESS}✓${NC} E2E passed"
+  else
+    echo "  ${MUTED}Skipped (no e2e/run.sh)${NC}"
+  fi
+
+  echo ""
+  echo "  ${MUTED}3/4 Lint corpus${NC}"
+  run_lint_corpus
+
+  echo ""
+  echo "  ${MUTED}4/4 Corpus benchmark${NC}"
+  run_bench > /dev/null 2>&1
+  echo "  ${SUCCESS}✓${NC} Benchmark complete"
+
+  echo ""
+  echo "  ${SUCCESS}${BOLD}🚀 Ready for PR${NC}"
+}
+
 run_test() {
   echo "  ${ACCENT}${BOLD}🧪 Running tests${NC}"
   go test ./... -count=1
@@ -115,8 +154,48 @@ run_build() {
 }
 
 run_bench() {
-  echo "  ${ACCENT}${BOLD}⏱️  Running corpus benchmark suite${NC}"
-  bash tests/benchmark/scripts/run-corpus-benchmark.sh
+  echo "  ${ACCENT}${BOLD}🏋 Running corpus benchmark${NC}"
+  bash tests/benchmark/scripts/run-corpus-benchmark.sh "$@"
+}
+
+run_bench_full() {
+  echo "  ${ACCENT}${BOLD}🏋 Running full benchmark suite${NC}"
+  bash tests/benchmark/scripts/run-full-benchmark.sh
+}
+
+run_lint_corpus() {
+  echo "  ${ACCENT}${BOLD}🔍 Linting benchmark corpus${NC}"
+  bash tests/benchmark/scripts/lint-corpus.sh
+}
+
+run_baseline() {
+  echo "  ${ACCENT}${BOLD}📏 Creating quality baseline${NC}"
+  bash tests/benchmark/scripts/create-baseline.sh "$@"
+}
+
+run_baseline_check() {
+  echo "  ${ACCENT}${BOLD}📏 Checking against baseline${NC}"
+  bash tests/benchmark/scripts/check-baseline.sh "$@"
+}
+
+run_baseline_update() {
+  echo "  ${ACCENT}${BOLD}📏 Updating baseline${NC}"
+  bash tests/benchmark/scripts/update-baseline.sh --accept "$@"
+}
+
+run_calibrate() {
+  echo "  ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}"
+  bash tests/benchmark/scripts/calibrate-thresholds.sh "$@"
+}
+
+run_runtime() {
+  echo "  ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}"
+  bash tests/benchmark/scripts/check-runtime-baseline.sh "$@"
+}
+
+run_tune() {
+  echo "  ${ACCENT}${BOLD}🎛️ Tuning combined weights${NC}"
+  bash tests/benchmark/scripts/tune-weights.sh "$@"
 }
 
 run_e2e() {
@@ -129,6 +208,7 @@ run_e2e() {
 }
 
 case "${1:-help}" in
+  pr)        run_pr ;;
   doctor)    exec bash scripts/doctor.sh ;;
   test)
     case "${2:-}" in
@@ -138,12 +218,32 @@ case "${1:-help}" in
     esac
     ;;
   coverage)  run_coverage ;;
-  lint)      run_lint ;;
+  lint)
+    case "${2:-}" in
+      corpus) run_lint_corpus ;;
+      *) run_lint ;;
+    esac
+    ;;
   fmt)       run_fmt ;;
   vet)       run_vet ;;
   check)     run_check ;;
   build)     run_build ;;
-  bench|benchmark) run_bench ;;
+  bench|benchmark)
+    case "${2:-}" in
+      full) run_bench_full ;;
+      *) shift; run_bench "$@" ;;
+    esac
+    ;;
+  baseline)
+    case "${2:-}" in
+      check) shift 2; run_baseline_check "$@" ;;
+      update) shift 2; run_baseline_update "$@" ;;
+      *) shift; run_baseline "$@" ;;
+    esac
+    ;;
+  calibrate) shift; run_calibrate "$@" ;;
+  runtime)   shift; run_runtime "$@" ;;
+  tune)      shift; run_tune "$@" ;;
   e2e)       run_e2e ;;
   help|*)    show_help ;;
 esac
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index 84ade33..b813297 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -15,22 +15,26 @@ cd ~/dev/semantic
 
 ## Dev Commands
 
-All development commands run via `./dev`:
-
-| Command | Description |
-|---------|-------------|
-| `./dev doctor` | Setup dev environment |
-| `./dev test` | Run unit tests |
-| `./dev test verbose` | Run unit tests (verbose) |
-| `./dev test race` | Run unit tests with race detector |
-| `./dev coverage` | Run tests with coverage report |
-| `./dev lint` | Run golangci-lint |
-| `./dev fmt` | Format code |
-| `./dev vet` | Run go vet |
-| `./dev check` | All checks (fmt + vet + lint + test) |
-| `./dev build` | Build CLI binary |
-| `./dev bench` | Run corpus benchmark suite |
-| `./dev e2e` | Run E2E tests (Docker) |
+```bash
+# Before opening a PR (runs all checks + e2e + benchmark)
+./dev pr
+
+# Quick iteration
+./dev test              # unit tests
+./dev check             # fmt + vet + lint + test race
+
+# Benchmarking
+./dev bench             # corpus benchmark
+./dev baseline          # create baseline (first time)
+./dev baseline check    # check for regressions
+
+# Other
+./dev build             # build ./semantic binary
+./dev e2e               # e2e tests (Docker)
+./dev lint corpus       # validate benchmark data
+./dev calibrate         # find optimal thresholds
+./dev tune              # grid-search weights
+```
 
 ## Architecture
 
diff --git a/tests/benchmark/baselines/.gitkeep b/tests/benchmark/baselines/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/benchmark/config/benchmark.json b/tests/benchmark/config/benchmark.json
index 23b5661..7b06060 100644
--- a/tests/benchmark/config/benchmark.json
+++ b/tests/benchmark/config/benchmark.json
@@ -1,13 +1,35 @@
 {
-  "version": "1.0.0",
-  "strategies": ["lexical", "embedding", "combined"],
-  "default_strategy": "combined",
-  "default_threshold": 0.3,
-  "default_top_k": 3,
-  "metrics": {
-    "min_accuracy": 0.85,
-    "min_avg_score": 0.5,
-    "max_latency_ms": 100
+  "version": "1.1.0",
+  "defaults": {
+    "strategy": "combined",
+    "threshold": 0.01,
+    "top_k": 5,
+    "weights": {
+      "lexical": 0.6,
+      "embedding": 0.4
+    }
+  },
+  "baseline": {
+    "quality": {
+      "max_overall_p_at_1_drop": 0.02,
+      "max_overall_mrr_drop": 0.02,
+      "max_overall_hit_at_3_drop": 0.02,
+      "max_corpus_p_at_1_drop": 0.08,
+      "max_difficulty_p_at_1_drop": 0.08,
+      "max_margin_drop_report": 0.15
+    },
+    "runtime": {
+      "max_ns_op_regression_ratio": 1.25,
+      "max_alloc_regression_ratio": 1.25,
+      "max_corpus_latency_p50_ms": 75,
+      "max_corpus_latency_p95_ms": 200
+    }
   },
+  "results": {
+    "dir": "tests/benchmark/results",
+    "baselines_dir": "tests/benchmark/baselines",
+    "generated_files_policy": "warn"
+  },
+  "strategies": ["lexical", "embedding", "combined"],
   "snapshots_dir": "../e2e/assets/snapshots"
 }
diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh
new file mode 100755
index 0000000..ef5603d
--- /dev/null
+++ b/tests/benchmark/scripts/calibrate-thresholds.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+#
+# Calibrate threshold recommendations for find and recovery.
+#
+# Usage:
+#   ./calibrate-thresholds.sh [--corpus <dir>]
+#
+# Reports recall/precision/false-positive-rate by threshold.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+CORPUS_DIR="${BENCHMARK_DIR}/corpus"
+RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read config
+if [[ -f "$CONFIG_FILE" ]]; then
+    STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+    LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
+    EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
+else
+    STRATEGY="combined"
+    LEXICAL_WEIGHT=0.6
+    EMBEDDING_WEIGHT=0.4
+fi
+
+SPECIFIC_CORPUS=""
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${RESULTS_DIR}"
+
+# Build semantic binary
+echo "Building semantic..."
+(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
+
+SEMANTIC="${BENCHMARK_DIR}/semantic"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json"
+
+# Thresholds to test
+THRESHOLDS=(0.01 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.60 0.70 0.80 0.90)
+
+echo "Testing ${#THRESHOLDS[@]} thresholds: ${THRESHOLDS[*]}"
+echo ""
+
+# Initialize report
+jq -n \
+    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+    --arg strategy "${STRATEGY}" \
+    '{
+        timestamp: $ts,
+        strategy: $strategy,
+        thresholds: [],
+        recommendations: {}
+    }' > "${REPORT_FILE}"
+
+# Collect results for each threshold
+for thresh in "${THRESHOLDS[@]}"; do
+    echo "Testing threshold: ${thresh}"
+
+    total=0
+    true_positives=0
+    false_positives=0
+    false_negatives=0
+
+    for corpus in "${CORPUS_DIR}"/*/; do
+        [[ -d "$corpus" ]] || continue
+
+        if [[ -n "$SPECIFIC_CORPUS" ]] && [[ "$(basename "$corpus")" != "$SPECIFIC_CORPUS" ]]; then
+            continue
+        fi
+
+        snapshot="${corpus}/snapshot.json"
+        queries="${corpus}/queries.json"
+
+        [[ -f "$snapshot" ]] && [[ -f "$queries" ]] || continue
+
+        count=$(jq length "$queries")
+
+        for i in $(seq 0 $((count - 1))); do
+            query=$(jq -r ".[$i].query" "$queries")
+            relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries")
+
+            result=$("${SEMANTIC}" find "${query}" \
+                --snapshot "${snapshot}" \
+                --strategy "${STRATEGY}" \
+                --threshold "${thresh}" \
+                --top-k 5 \
+                --lexical-weight "${LEXICAL_WEIGHT}" \
+                --embedding-weight "${EMBEDDING_WEIGHT}" \
+                --format json 2>/dev/null) || continue
+
+            best_ref=$(echo "$result" | jq -r '.best_ref // ""')
+            num_matches=$(echo "$result" | jq '.matches | length')
+
+            total=$((total + 1))
+
+            # Check if best match is relevant
+            if [[ -n "$best_ref" ]] && echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
+                true_positives=$((true_positives + 1))
+            elif [[ -n "$best_ref" ]] && [[ "$num_matches" -gt 0 ]]; then
+                false_positives=$((false_positives + 1))
+            fi
+
+            # If no match but there should be one
+            if [[ -z "$best_ref" ]] || [[ "$num_matches" -eq 0 ]]; then
+                rel_count=$(echo "$relevant_refs" | jq 'length')
+                if [[ "$rel_count" -gt 0 ]]; then
+                    false_negatives=$((false_negatives + 1))
+                fi
+            fi
+        done
+    done
+
+    # Calculate metrics
+    if [[ $total -eq 0 ]]; then
+        echo "  No queries processed"
+        continue
+    fi
+
+    precision=0
+    recall=0
+    fpr=0
+
+    if [[ $((true_positives + false_positives)) -gt 0 ]]; then
+        precision=$(echo "scale=4; $true_positives / ($true_positives + $false_positives)" | bc)
+    fi
+
+    if [[ $((true_positives + false_negatives)) -gt 0 ]]; then
+        recall=$(echo "scale=4; $true_positives / ($true_positives + $false_negatives)" | bc)
+    fi
+
+    if [[ $((false_positives + true_positives)) -gt 0 ]]; then
+        fpr=$(echo "scale=4; $false_positives / $total" | bc)
+    fi
+
+    f1=0
+    if (( $(echo "$precision + $recall > 0" | bc -l) )); then
+        f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc)
+    fi
+
+    printf "  Precision: %.3f | Recall: %.3f | FPR: %.3f | F1: %.3f\n" "$precision" "$recall" "$fpr" "$f1"
+
+    # Append to report
+    tmp=$(mktemp)
+    jq --argjson thresh "$thresh" \
+       --argjson total "$total" \
+       --argjson tp "$true_positives" \
+       --argjson fp "$false_positives" \
+       --argjson fn "$false_negatives" \
+       --argjson precision "$precision" \
+       --argjson recall "$recall" \
+       --argjson fpr "$fpr" \
+       --argjson f1 "$f1" \
+       '.thresholds += [{
+           threshold: $thresh,
+           total: $total,
+           true_positives: $tp,
+           false_positives: $fp,
+           false_negatives: $fn,
+           precision: $precision,
+           recall: $recall,
+           false_positive_rate: $fpr,
+           f1: $f1
+       }]' "$REPORT_FILE" > "$tmp"
+    mv "$tmp" "$REPORT_FILE"
+done
+
+# Calculate recommendations
+echo ""
+echo "Calculating recommendations..."
+
+# Best F1 for general find
+BEST_FIND=$(jq -r '[.thresholds[] | select(.f1 > 0)] | max_by(.f1) | .threshold // 0.3' "$REPORT_FILE")
+
+# Best recall with precision > 0.8 for recovery (prioritize not missing)
+BEST_RECOVERY=$(jq -r '[.thresholds[] | select(.precision >= 0.7)] | max_by(.recall) | .threshold // 0.2' "$REPORT_FILE")
+
+# Update recommendations
+tmp=$(mktemp)
+jq --argjson find "$BEST_FIND" \
+   --argjson recovery "$BEST_RECOVERY" \
+   '.recommendations = {
+       find: $find,
+       recovery: $recovery,
+       note: "find optimizes F1; recovery optimizes recall with precision >= 0.7"
+   }' "$REPORT_FILE" > "$tmp"
+mv "$tmp" "$REPORT_FILE"
+
+# Cleanup
+rm -f "${BENCHMARK_DIR}/semantic"
+
+echo ""
+echo "================================================"
+echo "  THRESHOLD CALIBRATION RESULTS"
+echo "================================================"
+echo "  Recommended for Find:     ${BEST_FIND}"
+echo "  Recommended for Recovery: ${BEST_RECOVERY}"
+echo "================================================"
+echo ""
+echo "Report: ${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/check-baseline.sh b/tests/benchmark/scripts/check-baseline.sh
new file mode 100755
index 0000000..f6e95ae
--- /dev/null
+++ b/tests/benchmark/scripts/check-baseline.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+#
+# Check current benchmark results against a baseline.
+#
+# Usage:
+#   ./check-baseline.sh [--baseline <file>] [--fail-on-regression]
+#
+# Exit codes:
+#   0 - No regressions detected
+#   1 - Regressions detected (if --fail-on-regression)
+#   2 - Error (missing files, invalid config)
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m'
+
+# Read config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 2
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+MAX_P1_DROP=$(jq -r '.baseline.quality.max_overall_p_at_1_drop // 0.02' "$CONFIG_FILE")
+MAX_MRR_DROP=$(jq -r '.baseline.quality.max_overall_mrr_drop // 0.02' "$CONFIG_FILE")
+MAX_HIT3_DROP=$(jq -r '.baseline.quality.max_overall_hit_at_3_drop // 0.02' "$CONFIG_FILE")
+MAX_CORPUS_P1_DROP=$(jq -r '.baseline.quality.max_corpus_p_at_1_drop // 0.08' "$CONFIG_FILE")
+MAX_MARGIN_DROP=$(jq -r '.baseline.quality.max_margin_drop_report // 0.15' "$CONFIG_FILE")
+
+# Parse args
+BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
+FAIL_ON_REGRESSION=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --baseline) BASELINE_FILE="$2"; shift 2 ;;
+        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
+        *) echo "Unknown option: $1"; exit 2 ;;
+    esac
+done
+
+if [[ ! -f "$BASELINE_FILE" ]]; then
+    echo "ERROR: Baseline not found: $BASELINE_FILE" >&2
+    echo "Run ./create-baseline.sh first" >&2
+    exit 2
+fi
+
+echo "Checking against baseline: ${BASELINE_FILE}"
+echo "Tolerances: P@1=${MAX_P1_DROP}, MRR=${MAX_MRR_DROP}, Hit@3=${MAX_HIT3_DROP}"
+echo ""
+
+# Run current benchmark
+TEMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TEMP_DIR"' EXIT
+
+"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" > "${TEMP_DIR}/output.log" 2>&1
+
+# Find the latest report
+LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
+
+if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
+    echo "ERROR: Could not find benchmark report" >&2
+    exit 2
+fi
+
+# Compare metrics
+REGRESSIONS=0
+WARNINGS=0
+
+compare_metric() {
+    local name="$1"
+    local baseline_val="$2"
+    local current_val="$3"
+    local max_drop="$4"
+
+    local diff
+    diff=$(echo "scale=4; $current_val - $baseline_val" | bc)
+    local drop
+    drop=$(echo "scale=4; $baseline_val - $current_val" | bc)
+
+    if (( $(echo "$drop > $max_drop" | bc -l) )); then
+        echo -e "${RED}REGRESSION${NC} $name: $baseline_val -> $current_val (drop: $drop, max: $max_drop)"
+        REGRESSIONS=$((REGRESSIONS + 1))
+    elif (( $(echo "$drop > 0" | bc -l) )); then
+        echo -e "${YELLOW}WARNING${NC} $name: $baseline_val -> $current_val (drop: $drop)"
+        WARNINGS=$((WARNINGS + 1))
+    else
+        echo -e "${GREEN}OK${NC} $name: $baseline_val -> $current_val (${diff:0:6})"
+    fi
+}
+
+echo "=== Overall Metrics ==="
+echo ""
+
+BASELINE_MRR=$(jq -r '.metrics.mrr' "$BASELINE_FILE")
+CURRENT_MRR=$(jq -r '.metrics.mrr' "$LATEST_REPORT")
+compare_metric "MRR" "$BASELINE_MRR" "$CURRENT_MRR" "$MAX_MRR_DROP"
+
+BASELINE_P1=$(jq -r '.metrics.p_at_1' "$BASELINE_FILE")
+CURRENT_P1=$(jq -r '.metrics.p_at_1' "$LATEST_REPORT")
+compare_metric "P@1" "$BASELINE_P1" "$CURRENT_P1" "$MAX_P1_DROP"
+
+BASELINE_HIT3=$(jq -r '.metrics.hit_at_3' "$BASELINE_FILE")
+CURRENT_HIT3=$(jq -r '.metrics.hit_at_3' "$LATEST_REPORT")
+compare_metric "Hit@3" "$BASELINE_HIT3" "$CURRENT_HIT3" "$MAX_HIT3_DROP"
+
+BASELINE_MARGIN=$(jq -r '.metrics.avg_margin' "$BASELINE_FILE")
+CURRENT_MARGIN=$(jq -r '.metrics.avg_margin' "$LATEST_REPORT")
+compare_metric "Margin" "$BASELINE_MARGIN" "$CURRENT_MARGIN" "$MAX_MARGIN_DROP"
+
+echo ""
+echo "=== Per-Corpus ==="
+echo ""
+
+for corpus in $(jq -r '.by_corpus | keys[]' "$BASELINE_FILE"); do
+    BASELINE_CORPUS_P1=$(jq -r ".by_corpus[\"$corpus\"].p_at_1 // 0" "$BASELINE_FILE")
+    CURRENT_CORPUS_P1=$(jq -r ".metrics.by_corpus[\"$corpus\"].p_at_1 // 0" "$LATEST_REPORT")
+    compare_metric "$corpus P@1" "$BASELINE_CORPUS_P1" "$CURRENT_CORPUS_P1" "$MAX_CORPUS_P1_DROP"
+done
+
+echo ""
+echo "================================================"
+if [[ $REGRESSIONS -gt 0 ]]; then
+    echo -e "${RED}REGRESSIONS: $REGRESSIONS${NC}"
+    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
+        exit 1
+    fi
+elif [[ $WARNINGS -gt 0 ]]; then
+    echo -e "${YELLOW}WARNINGS: $WARNINGS (no regressions)${NC}"
+else
+    echo -e "${GREEN}ALL CHECKS PASSED${NC}"
+fi
+echo "================================================"
diff --git a/tests/benchmark/scripts/check-runtime-baseline.sh b/tests/benchmark/scripts/check-runtime-baseline.sh
new file mode 100755
index 0000000..75bc4fc
--- /dev/null
+++ b/tests/benchmark/scripts/check-runtime-baseline.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+#
+# Check Go benchmark results against runtime baseline.
+#
+# Usage:
+#   ./check-runtime-baseline.sh [--fail-on-regression]
+#
+# Runs Go benchmarks and compares against saved baseline.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+PROJECT_ROOT="${BENCHMARK_DIR}/../.."
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m'
+
+# Read tolerances from config
+if [[ -f "$CONFIG_FILE" ]]; then
+    MAX_NS_RATIO=$(jq -r '.baseline.runtime.max_ns_op_regression_ratio // 1.25' "$CONFIG_FILE")
+    MAX_ALLOC_RATIO=$(jq -r '.baseline.runtime.max_alloc_regression_ratio // 1.25' "$CONFIG_FILE")
+else
+    MAX_NS_RATIO=1.25
+    MAX_ALLOC_RATIO=1.25
+fi
+
+# Parse args
+FAIL_ON_REGRESSION=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${RESULTS_DIR}"
+mkdir -p "${BASELINES_DIR}"
+
+BASELINE_FILE="${BASELINES_DIR}/runtime.json"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+REPORT_FILE="${RESULTS_DIR}/runtime_${TIMESTAMP}.json"
+
+echo "Running Go benchmarks..."
+echo ""
+
+# Run benchmarks
+BENCH_OUTPUT=$(mktemp)
+(cd "$PROJECT_ROOT" && go test -bench=. -benchmem ./internal/engine/... 2>&1) | tee "$BENCH_OUTPUT"
+
+# Parse benchmark output into JSON
+echo ""
+echo "Parsing results..."
+
+jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" '{timestamp: $ts, benchmarks: []}' > "$REPORT_FILE"
+
+while IFS= read -r line; do
+    if [[ "$line" =~ ^Benchmark ]]; then
+        # Parse: BenchmarkName-N  iterations  ns/op  bytes/op  allocs/op
+        name=$(echo "$line" | awk '{print $1}' | sed 's/-[0-9]*$//')
+        ns_op=$(echo "$line" | grep -oE '[0-9.]+ ns/op' | awk '{print $1}' || echo "0")
+        bytes_op=$(echo "$line" | grep -oE '[0-9]+ B/op' | awk '{print $1}' || echo "0")
+        allocs_op=$(echo "$line" | grep -oE '[0-9]+ allocs/op' | awk '{print $1}' || echo "0")
+
+        if [[ -n "$ns_op" ]] && [[ "$ns_op" != "0" ]]; then
+            tmp=$(mktemp)
+            jq --arg name "$name" \
+               --argjson ns "$ns_op" \
+               --argjson bytes "${bytes_op:-0}" \
+               --argjson allocs "${allocs_op:-0}" \
+               '.benchmarks += [{name: $name, ns_op: $ns, bytes_op: $bytes, allocs_op: $allocs}]' \
+               "$REPORT_FILE" > "$tmp"
+            mv "$tmp" "$REPORT_FILE"
+        fi
+    fi
+done < "$BENCH_OUTPUT"
+
+rm -f "$BENCH_OUTPUT"
+
+# If no baseline exists, create one
+if [[ ! -f "$BASELINE_FILE" ]]; then
+    echo ""
+    echo "No runtime baseline found. Creating initial baseline..."
+    cp "$REPORT_FILE" "$BASELINE_FILE"
+    echo "Baseline saved to: $BASELINE_FILE"
+    exit 0
+fi
+
+# Compare against baseline
+echo ""
+echo "=== Comparing against baseline ==="
+echo ""
+
+REGRESSIONS=0
+
+for name in $(jq -r '.benchmarks[].name' "$REPORT_FILE"); do
+    baseline_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$BASELINE_FILE")
+    current_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$REPORT_FILE")
+
+    baseline_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$BASELINE_FILE")
+    current_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$REPORT_FILE")
+
+    if [[ "$baseline_ns" == "0" ]] || [[ "$baseline_ns" == "null" ]]; then
+        echo -e "${YELLOW}NEW${NC} $name: ${current_ns} ns/op"
+        continue
+    fi
+
+    ratio=$(echo "scale=4; $current_ns / $baseline_ns" | bc)
+
+    if (( $(echo "$ratio > $MAX_NS_RATIO" | bc -l) )); then
+        echo -e "${RED}REGRESSION${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x, max: ${MAX_NS_RATIO}x)"
+        REGRESSIONS=$((REGRESSIONS + 1))
+    elif (( $(echo "$ratio > 1.1" | bc -l) )); then
+        echo -e "${YELLOW}WARNING${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
+    else
+        echo -e "${GREEN}OK${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
+    fi
+done
+
+echo ""
+echo "================================================"
+if [[ $REGRESSIONS -gt 0 ]]; then
+    echo -e "${RED}RUNTIME REGRESSIONS: $REGRESSIONS${NC}"
+    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
+        exit 1
+    fi
+else
+    echo -e "${GREEN}NO RUNTIME REGRESSIONS${NC}"
+fi
+echo "================================================"
+echo ""
+echo "Report: ${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/create-baseline.sh b/tests/benchmark/scripts/create-baseline.sh
new file mode 100755
index 0000000..cd4696a
--- /dev/null
+++ b/tests/benchmark/scripts/create-baseline.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#
+# Create a quality baseline from current corpus benchmark results.
+#
+# Usage:
+#   ./create-baseline.sh [--name <name>]
+#
+# This runs run-corpus-benchmark.sh and saves the results as a baseline.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+
+# Parse args
+BASELINE_NAME="${STRATEGY}"
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --name) BASELINE_NAME="$2"; shift 2 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+mkdir -p "${BASELINES_DIR}"
+
+BASELINE_FILE="${BASELINES_DIR}/${BASELINE_NAME}.json"
+
+echo "Creating baseline: ${BASELINE_NAME}"
+echo "Strategy: ${STRATEGY}"
+echo ""
+
+# Run corpus benchmark
+TEMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TEMP_DIR"' EXIT
+
+"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" 2>&1 | tee "${TEMP_DIR}/output.log"
+
+# Find the latest report
+LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
+
+if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
+    echo "ERROR: Could not find benchmark report" >&2
+    exit 1
+fi
+
+# Extract baseline data
+jq '{
+    created_at: .benchmark.timestamp,
+    strategy: .benchmark.strategy,
+    threshold: .benchmark.threshold,
+    top_k: .benchmark.top_k,
+    weights: .benchmark.weights,
+    metrics: {
+        total: .metrics.total,
+        mrr: .metrics.mrr,
+        p_at_1: .metrics.p_at_1,
+        p_at_3: .metrics.p_at_3,
+        hit_at_3: .metrics.hit_at_3,
+        hit_at_5: .metrics.hit_at_5,
+        avg_margin: .metrics.avg_margin,
+        latency_p50_ms: .metrics.latency_p50_ms,
+        latency_p95_ms: .metrics.latency_p95_ms
+    },
+    by_difficulty: .metrics.by_difficulty,
+    by_corpus: .metrics.by_corpus,
+    per_query: [.results[] | {id, corpus, difficulty, p_at_1, rr, margin}]
+}' "$LATEST_REPORT" > "$BASELINE_FILE"
+
+echo ""
+echo "================================================"
+echo "  BASELINE CREATED"
+echo "================================================"
+echo "  File: ${BASELINE_FILE}"
+echo ""
+jq -r '"  MRR:     \(.metrics.mrr)\n  P@1:     \(.metrics.p_at_1)\n  Hit@3:   \(.metrics.hit_at_3)\n  Margin:  \(.metrics.avg_margin)"' "$BASELINE_FILE"
+echo "================================================"
diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh
index 29f81b2..783e546 100755
--- a/tests/benchmark/scripts/lint-corpus.sh
+++ b/tests/benchmark/scripts/lint-corpus.sh
@@ -17,12 +17,12 @@ WARNINGS=0
 
 error() {
     echo -e "${RED}ERROR:${NC} $1"
-    ((ERRORS++))
+    ERRORS=$((ERRORS + 1))
 }
 
 warn() {
     echo -e "${YELLOW}WARN:${NC} $1"
-    ((WARNINGS++))
+    WARNINGS=$((WARNINGS + 1))
 }
 
 ok() {
diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh
index 4ce67d6..29c8a22 100755
--- a/tests/benchmark/scripts/run-benchmark.sh
+++ b/tests/benchmark/scripts/run-benchmark.sh
@@ -19,9 +19,18 @@ CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
 SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
 
-# Parse args
-STRATEGY="combined"
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
 CASE_FILE=""
+
+# Parse args (override config)
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --strategy) STRATEGY="$2"; shift 2 ;;
diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh
index b5579bf..53216af 100755
--- a/tests/benchmark/scripts/run-corpus-benchmark.sh
+++ b/tests/benchmark/scripts/run-corpus-benchmark.sh
@@ -17,17 +17,27 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 CORPUS_DIR="${BENCHMARK_DIR}/corpus"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
 
-# Parse args
-STRATEGY="combined"
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
+LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
+EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
 SPECIFIC_CORPUS=""
-TOP_K=5
-LEXICAL_WEIGHT=0.6
-EMBEDDING_WEIGHT=0.4
+
+# Parse args (override config)
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --strategy) STRATEGY="$2"; shift 2 ;;
         --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
+        --threshold) THRESHOLD="$2"; shift 2 ;;
         --top-k) TOP_K="$2"; shift 2 ;;
         --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;;
         --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;;
@@ -54,15 +64,19 @@ REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json"
 jq -n \
     --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
     --arg strategy "${STRATEGY}" \
+    --argjson threshold "${THRESHOLD}" \
     --argjson top_k "${TOP_K}" \
     --argjson lexical_weight "${LEXICAL_WEIGHT}" \
     --argjson embedding_weight "${EMBEDDING_WEIGHT}" \
+    --arg config_file "${CONFIG_FILE}" \
     '{
         benchmark: {
             timestamp: $ts,
             strategy: $strategy,
+            threshold: $threshold,
             top_k: $top_k,
             type: "corpus",
+            config_source: $config_file,
             weights: {
                 lexical: $lexical_weight,
                 embedding: $embedding_weight
@@ -128,7 +142,7 @@ run_corpus() {
         if ! result=$("${SEMANTIC}" find "${query}" \
             --snapshot "${snapshot}" \
             --strategy "${STRATEGY}" \
-            --threshold 0.01 \
+            --threshold "${THRESHOLD}" \
             --top-k "${TOP_K}" \
             --lexical-weight "${LEXICAL_WEIGHT}" \
             --embedding-weight "${EMBEDDING_WEIGHT}" \
diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh
index eadaad7..5c759dc 100755
--- a/tests/benchmark/scripts/run-full-benchmark.sh
+++ b/tests/benchmark/scripts/run-full-benchmark.sh
@@ -10,6 +10,19 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 CORPUS_DIR="${BENCHMARK_DIR}/corpus"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read defaults from config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
+LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
+EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
 
 mkdir -p "${RESULTS_DIR}"
 
diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh
index ef61d88..011b1b2 100755
--- a/tests/benchmark/scripts/tune-weights.sh
+++ b/tests/benchmark/scripts/tune-weights.sh
@@ -10,6 +10,16 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 RESULTS_DIR="${BENCHMARK_DIR}/results"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read defaults from config (used for threshold/top_k in grid runs)
+if [[ -f "$CONFIG_FILE" ]]; then
+    THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
+    TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
+else
+    THRESHOLD=0.01
+    TOP_K=5
+fi
 
 SPECIFIC_CORPUS=""
 STEP="0.1"
diff --git a/tests/benchmark/scripts/update-baseline.sh b/tests/benchmark/scripts/update-baseline.sh
new file mode 100755
index 0000000..ba93089
--- /dev/null
+++ b/tests/benchmark/scripts/update-baseline.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Update baseline after reviewing regressions.
+#
+# Usage:
+#   ./update-baseline.sh --accept [--baseline <file>]
+#
+# This re-runs the benchmark and overwrites the baseline file.
+# Use after reviewing check-baseline.sh output and confirming
+# the changes are intentional.
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+BASELINES_DIR="${BENCHMARK_DIR}/baselines"
+CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
+
+# Read config
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+fi
+
+STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
+
+# Parse args
+BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
+ACCEPT=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --accept) ACCEPT=true; shift ;;
+        --baseline) BASELINE_FILE="$2"; shift 2 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+if [[ "$ACCEPT" != "true" ]]; then
+    echo "Usage: $0 --accept [--baseline <file>]"
+    echo ""
+    echo "This will overwrite the baseline. Run check-baseline.sh first"
+    echo "to review changes before accepting."
+    exit 1
+fi
+
+if [[ ! -f "$BASELINE_FILE" ]]; then
+    echo "Baseline not found: $BASELINE_FILE"
+    echo "Creating new baseline instead..."
+    exec "${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
+fi
+
+# Show what will change
+echo "Current baseline: ${BASELINE_FILE}"
+echo ""
+jq -r '"  MRR:   \(.metrics.mrr)\n  P@1:   \(.metrics.p_at_1)\n  Hit@3: \(.metrics.hit_at_3)"' "$BASELINE_FILE"
+echo ""
+echo "Running benchmark to generate new baseline..."
+echo ""
+
+# Backup old baseline
+BACKUP_FILE="${BASELINE_FILE%.json}_$(date +%Y%m%d_%H%M%S).backup.json"
+cp "$BASELINE_FILE" "$BACKUP_FILE"
+echo "Backed up old baseline to: $BACKUP_FILE"
+
+# Create new baseline (overwrites)
+"${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
+
+echo ""
+echo "Baseline updated. Old baseline backed up to:"
+echo "  $BACKUP_FILE"

From a5a3d55d0473326c7d944b59c03b50aca7072de3 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 11:04:00 +0100
Subject: [PATCH 02/14] docs: improve SKILL.md for LLM usage

Add scenario-based command table to help LLM assistants pick
the right dev command for each situation.
---
 skills/semantic-dev/SKILL.md | 51 ++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index b813297..16e70b4 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -5,37 +5,44 @@ description: Develop and contribute to the Semantic project. Use when working on
 
 # Semantic Development
 
-Semantic is a zero-dependency Go library for matching natural language queries against accessibility tree elements.
+Zero-dependency Go library for matching natural language queries against accessibility tree elements.
 
-## Project Location
+## Essential Commands
 
+**Before any PR:**
 ```bash
-cd ~/dev/semantic
+./dev pr                # runs: check + e2e + lint corpus + bench
 ```
 
-## Dev Commands
+**During development:**
+```bash
+./dev test              # unit tests (fast)
+./dev check             # fmt + vet + lint + test race (full validation)
+./dev build             # build ./semantic CLI binary
+```
 
+**Quality regression checks:**
 ```bash
-# Before opening a PR (runs all checks + e2e + benchmark)
-./dev pr
-
-# Quick iteration
-./dev test              # unit tests
-./dev check             # fmt + vet + lint + test race
-
-# Benchmarking
-./dev bench             # corpus benchmark
-./dev baseline          # create baseline (first time)
-./dev baseline check    # check for regressions
-
-# Other
-./dev build             # build ./semantic binary
-./dev e2e               # e2e tests (Docker)
-./dev lint corpus       # validate benchmark data
-./dev calibrate         # find optimal thresholds
-./dev tune              # grid-search weights
+./dev baseline check    # compare quality against baseline
+./dev runtime           # compare performance against baseline
 ```
 
+**When quality changes intentionally:**
+```bash
+./dev baseline update   # accept new quality baseline (after review)
+```
+
+## When to Use Each
+
+| Scenario | Command |
+|----------|---------|
+| Made code changes, quick sanity | `./dev test` |
+| Ready to commit | `./dev check` |
+| Before opening PR | `./dev pr` |
+| Changed scoring/matching logic | `./dev baseline check` |
+| Performance-sensitive changes | `./dev runtime` |
+| Tuning weights | `./dev tune` then `./dev bench` |
+
 ## Architecture
 
 ```

From 93eee1ee9024ec6dec08d8892ee27df4ced70716 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:37:42 +0100
Subject: [PATCH 03/14] refactor: use Go CLI instead of bash scripts in dev
 tool

Replace bash implementations of bench, lint corpus, and loop commands
with calls to go run ./cmd/semantic-bench. Removes ~100 lines of
duplicate bash logic.
---
 dev                                           |  20 +-
 recovery/benchmark_test.go                    | 250 ++++++++++++
 skills/semantic-dev/SKILL.md                  |  54 +++
 .../benchmark/scripts/calibrate-thresholds.sh | 368 ++++++++++++------
 .../scripts/run-recovery-benchmark.sh         |  42 ++
 5 files changed, 613 insertions(+), 121 deletions(-)
 create mode 100644 recovery/benchmark_test.go
 create mode 100755 tests/benchmark/scripts/run-recovery-benchmark.sh

diff --git a/dev b/dev
index 215b566..a7f6247 100755
--- a/dev
+++ b/dev
@@ -19,6 +19,7 @@ commands=(
   "coverage:📊:Run tests with coverage report"
   "lint:🔍:Run golangci-lint"
   "lint corpus:🔍:Lint benchmark corpus"
+  "lint docs:🔍:Check documentation links"
   "fmt:✨:Format code"
   "vet:🔬:Run go vet"
   "check:✅:Run all checks (fmt + vet + lint + test)"
@@ -32,6 +33,7 @@ commands=(
   "runtime:⏱️:Check runtime baseline"
   "tune:🎛️:Tune combined weights"
   "e2e:🐳:Run E2E tests (Docker)"
+  "loop:🔄:Benchmark loop (bench → compare → report)"
 )
 
 show_help() {
@@ -155,17 +157,22 @@ run_build() {
 
 run_bench() {
   echo "  ${ACCENT}${BOLD}🏋 Running corpus benchmark${NC}"
-  bash tests/benchmark/scripts/run-corpus-benchmark.sh "$@"
+  go run ./cmd/semantic-bench check "$@"
 }
 
 run_bench_full() {
   echo "  ${ACCENT}${BOLD}🏋 Running full benchmark suite${NC}"
-  bash tests/benchmark/scripts/run-full-benchmark.sh
+  go run ./cmd/semantic-bench run -suite=all "$@"
 }
 
 run_lint_corpus() {
   echo "  ${ACCENT}${BOLD}🔍 Linting benchmark corpus${NC}"
-  bash tests/benchmark/scripts/lint-corpus.sh
+  go run ./cmd/semantic-bench lint "$@"
+}
+
+run_lint_docs() {
+  echo "  ${ACCENT}${BOLD}🔍 Checking documentation links${NC}"
+  bash scripts/check-docs-links.sh
 }
 
 run_baseline() {
@@ -207,6 +214,11 @@ run_e2e() {
   bash scripts/e2e.sh
 }
 
+run_loop() {
+  echo "  ${ACCENT}${BOLD}🔄 Benchmark Loop${NC}"
+  go run ./cmd/semantic-bench check -verbose "$@"
+}
+
 case "${1:-help}" in
   pr)        run_pr ;;
   doctor)    exec bash scripts/doctor.sh ;;
@@ -221,6 +233,7 @@ case "${1:-help}" in
   lint)
     case "${2:-}" in
       corpus) run_lint_corpus ;;
+      docs) run_lint_docs ;;
       *) run_lint ;;
     esac
     ;;
@@ -245,5 +258,6 @@ case "${1:-help}" in
   runtime)   shift; run_runtime "$@" ;;
   tune)      shift; run_tune "$@" ;;
   e2e)       run_e2e ;;
+  loop)      run_loop ;;
   help|*)    show_help ;;
 esac
diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go
new file mode 100644
index 0000000..9670a68
--- /dev/null
+++ b/recovery/benchmark_test.go
@@ -0,0 +1,250 @@
+package recovery
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/pinchtab/semantic"
+)
+
+type BenchmarkScenario struct {
+	ID            string                       `json:"id"`
+	Name          string                       `json:"name"`
+	Description   string                       `json:"description"`
+	OriginalQuery string                       `json:"original_query"`
+	OriginalRef   string                       `json:"original_ref"`
+	Before        []semantic.ElementDescriptor `json:"before"`
+	After         []semantic.ElementDescriptor `json:"after"`
+	ExpectedRef   *string                      `json:"expected_ref"`
+	ExpectedAlt   []string                     `json:"expected_alt"`
+	ExpectNoMatch bool                         `json:"expect_no_match"`
+	Difficulty    string                       `json:"difficulty"`
+}
+
+func loadScenarios(t *testing.T) []BenchmarkScenario {
+	_, thisFile, _, _ := runtime.Caller(0)
+	repoRoot := filepath.Join(filepath.Dir(thisFile), "..")
+	scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json")
+
+	data, err := os.ReadFile(scenariosPath)
+	if err != nil {
+		t.Fatalf("failed to read scenarios: %v", err)
+	}
+
+	var scenarios []BenchmarkScenario
+	if err := json.Unmarshal(data, &scenarios); err != nil {
+		t.Fatalf("failed to parse scenarios: %v", err)
+	}
+
+	return scenarios
+}
+
+func TestRecoveryBenchmark_Scenarios(t *testing.T) {
+	scenarios := loadScenarios(t)
+	matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128))
+
+	passed, failed := 0, 0
+
+	for _, sc := range scenarios {
+		t.Run(sc.ID, func(t *testing.T) {
+			result := runBenchmarkScenario(t, matcher, sc)
+
+			if result.pass {
+				passed++
+				t.Logf("PASS: recovered=%v got=%s expected=%s score=%.3f",
+					result.recovered, result.gotRef, result.expectedRef, result.score)
+			} else {
+				failed++
+				t.Errorf("FAIL: recovered=%v got=%s expected=%s score=%.3f error=%s",
+					result.recovered, result.gotRef, result.expectedRef, result.score, result.err)
+			}
+		})
+	}
+
+	t.Logf("Summary: %d passed, %d failed out of %d scenarios", passed, failed, len(scenarios))
+}
+
+type scenarioResult struct {
+	pass        bool
+	recovered   bool
+	gotRef      string
+	expectedRef string
+	score       float64
+	confidence  string
+	latencyMs   int64
+	err         string
+}
+
+func runBenchmarkScenario(t *testing.T, matcher semantic.ElementMatcher, sc BenchmarkScenario) scenarioResult {
+	result := scenarioResult{}
+
+	if sc.ExpectedRef != nil {
+		result.expectedRef = *sc.ExpectedRef
+	}
+
+	var origDesc semantic.ElementDescriptor
+	for _, d := range sc.Before {
+		if d.Ref == sc.OriginalRef {
+			origDesc = d
+			break
+		}
+	}
+
+	cache := NewIntentCache(100, 5*time.Minute)
+	cache.Store("test-tab", sc.OriginalRef, IntentEntry{
+		Query:      sc.OriginalQuery,
+		Descriptor: origDesc,
+		Score:      0.95,
+		Confidence: "high",
+		Strategy:   "combined",
+	})
+
+	re := NewRecoveryEngine(
+		DefaultRecoveryConfig(),
+		matcher,
+		cache,
+		func(_ context.Context, _ string) error { return nil },
+		func(_, ref string) (int64, bool) {
+			for i, d := range sc.After {
+				if d.Ref == ref {
+					return int64(1000 + i), true
+				}
+			}
+			return 0, false
+		},
+		func(_ string) []semantic.ElementDescriptor { return sc.After },
+	)
+
+	start := time.Now()
+
+	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
+
+	if !re.ShouldAttempt(err, sc.OriginalRef) {
+		result.err = "ShouldAttempt returned false"
+		result.pass = sc.ExpectNoMatch
+		result.latencyMs = time.Since(start).Milliseconds()
+		return result
+	}
+
+	rr, _, recErr := re.AttemptWithClassification(
+		context.Background(),
+		"test-tab",
+		sc.OriginalRef,
+		"click",
+		ClassifyFailure(err),
+		func(_ context.Context, kind string, nodeID int64) (map[string]any, error) {
+			return map[string]any{"clicked": true}, nil
+		},
+	)
+
+	result.latencyMs = time.Since(start).Milliseconds()
+	result.recovered = rr.Recovered
+	result.gotRef = rr.NewRef
+	result.score = rr.Score
+	result.confidence = rr.Confidence
+
+	if recErr != nil {
+		result.err = recErr.Error()
+	}
+
+	if sc.ExpectNoMatch {
+		result.pass = !rr.Recovered
+	} else if sc.ExpectedRef != nil {
+		if rr.NewRef == *sc.ExpectedRef {
+			result.pass = true
+		} else {
+			for _, alt := range sc.ExpectedAlt {
+				if rr.NewRef == alt {
+					result.pass = true
+					break
+				}
+			}
+		}
+	}
+
+	return result
+}
+
+func BenchmarkRecoveryEngine_Scenarios(b *testing.B) {
+	scenarios := loadScenariosB(b)
+	matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, sc := range scenarios {
+			runBenchmarkScenarioB(b, matcher, sc)
+		}
+	}
+}
+
+func loadScenariosB(b *testing.B) []BenchmarkScenario {
+	_, thisFile, _, _ := runtime.Caller(0)
+	repoRoot := filepath.Join(filepath.Dir(thisFile), "..")
+	scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json")
+
+	data, err := os.ReadFile(scenariosPath)
+	if err != nil {
+		b.Fatalf("failed to read scenarios: %v", err)
+	}
+
+	var scenarios []BenchmarkScenario
+	if err := json.Unmarshal(data, &scenarios); err != nil {
+		b.Fatalf("failed to parse scenarios: %v", err)
+	}
+
+	return scenarios
+}
+
+func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc BenchmarkScenario) {
+	var origDesc semantic.ElementDescriptor
+	for _, d := range sc.Before {
+		if d.Ref == sc.OriginalRef {
+			origDesc = d
+			break
+		}
+	}
+
+	cache := NewIntentCache(100, 5*time.Minute)
+	cache.Store("test-tab", sc.OriginalRef, IntentEntry{
+		Query:      sc.OriginalQuery,
+		Descriptor: origDesc,
+		Score:      0.95,
+		Confidence: "high",
+		Strategy:   "combined",
+	})
+
+	re := NewRecoveryEngine(
+		DefaultRecoveryConfig(),
+		matcher,
+		cache,
+		func(_ context.Context, _ string) error { return nil },
+		func(_, ref string) (int64, bool) {
+			for i, d := range sc.After {
+				if d.Ref == ref {
+					return int64(1000 + i), true
+				}
+			}
+			return 0, false
+		},
+		func(_ string) []semantic.ElementDescriptor { return sc.After },
+	)
+
+	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
+
+	re.AttemptWithClassification(
+		context.Background(),
+		"test-tab",
+		sc.OriginalRef,
+		"click",
+		ClassifyFailure(err),
+		func(_ context.Context, kind string, nodeID int64) (map[string]any, error) {
+			return map[string]any{"clicked": true}, nil
+		},
+	)
+}
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index 16e70b4..7cbb684 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -90,6 +90,60 @@ cmd/semantic/main.go       CLI tool (find, match, classify)
 
 4. **Pre-commit hook** runs gofmt + golangci-lint automatically on staged files.
 
+## Benchmark Improvement Loop
+
+When implementing changes that affect matching quality, follow this loop:
+
+### Step 1: Ensure baseline exists
+
+```bash
+./dev baseline
+```
+
+Creates `tests/benchmark/baselines/combined.json` if missing.
+
+### Step 2: Implement change
+
+Make one focused improvement at a time.
+
+### Step 3: Run benchmark loop
+
+```bash
+./dev loop
+```
+
+Shows comparison table with deltas:
+- **Green (+)** = improved
+- **Red (-)** = regressed  
+- **Gray** = unchanged
+
+### Step 4: Evaluate and decide
+
+| Result | Action |
+|--------|--------|
+| All metrics improved/unchanged | `./dev baseline update` |
+| Mixed (some up, some down) | Investigate tradeoff |
+| Key metrics regressed | Fix before merging |
+
+### Step 5: Iterate
+
+Repeat steps 2-4. Each `baseline update` sets new goalpost.
+
+### Key metrics
+
+- **MRR** — Mean Reciprocal Rank (higher = finds correct element faster)
+- **P@1** — Precision at 1 (is top result correct?)
+- **Hit@3** — Any correct result in top 3?
+- **Margin** — Score gap between best correct and best wrong
+
+### Adding test cases
+
+When a query should work better:
+
+1. Add to `tests/benchmark/corpus/*/queries.json` or `cases/*.json`
+2. Run `./dev lint corpus`
+3. Run `./dev loop` — benchmark will show regression until fixed
+
 ## Public API Surface
 
 Only these symbols are visible to consumers:
diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh
index ef5603d..84d68d1 100755
--- a/tests/benchmark/scripts/calibrate-thresholds.sh
+++ b/tests/benchmark/scripts/calibrate-thresholds.sh
@@ -1,30 +1,20 @@
 #!/bin/bash
 #
-# Calibrate threshold recommendations for find and recovery.
+# Threshold Calibration Benchmark
+#
+# Calculates optimal thresholds for semantic matching by evaluating
+# recall, precision, and false-positive rates across threshold levels.
 #
 # Usage:
 #   ./calibrate-thresholds.sh [--corpus <dir>]
 #
-# Reports recall/precision/false-positive-rate by threshold.
-#
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BENCHMARK_DIR="${SCRIPT_DIR}/.."
 CORPUS_DIR="${BENCHMARK_DIR}/corpus"
+CASES_DIR="${BENCHMARK_DIR}/cases"
 RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read config
-if [[ -f "$CONFIG_FILE" ]]; then
-    STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-    LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
-    EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
-else
-    STRATEGY="combined"
-    LEXICAL_WEIGHT=0.6
-    EMBEDDING_WEIGHT=0.4
-fi
 
 SPECIFIC_CORPUS=""
 while [[ $# -gt 0 ]]; do
@@ -45,164 +35,306 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json"
 
 # Thresholds to test
-THRESHOLDS=(0.01 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.60 0.70 0.80 0.90)
-
-echo "Testing ${#THRESHOLDS[@]} thresholds: ${THRESHOLDS[*]}"
-echo ""
+THRESHOLDS=(0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60)
 
 # Initialize report
 jq -n \
     --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
+    --argjson thresholds "$(printf '%s\n' "${THRESHOLDS[@]}" | jq -s '.')" \
     '{
-        timestamp: $ts,
-        strategy: $strategy,
-        thresholds: [],
+        calibration: {
+            timestamp: $ts,
+            thresholds_tested: $thresholds
+        },
+        by_threshold: {},
+        by_tag: {},
         recommendations: {}
     }' > "${REPORT_FILE}"
 
-# Collect results for each threshold
-for thresh in "${THRESHOLDS[@]}"; do
-    echo "Testing threshold: ${thresh}"
+echo ""
+echo "=== Threshold Calibration ==="
+echo "Testing thresholds: ${THRESHOLDS[*]}"
+echo ""
 
-    total=0
-    true_positives=0
-    false_positives=0
-    false_negatives=0
+# Collect all test cases
+declare -a ALL_QUERIES=()
+declare -a ALL_SNAPSHOTS=()
+declare -a ALL_RELEVANT=()
+declare -a ALL_EXPECT_NO_MATCH=()
+declare -a ALL_IDS=()
 
-    for corpus in "${CORPUS_DIR}"/*/; do
-        [[ -d "$corpus" ]] || continue
+load_corpus() {
+    local corpus_path="$1"
+    local snapshot="${corpus_path}/snapshot.json"
+    local queries="${corpus_path}/queries.json"
 
-        if [[ -n "$SPECIFIC_CORPUS" ]] && [[ "$(basename "$corpus")" != "$SPECIFIC_CORPUS" ]]; then
+    if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
+        return
+    fi
+
+    local count
+    count=$(jq length "$queries")
+
+    for i in $(seq 0 $((count - 1))); do
+        local query relevant id expect_no_match
+        id=$(jq -r ".[$i].id" "$queries")
+        query=$(jq -r ".[$i].query" "$queries")
+        relevant=$(jq -c ".[$i].relevant_refs // []" "$queries")
+        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$queries")
+
+        ALL_IDS+=("$id")
+        ALL_QUERIES+=("$query")
+        ALL_SNAPSHOTS+=("$snapshot")
+        ALL_RELEVANT+=("$relevant")
+        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
+    done
+}
+
+load_cases() {
+    local cases_file="$1"
+    local snapshots_dir="${BENCHMARK_DIR}/../e2e/assets/snapshots"
+
+    if [[ ! -f "$cases_file" ]]; then
+        return
+    fi
+
+    local count
+    count=$(jq length "$cases_file")
+
+    for i in $(seq 0 $((count - 1))); do
+        local id query snapshot_name expect_no_match expect_ref expect_ref_alt relevant
+        id=$(jq -r ".[$i].id" "$cases_file")
+        query=$(jq -r ".[$i].query" "$cases_file")
+        snapshot_name=$(jq -r ".[$i].snapshot" "$cases_file")
+        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$cases_file")
+        expect_ref=$(jq -r ".[$i].expect_ref // \"\"" "$cases_file")
+        expect_ref_alt=$(jq -c ".[$i].expect_ref_alt // []" "$cases_file")
+
+        if [[ -n "$expect_ref" && "$expect_ref" != "null" ]]; then
+            relevant=$(echo "$expect_ref_alt" | jq --arg r "$expect_ref" '. + [$r]')
+        else
+            relevant="[]"
+        fi
+
+        local snapshot="${snapshots_dir}/${snapshot_name}"
+        if [[ ! -f "$snapshot" ]]; then
             continue
         fi
 
-        snapshot="${corpus}/snapshot.json"
-        queries="${corpus}/queries.json"
+        ALL_IDS+=("$id")
+        ALL_QUERIES+=("$query")
+        ALL_SNAPSHOTS+=("$snapshot")
+        ALL_RELEVANT+=("$relevant")
+        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
+    done
+}
 
-        [[ -f "$snapshot" ]] && [[ -f "$queries" ]] || continue
+echo "Loading test cases..."
+if [[ -n "${SPECIFIC_CORPUS}" ]]; then
+    load_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}"
+else
+    for corpus in "${CORPUS_DIR}"/*/; do
+        [[ -d "$corpus" ]] || continue
+        load_corpus "$corpus"
+    done
+fi
 
-        count=$(jq length "$queries")
+load_cases "${CASES_DIR}/negative-threshold.json"
 
-        for i in $(seq 0 $((count - 1))); do
-            query=$(jq -r ".[$i].query" "$queries")
-            relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries")
+TOTAL_CASES=${#ALL_QUERIES[@]}
+echo "Loaded ${TOTAL_CASES} test cases"
+echo ""
 
-            result=$("${SEMANTIC}" find "${query}" \
-                --snapshot "${snapshot}" \
-                --strategy "${STRATEGY}" \
-                --threshold "${thresh}" \
-                --top-k 5 \
-                --lexical-weight "${LEXICAL_WEIGHT}" \
-                --embedding-weight "${EMBEDDING_WEIGHT}" \
-                --format json 2>/dev/null) || continue
+for threshold in "${THRESHOLDS[@]}"; do
+    echo "Testing threshold ${threshold}..."
 
-            best_ref=$(echo "$result" | jq -r '.best_ref // ""')
-            num_matches=$(echo "$result" | jq '.matches | length')
+    tp=0 fp=0 fn=0 tn=0
 
-            total=$((total + 1))
+    for i in $(seq 0 $((TOTAL_CASES - 1))); do
+        query="${ALL_QUERIES[$i]}"
+        snapshot="${ALL_SNAPSHOTS[$i]}"
+        relevant="${ALL_RELEVANT[$i]}"
+        expect_no_match="${ALL_EXPECT_NO_MATCH[$i]}"
 
-            # Check if best match is relevant
-            if [[ -n "$best_ref" ]] && echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-                true_positives=$((true_positives + 1))
-            elif [[ -n "$best_ref" ]] && [[ "$num_matches" -gt 0 ]]; then
-                false_positives=$((false_positives + 1))
+        result=$("${SEMANTIC}" find "${query}" \
+            --snapshot "${snapshot}" \
+            --strategy combined \
+            --threshold "${threshold}" \
+            --top-k 5 \
+            --format json 2>/dev/null) || result='{"matches":[]}'
+
+        match_count=$(echo "$result" | jq '.matches | length')
+        best_ref=$(echo "$result" | jq -r '.best_ref // ""')
+
+        if [[ "$expect_no_match" == "true" ]]; then
+            if [[ $match_count -eq 0 ]]; then
+                tn=$((tn + 1))
+            else
+                fp=$((fp + 1))
+            fi
+        else
+            relevant_count=$(echo "$relevant" | jq 'length')
+            if [[ $relevant_count -eq 0 ]]; then
+                continue
             fi
 
-            # If no match but there should be one
-            if [[ -z "$best_ref" ]] || [[ "$num_matches" -eq 0 ]]; then
-                rel_count=$(echo "$relevant_refs" | jq 'length')
-                if [[ "$rel_count" -gt 0 ]]; then
-                    false_negatives=$((false_negatives + 1))
-                fi
+            if [[ $match_count -eq 0 ]]; then
+                fn=$((fn + 1))
+            elif echo "$relevant" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
+                tp=$((tp + 1))
+            else
+                fp=$((fp + 1))
             fi
-        done
+        fi
     done
 
-    # Calculate metrics
-    if [[ $total -eq 0 ]]; then
-        echo "  No queries processed"
-        continue
-    fi
-
-    precision=0
-    recall=0
-    fpr=0
+    total_positive=$((tp + fn))
+    total_negative=$((tn + fp))
 
-    if [[ $((true_positives + false_positives)) -gt 0 ]]; then
-        precision=$(echo "scale=4; $true_positives / ($true_positives + $false_positives)" | bc)
+    if [[ $total_positive -gt 0 ]]; then
+        recall=$(echo "scale=4; $tp / $total_positive" | bc)
+    else
+        recall="0"
     fi
 
-    if [[ $((true_positives + false_negatives)) -gt 0 ]]; then
-        recall=$(echo "scale=4; $true_positives / ($true_positives + $false_negatives)" | bc)
+    if [[ $((tp + fp)) -gt 0 ]]; then
+        precision=$(echo "scale=4; $tp / ($tp + $fp)" | bc)
+    else
+        precision="1"
     fi
 
-    if [[ $((false_positives + true_positives)) -gt 0 ]]; then
-        fpr=$(echo "scale=4; $false_positives / $total" | bc)
+    if [[ $total_negative -gt 0 ]]; then
+        fpr=$(echo "scale=4; $fp / $total_negative" | bc)
+    else
+        fpr="0"
     fi
 
-    f1=0
-    if (( $(echo "$precision + $recall > 0" | bc -l) )); then
+    if [[ $(echo "$precision + $recall > 0" | bc) -eq 1 ]]; then
         f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc)
+    else
+        f1="0"
     fi
 
-    printf "  Precision: %.3f | Recall: %.3f | FPR: %.3f | F1: %.3f\n" "$precision" "$recall" "$fpr" "$f1"
+    printf "  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f FPR=%.3f F1=%.3f\n" \
+        "$threshold" "$tp" "$fp" "$fn" "$tn" "$recall" "$precision" "$fpr" "$f1"
 
-    # Append to report
     tmp=$(mktemp)
-    jq --argjson thresh "$thresh" \
-       --argjson total "$total" \
-       --argjson tp "$true_positives" \
-       --argjson fp "$false_positives" \
-       --argjson fn "$false_negatives" \
-       --argjson precision "$precision" \
-       --argjson recall "$recall" \
-       --argjson fpr "$fpr" \
-       --argjson f1 "$f1" \
-       '.thresholds += [{
-           threshold: $thresh,
-           total: $total,
-           true_positives: $tp,
-           false_positives: $fp,
-           false_negatives: $fn,
-           precision: $precision,
-           recall: $recall,
-           false_positive_rate: $fpr,
-           f1: $f1
-       }]' "$REPORT_FILE" > "$tmp"
+    jq --arg t "$threshold" \
+       --argjson tp "$tp" --argjson fp "$fp" --argjson fn "$fn" --argjson tn "$tn" \
+       --argjson recall "$recall" --argjson precision "$precision" \
+       --argjson fpr "$fpr" --argjson f1 "$f1" \
+       '.by_threshold[$t] = {
+           tp: $tp, fp: $fp, fn: $fn, tn: $tn,
+           recall: $recall, precision: $precision,
+           false_positive_rate: $fpr, f1: $f1
+       }' "$REPORT_FILE" > "$tmp"
     mv "$tmp" "$REPORT_FILE"
 done
 
-# Calculate recommendations
 echo ""
 echo "Calculating recommendations..."
 
-# Best F1 for general find
-BEST_FIND=$(jq -r '[.thresholds[] | select(.f1 > 0)] | max_by(.f1) | .threshold // 0.3' "$REPORT_FILE")
+best_f1_threshold="" best_f1=0
+best_recall_threshold="" best_recall=0
+
+for threshold in "${THRESHOLDS[@]}"; do
+    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
+    f1=$(echo "$metrics" | jq -r '.f1')
+    recall=$(echo "$metrics" | jq -r '.recall')
+
+    if (( $(echo "$f1 > $best_f1" | bc -l) )); then
+        best_f1=$f1
+        best_f1_threshold=$threshold
+    fi
+    if (( $(echo "$recall > $best_recall" | bc -l) )); then
+        best_recall=$recall
+        best_recall_threshold=$threshold
+    fi
+done
+
+recovery_threshold=""
+recovery_precision=0
+for threshold in "${THRESHOLDS[@]}"; do
+    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
+    recall=$(echo "$metrics" | jq -r '.recall')
+    precision=$(echo "$metrics" | jq -r '.precision')
+
+    if (( $(echo "$recall >= 0.85" | bc -l) )); then
+        if (( $(echo "$precision > $recovery_precision" | bc -l) )); then
+            recovery_precision=$precision
+            recovery_threshold=$threshold
+        fi
+    fi
+done
+
+if [[ -z "$recovery_threshold" ]]; then
+    recovery_threshold="${THRESHOLDS[0]}"
+fi
 
-# Best recall with precision > 0.8 for recovery (prioritize not missing)
-BEST_RECOVERY=$(jq -r '[.thresholds[] | select(.precision >= 0.7)] | max_by(.recall) | .threshold // 0.2' "$REPORT_FILE")
+default_threshold="$best_f1_threshold"
 
-# Update recommendations
 tmp=$(mktemp)
-jq --argjson find "$BEST_FIND" \
-   --argjson recovery "$BEST_RECOVERY" \
+jq --arg default "$default_threshold" \
+   --arg recovery "$recovery_threshold" \
+   --arg best_f1 "$best_f1_threshold" \
+   --argjson best_f1_val "$best_f1" \
    '.recommendations = {
-       find: $find,
-       recovery: $recovery,
-       note: "find optimizes F1; recovery optimizes recall with precision >= 0.7"
+       default_threshold: $default,
+       recovery_threshold: $recovery,
+       best_f1: { threshold: $best_f1, value: $best_f1_val },
+       notes: "default_threshold optimizes F1. recovery_threshold prioritizes recall (>=85%)."
    }' "$REPORT_FILE" > "$tmp"
 mv "$tmp" "$REPORT_FILE"
 
-# Cleanup
+SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
+
+cat > "${SUMMARY_FILE}" << EOF
+# Threshold Calibration Report
+
+Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)
+
+## Recommendations
+
+| Use Case | Threshold | Rationale |
+|----------|-----------|-----------|
+| **Default (find)** | **${default_threshold}** | Best F1 score (${best_f1}) |
+| **Recovery** | **${recovery_threshold}** | High recall for element recovery |
+
+## Metrics by Threshold
+
+| Threshold | TP | FP | FN | TN | Recall | Precision | FPR | F1 |
+|-----------|----|----|----|----|--------|-----------|-----|-----|
+$(for t in "${THRESHOLDS[@]}"; do
+    m=$(jq -r ".by_threshold[\"$t\"]" "$REPORT_FILE")
+    printf "| %.2f | %d | %d | %d | %d | %.3f | %.3f | %.3f | %.3f |\n" \
+        "$t" \
+        "$(echo "$m" | jq -r '.tp')" \
+        "$(echo "$m" | jq -r '.fp')" \
+        "$(echo "$m" | jq -r '.fn')" \
+        "$(echo "$m" | jq -r '.tn')" \
+        "$(echo "$m" | jq -r '.recall')" \
+        "$(echo "$m" | jq -r '.precision')" \
+        "$(echo "$m" | jq -r '.false_positive_rate')" \
+        "$(echo "$m" | jq -r '.f1')"
+done)
+
+## Trade-offs
+
+- **Lower threshold** (0.10-0.20): High recall, more false positives. Good for recovery.
+- **Medium threshold** (0.25-0.35): Balanced. Good default for find operations.
+- **Higher threshold** (0.40+): High precision, misses weaker matches.
+EOF
+
 rm -f "${BENCHMARK_DIR}/semantic"
 
 echo ""
 echo "================================================"
-echo "  THRESHOLD CALIBRATION RESULTS"
+echo "  THRESHOLD CALIBRATION COMPLETE"
 echo "================================================"
-echo "  Recommended for Find:     ${BEST_FIND}"
-echo "  Recommended for Recovery: ${BEST_RECOVERY}"
+echo "  Test cases:         ${TOTAL_CASES}"
+echo "  Default threshold:  ${default_threshold} (F1=${best_f1})"
+echo "  Recovery threshold: ${recovery_threshold}"
 echo "================================================"
 echo ""
-echo "Report: ${REPORT_FILE}"
+echo "Report:  ${REPORT_FILE}"
+echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/run-recovery-benchmark.sh b/tests/benchmark/scripts/run-recovery-benchmark.sh
new file mode 100755
index 0000000..93fc88a
--- /dev/null
+++ b/tests/benchmark/scripts/run-recovery-benchmark.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Recovery Engine Benchmark
+#
+# Exercises RecoveryEngine directly using before/after snapshots
+# and intent cache entries from recovery scenarios.
+#
+# Usage:
+#   ./run-recovery-benchmark.sh
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_DIR="${SCRIPT_DIR}/.."
+RESULTS_DIR="${BENCHMARK_DIR}/results"
+
+mkdir -p "${RESULTS_DIR}"
+
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+REPORT_FILE="${RESULTS_DIR}/recovery_benchmark_${TIMESTAMP}.txt"
+
+echo "=== Recovery Engine Benchmark ==="
+echo ""
+
+cd "${BENCHMARK_DIR}/../.."
+
+# Run the Go test that exercises RecoveryEngine with scenarios
+echo "Running recovery scenarios..."
+echo ""
+
+go test -v -run TestRecoveryBenchmark_Scenarios ./recovery/ 2>&1 | tee "$REPORT_FILE"
+
+# Also run the Go benchmark for performance
+echo ""
+echo "Running performance benchmark..."
+go test -bench=BenchmarkRecoveryEngine_Scenarios -benchmem ./recovery/ 2>&1 | tee -a "$REPORT_FILE"
+
+echo ""
+echo "================================================"
+echo "  RECOVERY BENCHMARK COMPLETE"
+echo "================================================"
+echo "Report: $REPORT_FILE"

From d504268ebe4ca92e12f1cc8916b084e35a656af6 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:38:06 +0100
Subject: [PATCH 04/14] feat: add semantic-bench CLI for benchmark management

Go CLI with commands: check, run, compare, lint, catalog.
Replaces bash scripts with structured benchmark framework.
---
 cmd/semantic-bench/main.go     | 113 ++++++++
 internal/benchmark/commands.go | 510 +++++++++++++++++++++++++++++++++
 internal/benchmark/config.go   | 247 ++++++++++++++++
 internal/benchmark/dataset.go  | 117 ++++++++
 internal/benchmark/runner.go   | 384 +++++++++++++++++++++++++
 5 files changed, 1371 insertions(+)
 create mode 100644 cmd/semantic-bench/main.go
 create mode 100644 internal/benchmark/commands.go
 create mode 100644 internal/benchmark/config.go
 create mode 100644 internal/benchmark/dataset.go
 create mode 100644 internal/benchmark/runner.go

diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go
new file mode 100644
index 0000000..35bf051
--- /dev/null
+++ b/cmd/semantic-bench/main.go
@@ -0,0 +1,113 @@
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/pinchtab/semantic/internal/benchmark"
+)
+
+const usage = `semantic-bench - Benchmark runner for semantic matching
+
+Usage:
+  semantic-bench <command> [flags]
+
+Commands:
+  check     Run benchmark and compare against baseline (default)
+  run       Run benchmark suites
+  compare   Compare two reports
+  lint      Validate dataset
+  catalog   Print dataset inventory
+
+Flags:
+  -h, --help    Show help
+
+Run 'semantic-bench <command> --help' for command-specific help.
+`
+
+func main() {
+	if len(os.Args) < 2 {
+		runCheck(os.Args[1:])
+		return
+	}
+
+	cmd := os.Args[1]
+	args := os.Args[2:]
+
+	switch cmd {
+	case "check":
+		runCheck(args)
+	case "run":
+		runRun(args)
+	case "compare":
+		runCompare(args)
+	case "lint":
+		runLint(args)
+	case "catalog":
+		runCatalog(args)
+	case "-h", "--help", "help":
+		fmt.Print(usage)
+	default:
+		fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage)
+		os.Exit(2)
+	}
+}
+
+func runCheck(args []string) {
+	cfg := benchmark.ParseCheckFlags(args)
+	result, err := benchmark.RunCheck(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCheckResult(result, cfg)
+	if result.Status == "fail" {
+		os.Exit(1)
+	}
+}
+
+func runRun(args []string) {
+	cfg := benchmark.ParseRunFlags(args)
+	result, err := benchmark.RunBenchmark(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintRunResult(result, cfg)
+}
+
+func runCompare(args []string) {
+	cfg := benchmark.ParseCompareFlags(args)
+	result, err := benchmark.RunCompare(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCompareResult(result, cfg)
+	if result.Status == "fail" {
+		os.Exit(1)
+	}
+}
+
+func runLint(args []string) {
+	cfg := benchmark.ParseLintFlags(args)
+	result, err := benchmark.RunLint(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintLintResult(result, cfg)
+	if result.Errors > 0 {
+		os.Exit(1)
+	}
+}
+
+func runCatalog(args []string) {
+	cfg := benchmark.ParseCatalogFlags(args)
+	result, err := benchmark.RunCatalog(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCatalogResult(result, cfg)
+}
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
new file mode 100644
index 0000000..ad22ea3
--- /dev/null
+++ b/internal/benchmark/commands.go
@@ -0,0 +1,510 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+)
+
+type CheckResult struct {
+	Status    string        `json:"status"`
+	Summary   CheckSummary  `json:"summary"`
+	Delta     *MetricsDelta `json:"delta,omitempty"`
+	TopRegs   []Regression  `json:"top_regressions,omitempty"`
+	Artifacts Artifacts     `json:"artifacts"`
+	Report    *Report       `json:"-"`
+}
+
+type CheckSummary struct {
+	PAt1        float64 `json:"p_at_1"`
+	MRR         float64 `json:"mrr"`
+	HitAt3      float64 `json:"hit_at_3"`
+	Total       int     `json:"total"`
+	Regressions int     `json:"regressions"`
+	Warnings    int     `json:"warnings"`
+}
+
+type MetricsDelta struct {
+	PAt1   float64 `json:"p_at_1"`
+	MRR    float64 `json:"mrr"`
+	HitAt3 float64 `json:"hit_at_3"`
+}
+
+type Regression struct {
+	ID           string   `json:"id"`
+	Corpus       string   `json:"corpus"`
+	Query        string   `json:"query"`
+	Expected     []string `json:"expected"`
+	BaselineRef  string   `json:"baseline_ref,omitempty"`
+	CurrentRef   string   `json:"current_ref"`
+	Reason       string   `json:"reason"`
+	DebugCommand string   `json:"debug_command"`
+}
+
+type Artifacts struct {
+	ReportJSON string `json:"report_json"`
+	SummaryMD  string `json:"summary_md"`
+}
+
+type CompareResult struct {
+	Status       string       `json:"status"`
+	Delta        MetricsDelta `json:"delta"`
+	Regressions  []Regression `json:"regressions"`
+	Improvements []string     `json:"improvements"`
+}
+
+type LintResult struct {
+	Errors   int      `json:"errors"`
+	Warnings int      `json:"warnings"`
+	Messages []string `json:"messages"`
+}
+
+type CatalogResult struct {
+	Corpora      []CorpusSummary `json:"corpora"`
+	TotalQueries int             `json:"total_queries"`
+	ByTag        map[string]int  `json:"by_tag,omitempty"`
+	ByDifficulty map[string]int  `json:"by_difficulty,omitempty"`
+}
+
+type CorpusSummary struct {
+	ID      string   `json:"id"`
+	Queries int      `json:"queries"`
+	Tags    []string `json:"tags"`
+}
+
+func RunCheck(cfg CheckConfig) (*CheckResult, error) {
+	root := FindBenchmarkRoot()
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	benchCfg, _ := LoadConfig(root)
+	profile := Profile{
+		Strategy:  "combined",
+		Threshold: 0.01,
+		TopK:      5,
+		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+	}
+	if benchCfg != nil {
+		profile = ResolveProfile(benchCfg, cfg.Profile)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        profile.Strategy,
+		Threshold:       profile.Threshold,
+		TopK:            profile.TopK,
+		LexicalWeight:   profile.Weights.Lexical,
+		EmbeddingWeight: profile.Weights.Embedding,
+		Profile:         cfg.Profile,
+		Mode:            "library",
+		Verbose:         cfg.Verbose,
+		Explain:         cfg.Explain,
+		OutputDir:       cfg.OutputDir,
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	result := &CheckResult{
+		Status: "pass",
+		Report: report,
+	}
+	result.Summary.PAt1 = report.Metrics.Overall.PAt1
+	result.Summary.MRR = report.Metrics.Overall.MRR
+	result.Summary.HitAt3 = report.Metrics.Overall.HitAt3
+	result.Summary.Total = report.Metrics.Overall.Total
+
+	// Count misses
+	for _, r := range report.Results {
+		if r.Status == "miss" {
+			result.TopRegs = append(result.TopRegs, Regression{
+				ID:           r.ID,
+				Corpus:       r.Corpus,
+				Query:        r.Query,
+				Expected:     r.Expected.RelevantRefs,
+				CurrentRef:   r.Actual.BestRef,
+				Reason:       "miss",
+				DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID),
+			})
+		}
+	}
+	result.Summary.Regressions = len(result.TopRegs)
+
+	// Compare to baseline if exists
+	baselinePath := cfg.BaselinePath
+	if baselinePath == "" {
+		baselinePath = filepath.Join(root, "baselines", "combined.json")
+	}
+	if _, err := os.Stat(baselinePath); err == nil {
+		baseline, err := loadReport(baselinePath)
+		if err == nil {
+			result.Delta = &MetricsDelta{
+				PAt1:   report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+			}
+			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
+				result.Status = "fail"
+			}
+		}
+	}
+
+	// Write artifacts
+	os.MkdirAll(cfg.OutputDir, 0755)
+	ts := time.Now().Format("20060102_150405")
+	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
+	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
+
+	reportJSON, _ := json.MarshalIndent(report, "", "  ")
+	os.WriteFile(reportPath, reportJSON, 0644)
+
+	summaryMD := generateSummaryMD(report, result)
+	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
+
+	result.Artifacts.ReportJSON = reportPath
+	result.Artifacts.SummaryMD = summaryPath
+
+	return result, nil
+}
+
+func RunBenchmark(cfg RunConfig) (*Report, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+	return RunCorpusBenchmark(ds, cfg)
+}
+
+func RunCompare(cfg CompareConfig) (*CompareResult, error) {
+	baseline, err := loadReport(cfg.BaselinePath)
+	if err != nil {
+		return nil, fmt.Errorf("load baseline: %w", err)
+	}
+	current, err := loadReport(cfg.CurrentPath)
+	if err != nil {
+		return nil, fmt.Errorf("load current: %w", err)
+	}
+
+	result := &CompareResult{
+		Status: "pass",
+		Delta: MetricsDelta{
+			PAt1:   current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+			MRR:    current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+			HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+		},
+	}
+
+	if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 {
+		result.Status = "fail"
+	}
+
+	// Find regressions
+	baselineResults := make(map[string]QueryResult)
+	for _, r := range baseline.Results {
+		baselineResults[r.ID] = r
+	}
+	for _, r := range current.Results {
+		if base, ok := baselineResults[r.ID]; ok {
+			if base.Status == "hit" && r.Status != "hit" {
+				result.Regressions = append(result.Regressions, Regression{
+					ID:          r.ID,
+					Corpus:      r.Corpus,
+					Query:       r.Query,
+					BaselineRef: base.Actual.BestRef,
+					CurrentRef:  r.Actual.BestRef,
+					Reason:      fmt.Sprintf("%s -> %s", base.Status, r.Status),
+				})
+			}
+		}
+	}
+
+	return result, nil
+}
+
+func RunLint(cfg LintConfig) (*LintResult, error) {
+	root := FindBenchmarkRoot()
+	result := &LintResult{}
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		result.Errors++
+		result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err))
+		return result, nil
+	}
+
+	// Check for duplicate IDs
+	ids := make(map[string]string)
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if existing, ok := ids[q.ID]; ok {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing))
+			} else {
+				ids[q.ID] = c.ID
+			}
+		}
+	}
+
+	// Check refs exist
+	for _, c := range ds.Corpora {
+		refs := make(map[string]bool)
+		for _, d := range c.Snapshot {
+			refs[d.Ref] = true
+		}
+		for _, q := range c.Queries {
+			for _, r := range q.RelevantRefs {
+				if !refs[r] {
+					result.Errors++
+					result.Messages = append(result.Messages,
+						fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r))
+				}
+			}
+		}
+	}
+
+	// Check difficulty values
+	validDiff := map[string]bool{"easy": true, "medium": true, "hard": true}
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if q.Difficulty != "" && !validDiff[q.Difficulty] {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID))
+			}
+		}
+	}
+
+	if result.Errors == 0 && result.Warnings == 0 {
+		result.Messages = append(result.Messages, "All checks passed")
+	}
+
+	return result, nil
+}
+
+func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+
+	result := &CatalogResult{
+		ByTag:        make(map[string]int),
+		ByDifficulty: make(map[string]int),
+	}
+
+	for _, c := range ds.Corpora {
+		tags := make(map[string]bool)
+		for _, q := range c.Queries {
+			result.TotalQueries++
+			result.ByDifficulty[q.Difficulty]++
+			for _, t := range q.Tags {
+				tags[t] = true
+				result.ByTag[t]++
+			}
+		}
+		var tagList []string
+		for t := range tags {
+			tagList = append(tagList, t)
+		}
+		sort.Strings(tagList)
+		result.Corpora = append(result.Corpora, CorpusSummary{
+			ID:      c.ID,
+			Queries: len(c.Queries),
+			Tags:    tagList,
+		})
+	}
+
+	return result, nil
+}
+
+func loadReport(path string) (*Report, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var r Report
+	if err := json.Unmarshal(data, &r); err != nil {
+		return nil, err
+	}
+	return &r, nil
+}
+
+func generateSummaryMD(report *Report, result *CheckResult) string {
+	var sb strings.Builder
+
+	sb.WriteString("# Benchmark Summary\n\n")
+	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
+
+	sb.WriteString("## Overall Metrics\n\n")
+	sb.WriteString("| Metric | Value |\n")
+	sb.WriteString("|--------|-------|\n")
+	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
+	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
+	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
+	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
+	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
+
+	if result.Delta != nil {
+		sb.WriteString("\n## Delta from Baseline\n\n")
+		sb.WriteString("| Metric | Delta |\n")
+		sb.WriteString("|--------|-------|\n")
+		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
+		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
+		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
+	}
+
+	if len(result.TopRegs) > 0 {
+		sb.WriteString("\n## Misses\n\n")
+		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
+		sb.WriteString("|----|--------|-------|-----|----------|\n")
+		for _, r := range result.TopRegs {
+			if len(result.TopRegs) > 10 {
+				break
+			}
+			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
+				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
+		}
+	}
+
+	return sb.String()
+}
+
+func PrintCheckResult(result *CheckResult, cfg CheckConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m Benchmark passed\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Benchmark failed\n")
+	}
+	fmt.Printf("\n")
+
+	fmt.Printf("  %-12s %8.4f\n", "MRR", result.Summary.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", result.Summary.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", result.Summary.Total)
+	fmt.Printf("  %-12s %8d\n", "Misses", result.Summary.Regressions)
+
+	if result.Delta != nil {
+		fmt.Printf("\n  Delta from baseline:\n")
+		printDelta("P@1", result.Delta.PAt1)
+		printDelta("MRR", result.Delta.MRR)
+		printDelta("Hit@3", result.Delta.HitAt3)
+	}
+
+	fmt.Printf("\n  Artifacts:\n")
+	fmt.Printf("    Report:  %s\n", result.Artifacts.ReportJSON)
+	fmt.Printf("    Summary: %s\n", result.Artifacts.SummaryMD)
+	fmt.Printf("\n")
+}
+
+func printDelta(name string, delta float64) {
+	color := "\033[0m"
+	sign := ""
+	if delta > 0.001 {
+		color = "\033[32m"
+		sign = "+"
+	} else if delta < -0.001 {
+		color = "\033[31m"
+	}
+	fmt.Printf("    %s%-8s %s%.4f\033[0m\n", color, name, sign, delta)
+}
+
+func PrintRunResult(report *Report, cfg RunConfig) {
+	fmt.Printf("\n")
+	fmt.Printf("  %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", report.Metrics.Overall.Total)
+	fmt.Printf("\n")
+
+	if cfg.Verbose {
+		for _, r := range report.Results {
+			status := "\033[32mHIT \033[0m"
+			switch r.Status {
+			case "miss":
+				status = "\033[31mMISS\033[0m"
+			case "partial":
+				status = "\033[33mPART\033[0m"
+			}
+			fmt.Printf("  [%s] %s | %s | got=%s score=%.3f\n",
+				r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore)
+		}
+	}
+}
+
+func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m No regression\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Regression detected\n")
+	}
+	fmt.Printf("\n")
+	printDelta("P@1", result.Delta.PAt1)
+	printDelta("MRR", result.Delta.MRR)
+	printDelta("Hit@3", result.Delta.HitAt3)
+
+	if len(result.Regressions) > 0 {
+		fmt.Printf("\n  Regressions:\n")
+		for _, r := range result.Regressions {
+			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
+		}
+	}
+	fmt.Printf("\n")
+}
+
+func PrintLintResult(result *LintResult, cfg LintConfig) {
+	for _, msg := range result.Messages {
+		fmt.Println(msg)
+	}
+	fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings)
+}
+
+func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n  Corpora: %d\n", len(result.Corpora))
+	fmt.Printf("  Total Queries: %d\n\n", result.TotalQueries)
+
+	fmt.Printf("  %-30s %8s\n", "Corpus", "Queries")
+	fmt.Printf("  %-30s %8s\n", "------", "-------")
+	for _, c := range result.Corpora {
+		fmt.Printf("  %-30s %8d\n", c.ID, c.Queries)
+	}
+
+	switch cfg.By {
+	case "difficulty":
+		fmt.Printf("\n  By Difficulty:\n")
+		for d, n := range result.ByDifficulty {
+			fmt.Printf("    %-10s %4d\n", d, n)
+		}
+	case "tag":
+		fmt.Printf("\n  By Tag:\n")
+		for t, n := range result.ByTag {
+			fmt.Printf("    %-20s %4d\n", t, n)
+		}
+	}
+	fmt.Printf("\n")
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
new file mode 100644
index 0000000..c8ac10d
--- /dev/null
+++ b/internal/benchmark/config.go
@@ -0,0 +1,247 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"flag"
+	"os"
+	"path/filepath"
+)
+
+type Config struct {
+	Version  string             `json:"version"`
+	Defaults DefaultsConfig     `json:"defaults"`
+	Profiles map[string]Profile `json:"profiles"`
+	Baseline BaselineConfig     `json:"baseline"`
+}
+
+type DefaultsConfig struct {
+	Profile string `json:"profile"`
+}
+
+type Profile struct {
+	Strategy   string   `json:"strategy"`
+	Threshold  float64  `json:"threshold"`
+	TopK       int      `json:"top_k"`
+	Weights    Weights  `json:"weights"`
+	Suites     []string `json:"suites"`
+	Mode       string   `json:"mode"`
+	Inherits   string   `json:"inherits"`
+	Verbose    bool     `json:"verbose"`
+	Explain    bool     `json:"explain"`
+	FailOnReg  bool     `json:"fail_on_regression"`
+}
+
+type Weights struct {
+	Lexical   float64 `json:"lexical"`
+	Embedding float64 `json:"embedding"`
+}
+
+type BaselineConfig struct {
+	Quality BaselineQuality `json:"quality"`
+	Runtime BaselineRuntime `json:"runtime"`
+}
+
+type BaselineQuality struct {
+	MaxOverallPAt1Drop  float64 `json:"max_overall_p_at_1_drop"`
+	MaxOverallMRRDrop   float64 `json:"max_overall_mrr_drop"`
+	MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"`
+	MaxCorpusPAt1Drop   float64 `json:"max_corpus_p_at_1_drop"`
+	MaxTagPAt1Drop      float64 `json:"max_tag_p_at_1_drop"`
+}
+
+type BaselineRuntime struct {
+	MaxNsOpRegressionRatio   float64 `json:"max_ns_op_regression_ratio"`
+	MaxAllocRegressionRatio  float64 `json:"max_alloc_regression_ratio"`
+}
+
+type CheckConfig struct {
+	Profile      string
+	BaselinePath string
+	OutputDir    string
+	Format       string
+	FailOnReg    bool
+	Quick        bool
+	Verbose      bool
+	Explain      bool
+}
+
+type RunConfig struct {
+	Suite           string
+	Corpus          string
+	QueryID         string
+	Strategy        string
+	Threshold       float64
+	TopK            int
+	LexicalWeight   float64
+	EmbeddingWeight float64
+	Profile         string
+	Mode            string
+	Verbose         bool
+	Explain         bool
+	OutputDir       string
+	ReportName      string
+}
+
+type CompareConfig struct {
+	BaselinePath string
+	CurrentPath  string
+	Format       string
+	Verbose      bool
+}
+
+type LintConfig struct {
+	Format  string
+	Verbose bool
+}
+
+type CatalogConfig struct {
+	Format string
+	By     string
+}
+
+func FindBenchmarkRoot() string {
+	cwd, _ := os.Getwd()
+	for d := cwd; d != "/"; d = filepath.Dir(d) {
+		if _, err := os.Stat(filepath.Join(d, "tests/benchmark/config/benchmark.json")); err == nil {
+			return filepath.Join(d, "tests/benchmark")
+		}
+		if _, err := os.Stat(filepath.Join(d, "go.mod")); err == nil {
+			return filepath.Join(d, "tests/benchmark")
+		}
+	}
+	return filepath.Join(cwd, "tests/benchmark")
+}
+
+func LoadConfig(benchmarkRoot string) (*Config, error) {
+	path := filepath.Join(benchmarkRoot, "config/benchmark.json")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var cfg Config
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return nil, err
+	}
+	return &cfg, nil
+}
+
+func ResolveProfile(cfg *Config, name string) Profile {
+	p, ok := cfg.Profiles[name]
+	if !ok {
+		return Profile{
+			Strategy:  "combined",
+			Threshold: 0.01,
+			TopK:      5,
+			Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+			Suites:    []string{"corpus"},
+			Mode:      "library",
+		}
+	}
+	if p.Inherits != "" {
+		base := ResolveProfile(cfg, p.Inherits)
+		if p.Strategy == "" {
+			p.Strategy = base.Strategy
+		}
+		if p.Threshold == 0 {
+			p.Threshold = base.Threshold
+		}
+		if p.TopK == 0 {
+			p.TopK = base.TopK
+		}
+		if p.Weights.Lexical == 0 && p.Weights.Embedding == 0 {
+			p.Weights = base.Weights
+		}
+		if len(p.Suites) == 0 {
+			p.Suites = base.Suites
+		}
+		if p.Mode == "" {
+			p.Mode = base.Mode
+		}
+	}
+	return p
+}
+
+func ParseCheckFlags(args []string) CheckConfig {
+	fs := flag.NewFlagSet("check", flag.ExitOnError)
+	cfg := CheckConfig{
+		Profile:   "default",
+		OutputDir: filepath.Join(FindBenchmarkRoot(), "results"),
+		Format:    "text",
+	}
+	fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile")
+	fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline file path")
+	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)")
+	fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression")
+	fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details")
+	fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseRunFlags(args []string) RunConfig {
+	fs := flag.NewFlagSet("run", flag.ExitOnError)
+	cfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Profile:         "default",
+		Mode:            "library",
+		OutputDir:       filepath.Join(FindBenchmarkRoot(), "results"),
+	}
+	fs.StringVar(&cfg.Suite, "suite", cfg.Suite, "suite to run (corpus|recovery|classification|runtime|all)")
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to run")
+	fs.StringVar(&cfg.QueryID, "query", "", "specific query ID to run")
+	fs.StringVar(&cfg.Strategy, "strategy", cfg.Strategy, "matching strategy")
+	fs.Float64Var(&cfg.Threshold, "threshold", cfg.Threshold, "score threshold")
+	fs.IntVar(&cfg.TopK, "top-k", cfg.TopK, "number of results")
+	fs.Float64Var(&cfg.LexicalWeight, "lexical-weight", cfg.LexicalWeight, "lexical weight")
+	fs.Float64Var(&cfg.EmbeddingWeight, "embedding-weight", cfg.EmbeddingWeight, "embedding weight")
+	fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile")
+	fs.StringVar(&cfg.Mode, "mode", cfg.Mode, "execution mode (cli|library|both)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.BoolVar(&cfg.Explain, "explain", false, "include explanations")
+	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
+	fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseCompareFlags(args []string) CompareConfig {
+	fs := flag.NewFlagSet("compare", flag.ExitOnError)
+	cfg := CompareConfig{
+		Format: "text",
+	}
+	fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline report path (required)")
+	fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)")
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseLintFlags(args []string) LintConfig {
+	fs := flag.NewFlagSet("lint", flag.ExitOnError)
+	cfg := LintConfig{
+		Format: "text",
+	}
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseCatalogFlags(args []string) CatalogConfig {
+	fs := flag.NewFlagSet("catalog", flag.ExitOnError)
+	cfg := CatalogConfig{
+		Format: "table",
+	}
+	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)")
+	fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)")
+	fs.Parse(args)
+	return cfg
+}
diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go
new file mode 100644
index 0000000..555b503
--- /dev/null
+++ b/internal/benchmark/dataset.go
@@ -0,0 +1,117 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+
+	"github.com/pinchtab/semantic"
+)
+
+type Query struct {
+	ID                    string   `json:"id"`
+	QueryText             string   `json:"query"`
+	RelevantRefs          []string `json:"relevant_refs"`
+	PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
+	Difficulty            string   `json:"difficulty"`
+	Tags                  []string `json:"tags"`
+	Intent                string   `json:"intent,omitempty"`
+	PageType              string   `json:"page_type,omitempty"`
+	Threshold             *float64 `json:"threshold,omitempty"`
+	TopK                  *int     `json:"top_k,omitempty"`
+	ExpectNoMatch         bool     `json:"expect_no_match,omitempty"`
+	MinScore              *float64 `json:"min_score,omitempty"`
+	Notes                 string   `json:"notes,omitempty"`
+}
+
+type Corpus struct {
+	ID        string
+	Path      string
+	Snapshot  []semantic.ElementDescriptor
+	Queries   []Query
+}
+
+type Dataset struct {
+	Root    string
+	Corpora []Corpus
+}
+
+func LoadDataset(benchmarkRoot string) (*Dataset, error) {
+	corpusDir := filepath.Join(benchmarkRoot, "corpus")
+	entries, err := os.ReadDir(corpusDir)
+	if err != nil {
+		return nil, err
+	}
+
+	ds := &Dataset{Root: benchmarkRoot}
+
+	for _, entry := range entries {
+		if !entry.IsDir() {
+			continue
+		}
+
+		corpusPath := filepath.Join(corpusDir, entry.Name())
+		snapshotPath := filepath.Join(corpusPath, "snapshot.json")
+		queriesPath := filepath.Join(corpusPath, "queries.json")
+
+		if _, err := os.Stat(snapshotPath); os.IsNotExist(err) {
+			continue
+		}
+		if _, err := os.Stat(queriesPath); os.IsNotExist(err) {
+			continue
+		}
+
+		corpus, err := loadCorpus(entry.Name(), corpusPath)
+		if err != nil {
+			return nil, err
+		}
+
+		ds.Corpora = append(ds.Corpora, *corpus)
+	}
+
+	return ds, nil
+}
+
+func loadCorpus(id, path string) (*Corpus, error) {
+	snapshotPath := filepath.Join(path, "snapshot.json")
+	queriesPath := filepath.Join(path, "queries.json")
+
+	snapshotData, err := os.ReadFile(snapshotPath)
+	if err != nil {
+		return nil, err
+	}
+
+	var snapshot []semantic.ElementDescriptor
+	if err := json.Unmarshal(snapshotData, &snapshot); err != nil {
+		return nil, err
+	}
+
+	queriesData, err := os.ReadFile(queriesPath)
+	if err != nil {
+		return nil, err
+	}
+
+	var queries []Query
+	if err := json.Unmarshal(queriesData, &queries); err != nil {
+		return nil, err
+	}
+
+	return &Corpus{
+		ID:       id,
+		Path:     path,
+		Snapshot: snapshot,
+		Queries:  queries,
+	}, nil
+}
+
+func (ds *Dataset) QueryCount() int {
+	count := 0
+	for _, c := range ds.Corpora {
+		count += len(c.Queries)
+	}
+	return count
+}
+
+func (ds *Dataset) CorpusCount() int {
+	return len(ds.Corpora)
+}
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
new file mode 100644
index 0000000..391cc0a
--- /dev/null
+++ b/internal/benchmark/runner.go
@@ -0,0 +1,384 @@
+package benchmark
+
+import (
+	"context"
+	"time"
+
+	"github.com/pinchtab/semantic"
+)
+
+type QueryResult struct {
+	ID       string   `json:"id"`
+	Corpus   string   `json:"corpus"`
+	Query    string   `json:"query"`
+	Difficulty string `json:"difficulty"`
+	Tags     []string `json:"tags"`
+	Intent   string   `json:"intent,omitempty"`
+	PageType string   `json:"page_type,omitempty"`
+	Expected struct {
+		RelevantRefs          []string `json:"relevant_refs"`
+		PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
+	} `json:"expected"`
+	Actual struct {
+		BestRef   string  `json:"best_ref"`
+		BestScore float64 `json:"best_score"`
+		Matches   []Match `json:"matches"`
+	} `json:"actual"`
+	Metrics struct {
+		RR                float64 `json:"rr"`
+		PAt1              float64 `json:"p_at_1"`
+		PAt3              float64 `json:"p_at_3"`
+		HitAt3            int     `json:"hit_at_3"`
+		HitAt5            int     `json:"hit_at_5"`
+		BestRelevantRank  *int    `json:"best_relevant_rank"`
+		BestRelevantScore float64 `json:"best_relevant_score"`
+		BestWrongScore    float64 `json:"best_wrong_score"`
+		Margin            float64 `json:"margin"`
+	} `json:"metrics"`
+	Latency struct {
+		LibraryMs int64 `json:"library_ms"`
+		CLIMs     *int64 `json:"cli_ms,omitempty"`
+	} `json:"latency"`
+	Status string `json:"status"`
+}
+
+type Match struct {
+	Ref   string  `json:"ref"`
+	Score float64 `json:"score"`
+	Role  string  `json:"role"`
+	Name  string  `json:"name"`
+}
+
+type Report struct {
+	SchemaVersion string `json:"schema_version"`
+	Run           struct {
+		ID        string `json:"id"`
+		Timestamp string `json:"timestamp"`
+		Tool      string `json:"tool"`
+		GitSHA    string `json:"git_sha,omitempty"`
+		GitDirty  bool   `json:"git_dirty,omitempty"`
+		Command   string `json:"command"`
+	} `json:"run"`
+	Dataset struct {
+		Name         string `json:"name"`
+		Version      string `json:"version,omitempty"`
+		QueryCount   int    `json:"query_count"`
+		CorpusCount  int    `json:"corpus_count"`
+	} `json:"dataset"`
+	Config struct {
+		Profile   string  `json:"profile"`
+		Strategy  string  `json:"strategy"`
+		Threshold float64 `json:"threshold"`
+		TopK      int     `json:"top_k"`
+		Weights   Weights `json:"weights"`
+	} `json:"config"`
+	Status  string `json:"status"`
+	Metrics struct {
+		Overall    OverallMetrics           `json:"overall"`
+		Latency    LatencyMetrics           `json:"latency"`
+		ByCorpus   map[string]CorpusMetrics `json:"by_corpus"`
+		ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"`
+		ByTag      map[string]CorpusMetrics `json:"by_tag"`
+	} `json:"metrics"`
+	Results []QueryResult `json:"results"`
+}
+
+type OverallMetrics struct {
+	Total     int     `json:"total"`
+	MRR       float64 `json:"mrr"`
+	PAt1      float64 `json:"p_at_1"`
+	PAt3      float64 `json:"p_at_3"`
+	HitAt3    float64 `json:"hit_at_3"`
+	HitAt5    float64 `json:"hit_at_5"`
+	AvgMargin float64 `json:"avg_margin"`
+}
+
+type LatencyMetrics struct {
+	LibraryP50Ms int64  `json:"library_p50_ms"`
+	LibraryP95Ms int64  `json:"library_p95_ms"`
+	CLIP50Ms     *int64 `json:"cli_p50_ms,omitempty"`
+	CLIP95Ms     *int64 `json:"cli_p95_ms,omitempty"`
+}
+
+type CorpusMetrics struct {
+	Count     int     `json:"count"`
+	MRR       float64 `json:"mrr"`
+	PAt1      float64 `json:"p_at_1"`
+	HitAt3    float64 `json:"hit_at_3"`
+	AvgMargin float64 `json:"avg_margin"`
+}
+
+func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
+	matcher := createMatcher(cfg)
+
+	report := &Report{
+		SchemaVersion: "1.0.0",
+		Status:        "pass",
+	}
+	report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile
+	report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339)
+	report.Run.Tool = "semantic-bench"
+	report.Dataset.Name = "semantic-ui-matching-corpus"
+	report.Dataset.QueryCount = ds.QueryCount()
+	report.Dataset.CorpusCount = ds.CorpusCount()
+	report.Config.Profile = cfg.Profile
+	report.Config.Strategy = cfg.Strategy
+	report.Config.Threshold = cfg.Threshold
+	report.Config.TopK = cfg.TopK
+	report.Config.Weights = Weights{Lexical: cfg.LexicalWeight, Embedding: cfg.EmbeddingWeight}
+
+	report.Metrics.ByCorpus = make(map[string]CorpusMetrics)
+	report.Metrics.ByDifficulty = make(map[string]CorpusMetrics)
+	report.Metrics.ByTag = make(map[string]CorpusMetrics)
+
+	var allLatencies []int64
+
+	for _, corpus := range ds.Corpora {
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+
+		for _, query := range corpus.Queries {
+			if cfg.QueryID != "" && query.ID != cfg.QueryID {
+				continue
+			}
+
+			result := runQuery(matcher, corpus, query, cfg)
+			report.Results = append(report.Results, result)
+			allLatencies = append(allLatencies, result.Latency.LibraryMs)
+		}
+	}
+
+	aggregateMetrics(report, allLatencies)
+	return report, nil
+}
+
+func createMatcher(cfg RunConfig) semantic.ElementMatcher {
+	embedder := semantic.NewHashingEmbedder(128)
+	switch cfg.Strategy {
+	case "lexical":
+		return semantic.NewLexicalMatcher()
+	case "embedding":
+		return semantic.NewEmbeddingMatcher(embedder)
+	default:
+		return semantic.NewCombinedMatcher(embedder)
+	}
+}
+
+func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg RunConfig) QueryResult {
+	result := QueryResult{
+		ID:         query.ID,
+		Corpus:     corpus.ID,
+		Query:      query.QueryText,
+		Difficulty: query.Difficulty,
+		Tags:       query.Tags,
+		Intent:     query.Intent,
+		PageType:   query.PageType,
+	}
+	result.Expected.RelevantRefs = query.RelevantRefs
+	result.Expected.PartiallyRelevantRefs = query.PartiallyRelevantRefs
+
+	threshold := cfg.Threshold
+	if query.Threshold != nil {
+		threshold = *query.Threshold
+	}
+	topK := cfg.TopK
+	if query.TopK != nil {
+		topK = *query.TopK
+	}
+
+	start := time.Now()
+	findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{
+		Threshold: threshold,
+		TopK:      topK,
+	})
+	result.Latency.LibraryMs = time.Since(start).Milliseconds()
+
+	result.Actual.BestRef = findResult.BestRef
+	result.Actual.BestScore = findResult.BestScore
+	for _, m := range findResult.Matches {
+		result.Actual.Matches = append(result.Actual.Matches, Match{
+			Ref:   m.Ref,
+			Score: m.Score,
+			Role:  m.Role,
+			Name:  m.Name,
+		})
+	}
+
+	computeQueryMetrics(&result, query)
+	return result
+}
+
+func computeQueryMetrics(result *QueryResult, query Query) {
+	relevantSet := make(map[string]bool)
+	for _, r := range query.RelevantRefs {
+		relevantSet[r] = true
+	}
+	partialSet := make(map[string]bool)
+	for _, r := range query.PartiallyRelevantRefs {
+		partialSet[r] = true
+	}
+
+	// Reciprocal Rank
+	for i, m := range result.Actual.Matches {
+		if relevantSet[m.Ref] {
+			result.Metrics.RR = 1.0 / float64(i+1)
+			break
+		}
+	}
+
+	// P@1
+	if len(result.Actual.Matches) > 0 {
+		if relevantSet[result.Actual.Matches[0].Ref] {
+			result.Metrics.PAt1 = 1.0
+		} else if partialSet[result.Actual.Matches[0].Ref] {
+			result.Metrics.PAt1 = 0.5
+		}
+	}
+
+	// P@3, Hit@3, Hit@5
+	relevantInTop3 := 0
+	partialInTop3 := 0
+	for i, m := range result.Actual.Matches {
+		if i >= 5 {
+			break
+		}
+		if relevantSet[m.Ref] {
+			if result.Metrics.BestRelevantRank == nil {
+				rank := i + 1
+				result.Metrics.BestRelevantRank = &rank
+			}
+			if result.Metrics.BestRelevantScore == 0 || m.Score > result.Metrics.BestRelevantScore {
+				result.Metrics.BestRelevantScore = m.Score
+			}
+			if i < 3 {
+				relevantInTop3++
+				result.Metrics.HitAt3 = 1
+			}
+			result.Metrics.HitAt5 = 1
+		} else if partialSet[m.Ref] {
+			if i < 3 {
+				partialInTop3++
+			}
+		} else {
+			if m.Score > result.Metrics.BestWrongScore {
+				result.Metrics.BestWrongScore = m.Score
+			}
+		}
+	}
+	result.Metrics.PAt3 = (float64(relevantInTop3) + float64(partialInTop3)*0.5) / 3.0
+	result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore
+
+	// Status
+	if query.ExpectNoMatch {
+		if len(result.Actual.Matches) == 0 {
+			result.Status = "no_match_expected"
+		} else {
+			result.Status = "unexpected_match"
+		}
+	} else if result.Metrics.PAt1 >= 1.0 {
+		result.Status = "hit"
+	} else if result.Metrics.PAt1 >= 0.5 {
+		result.Status = "partial"
+	} else {
+		result.Status = "miss"
+	}
+}
+
+func aggregateMetrics(report *Report, latencies []int64) {
+	n := len(report.Results)
+	if n == 0 {
+		return
+	}
+
+	report.Metrics.Overall.Total = n
+
+	var sumRR, sumP1, sumP3, sumHit3, sumHit5, sumMargin float64
+	corpusAgg := make(map[string]*aggregator)
+	diffAgg := make(map[string]*aggregator)
+	tagAgg := make(map[string]*aggregator)
+
+	for _, r := range report.Results {
+		sumRR += r.Metrics.RR
+		sumP1 += r.Metrics.PAt1
+		sumP3 += r.Metrics.PAt3
+		sumHit3 += float64(r.Metrics.HitAt3)
+		sumHit5 += float64(r.Metrics.HitAt5)
+		sumMargin += r.Metrics.Margin
+
+		addToAgg(corpusAgg, r.Corpus, r)
+		addToAgg(diffAgg, r.Difficulty, r)
+		for _, t := range r.Tags {
+			addToAgg(tagAgg, t, r)
+		}
+	}
+
+	report.Metrics.Overall.MRR = sumRR / float64(n)
+	report.Metrics.Overall.PAt1 = sumP1 / float64(n)
+	report.Metrics.Overall.PAt3 = sumP3 / float64(n)
+	report.Metrics.Overall.HitAt3 = sumHit3 / float64(n)
+	report.Metrics.Overall.HitAt5 = sumHit5 / float64(n)
+	report.Metrics.Overall.AvgMargin = sumMargin / float64(n)
+
+	for k, a := range corpusAgg {
+		report.Metrics.ByCorpus[k] = a.toMetrics()
+	}
+	for k, a := range diffAgg {
+		report.Metrics.ByDifficulty[k] = a.toMetrics()
+	}
+	for k, a := range tagAgg {
+		report.Metrics.ByTag[k] = a.toMetrics()
+	}
+
+	// Latency percentiles
+	if len(latencies) > 0 {
+		sorted := make([]int64, len(latencies))
+		copy(sorted, latencies)
+		sortInt64(sorted)
+		report.Metrics.Latency.LibraryP50Ms = sorted[len(sorted)*50/100]
+		report.Metrics.Latency.LibraryP95Ms = sorted[len(sorted)*95/100]
+	}
+}
+
+type aggregator struct {
+	count     int
+	sumRR     float64
+	sumP1     float64
+	sumHit3   float64
+	sumMargin float64
+}
+
+func addToAgg(m map[string]*aggregator, key string, r QueryResult) {
+	if _, ok := m[key]; !ok {
+		m[key] = &aggregator{}
+	}
+	a := m[key]
+	a.count++
+	a.sumRR += r.Metrics.RR
+	a.sumP1 += r.Metrics.PAt1
+	a.sumHit3 += float64(r.Metrics.HitAt3)
+	a.sumMargin += r.Metrics.Margin
+}
+
+func (a *aggregator) toMetrics() CorpusMetrics {
+	if a.count == 0 {
+		return CorpusMetrics{}
+	}
+	return CorpusMetrics{
+		Count:     a.count,
+		MRR:       a.sumRR / float64(a.count),
+		PAt1:      a.sumP1 / float64(a.count),
+		HitAt3:    a.sumHit3 / float64(a.count),
+		AvgMargin: a.sumMargin / float64(a.count),
+	}
+}
+
+func sortInt64(s []int64) {
+	for i := range s {
+		for j := i + 1; j < len(s); j++ {
+			if s[j] < s[i] {
+				s[i], s[j] = s[j], s[i]
+			}
+		}
+	}
+}

From fd7a3f195170fdde80af5fccdba951b4277f8be2 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:38:26 +0100
Subject: [PATCH 05/14] chore: ignore semantic-bench binary

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8a46978..9a58d8e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Binary
 /semantic
+/semantic-bench
 tests/benchmark/semantic
 tests/e2e/semantic
 *.exe

From 8b21ba78ba88bfbfeca4f47fba309853ba926e36 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 15:48:00 +0100
Subject: [PATCH 06/14] feat: add baseline, calibrate, tune commands to Go CLI

Move benchmark management from bash scripts to Go:
- `semantic-bench baseline create/update` - manage quality baselines
- `semantic-bench calibrate` - threshold optimization via precision/recall
- `semantic-bench tune` - grid-search lexical/embedding weights

Update dev tool to use Go CLI for all benchmark commands.
---
 cmd/semantic-bench/main.go     |  49 ++++-
 dev                            |  10 +-
 internal/benchmark/commands.go | 363 +++++++++++++++++++++++++++++++++
 internal/benchmark/config.go   |  59 ++++++
 4 files changed, 471 insertions(+), 10 deletions(-)

diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go
index 35bf051..4866601 100644
--- a/cmd/semantic-bench/main.go
+++ b/cmd/semantic-bench/main.go
@@ -13,11 +13,14 @@ Usage:
   semantic-bench <command> [flags]
 
 Commands:
-  check     Run benchmark and compare against baseline (default)
-  run       Run benchmark suites
-  compare   Compare two reports
-  lint      Validate dataset
-  catalog   Print dataset inventory
+  check       Run benchmark and compare against baseline (default)
+  run         Run benchmark suites
+  compare     Compare two reports
+  lint        Validate dataset
+  catalog     Print dataset inventory
+  baseline    Manage quality baselines (create, update)
+  calibrate   Find optimal thresholds via precision/recall analysis
+  tune        Grid-search lexical/embedding weights
 
 Flags:
   -h, --help    Show help
@@ -45,6 +48,12 @@ func main() {
 		runLint(args)
 	case "catalog":
 		runCatalog(args)
+	case "baseline":
+		runBaseline(args)
+	case "calibrate":
+		runCalibrate(args)
+	case "tune":
+		runTune(args)
 	case "-h", "--help", "help":
 		fmt.Print(usage)
 	default:
@@ -111,3 +120,33 @@ func runCatalog(args []string) {
 	}
 	benchmark.PrintCatalogResult(result, cfg)
 }
+
+func runBaseline(args []string) {
+	cfg := benchmark.ParseBaselineFlags(args)
+	result, err := benchmark.RunBaseline(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintBaselineResult(result, cfg)
+}
+
+func runCalibrate(args []string) {
+	cfg := benchmark.ParseCalibrateFlags(args)
+	result, err := benchmark.RunCalibrate(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintCalibrateResult(result, cfg)
+}
+
+func runTune(args []string) {
+	cfg := benchmark.ParseTuneFlags(args)
+	result, err := benchmark.RunTune(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintTuneResult(result, cfg)
+}
diff --git a/dev b/dev
index a7f6247..da0f70c 100755
--- a/dev
+++ b/dev
@@ -177,22 +177,22 @@ run_lint_docs() {
 
 run_baseline() {
   echo "  ${ACCENT}${BOLD}📏 Creating quality baseline${NC}"
-  bash tests/benchmark/scripts/create-baseline.sh "$@"
+  go run ./cmd/semantic-bench baseline create "$@"
 }
 
 run_baseline_check() {
   echo "  ${ACCENT}${BOLD}📏 Checking against baseline${NC}"
-  bash tests/benchmark/scripts/check-baseline.sh "$@"
+  go run ./cmd/semantic-bench check "$@"
 }
 
 run_baseline_update() {
   echo "  ${ACCENT}${BOLD}📏 Updating baseline${NC}"
-  bash tests/benchmark/scripts/update-baseline.sh --accept "$@"
+  go run ./cmd/semantic-bench baseline update --accept "$@"
 }
 
 run_calibrate() {
   echo "  ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}"
-  bash tests/benchmark/scripts/calibrate-thresholds.sh "$@"
+  go run ./cmd/semantic-bench calibrate -verbose "$@"
 }
 
 run_runtime() {
@@ -202,7 +202,7 @@ run_runtime() {
 
 run_tune() {
   echo "  ${ACCENT}${BOLD}🎛️ Tuning combined weights${NC}"
-  bash tests/benchmark/scripts/tune-weights.sh "$@"
+  go run ./cmd/semantic-bench tune -verbose "$@"
 }
 
 run_e2e() {
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
index ad22ea3..7f37ed5 100644
--- a/internal/benchmark/commands.go
+++ b/internal/benchmark/commands.go
@@ -8,6 +8,8 @@ import (
 	"sort"
 	"strings"
 	"time"
+
+	"github.com/pinchtab/semantic"
 )
 
 type CheckResult struct {
@@ -508,3 +510,364 @@ func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
 	}
 	fmt.Printf("\n")
 }
+
+// Baseline management
+
+type BaselineResult struct {
+	Action   string         `json:"action"`
+	Path     string         `json:"path"`
+	Metrics  OverallMetrics `json:"metrics"`
+	Previous *OverallMetrics `json:"previous,omitempty"`
+}
+
+func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) {
+	root := FindBenchmarkRoot()
+	baselinesDir := filepath.Join(root, "baselines")
+	if err := os.MkdirAll(baselinesDir, 0755); err != nil {
+		return nil, err
+	}
+
+	baselinePath := filepath.Join(baselinesDir, cfg.Name+".json")
+
+	switch cfg.Action {
+	case "create":
+		return createBaseline(root, baselinePath, cfg)
+	case "update":
+		if !cfg.Accept {
+			return nil, fmt.Errorf("use --accept to confirm baseline update")
+		}
+		return updateBaseline(root, baselinePath, cfg)
+	default:
+		return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action)
+	}
+}
+
+func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Mode:            "library",
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	data, err := json.MarshalIndent(report, "", "  ")
+	if err != nil {
+		return nil, err
+	}
+	if err := os.WriteFile(baselinePath, data, 0644); err != nil {
+		return nil, err
+	}
+
+	return &BaselineResult{
+		Action:  "create",
+		Path:    baselinePath,
+		Metrics: report.Metrics.Overall,
+	}, nil
+}
+
+func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	var previous *OverallMetrics
+	if data, err := os.ReadFile(baselinePath); err == nil {
+		var old Report
+		if json.Unmarshal(data, &old) == nil {
+			previous = &old.Metrics.Overall
+		}
+		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
+		os.WriteFile(backupPath, data, 0644)
+	}
+
+	result, err := createBaseline(root, baselinePath, cfg)
+	if err != nil {
+		return nil, err
+	}
+	result.Action = "update"
+	result.Previous = previous
+	return result, nil
+}
+
+func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) {
+	fmt.Printf("\n  Baseline %sd: %s\n\n", result.Action, result.Path)
+	fmt.Printf("  MRR:    %.4f\n", result.Metrics.MRR)
+	fmt.Printf("  P@1:    %.4f\n", result.Metrics.PAt1)
+	fmt.Printf("  Hit@3:  %.4f\n", result.Metrics.HitAt3)
+
+	if result.Previous != nil {
+		fmt.Printf("\n  Previous:\n")
+		fmt.Printf("    MRR:    %.4f\n", result.Previous.MRR)
+		fmt.Printf("    P@1:    %.4f\n", result.Previous.PAt1)
+		fmt.Printf("    Hit@3:  %.4f\n", result.Previous.HitAt3)
+	}
+	fmt.Println()
+}
+
+// Threshold calibration
+
+type CalibrateResult struct {
+	ByThreshold     map[string]ThresholdMetrics `json:"by_threshold"`
+	Recommendations CalibrateRecommendations    `json:"recommendations"`
+	TotalCases      int                         `json:"total_cases"`
+}
+
+type ThresholdMetrics struct {
+	TP        int     `json:"tp"`
+	FP        int     `json:"fp"`
+	FN        int     `json:"fn"`
+	TN        int     `json:"tn"`
+	Recall    float64 `json:"recall"`
+	Precision float64 `json:"precision"`
+	FPR       float64 `json:"false_positive_rate"`
+	F1        float64 `json:"f1"`
+}
+
+type CalibrateRecommendations struct {
+	DefaultThreshold  float64 `json:"default_threshold"`
+	RecoveryThreshold float64 `json:"recovery_threshold"`
+	BestF1            float64 `json:"best_f1"`
+}
+
+func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &CalibrateResult{
+		ByThreshold: make(map[string]ThresholdMetrics),
+	}
+
+	type testCase struct {
+		query         Query
+		corpus        *Corpus
+	}
+
+	var cases []testCase
+	for i := range ds.Corpora {
+		corpus := &ds.Corpora[i]
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+		for _, q := range corpus.Queries {
+			cases = append(cases, testCase{query: q, corpus: corpus})
+		}
+	}
+	result.TotalCases = len(cases)
+
+	if cfg.Verbose {
+		fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases))
+	}
+
+	runCfg := RunConfig{
+		Strategy:        "combined",
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+	}
+	matcher := createMatcher(runCfg)
+
+	var bestF1, bestF1Threshold float64
+	var bestRecallThreshold float64
+	var bestRecallWithPrecision float64
+
+	for _, threshold := range cfg.Thresholds {
+		tp, fp, fn, tn := 0, 0, 0, 0
+
+		for _, tc := range cases {
+			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
+				Threshold: threshold,
+				TopK:      5,
+			})
+
+			hasMatch := len(findResult.Matches) > 0
+			topRef := ""
+			if hasMatch {
+				topRef = findResult.Matches[0].Ref
+			}
+
+			if tc.query.ExpectNoMatch {
+				if hasMatch {
+					fp++
+				} else {
+					tn++
+				}
+			} else if len(tc.query.RelevantRefs) > 0 {
+				if !hasMatch {
+					fn++
+				} else if contains(tc.query.RelevantRefs, topRef) {
+					tp++
+				} else {
+					fp++
+				}
+			}
+		}
+
+		totalPos := tp + fn
+		totalNeg := tn + fp
+
+		var recall, precision, fpr, f1 float64
+		if totalPos > 0 {
+			recall = float64(tp) / float64(totalPos)
+		}
+		if tp+fp > 0 {
+			precision = float64(tp) / float64(tp+fp)
+		}
+		if totalNeg > 0 {
+			fpr = float64(fp) / float64(totalNeg)
+		}
+		if precision+recall > 0 {
+			f1 = 2 * precision * recall / (precision + recall)
+		}
+
+		key := fmt.Sprintf("%.2f", threshold)
+		result.ByThreshold[key] = ThresholdMetrics{
+			TP: tp, FP: fp, FN: fn, TN: tn,
+			Recall: recall, Precision: precision, FPR: fpr, F1: f1,
+		}
+
+		if f1 > bestF1 {
+			bestF1 = f1
+			bestF1Threshold = threshold
+		}
+		if recall >= 0.85 && precision > bestRecallWithPrecision {
+			bestRecallWithPrecision = precision
+			bestRecallThreshold = threshold
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n",
+				threshold, tp, fp, fn, tn, recall, precision, f1)
+		}
+	}
+
+	if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 {
+		bestRecallThreshold = cfg.Thresholds[0]
+	}
+
+	result.Recommendations = CalibrateRecommendations{
+		DefaultThreshold:  bestF1Threshold,
+		RecoveryThreshold: bestRecallThreshold,
+		BestF1:            bestF1,
+	}
+
+	return result, nil
+}
+
+func contains(refs []string, ref string) bool {
+	for _, r := range refs {
+		if r == ref {
+			return true
+		}
+	}
+	return false
+}
+
+func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) {
+	fmt.Printf("\n  Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold))
+
+	fmt.Printf("  Recommendations:\n")
+	fmt.Printf("    Default (best F1):   %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1)
+	fmt.Printf("    Recovery (recall):   %.2f\n", result.Recommendations.RecoveryThreshold)
+	fmt.Println()
+}
+
+// Weight tuning
+
+type TuneResult struct {
+	Results []TuneRun `json:"results"`
+	Best    *TuneRun  `json:"best"`
+}
+
+type TuneRun struct {
+	LexicalWeight   float64 `json:"lexical_weight"`
+	EmbeddingWeight float64 `json:"embedding_weight"`
+	MRR             float64 `json:"mrr"`
+	PAt1            float64 `json:"p_at_1"`
+	HitAt3          float64 `json:"hit_at_3"`
+}
+
+func RunTune(cfg TuneConfig) (*TuneResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &TuneResult{}
+
+	if cfg.Verbose {
+		fmt.Printf("  %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3")
+	}
+
+	for w := 0.0; w <= 1.0001; w += cfg.Step {
+		lexW := w
+		embW := 1.0 - w
+
+		runCfg := RunConfig{
+			Suite:           "corpus",
+			Strategy:        "combined",
+			Threshold:       0.01,
+			TopK:            5,
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			Mode:            "library",
+		}
+
+		if cfg.Corpus != "" {
+			runCfg.Corpus = cfg.Corpus
+		}
+
+		report, err := RunCorpusBenchmark(ds, runCfg)
+		if err != nil {
+			return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err)
+		}
+
+		run := TuneRun{
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			MRR:             report.Metrics.Overall.MRR,
+			PAt1:            report.Metrics.Overall.PAt1,
+			HitAt3:          report.Metrics.Overall.HitAt3,
+		}
+		result.Results = append(result.Results, run)
+
+		if result.Best == nil || run.PAt1 > result.Best.PAt1 ||
+			(run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) {
+			best := run
+			result.Best = &best
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n",
+				lexW, embW, run.MRR, run.PAt1, run.HitAt3)
+		}
+	}
+
+	return result, nil
+}
+
+func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
+	fmt.Printf("\n  Tested %d weight combinations\n\n", len(result.Results))
+
+	if result.Best != nil {
+		fmt.Printf("  Best weights:\n")
+		fmt.Printf("    Lexical:   %.2f\n", result.Best.LexicalWeight)
+		fmt.Printf("    Embedding: %.2f\n", result.Best.EmbeddingWeight)
+		fmt.Printf("    MRR:       %.4f\n", result.Best.MRR)
+		fmt.Printf("    P@1:       %.4f\n", result.Best.PAt1)
+		fmt.Printf("    Hit@3:     %.4f\n", result.Best.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index c8ac10d..eb2fe57 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -99,6 +99,25 @@ type CatalogConfig struct {
 	By     string
 }
 
+type BaselineCmdConfig struct {
+	Action  string // "create" or "update"
+	Name    string
+	Accept  bool
+	Verbose bool
+}
+
+type CalibrateConfig struct {
+	Corpus     string
+	Thresholds []float64
+	Verbose    bool
+}
+
+type TuneConfig struct {
+	Corpus  string
+	Step    float64
+	Verbose bool
+}
+
 func FindBenchmarkRoot() string {
 	cwd, _ := os.Getwd()
 	for d := cwd; d != "/"; d = filepath.Dir(d) {
@@ -245,3 +264,43 @@ func ParseCatalogFlags(args []string) CatalogConfig {
 	fs.Parse(args)
 	return cfg
 }
+
+func ParseBaselineFlags(args []string) BaselineCmdConfig {
+	fs := flag.NewFlagSet("baseline", flag.ExitOnError)
+	cfg := BaselineCmdConfig{
+		Action: "create",
+		Name:   "combined",
+	}
+	fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name")
+	fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+
+	if len(fs.Args()) > 0 {
+		cfg.Action = fs.Args()[0]
+	}
+	return cfg
+}
+
+func ParseCalibrateFlags(args []string) CalibrateConfig {
+	fs := flag.NewFlagSet("calibrate", flag.ExitOnError)
+	cfg := CalibrateConfig{
+		Thresholds: []float64{0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60},
+	}
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
+
+func ParseTuneFlags(args []string) TuneConfig {
+	fs := flag.NewFlagSet("tune", flag.ExitOnError)
+	cfg := TuneConfig{
+		Step: 0.1,
+	}
+	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against")
+	fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}

From 5e39de714da80fc5e1c133fbaaa98e1c2bbc368c Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:31:21 +0100
Subject: [PATCH 07/14] chore: remove bash scripts replaced by Go CLI

Keep only check-runtime-baseline.sh (wraps go test -bench).
---
 .../benchmark/scripts/calibrate-thresholds.sh | 340 ------------
 tests/benchmark/scripts/check-baseline.sh     | 140 -----
 tests/benchmark/scripts/create-baseline.sh    |  86 ---
 tests/benchmark/scripts/finalize-report.sh    | 115 ----
 tests/benchmark/scripts/lint-corpus.sh        | 197 -------
 tests/benchmark/scripts/record-result.sh      |  44 --
 tests/benchmark/scripts/run-benchmark.sh      | 226 --------
 .../benchmark/scripts/run-corpus-benchmark.sh | 514 ------------------
 tests/benchmark/scripts/run-full-benchmark.sh | 317 -----------
 .../scripts/run-recovery-benchmark.sh         |  42 --
 tests/benchmark/scripts/tune-weights.sh       | 167 ------
 tests/benchmark/scripts/update-baseline.sh    |  70 ---
 12 files changed, 2258 deletions(-)
 delete mode 100755 tests/benchmark/scripts/calibrate-thresholds.sh
 delete mode 100755 tests/benchmark/scripts/check-baseline.sh
 delete mode 100755 tests/benchmark/scripts/create-baseline.sh
 delete mode 100755 tests/benchmark/scripts/finalize-report.sh
 delete mode 100755 tests/benchmark/scripts/lint-corpus.sh
 delete mode 100755 tests/benchmark/scripts/record-result.sh
 delete mode 100755 tests/benchmark/scripts/run-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/run-corpus-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/run-full-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/run-recovery-benchmark.sh
 delete mode 100755 tests/benchmark/scripts/tune-weights.sh
 delete mode 100755 tests/benchmark/scripts/update-baseline.sh

diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh
deleted file mode 100755
index 84d68d1..0000000
--- a/tests/benchmark/scripts/calibrate-thresholds.sh
+++ /dev/null
@@ -1,340 +0,0 @@
-#!/bin/bash
-#
-# Threshold Calibration Benchmark
-#
-# Calculates optimal thresholds for semantic matching by evaluating
-# recall, precision, and false-positive rates across threshold levels.
-#
-# Usage:
-#   ./calibrate-thresholds.sh [--corpus <dir>]
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-CASES_DIR="${BENCHMARK_DIR}/cases"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-SPECIFIC_CORPUS=""
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json"
-
-# Thresholds to test
-THRESHOLDS=(0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60)
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --argjson thresholds "$(printf '%s\n' "${THRESHOLDS[@]}" | jq -s '.')" \
-    '{
-        calibration: {
-            timestamp: $ts,
-            thresholds_tested: $thresholds
-        },
-        by_threshold: {},
-        by_tag: {},
-        recommendations: {}
-    }' > "${REPORT_FILE}"
-
-echo ""
-echo "=== Threshold Calibration ==="
-echo "Testing thresholds: ${THRESHOLDS[*]}"
-echo ""
-
-# Collect all test cases
-declare -a ALL_QUERIES=()
-declare -a ALL_SNAPSHOTS=()
-declare -a ALL_RELEVANT=()
-declare -a ALL_EXPECT_NO_MATCH=()
-declare -a ALL_IDS=()
-
-load_corpus() {
-    local corpus_path="$1"
-    local snapshot="${corpus_path}/snapshot.json"
-    local queries="${corpus_path}/queries.json"
-
-    if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
-        return
-    fi
-
-    local count
-    count=$(jq length "$queries")
-
-    for i in $(seq 0 $((count - 1))); do
-        local query relevant id expect_no_match
-        id=$(jq -r ".[$i].id" "$queries")
-        query=$(jq -r ".[$i].query" "$queries")
-        relevant=$(jq -c ".[$i].relevant_refs // []" "$queries")
-        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$queries")
-
-        ALL_IDS+=("$id")
-        ALL_QUERIES+=("$query")
-        ALL_SNAPSHOTS+=("$snapshot")
-        ALL_RELEVANT+=("$relevant")
-        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
-    done
-}
-
-load_cases() {
-    local cases_file="$1"
-    local snapshots_dir="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-
-    if [[ ! -f "$cases_file" ]]; then
-        return
-    fi
-
-    local count
-    count=$(jq length "$cases_file")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query snapshot_name expect_no_match expect_ref expect_ref_alt relevant
-        id=$(jq -r ".[$i].id" "$cases_file")
-        query=$(jq -r ".[$i].query" "$cases_file")
-        snapshot_name=$(jq -r ".[$i].snapshot" "$cases_file")
-        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$cases_file")
-        expect_ref=$(jq -r ".[$i].expect_ref // \"\"" "$cases_file")
-        expect_ref_alt=$(jq -c ".[$i].expect_ref_alt // []" "$cases_file")
-
-        if [[ -n "$expect_ref" && "$expect_ref" != "null" ]]; then
-            relevant=$(echo "$expect_ref_alt" | jq --arg r "$expect_ref" '. + [$r]')
-        else
-            relevant="[]"
-        fi
-
-        local snapshot="${snapshots_dir}/${snapshot_name}"
-        if [[ ! -f "$snapshot" ]]; then
-            continue
-        fi
-
-        ALL_IDS+=("$id")
-        ALL_QUERIES+=("$query")
-        ALL_SNAPSHOTS+=("$snapshot")
-        ALL_RELEVANT+=("$relevant")
-        ALL_EXPECT_NO_MATCH+=("$expect_no_match")
-    done
-}
-
-echo "Loading test cases..."
-if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-    load_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}"
-else
-    for corpus in "${CORPUS_DIR}"/*/; do
-        [[ -d "$corpus" ]] || continue
-        load_corpus "$corpus"
-    done
-fi
-
-load_cases "${CASES_DIR}/negative-threshold.json"
-
-TOTAL_CASES=${#ALL_QUERIES[@]}
-echo "Loaded ${TOTAL_CASES} test cases"
-echo ""
-
-for threshold in "${THRESHOLDS[@]}"; do
-    echo "Testing threshold ${threshold}..."
-
-    tp=0 fp=0 fn=0 tn=0
-
-    for i in $(seq 0 $((TOTAL_CASES - 1))); do
-        query="${ALL_QUERIES[$i]}"
-        snapshot="${ALL_SNAPSHOTS[$i]}"
-        relevant="${ALL_RELEVANT[$i]}"
-        expect_no_match="${ALL_EXPECT_NO_MATCH[$i]}"
-
-        result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot}" \
-            --strategy combined \
-            --threshold "${threshold}" \
-            --top-k 5 \
-            --format json 2>/dev/null) || result='{"matches":[]}'
-
-        match_count=$(echo "$result" | jq '.matches | length')
-        best_ref=$(echo "$result" | jq -r '.best_ref // ""')
-
-        if [[ "$expect_no_match" == "true" ]]; then
-            if [[ $match_count -eq 0 ]]; then
-                tn=$((tn + 1))
-            else
-                fp=$((fp + 1))
-            fi
-        else
-            relevant_count=$(echo "$relevant" | jq 'length')
-            if [[ $relevant_count -eq 0 ]]; then
-                continue
-            fi
-
-            if [[ $match_count -eq 0 ]]; then
-                fn=$((fn + 1))
-            elif echo "$relevant" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-                tp=$((tp + 1))
-            else
-                fp=$((fp + 1))
-            fi
-        fi
-    done
-
-    total_positive=$((tp + fn))
-    total_negative=$((tn + fp))
-
-    if [[ $total_positive -gt 0 ]]; then
-        recall=$(echo "scale=4; $tp / $total_positive" | bc)
-    else
-        recall="0"
-    fi
-
-    if [[ $((tp + fp)) -gt 0 ]]; then
-        precision=$(echo "scale=4; $tp / ($tp + $fp)" | bc)
-    else
-        precision="1"
-    fi
-
-    if [[ $total_negative -gt 0 ]]; then
-        fpr=$(echo "scale=4; $fp / $total_negative" | bc)
-    else
-        fpr="0"
-    fi
-
-    if [[ $(echo "$precision + $recall > 0" | bc) -eq 1 ]]; then
-        f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc)
-    else
-        f1="0"
-    fi
-
-    printf "  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f FPR=%.3f F1=%.3f\n" \
-        "$threshold" "$tp" "$fp" "$fn" "$tn" "$recall" "$precision" "$fpr" "$f1"
-
-    tmp=$(mktemp)
-    jq --arg t "$threshold" \
-       --argjson tp "$tp" --argjson fp "$fp" --argjson fn "$fn" --argjson tn "$tn" \
-       --argjson recall "$recall" --argjson precision "$precision" \
-       --argjson fpr "$fpr" --argjson f1 "$f1" \
-       '.by_threshold[$t] = {
-           tp: $tp, fp: $fp, fn: $fn, tn: $tn,
-           recall: $recall, precision: $precision,
-           false_positive_rate: $fpr, f1: $f1
-       }' "$REPORT_FILE" > "$tmp"
-    mv "$tmp" "$REPORT_FILE"
-done
-
-echo ""
-echo "Calculating recommendations..."
-
-best_f1_threshold="" best_f1=0
-best_recall_threshold="" best_recall=0
-
-for threshold in "${THRESHOLDS[@]}"; do
-    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
-    f1=$(echo "$metrics" | jq -r '.f1')
-    recall=$(echo "$metrics" | jq -r '.recall')
-
-    if (( $(echo "$f1 > $best_f1" | bc -l) )); then
-        best_f1=$f1
-        best_f1_threshold=$threshold
-    fi
-    if (( $(echo "$recall > $best_recall" | bc -l) )); then
-        best_recall=$recall
-        best_recall_threshold=$threshold
-    fi
-done
-
-recovery_threshold=""
-recovery_precision=0
-for threshold in "${THRESHOLDS[@]}"; do
-    metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE")
-    recall=$(echo "$metrics" | jq -r '.recall')
-    precision=$(echo "$metrics" | jq -r '.precision')
-
-    if (( $(echo "$recall >= 0.85" | bc -l) )); then
-        if (( $(echo "$precision > $recovery_precision" | bc -l) )); then
-            recovery_precision=$precision
-            recovery_threshold=$threshold
-        fi
-    fi
-done
-
-if [[ -z "$recovery_threshold" ]]; then
-    recovery_threshold="${THRESHOLDS[0]}"
-fi
-
-default_threshold="$best_f1_threshold"
-
-tmp=$(mktemp)
-jq --arg default "$default_threshold" \
-   --arg recovery "$recovery_threshold" \
-   --arg best_f1 "$best_f1_threshold" \
-   --argjson best_f1_val "$best_f1" \
-   '.recommendations = {
-       default_threshold: $default,
-       recovery_threshold: $recovery,
-       best_f1: { threshold: $best_f1, value: $best_f1_val },
-       notes: "default_threshold optimizes F1. recovery_threshold prioritizes recall (>=85%)."
-   }' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Threshold Calibration Report
-
-Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)
-
-## Recommendations
-
-| Use Case | Threshold | Rationale |
-|----------|-----------|-----------|
-| **Default (find)** | **${default_threshold}** | Best F1 score (${best_f1}) |
-| **Recovery** | **${recovery_threshold}** | High recall for element recovery |
-
-## Metrics by Threshold
-
-| Threshold | TP | FP | FN | TN | Recall | Precision | FPR | F1 |
-|-----------|----|----|----|----|--------|-----------|-----|-----|
-$(for t in "${THRESHOLDS[@]}"; do
-    m=$(jq -r ".by_threshold[\"$t\"]" "$REPORT_FILE")
-    printf "| %.2f | %d | %d | %d | %d | %.3f | %.3f | %.3f | %.3f |\n" \
-        "$t" \
-        "$(echo "$m" | jq -r '.tp')" \
-        "$(echo "$m" | jq -r '.fp')" \
-        "$(echo "$m" | jq -r '.fn')" \
-        "$(echo "$m" | jq -r '.tn')" \
-        "$(echo "$m" | jq -r '.recall')" \
-        "$(echo "$m" | jq -r '.precision')" \
-        "$(echo "$m" | jq -r '.false_positive_rate')" \
-        "$(echo "$m" | jq -r '.f1')"
-done)
-
-## Trade-offs
-
-- **Lower threshold** (0.10-0.20): High recall, more false positives. Good for recovery.
-- **Medium threshold** (0.25-0.35): Balanced. Good default for find operations.
-- **Higher threshold** (0.40+): High precision, misses weaker matches.
-EOF
-
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "================================================"
-echo "  THRESHOLD CALIBRATION COMPLETE"
-echo "================================================"
-echo "  Test cases:         ${TOTAL_CASES}"
-echo "  Default threshold:  ${default_threshold} (F1=${best_f1})"
-echo "  Recovery threshold: ${recovery_threshold}"
-echo "================================================"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/check-baseline.sh b/tests/benchmark/scripts/check-baseline.sh
deleted file mode 100755
index f6e95ae..0000000
--- a/tests/benchmark/scripts/check-baseline.sh
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/bin/bash
-#
-# Check current benchmark results against a baseline.
-#
-# Usage:
-#   ./check-baseline.sh [--baseline <file>] [--fail-on-regression]
-#
-# Exit codes:
-#   0 - No regressions detected
-#   1 - Regressions detected (if --fail-on-regression)
-#   2 - Error (missing files, invalid config)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-# Read config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 2
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-MAX_P1_DROP=$(jq -r '.baseline.quality.max_overall_p_at_1_drop // 0.02' "$CONFIG_FILE")
-MAX_MRR_DROP=$(jq -r '.baseline.quality.max_overall_mrr_drop // 0.02' "$CONFIG_FILE")
-MAX_HIT3_DROP=$(jq -r '.baseline.quality.max_overall_hit_at_3_drop // 0.02' "$CONFIG_FILE")
-MAX_CORPUS_P1_DROP=$(jq -r '.baseline.quality.max_corpus_p_at_1_drop // 0.08' "$CONFIG_FILE")
-MAX_MARGIN_DROP=$(jq -r '.baseline.quality.max_margin_drop_report // 0.15' "$CONFIG_FILE")
-
-# Parse args
-BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
-FAIL_ON_REGRESSION=false
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --baseline) BASELINE_FILE="$2"; shift 2 ;;
-        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
-        *) echo "Unknown option: $1"; exit 2 ;;
-    esac
-done
-
-if [[ ! -f "$BASELINE_FILE" ]]; then
-    echo "ERROR: Baseline not found: $BASELINE_FILE" >&2
-    echo "Run ./create-baseline.sh first" >&2
-    exit 2
-fi
-
-echo "Checking against baseline: ${BASELINE_FILE}"
-echo "Tolerances: P@1=${MAX_P1_DROP}, MRR=${MAX_MRR_DROP}, Hit@3=${MAX_HIT3_DROP}"
-echo ""
-
-# Run current benchmark
-TEMP_DIR=$(mktemp -d)
-trap 'rm -rf "$TEMP_DIR"' EXIT
-
-"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" > "${TEMP_DIR}/output.log" 2>&1
-
-# Find the latest report
-LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
-
-if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
-    echo "ERROR: Could not find benchmark report" >&2
-    exit 2
-fi
-
-# Compare metrics
-REGRESSIONS=0
-WARNINGS=0
-
-compare_metric() {
-    local name="$1"
-    local baseline_val="$2"
-    local current_val="$3"
-    local max_drop="$4"
-
-    local diff
-    diff=$(echo "scale=4; $current_val - $baseline_val" | bc)
-    local drop
-    drop=$(echo "scale=4; $baseline_val - $current_val" | bc)
-
-    if (( $(echo "$drop > $max_drop" | bc -l) )); then
-        echo -e "${RED}REGRESSION${NC} $name: $baseline_val -> $current_val (drop: $drop, max: $max_drop)"
-        REGRESSIONS=$((REGRESSIONS + 1))
-    elif (( $(echo "$drop > 0" | bc -l) )); then
-        echo -e "${YELLOW}WARNING${NC} $name: $baseline_val -> $current_val (drop: $drop)"
-        WARNINGS=$((WARNINGS + 1))
-    else
-        echo -e "${GREEN}OK${NC} $name: $baseline_val -> $current_val (${diff:0:6})"
-    fi
-}
-
-echo "=== Overall Metrics ==="
-echo ""
-
-BASELINE_MRR=$(jq -r '.metrics.mrr' "$BASELINE_FILE")
-CURRENT_MRR=$(jq -r '.metrics.mrr' "$LATEST_REPORT")
-compare_metric "MRR" "$BASELINE_MRR" "$CURRENT_MRR" "$MAX_MRR_DROP"
-
-BASELINE_P1=$(jq -r '.metrics.p_at_1' "$BASELINE_FILE")
-CURRENT_P1=$(jq -r '.metrics.p_at_1' "$LATEST_REPORT")
-compare_metric "P@1" "$BASELINE_P1" "$CURRENT_P1" "$MAX_P1_DROP"
-
-BASELINE_HIT3=$(jq -r '.metrics.hit_at_3' "$BASELINE_FILE")
-CURRENT_HIT3=$(jq -r '.metrics.hit_at_3' "$LATEST_REPORT")
-compare_metric "Hit@3" "$BASELINE_HIT3" "$CURRENT_HIT3" "$MAX_HIT3_DROP"
-
-BASELINE_MARGIN=$(jq -r '.metrics.avg_margin' "$BASELINE_FILE")
-CURRENT_MARGIN=$(jq -r '.metrics.avg_margin' "$LATEST_REPORT")
-compare_metric "Margin" "$BASELINE_MARGIN" "$CURRENT_MARGIN" "$MAX_MARGIN_DROP"
-
-echo ""
-echo "=== Per-Corpus ==="
-echo ""
-
-for corpus in $(jq -r '.by_corpus | keys[]' "$BASELINE_FILE"); do
-    BASELINE_CORPUS_P1=$(jq -r ".by_corpus[\"$corpus\"].p_at_1 // 0" "$BASELINE_FILE")
-    CURRENT_CORPUS_P1=$(jq -r ".metrics.by_corpus[\"$corpus\"].p_at_1 // 0" "$LATEST_REPORT")
-    compare_metric "$corpus P@1" "$BASELINE_CORPUS_P1" "$CURRENT_CORPUS_P1" "$MAX_CORPUS_P1_DROP"
-done
-
-echo ""
-echo "================================================"
-if [[ $REGRESSIONS -gt 0 ]]; then
-    echo -e "${RED}REGRESSIONS: $REGRESSIONS${NC}"
-    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
-        exit 1
-    fi
-elif [[ $WARNINGS -gt 0 ]]; then
-    echo -e "${YELLOW}WARNINGS: $WARNINGS (no regressions)${NC}"
-else
-    echo -e "${GREEN}ALL CHECKS PASSED${NC}"
-fi
-echo "================================================"
diff --git a/tests/benchmark/scripts/create-baseline.sh b/tests/benchmark/scripts/create-baseline.sh
deleted file mode 100755
index cd4696a..0000000
--- a/tests/benchmark/scripts/create-baseline.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-#
-# Create a quality baseline from current corpus benchmark results.
-#
-# Usage:
-#   ./create-baseline.sh [--name <name>]
-#
-# This runs run-corpus-benchmark.sh and saves the results as a baseline.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-
-# Parse args
-BASELINE_NAME="${STRATEGY}"
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --name) BASELINE_NAME="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${BASELINES_DIR}"
-
-BASELINE_FILE="${BASELINES_DIR}/${BASELINE_NAME}.json"
-
-echo "Creating baseline: ${BASELINE_NAME}"
-echo "Strategy: ${STRATEGY}"
-echo ""
-
-# Run corpus benchmark
-TEMP_DIR=$(mktemp -d)
-trap 'rm -rf "$TEMP_DIR"' EXIT
-
-"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" 2>&1 | tee "${TEMP_DIR}/output.log"
-
-# Find the latest report
-LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1)
-
-if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then
-    echo "ERROR: Could not find benchmark report" >&2
-    exit 1
-fi
-
-# Extract baseline data
-jq '{
-    created_at: .benchmark.timestamp,
-    strategy: .benchmark.strategy,
-    threshold: .benchmark.threshold,
-    top_k: .benchmark.top_k,
-    weights: .benchmark.weights,
-    metrics: {
-        total: .metrics.total,
-        mrr: .metrics.mrr,
-        p_at_1: .metrics.p_at_1,
-        p_at_3: .metrics.p_at_3,
-        hit_at_3: .metrics.hit_at_3,
-        hit_at_5: .metrics.hit_at_5,
-        avg_margin: .metrics.avg_margin,
-        latency_p50_ms: .metrics.latency_p50_ms,
-        latency_p95_ms: .metrics.latency_p95_ms
-    },
-    by_difficulty: .metrics.by_difficulty,
-    by_corpus: .metrics.by_corpus,
-    per_query: [.results[] | {id, corpus, difficulty, p_at_1, rr, margin}]
-}' "$LATEST_REPORT" > "$BASELINE_FILE"
-
-echo ""
-echo "================================================"
-echo "  BASELINE CREATED"
-echo "================================================"
-echo "  File: ${BASELINE_FILE}"
-echo ""
-jq -r '"  MRR:     \(.metrics.mrr)\n  P@1:     \(.metrics.p_at_1)\n  Hit@3:   \(.metrics.hit_at_3)\n  Margin:  \(.metrics.avg_margin)"' "$BASELINE_FILE"
-echo "================================================"
diff --git a/tests/benchmark/scripts/finalize-report.sh b/tests/benchmark/scripts/finalize-report.sh
deleted file mode 100755
index 38d314f..0000000
--- a/tests/benchmark/scripts/finalize-report.sh
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-#
-# Finalize benchmark report and generate summary
-#
-# Usage:
-#   ./finalize-report.sh <report_file>
-#
-set -euo pipefail
-
-if [[ $# -lt 1 ]]; then
-    echo "Usage: $0 <report_file>"
-    exit 1
-fi
-
-REPORT_FILE="$1"
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-# Calculate final metrics
-TMP_FILE=$(mktemp)
-jq '
-    .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 10000 | floor / 100) else 0 end) |
-    .summary.avg_score = (if (.results | length) > 0 then ([.results[].score] | add / length | . * 1000 | floor / 1000) else 0 end) |
-    .summary.avg_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | add / length | floor) else 0 end) |
-    .summary.min_score = (if (.results | length) > 0 then ([.results[].score] | min) else 0 end) |
-    .summary.max_score = (if (.results | length) > 0 then ([.results[].score] | max) else 0 end) |
-    .summary.min_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | min) else 0 end) |
-    .summary.max_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | max) else 0 end)
-' "${REPORT_FILE}" > "${TMP_FILE}"
-mv "${TMP_FILE}" "${REPORT_FILE}"
-
-# Generate markdown summary
-TIMESTAMP=$(jq -r '.benchmark.timestamp' "${REPORT_FILE}")
-STRATEGY=$(jq -r '.benchmark.strategy' "${REPORT_FILE}")
-VERSION=$(jq -r '.benchmark.version' "${REPORT_FILE}")
-TOTAL=$(jq -r '.summary.total' "${REPORT_FILE}")
-PASSED=$(jq -r '.summary.passed' "${REPORT_FILE}")
-FAILED=$(jq -r '.summary.failed' "${REPORT_FILE}")
-SKIPPED=$(jq -r '.summary.skipped' "${REPORT_FILE}")
-ACCURACY=$(jq -r '.summary.accuracy' "${REPORT_FILE}")
-AVG_SCORE=$(jq -r '.summary.avg_score' "${REPORT_FILE}")
-AVG_LATENCY=$(jq -r '.summary.avg_latency_ms' "${REPORT_FILE}")
-MIN_SCORE=$(jq -r '.summary.min_score' "${REPORT_FILE}")
-MAX_SCORE=$(jq -r '.summary.max_score' "${REPORT_FILE}")
-MIN_LATENCY=$(jq -r '.summary.min_latency_ms' "${REPORT_FILE}")
-MAX_LATENCY=$(jq -r '.summary.max_latency_ms' "${REPORT_FILE}")
-
-cat > "${SUMMARY_FILE}" << EOF
-# Semantic Matching Benchmark Results
-
-## Benchmark Info
-
-| Field | Value |
-|-------|-------|
-| Timestamp | ${TIMESTAMP} |
-| Strategy | ${STRATEGY} |
-| Version | ${VERSION} |
-
-## Results Summary
-
-| Metric | Value |
-|--------|-------|
-| Total Cases | ${TOTAL} |
-| Passed | ${PASSED} |
-| Failed | ${FAILED} |
-| Skipped | ${SKIPPED} |
-| **Accuracy** | **${ACCURACY}%** |
-
-## Score Distribution
-
-| Metric | Value |
-|--------|-------|
-| Average Score | ${AVG_SCORE} |
-| Min Score | ${MIN_SCORE} |
-| Max Score | ${MAX_SCORE} |
-
-## Latency
-
-| Metric | Value |
-|--------|-------|
-| Average | ${AVG_LATENCY} ms |
-| Min | ${MIN_LATENCY} ms |
-| Max | ${MAX_LATENCY} ms |
-
-## Failed Cases
-
-EOF
-
-# Add failed cases
-jq -r '.results[] | select(.status == "fail") | "| \(.id) | \(.notes) |"' "${REPORT_FILE}" >> "${SUMMARY_FILE}"
-
-if [[ $(jq '[.results[] | select(.status == "fail")] | length' "${REPORT_FILE}") -eq 0 ]]; then
-    echo "_No failures_" >> "${SUMMARY_FILE}"
-else
-    # Add header
-    sed -i.bak '/## Failed Cases/a\
-| ID | Notes |\
-|-----|-------|' "${SUMMARY_FILE}"
-    rm -f "${SUMMARY_FILE}.bak"
-fi
-
-echo ""
-echo "================================================"
-echo "  BENCHMARK SUMMARY"
-echo "================================================"
-echo "  Strategy:  ${STRATEGY}"
-echo "  Total:     ${TOTAL}"
-echo "  Passed:    ${PASSED}"
-echo "  Failed:    ${FAILED}"
-echo "  Accuracy:  ${ACCURACY}%"
-echo "  Avg Score: ${AVG_SCORE}"
-echo "  Avg Latency: ${AVG_LATENCY} ms"
-echo "================================================"
-echo ""
-echo "Report: ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh
deleted file mode 100755
index 783e546..0000000
--- a/tests/benchmark/scripts/lint-corpus.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-CASES_DIR="${BENCHMARK_DIR}/cases"
-SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-ERRORS=0
-WARNINGS=0
-
-error() {
-    echo -e "${RED}ERROR:${NC} $1"
-    ERRORS=$((ERRORS + 1))
-}
-
-warn() {
-    echo -e "${YELLOW}WARN:${NC} $1"
-    WARNINGS=$((WARNINGS + 1))
-}
-
-ok() {
-    echo -e "${GREEN}✓${NC} $1"
-}
-
-echo "=== Corpus Lint ==="
-echo ""
-
-# 1. Check for invalid JSON in all benchmark files
-echo "Checking JSON validity..."
-for f in "${CORPUS_DIR}"/*/*.json "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        if ! jq . "$f" >/dev/null 2>&1; then
-            error "Invalid JSON: $f"
-        fi
-    fi
-done
-
-# 2. Check for duplicate query IDs across corpus files
-echo "Checking for duplicate query IDs..."
-declare -A QUERY_IDS
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r id; do
-            if [[ -n "$id" && "$id" != "null" ]]; then
-                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
-                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
-                else
-                    QUERY_IDS[$id]="$f"
-                fi
-            fi
-        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# Also check cases files
-for f in "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r id; do
-            if [[ -n "$id" && "$id" != "null" ]]; then
-                if [[ -n "${QUERY_IDS[$id]:-}" ]]; then
-                    error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})"
-                else
-                    QUERY_IDS[$id]="$f"
-                fi
-            fi
-        done < <(jq -r '.[].id // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 3. Check for duplicate refs within snapshots
-echo "Checking for duplicate refs in snapshots..."
-for f in "${CORPUS_DIR}"/*/snapshot.json; do
-    if [[ -f "$f" ]]; then
-        dupes=$(jq -r '.[].ref' "$f" 2>/dev/null | sort | uniq -d)
-        if [[ -n "$dupes" ]]; then
-            error "Duplicate refs in $f: $dupes"
-        fi
-    fi
-done
-
-# 4. Check that relevant_refs exist in snapshot
-echo "Checking relevant_refs exist in snapshots..."
-for corpus_dir in "${CORPUS_DIR}"/*/; do
-    corpus_name=$(basename "$corpus_dir")
-    snapshot="${corpus_dir}snapshot.json"
-    queries="${corpus_dir}queries.json"
-
-    if [[ -f "$snapshot" && -f "$queries" ]]; then
-        # Get all refs from snapshot
-        refs=$(jq -r '.[].ref' "$snapshot" 2>/dev/null | sort | uniq)
-
-        # Check relevant_refs
-        while IFS= read -r ref; do
-            if [[ -n "$ref" && "$ref" != "null" ]]; then
-                if ! echo "$refs" | grep -qx "$ref"; then
-                    error "[$corpus_name] relevant_ref '$ref' not found in snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].relevant_refs[]? // empty' "$queries" 2>/dev/null)
-
-        # Check partially_relevant_refs
-        while IFS= read -r ref; do
-            if [[ -n "$ref" && "$ref" != "null" ]]; then
-                if ! echo "$refs" | grep -qx "$ref"; then
-                    error "[$corpus_name] partially_relevant_ref '$ref' not found in snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].partially_relevant_refs[]? // empty' "$queries" 2>/dev/null)
-    fi
-done
-
-# 5. Check for empty relevant_refs (except no-match cases)
-echo "Checking for empty relevant_refs..."
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        empty_relevant=$(jq -r '.[] | select(.relevant_refs | length == 0) | select(.partially_relevant_refs | length == 0) | select(.expect_no_match != true) | .id' "$f" 2>/dev/null)
-        for id in $empty_relevant; do
-            if [[ -n "$id" ]]; then
-                warn "Query '$id' in $f has empty relevant_refs"
-            fi
-        done
-    fi
-done
-
-# 6. Check difficulty values
-echo "Checking difficulty values..."
-VALID_DIFFICULTIES="easy medium hard"
-for f in "${CORPUS_DIR}"/*/queries.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r line; do
-            id=$(echo "$line" | cut -d'|' -f1)
-            diff=$(echo "$line" | cut -d'|' -f2)
-            if [[ -n "$diff" && "$diff" != "null" ]]; then
-                if ! echo "$VALID_DIFFICULTIES" | grep -qw "$diff"; then
-                    error "Invalid difficulty '$diff' for query '$id' in $f"
-                fi
-            fi
-        done < <(jq -r '.[] | "\(.id)|\(.difficulty // "null")"' "$f" 2>/dev/null)
-    fi
-done
-
-# 7. Check for known tags (warn on unknown)
-echo "Checking tags..."
-KNOWN_TAGS="absent-control accessibility action action-synonym action-verb adversarial alertdialog all-stopwords auth basket-cart bulk-action button cell checkbox combobox compound context-exclusion conversational dashboard description descriptive dialog directional disambiguation domain-intent download-export duplicate-labels ecommerce empty-query empty-snapshot exact exact-match filter find-search generic-verb github guard icon implicit input interactive-boost keyboard-mash legal link literal-text login login-signin long-query lookup-search media menu menuitem missing-letter name-match natural-language navigation negative-context no-match noise-tokens nonsense option ordinal pagination parent-context partial position preferences-settings purchase-buy question-form radio register-create registration repeated-word row-context search searchbox section section-context signout-logout single-char social special-chars spinbutton stale-ref state switch synonym synonym-chain tab table textbox threshold toggle transposition typo vague-query visual weak-match wikipedia"
-for f in "${CORPUS_DIR}"/*/queries.json "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r tag; do
-            if [[ -n "$tag" && "$tag" != "null" ]]; then
-                if ! echo "$KNOWN_TAGS" | grep -qw "$tag"; then
-                    warn "Unknown tag '$tag' in $f"
-                fi
-            fi
-        done < <(jq -r '.[].tags[]? // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 8. Check case files reference existing snapshots
-echo "Checking case file snapshot references..."
-for f in "${CASES_DIR}"/*.json; do
-    if [[ -f "$f" ]]; then
-        while IFS= read -r snapshot; do
-            if [[ -n "$snapshot" && "$snapshot" != "null" ]]; then
-                if [[ ! -f "${SNAPSHOTS_DIR}/${snapshot}" ]]; then
-                    error "Case file $f references missing snapshot: $snapshot"
-                fi
-            fi
-        done < <(jq -r '.[].snapshot // empty' "$f" 2>/dev/null)
-    fi
-done
-
-# 9. Check for generated result files in source tree
-echo "Checking for generated result files..."
-if ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | grep -v '.gitkeep' | head -1 >/dev/null 2>&1; then
-    result_count=$(ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | wc -l | tr -d ' ')
-    warn "Found $result_count generated result files in tests/benchmark/results/ (should be gitignored)"
-fi
-
-echo ""
-echo "=== Summary ==="
-if [[ $ERRORS -eq 0 && $WARNINGS -eq 0 ]]; then
-    ok "All checks passed"
-    exit 0
-elif [[ $ERRORS -eq 0 ]]; then
-    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
-    exit 0
-else
-    echo -e "${RED}Errors: $ERRORS${NC}"
-    echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
-    exit 1
-fi
diff --git a/tests/benchmark/scripts/record-result.sh b/tests/benchmark/scripts/record-result.sh
deleted file mode 100755
index 2288f7c..0000000
--- a/tests/benchmark/scripts/record-result.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-#
-# Record a benchmark result
-#
-# Usage:
-#   ./record-result.sh <report_file> <id> <pass|fail|skip> <score> <latency_ms> "notes"
-#
-set -euo pipefail
-
-if [[ $# -lt 5 ]]; then
-    echo "Usage: $0 <report_file> <id> <pass|fail|skip> <score> <latency_ms> [notes]"
-    exit 1
-fi
-
-REPORT_FILE="$1"
-ID="$2"
-STATUS="$3"
-SCORE="$4"
-LATENCY_MS="$5"
-NOTES="${6:-}"
-TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
-
-# Create result entry
-RESULT_JSON=$(jq -n \
-    --arg id "${ID}" \
-    --arg status "${STATUS}" \
-    --argjson score "${SCORE}" \
-    --argjson latency "${LATENCY_MS}" \
-    --arg notes "${NOTES}" \
-    --arg ts "${TIMESTAMP}" \
-    '{id: $id, status: $status, score: $score, latency_ms: $latency, notes: $notes, timestamp: $ts}')
-
-# Append to report
-TMP_FILE=$(mktemp)
-jq --argjson result "${RESULT_JSON}" \
-   --arg status "${STATUS}" \
-   '.results += [$result] |
-    .summary.total += 1 |
-    if $status == "pass" then .summary.passed += 1
-    elif $status == "fail" then .summary.failed += 1
-    else .summary.skipped += 1 end' \
-   "${REPORT_FILE}" > "${TMP_FILE}"
-
-mv "${TMP_FILE}" "${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh
deleted file mode 100755
index 29c8a22..0000000
--- a/tests/benchmark/scripts/run-benchmark.sh
+++ /dev/null
@@ -1,226 +0,0 @@
-#!/bin/bash
-#
-# Run semantic matching benchmark
-#
-# Usage:
-#   ./run-benchmark.sh [--strategy <name>] [--cases <file>]
-#
-# Options:
-#   --strategy <name>   Strategy to benchmark (lexical, embedding, combined)
-#   --cases <file>      Specific case file to run (default: all)
-#   --output <dir>      Output directory (default: ../results)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CASES_DIR="${BENCHMARK_DIR}/cases"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-CASE_FILE=""
-
-# Parse args (override config)
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --strategy) STRATEGY="$2"; shift 2 ;;
-        --cases) CASE_FILE="$2"; shift 2 ;;
-        --output) RESULTS_DIR="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-case "${STRATEGY}" in
-    lexical|embedding|combined) ;;
-    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
-esac
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.json"
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
-    --arg version "$(${SEMANTIC} --version 2>/dev/null || echo 'dev')" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            strategy: $strategy,
-            version: $version
-        },
-        results: [],
-        summary: {
-            total: 0,
-            passed: 0,
-            failed: 0,
-            skipped: 0,
-            accuracy: 0,
-            avg_score: 0,
-            avg_latency_ms: 0
-        }
-    }' > "${REPORT_FILE}"
-
-# Run cases
-score_at_least() {
-    local score="$1"
-    local min_score="$2"
-    awk -v score="${score}" -v min_score="${min_score}" 'BEGIN { exit (score + 0 >= min_score + 0) ? 0 : 1 }'
-}
-
-run_case() {
-    local case_file="$1"
-    local case_name
-    case_name=$(basename "$case_file" .json)
-
-    echo ""
-    echo "=== Running: ${case_name} ==="
-
-    local count
-    count=$(jq length "$case_file")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query snapshot expect_ref expect_ref_alt expect_no_match expect_no_crash expect_has_matches threshold min_score
-
-        id=$(jq -r ".[$i].id" "$case_file")
-        query=$(jq -r ".[$i].query" "$case_file")
-        snapshot=$(jq -r ".[$i].snapshot" "$case_file")
-        expect_ref=$(jq -r ".[$i].expect_ref // empty" "$case_file")
-        expect_ref_alt=$(jq -r ".[$i].expect_ref_alt // [] | join(\",\")" "$case_file")
-        expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$case_file")
-        expect_no_crash=$(jq -r ".[$i].expect_no_crash // false" "$case_file")
-        expect_has_matches=$(jq -r ".[$i].expect_has_matches // false" "$case_file")
-        threshold=$(jq -r ".[$i].threshold // 0.3" "$case_file")
-        min_score=$(jq -r ".[$i].min_score // 0" "$case_file")
-
-        local snapshot_path="${SNAPSHOTS_DIR}/${snapshot}"
-        if [[ ! -f "${snapshot_path}" ]]; then
-            echo "  [${id}] SKIP: snapshot not found: ${snapshot}"
-            "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "skip" 0 0 "snapshot not found"
-            continue
-        fi
-
-        # Run query and measure time
-        local start_ms end_ms duration_ms result exit_code
-        start_ms=$(python3 -c 'import time; print(int(time.time() * 1000))')
-
-        set +e
-        result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot_path}" \
-            --strategy "${STRATEGY}" \
-            --threshold "${threshold}" \
-            --format json 2>&1)
-        exit_code=$?
-        set -e
-
-        end_ms=$(python3 -c 'import time; print(int(time.time() * 1000))')
-        duration_ms=$((end_ms - start_ms))
-
-        # Evaluate result
-        local status="fail"
-        local got_ref=""
-        local got_score=0
-        local notes=""
-
-        if [[ ${exit_code} -ne 0 ]]; then
-            if [[ "${expect_no_crash}" == "true" ]]; then
-                # Some crashes are expected (empty query, etc)
-                status="pass"
-                notes="exit ${exit_code} (expected)"
-            else
-                notes="exit ${exit_code}: ${result}"
-            fi
-        else
-            got_ref=$(echo "$result" | jq -r '.best_ref // empty')
-            got_score=$(echo "$result" | jq -r '.best_score // 0')
-            local match_count
-            match_count=$(echo "$result" | jq -r '.matches | length')
-
-            if [[ "${expect_no_match}" == "true" ]]; then
-                if [[ ${match_count} -eq 0 ]]; then
-                    status="pass"
-                    notes="no matches (expected)"
-                else
-                    notes="expected no matches, got ${match_count}"
-                fi
-            elif [[ "${expect_has_matches}" == "true" ]]; then
-                if [[ ${match_count} -gt 0 ]]; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="${match_count} matches, score=${got_score}"
-                    else
-                        notes="${match_count} matches, score=${got_score} below min_score=${min_score}"
-                    fi
-                else
-                    notes="expected matches, got 0"
-                fi
-            elif [[ -n "${expect_ref}" ]]; then
-                if [[ "${got_ref}" == "${expect_ref}" ]]; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="ref=${got_ref}, score=${got_score}"
-                    else
-                        notes="ref=${got_ref}, score=${got_score} below min_score=${min_score}"
-                    fi
-                elif [[ -n "${expect_ref_alt}" ]] && echo ",${expect_ref_alt}," | grep -q ",${got_ref},"; then
-                    if score_at_least "${got_score}" "${min_score}"; then
-                        status="pass"
-                        notes="ref=${got_ref} (alt), score=${got_score}"
-                    else
-                        notes="ref=${got_ref} (alt), score=${got_score} below min_score=${min_score}"
-                    fi
-                else
-                    notes="got ${got_ref}, want ${expect_ref}"
-                fi
-            elif [[ "${expect_no_crash}" == "true" ]]; then
-                status="pass"
-                notes="no crash"
-            fi
-        fi
-
-        # Record result
-        "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "${status}" "${got_score}" "${duration_ms}" "${notes}"
-
-        if [[ "${status}" == "pass" ]]; then
-            echo "  [${id}] PASS: ${notes}"
-        else
-            echo "  [${id}] FAIL: ${notes}"
-        fi
-    done
-}
-
-# Find case files
-if [[ -n "${CASE_FILE}" ]]; then
-    run_case "${CASES_DIR}/${CASE_FILE}"
-else
-    for case_file in "${CASES_DIR}"/*.json; do
-        [[ -f "$case_file" ]] || continue
-        run_case "$case_file"
-    done
-fi
-
-# Finalize report
-"${SCRIPT_DIR}/finalize-report.sh" "${REPORT_FILE}"
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "Benchmark complete: ${REPORT_FILE}"
diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh
deleted file mode 100755
index 53216af..0000000
--- a/tests/benchmark/scripts/run-corpus-benchmark.sh
+++ /dev/null
@@ -1,514 +0,0 @@
-#!/bin/bash
-#
-# Run semantic matching benchmark with ranking metrics
-#
-# Usage:
-#   ./run-corpus-benchmark.sh [--strategy <name>] [--corpus <dir>] [--lexical-weight <n>] [--embedding-weight <n>]
-#
-# Metrics:
-#   - MRR (Mean Reciprocal Rank)
-#   - P@1 (Precision at 1)
-#   - P@3 (Precision at 3)
-#   - Latency distribution (p50, p95, p99)
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
-EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
-SPECIFIC_CORPUS=""
-
-# Parse args (override config)
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --strategy) STRATEGY="$2"; shift 2 ;;
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        --threshold) THRESHOLD="$2"; shift 2 ;;
-        --top-k) TOP_K="$2"; shift 2 ;;
-        --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;;
-        --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-case "${STRATEGY}" in
-    lexical|embedding|combined) ;;
-    *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;;
-esac
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json"
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg strategy "${STRATEGY}" \
-    --argjson threshold "${THRESHOLD}" \
-    --argjson top_k "${TOP_K}" \
-    --argjson lexical_weight "${LEXICAL_WEIGHT}" \
-    --argjson embedding_weight "${EMBEDDING_WEIGHT}" \
-    --arg config_file "${CONFIG_FILE}" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            strategy: $strategy,
-            threshold: $threshold,
-            top_k: $top_k,
-            type: "corpus",
-            config_source: $config_file,
-            weights: {
-                lexical: $lexical_weight,
-                embedding: $embedding_weight
-            }
-        },
-        results: [],
-        metrics: {
-            total: 0,
-            mrr: 0,
-            p_at_1: 0,
-            p_at_3: 0,
-            latencies_ms: [],
-            by_difficulty: {},
-            by_tag: {}
-        }
-    }' > "${REPORT_FILE}"
-
-# Arrays to collect metrics
-declare -a ALL_RRS=()
-declare -a ALL_P1=()
-declare -a ALL_P3=()
-declare -a ALL_HIT3=()
-declare -a ALL_HIT5=()
-declare -a ALL_MARGINS=()
-declare -a ALL_LATENCIES=()
-
-run_corpus() {
-    local corpus_path="$1"
-    local corpus_name
-    corpus_name=$(basename "$corpus_path")
-
-    local snapshot="${corpus_path}/snapshot.json"
-    local queries="${corpus_path}/queries.json"
-
-    if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then
-        if [[ -f "${corpus_path}/cases.json" ]] || [[ -f "${corpus_path}/scenarios.json" ]]; then
-            return
-        fi
-        echo "  Skipping ${corpus_name}: missing files"
-        return
-    fi
-
-    echo ""
-    echo "=== Corpus: ${corpus_name} ==="
-
-    local count
-    count=$(jq length "$queries")
-
-    for i in $(seq 0 $((count - 1))); do
-        local id query relevant_refs partial_refs difficulty tags
-
-        id=$(jq -r ".[$i].id" "$queries")
-        query=$(jq -r ".[$i].query" "$queries")
-        relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries")
-        partial_refs=$(jq -c ".[$i].partially_relevant_refs // []" "$queries")
-        difficulty=$(jq -r ".[$i].difficulty // \"medium\"" "$queries")
-        tags=$(jq -c ".[$i].tags // []" "$queries")
-
-        # Run query and measure time
-        local start_ns end_ns duration_ms result
-        start_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
-
-        if ! result=$("${SEMANTIC}" find "${query}" \
-            --snapshot "${snapshot}" \
-            --strategy "${STRATEGY}" \
-            --threshold "${THRESHOLD}" \
-            --top-k "${TOP_K}" \
-            --lexical-weight "${LEXICAL_WEIGHT}" \
-            --embedding-weight "${EMBEDDING_WEIGHT}" \
-            --format json 2>&1); then
-            echo "  [${id}] ERROR: semantic find failed for query: ${query}" >&2
-            echo "${result}" >&2
-            exit 1
-        fi
-
-        if ! echo "$result" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
-            echo "  [${id}] ERROR: semantic find returned invalid JSON" >&2
-            echo "${result}" >&2
-            exit 1
-        fi
-
-        end_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))')
-        duration_ms=$(( (end_ns - start_ns) / 1000 ))
-
-        # Extract results
-        local matches best_ref best_score
-        matches=$(echo "$result" | jq -c '[.matches[].ref]')
-        best_ref=$(echo "$result" | jq -r '.best_ref // ""')
-        best_score=$(echo "$result" | jq -r '.best_score // 0')
-
-        # Calculate Reciprocal Rank
-        local rr=0
-        for rank in $(seq 1 ${TOP_K}); do
-            local ref_at_rank
-            ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                rr=$(echo "scale=4; 1 / ${rank}" | bc)
-                break
-            fi
-        done
-
-        # Calculate P@1
-        local p1=0
-        if echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-            p1=1
-        elif echo "$partial_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then
-            p1=0.5
-        fi
-
-        # Calculate P@3 (count relevant in top 3, partials count as 0.5)
-        local relevant_in_top3=0
-        local partial_in_top3=0
-        local hit_at_3=0
-        local hit_at_5=0
-        local best_relevant_rank="null"
-        for rank in 1 2 3 4 5; do
-            local ref_at_rank
-            ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                if [[ "$best_relevant_rank" == "null" ]]; then
-                    best_relevant_rank=$rank
-                fi
-                if [[ $rank -le 3 ]]; then
-                    relevant_in_top3=$((relevant_in_top3 + 1))
-                    hit_at_3=1
-                fi
-                hit_at_5=1
-            elif [[ $rank -le 3 ]]; then
-                if echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then
-                    partial_in_top3=$((partial_in_top3 + 1))
-                fi
-            fi
-        done
-        local p3
-        p3=$(echo "scale=4; (${relevant_in_top3} + ${partial_in_top3} * 0.5) / 3" | bc)
-
-        # Calculate best_relevant_score, best_wrong_score, and margin
-        local best_relevant_score=0
-        local best_wrong_score=0
-        local num_matches
-        num_matches=$(echo "$result" | jq '.matches | length')
-        for idx in $(seq 0 $((num_matches - 1))); do
-            local ref_at_idx score_at_idx
-            ref_at_idx=$(echo "$result" | jq -r ".matches[$idx].ref // \"\"")
-            score_at_idx=$(echo "$result" | jq -r ".matches[$idx].score // 0")
-            if echo "$relevant_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
-                if (( $(echo "$score_at_idx > $best_relevant_score" | bc -l) )); then
-                    best_relevant_score=$score_at_idx
-                fi
-            elif echo "$partial_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then
-                : # partials don't count as wrong
-            else
-                if (( $(echo "$score_at_idx > $best_wrong_score" | bc -l) )); then
-                    best_wrong_score=$score_at_idx
-                fi
-            fi
-        done
-        local margin
-        margin=$(echo "scale=4; $best_relevant_score - $best_wrong_score" | bc)
-
-        # Collect metrics
-        ALL_RRS+=("$rr")
-        ALL_P1+=("$p1")
-        ALL_P3+=("$p3")
-        ALL_HIT3+=("$hit_at_3")
-        ALL_HIT5+=("$hit_at_5")
-        ALL_MARGINS+=("$margin")
-        ALL_LATENCIES+=("$duration_ms")
-
-        # Status indicator
-        local status="MISS"
-        if (( $(echo "$p1 >= 1" | bc -l) )); then
-            status="HIT "
-        elif (( $(echo "$p1 >= 0.5" | bc -l) )); then
-            status="PART"
-        fi
-
-        printf "  [%s] %s | RR=%.2f P@1=%.1f P@3=%.2f | %dms | %s\n" \
-            "$id" "$status" "$rr" "$p1" "$p3" "$duration_ms" "$query"
-
-        # Record to report
-        local result_json
-        result_json=$(jq -n \
-            --arg id "$id" \
-            --arg query "$query" \
-            --arg corpus "$corpus_name" \
-            --arg difficulty "$difficulty" \
-            --argjson tags "$tags" \
-            --arg best_ref "$best_ref" \
-            --argjson best_score "$best_score" \
-            --argjson matches "$matches" \
-            --argjson relevant "$relevant_refs" \
-            --argjson rr "$rr" \
-            --argjson p1 "$p1" \
-            --argjson p3 "$p3" \
-            --argjson hit_at_3 "$hit_at_3" \
-            --argjson hit_at_5 "$hit_at_5" \
-            --argjson best_relevant_rank "$best_relevant_rank" \
-            --argjson best_relevant_score "$best_relevant_score" \
-            --argjson best_wrong_score "$best_wrong_score" \
-            --argjson margin "$margin" \
-            --argjson latency "$duration_ms" \
-            '{
-                id: $id, query: $query, corpus: $corpus,
-                difficulty: $difficulty, tags: $tags,
-                best_ref: $best_ref, best_score: $best_score,
-                matches: $matches, relevant_refs: $relevant,
-                rr: $rr, p_at_1: $p1, p_at_3: $p3,
-                hit_at_3: $hit_at_3, hit_at_5: $hit_at_5,
-                best_relevant_rank: $best_relevant_rank,
-                best_relevant_score: $best_relevant_score,
-                best_wrong_score: $best_wrong_score,
-                margin: $margin,
-                latency_ms: $latency
-            }')
-
-        # Append to report
-        local tmp
-        tmp=$(mktemp)
-        jq --argjson r "$result_json" '.results += [$r]' "$REPORT_FILE" > "$tmp"
-        mv "$tmp" "$REPORT_FILE"
-    done
-}
-
-# Run benchmarks
-if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-    run_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}"
-else
-    for corpus in "${CORPUS_DIR}"/*/; do
-        [[ -d "$corpus" ]] || continue
-        run_corpus "$corpus"
-    done
-fi
-
-# Calculate aggregate metrics
-echo ""
-echo "Calculating aggregate metrics..."
-
-TOTAL=${#ALL_RRS[@]}
-if [[ $TOTAL -eq 0 ]]; then
-    echo "No results to aggregate"
-    exit 1
-fi
-
-# MRR
-MRR=$(printf '%s\n' "${ALL_RRS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# P@1
-P1=$(printf '%s\n' "${ALL_P1[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# P@3
-P3=$(printf '%s\n' "${ALL_P3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Hit@3
-HIT3=$(printf '%s\n' "${ALL_HIT3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Hit@5
-HIT5=$(printf '%s\n' "${ALL_HIT5[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Average margin
-AVG_MARGIN=$(printf '%s\n' "${ALL_MARGINS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}')
-
-# Latency percentiles
-SORTED_LAT=($(printf '%s\n' "${ALL_LATENCIES[@]}" | sort -n))
-P50_IDX=$(( TOTAL * 50 / 100 ))
-P95_IDX=$(( TOTAL * 95 / 100 ))
-P99_IDX=$(( TOTAL * 99 / 100 ))
-LAT_P50=${SORTED_LAT[$P50_IDX]:-0}
-LAT_P95=${SORTED_LAT[$P95_IDX]:-0}
-LAT_P99=${SORTED_LAT[$P99_IDX]:-0}
-LAT_AVG=$(printf '%s\n' "${ALL_LATENCIES[@]}" | awk '{s+=$1} END {printf "%.0f", s/NR}')
-
-# Update report with aggregates
-tmp=$(mktemp)
-jq \
-    --argjson total "$TOTAL" \
-    --argjson mrr "$MRR" \
-    --argjson p1 "$P1" \
-    --argjson p3 "$P3" \
-    --argjson hit3 "$HIT3" \
-    --argjson hit5 "$HIT5" \
-    --argjson avg_margin "$AVG_MARGIN" \
-    --argjson lat_avg "$LAT_AVG" \
-    --argjson lat_p50 "$LAT_P50" \
-    --argjson lat_p95 "$LAT_P95" \
-    --argjson lat_p99 "$LAT_P99" \
-    '.metrics = {
-        total: $total,
-        mrr: $mrr,
-        p_at_1: $p1,
-        p_at_3: $p3,
-        hit_at_3: $hit3,
-        hit_at_5: $hit5,
-        avg_margin: $avg_margin,
-        latency_avg_ms: $lat_avg,
-        latency_p50_ms: $lat_p50,
-        latency_p95_ms: $lat_p95,
-        latency_p99_ms: $lat_p99
-    }' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-difficulty breakdown
-tmp=$(mktemp)
-jq '.metrics.by_difficulty = (
-    .results | group_by(.difficulty) | map({
-        key: .[0].difficulty,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    }) | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-corpus breakdown
-tmp=$(mktemp)
-jq '.metrics.by_corpus = (
-    .results | group_by(.corpus) | map({
-        key: .[0].corpus,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    }) | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Add by-tag breakdown
-tmp=$(mktemp)
-jq '.metrics.by_tag = (
-    [.results[] | {tags: .tags, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
-    | [.[] | .tags[] as $tag | {tag: $tag, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}]
-    | group_by(.tag)
-    | map({
-        key: .[0].tag,
-        value: {
-            count: length,
-            mrr: ([.[].rr] | add / length),
-            p_at_1: ([.[].p_at_1] | add / length),
-            hit_at_3: ([.[].hit_at_3] | add / length),
-            hit_at_5: ([.[].hit_at_5] | add / length),
-            avg_margin: ([.[].margin] | add / length)
-        }
-    })
-    | from_entries
-)' "$REPORT_FILE" > "$tmp"
-mv "$tmp" "$REPORT_FILE"
-
-# Generate summary
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Semantic Matching Benchmark Results
-
-## Configuration
-
-| Field | Value |
-|-------|-------|
-| Timestamp | $(jq -r '.benchmark.timestamp' "$REPORT_FILE") |
-| Strategy | ${STRATEGY} |
-| Lexical Weight | ${LEXICAL_WEIGHT} |
-| Embedding Weight | ${EMBEDDING_WEIGHT} |
-| Top-K | ${TOP_K} |
-| Total Queries | ${TOTAL} |
-
-## Ranking Metrics
-
-| Metric | Value | Description |
-|--------|-------|-------------|
-| **MRR** | **${MRR}** | Mean Reciprocal Rank |
-| **P@1** | **${P1}** | Precision at rank 1 |
-| **P@3** | **${P3}** | Precision at rank 3 |
-| **Hit@3** | **${HIT3}** | Any relevant in top 3 |
-| **Hit@5** | **${HIT5}** | Any relevant in top 5 |
-| **Avg Margin** | **${AVG_MARGIN}** | best_relevant - best_wrong |
-
-## Latency
-
-| Percentile | Value |
-|------------|-------|
-| Average | ${LAT_AVG} ms |
-| P50 | ${LAT_P50} ms |
-| P95 | ${LAT_P95} ms |
-| P99 | ${LAT_P99} ms |
-
-## By Difficulty
-
-| Difficulty | Count | MRR | P@1 | Hit@3 | Margin |
-|------------|-------|-----|-----|-------|--------|
-$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
-
-## By Corpus
-
-| Corpus | Count | MRR | P@1 | Hit@3 | Margin |
-|--------|-------|-----|-----|-------|--------|
-$(jq -r '.metrics.by_corpus | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE")
-
-## Misses (P@1 = 0)
-
-| ID | Query | Got | Expected |
-|----|-------|-----|----------|
-$(jq -r '.results[] | select(.p_at_1 == 0) | "| \(.id) | \(.query) | \(.best_ref) | \(.relevant_refs | join(",")) |"' "$REPORT_FILE")
-
-EOF
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "================================================"
-echo "  CORPUS BENCHMARK RESULTS"
-echo "================================================"
-echo "  Strategy:    ${STRATEGY}"
-echo "  Weights:     lexical=${LEXICAL_WEIGHT} embedding=${EMBEDDING_WEIGHT}"
-echo "  Queries:     ${TOTAL}"
-echo "  MRR:         ${MRR}"
-echo "  P@1:         ${P1}"
-echo "  P@3:         ${P3}"
-echo "  Hit@3:       ${HIT3}"
-echo "  Hit@5:       ${HIT5}"
-echo "  Avg Margin:  ${AVG_MARGIN}"
-echo "  Latency P50: ${LAT_P50} ms"
-echo "  Latency P95: ${LAT_P95} ms"
-echo "================================================"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh
deleted file mode 100755
index 5c759dc..0000000
--- a/tests/benchmark/scripts/run-full-benchmark.sh
+++ /dev/null
@@ -1,317 +0,0 @@
-#!/bin/bash
-#
-# Full semantic benchmark: Find + Recovery + Classification
-#
-# Produces a composite score for overall system health.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-CORPUS_DIR="${BENCHMARK_DIR}/corpus"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE")
-EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE")
-
-mkdir -p "${RESULTS_DIR}"
-
-# Build semantic binary with recovery support
-echo "Building semantic..."
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-SEMANTIC="${BENCHMARK_DIR}/semantic"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/full_benchmark_${TIMESTAMP}.json"
-
-has_role_keyword() {
-    local query="$1"
-    echo "$query" | grep -Eiq '(^|[^[:alnum:]])(button|input|link|textbox|checkbox|radio|select|option|tab|menu|form|search)([^[:alnum:]]|$)'
-}
-
-enrich_recovery_query() {
-    local query="$1"
-    local role="$2"
-
-    if [[ -z "$query" || -z "$role" ]]; then
-        printf '%s' "$query"
-        return
-    fi
-    if has_role_keyword "$query"; then
-        printf '%s' "$query"
-        return
-    fi
-    printf '%s %s' "$query" "$role"
-}
-
-# Initialize report
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    '{
-        timestamp: $ts,
-        find: { total: 0, mrr: 0, p_at_1: 0, latency_p50: 0 },
-        recovery: { total: 0, recovered: 0, rate: 0 },
-        classification: { total: 0, correct: 0, accuracy: 0 },
-        composite: { score: 0, grade: "" }
-    }' > "${REPORT_FILE}"
-
-echo ""
-echo "=============================================="
-echo "  PHASE 1: FIND BENCHMARK"
-echo "=============================================="
-
-# Run corpus benchmark and capture metrics
-FIND_OUTPUT=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" 2>&1)
-echo "$FIND_OUTPUT"
-
-# Extract metrics from the corpus report rather than the human-readable output.
-FIND_REPORT=$(echo "$FIND_OUTPUT" | awk '/^Report:/ {print $2}' | tail -1)
-if [[ -z "${FIND_REPORT}" ]] || [[ ! -f "${FIND_REPORT}" ]]; then
-    echo "error: could not locate corpus benchmark report" >&2
-    exit 1
-fi
-FIND_MRR=$(jq -r '.metrics.mrr' "$FIND_REPORT")
-FIND_P1=$(jq -r '.metrics.p_at_1' "$FIND_REPORT")
-FIND_TOTAL=$(jq -r '.metrics.total' "$FIND_REPORT")
-FIND_LAT=$(jq -r '.metrics.latency_p50_ms' "$FIND_REPORT")
-
-# Rebuild semantic binary (corpus benchmark deletes it)
-(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic)
-
-echo ""
-echo "=============================================="
-echo "  PHASE 2: RECOVERY BENCHMARK"
-echo "=============================================="
-
-SCENARIOS_FILE="${CORPUS_DIR}/recovery-scenarios/scenarios.json"
-RECOVERY_TOTAL=0
-RECOVERY_SUCCESS=0
-
-if [[ -f "$SCENARIOS_FILE" ]]; then
-    SCENARIO_COUNT=$(jq length "$SCENARIOS_FILE")
-
-    for i in $(seq 0 $((SCENARIO_COUNT - 1))); do
-        ID=$(jq -r ".[$i].id" "$SCENARIOS_FILE")
-        NAME=$(jq -r ".[$i].name" "$SCENARIOS_FILE")
-        RAW_QUERY=$(jq -r ".[$i].original_query" "$SCENARIOS_FILE")
-        ORIGINAL_REF=$(jq -r ".[$i].original_ref // empty" "$SCENARIOS_FILE")
-        ORIGINAL_ROLE=$(jq -r ".[$i].before[]? | select(.ref == \"$ORIGINAL_REF\") | .role // empty" "$SCENARIOS_FILE")
-        QUERY=$(enrich_recovery_query "$RAW_QUERY" "$ORIGINAL_ROLE")
-        EXPECTED=$(jq -r ".[$i].expected_ref // empty" "$SCENARIOS_FILE")
-        EXPECTED_ALT=$(jq -r ".[$i].expected_alt // [] | join(\",\")" "$SCENARIOS_FILE")
-        EXPECT_NO_MATCH=$(jq -r ".[$i].expect_no_match // false" "$SCENARIOS_FILE")
-
-        # Write after snapshot to temp file
-        AFTER_FILE=$(mktemp)
-        jq ".[$i].after" "$SCENARIOS_FILE" > "$AFTER_FILE"
-
-        # Run semantic find on after snapshot with the same minimum score
-        # enforced by DefaultRecoveryConfig in the recovery engine.
-        if ! RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>&1); then
-            echo "  [$ID] ERROR: semantic find failed during recovery benchmark" >&2
-            echo "$RESULT" >&2
-            rm -f "$AFTER_FILE"
-            exit 1
-        fi
-        if ! echo "$RESULT" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then
-            echo "  [$ID] ERROR: semantic find returned invalid JSON during recovery benchmark" >&2
-            echo "$RESULT" >&2
-            rm -f "$AFTER_FILE"
-            exit 1
-        fi
-        BEST_REF=$(echo "$RESULT" | jq -r '.best_ref // ""')
-
-        rm -f "$AFTER_FILE"
-
-        RECOVERY_TOTAL=$((RECOVERY_TOTAL + 1))
-        STATUS="FAIL"
-
-        if [[ "$EXPECT_NO_MATCH" == "true" ]]; then
-            if [[ -z "$BEST_REF" ]] || [[ "$BEST_REF" == "null" ]]; then
-                STATUS="PASS"
-                RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-            fi
-        elif [[ "$BEST_REF" == "$EXPECTED" ]]; then
-            STATUS="PASS"
-            RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-        elif [[ -n "$EXPECTED_ALT" ]] && echo ",$EXPECTED_ALT," | grep -q ",$BEST_REF,"; then
-            STATUS="PASS"
-            RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1))
-        fi
-
-        printf "  [%s] %s | %s | got=%s want=%s\n" "$ID" "$STATUS" "$NAME" "$BEST_REF" "$EXPECTED"
-    done
-fi
-
-RECOVERY_RATE=0
-if [[ $RECOVERY_TOTAL -gt 0 ]]; then
-    RECOVERY_RATE=$(echo "scale=4; $RECOVERY_SUCCESS / $RECOVERY_TOTAL" | bc)
-fi
-
-echo ""
-echo "  Recovery: $RECOVERY_SUCCESS / $RECOVERY_TOTAL = $RECOVERY_RATE"
-
-echo ""
-echo "=============================================="
-echo "  PHASE 3: CLASSIFICATION BENCHMARK"
-echo "=============================================="
-
-CLASS_FILE="${CORPUS_DIR}/classification/cases.json"
-CLASS_TOTAL=0
-CLASS_CORRECT=0
-
-if [[ -f "$CLASS_FILE" ]]; then
-    CLASS_COUNT=$(jq length "$CLASS_FILE")
-
-    for i in $(seq 0 $((CLASS_COUNT - 1))); do
-        ID=$(jq -r ".[$i].id" "$CLASS_FILE")
-        ERROR=$(jq -r ".[$i].error" "$CLASS_FILE")
-        EXPECTED=$(jq -r ".[$i].expected_type" "$CLASS_FILE")
-
-        # Run semantic classify (extract just the type, first word)
-        if ! RESULT=$("${SEMANTIC}" classify "$ERROR" 2>&1); then
-            echo "  [$ID] ERROR: semantic classify failed" >&2
-            echo "$RESULT" >&2
-            exit 1
-        fi
-        GOT=$(echo "$RESULT" | awk '{print $1}')
-
-        CLASS_TOTAL=$((CLASS_TOTAL + 1))
-        STATUS="FAIL"
-
-        if [[ "$GOT" == "$EXPECTED" ]]; then
-            STATUS="PASS"
-            CLASS_CORRECT=$((CLASS_CORRECT + 1))
-        fi
-
-        printf "  [%s] %s | \"%s\" → %s (want %s)\n" "$ID" "$STATUS" "${ERROR:0:40}" "$GOT" "$EXPECTED"
-    done
-fi
-
-CLASS_ACCURACY=0
-if [[ $CLASS_TOTAL -gt 0 ]]; then
-    CLASS_ACCURACY=$(echo "scale=4; $CLASS_CORRECT / $CLASS_TOTAL" | bc)
-fi
-
-echo ""
-echo "  Classification: $CLASS_CORRECT / $CLASS_TOTAL = $CLASS_ACCURACY"
-
-echo ""
-echo "=============================================="
-echo "  COMPOSITE SCORE"
-echo "=============================================="
-
-# Calculate composite score with weights:
-#   Find P@1:      40%
-#   Find MRR:      20%
-#   Recovery Rate: 25%
-#   Classification: 15%
-
-COMPOSITE=$(echo "scale=4; \
-    ($FIND_P1 * 0.40) + \
-    ($FIND_MRR * 0.20) + \
-    ($RECOVERY_RATE * 0.25) + \
-    ($CLASS_ACCURACY * 0.15)" | bc)
-COMPOSITE=$(awk -v value="$COMPOSITE" 'BEGIN { printf "%.4f", value }')
-
-# Assign grade
-GRADE="F"
-if (( $(echo "$COMPOSITE >= 0.95" | bc -l) )); then GRADE="A+"
-elif (( $(echo "$COMPOSITE >= 0.90" | bc -l) )); then GRADE="A"
-elif (( $(echo "$COMPOSITE >= 0.85" | bc -l) )); then GRADE="B+"
-elif (( $(echo "$COMPOSITE >= 0.80" | bc -l) )); then GRADE="B"
-elif (( $(echo "$COMPOSITE >= 0.75" | bc -l) )); then GRADE="C+"
-elif (( $(echo "$COMPOSITE >= 0.70" | bc -l) )); then GRADE="C"
-elif (( $(echo "$COMPOSITE >= 0.60" | bc -l) )); then GRADE="D"
-fi
-
-# Update report
-TMP=$(mktemp)
-jq \
-    --argjson find_total "${FIND_TOTAL:-0}" \
-    --argjson find_mrr "${FIND_MRR:-0}" \
-    --argjson find_p1 "${FIND_P1:-0}" \
-    --argjson find_lat "${FIND_LAT:-0}" \
-    --argjson rec_total "$RECOVERY_TOTAL" \
-    --argjson rec_success "$RECOVERY_SUCCESS" \
-    --argjson rec_rate "$RECOVERY_RATE" \
-    --argjson class_total "$CLASS_TOTAL" \
-    --argjson class_correct "$CLASS_CORRECT" \
-    --argjson class_acc "$CLASS_ACCURACY" \
-    --argjson composite "$COMPOSITE" \
-    --arg grade "$GRADE" \
-    '.find = { total: $find_total, mrr: $find_mrr, p_at_1: $find_p1, latency_p50: $find_lat } |
-     .recovery = { total: $rec_total, recovered: $rec_success, rate: $rec_rate } |
-     .classification = { total: $class_total, correct: $class_correct, accuracy: $class_acc } |
-     .composite = { score: $composite, grade: $grade }' \
-    "$REPORT_FILE" > "$TMP"
-mv "$TMP" "$REPORT_FILE"
-
-# Generate summary
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-cat > "$SUMMARY_FILE" << EOF
-# Semantic Benchmark Report
-
-## Composite Score: ${COMPOSITE} (${GRADE})
-
-| Component | Weight | Score | Weighted |
-|-----------|--------|-------|----------|
-| Find P@1 | 40% | ${FIND_P1:-0} | $(echo "scale=3; ${FIND_P1:-0} * 0.40" | bc) |
-| Find MRR | 20% | ${FIND_MRR:-0} | $(echo "scale=3; ${FIND_MRR:-0} * 0.20" | bc) |
-| Recovery | 25% | ${RECOVERY_RATE} | $(echo "scale=3; ${RECOVERY_RATE} * 0.25" | bc) |
-| Classification | 15% | ${CLASS_ACCURACY} | $(echo "scale=3; ${CLASS_ACCURACY} * 0.15" | bc) |
-
-## Find Performance
-- Queries: ${FIND_TOTAL:-0}
-- MRR: ${FIND_MRR:-0}
-- P@1: ${FIND_P1:-0}
-- Latency P50: ${FIND_LAT:-0} ms
-
-## Recovery Performance
-- Scenarios: ${RECOVERY_TOTAL}
-- Recovered: ${RECOVERY_SUCCESS}
-- Rate: ${RECOVERY_RATE}
-
-## Classification Performance
-- Cases: ${CLASS_TOTAL}
-- Correct: ${CLASS_CORRECT}
-- Accuracy: ${CLASS_ACCURACY}
-
-## Grade Scale
-| Grade | Score |
-|-------|-------|
-| A+ | >= 0.95 |
-| A | >= 0.90 |
-| B+ | >= 0.85 |
-| B | >= 0.80 |
-| C+ | >= 0.75 |
-| C | >= 0.70 |
-| D | >= 0.60 |
-| F | < 0.60 |
-EOF
-
-# Cleanup
-rm -f "${BENCHMARK_DIR}/semantic"
-
-echo ""
-echo "  ┌─────────────────────────────────────────┐"
-echo "  │  COMPOSITE SCORE: ${COMPOSITE}  GRADE: ${GRADE}      │"
-echo "  ├─────────────────────────────────────────┤"
-echo "  │  Find P@1:       ${FIND_P1:-0}  (40%)            │"
-echo "  │  Find MRR:       ${FIND_MRR:-0}  (20%)            │"
-echo "  │  Recovery:       ${RECOVERY_RATE}  (25%)            │"
-echo "  │  Classification: ${CLASS_ACCURACY}  (15%)            │"
-echo "  └─────────────────────────────────────────┘"
-echo ""
-echo "Report: ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/run-recovery-benchmark.sh b/tests/benchmark/scripts/run-recovery-benchmark.sh
deleted file mode 100755
index 93fc88a..0000000
--- a/tests/benchmark/scripts/run-recovery-benchmark.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-#
-# Recovery Engine Benchmark
-#
-# Exercises RecoveryEngine directly using before/after snapshots
-# and intent cache entries from recovery scenarios.
-#
-# Usage:
-#   ./run-recovery-benchmark.sh
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-
-mkdir -p "${RESULTS_DIR}"
-
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/recovery_benchmark_${TIMESTAMP}.txt"
-
-echo "=== Recovery Engine Benchmark ==="
-echo ""
-
-cd "${BENCHMARK_DIR}/../.."
-
-# Run the Go test that exercises RecoveryEngine with scenarios
-echo "Running recovery scenarios..."
-echo ""
-
-go test -v -run TestRecoveryBenchmark_Scenarios ./recovery/ 2>&1 | tee "$REPORT_FILE"
-
-# Also run the Go benchmark for performance
-echo ""
-echo "Running performance benchmark..."
-go test -bench=BenchmarkRecoveryEngine_Scenarios -benchmem ./recovery/ 2>&1 | tee -a "$REPORT_FILE"
-
-echo ""
-echo "================================================"
-echo "  RECOVERY BENCHMARK COMPLETE"
-echo "================================================"
-echo "Report: $REPORT_FILE"
diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh
deleted file mode 100755
index 011b1b2..0000000
--- a/tests/benchmark/scripts/tune-weights.sh
+++ /dev/null
@@ -1,167 +0,0 @@
-#!/bin/bash
-#
-# Grid-search combined matcher lexical/embedding weights against the corpus.
-#
-# Usage:
-#   ./tune-weights.sh [--corpus <dir>] [--step <n>] [--output <dir>]
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read defaults from config (used for threshold/top_k in grid runs)
-if [[ -f "$CONFIG_FILE" ]]; then
-    THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE")
-    TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE")
-else
-    THRESHOLD=0.01
-    TOP_K=5
-fi
-
-SPECIFIC_CORPUS=""
-STEP="0.1"
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;;
-        --step) STEP="$2"; shift 2 ;;
-        --output) RESULTS_DIR="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${RESULTS_DIR}"
-
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/tuning_weights_${TIMESTAMP}.json"
-SUMMARY_FILE="${REPORT_FILE%.json}_summary.md"
-
-jq -n \
-    --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-    --arg step "${STEP}" \
-    '{
-        benchmark: {
-            timestamp: $ts,
-            type: "weight-tuning",
-            strategy: "combined",
-            step: ($step | tonumber)
-        },
-        results: [],
-        best: null
-    }' > "${REPORT_FILE}"
-
-weights=$(awk -v step="${STEP}" 'BEGIN {
-    if (step <= 0 || step > 1) {
-        exit 1
-    }
-    for (w = 0; w <= 1.000001; w += step) {
-        printf "%.4f\n", w
-    }
-}')
-
-if [[ -z "${weights}" ]]; then
-    echo "Invalid step: ${STEP}" >&2
-    exit 1
-fi
-
-echo "Weight tuning: step=${STEP}"
-echo ""
-printf "%-10s %-10s %-8s %-8s %-8s %-8s %-8s\n" "lexical" "embedding" "MRR" "P@1" "P@3" "P50" "report"
-
-while IFS= read -r lexical_weight; do
-    embedding_weight=$(awk -v w="${lexical_weight}" 'BEGIN { printf "%.4f", 1 - w }')
-
-    args=(
-        --strategy combined
-        --lexical-weight "${lexical_weight}"
-        --embedding-weight "${embedding_weight}"
-    )
-    if [[ -n "${SPECIFIC_CORPUS}" ]]; then
-        args+=(--corpus "${SPECIFIC_CORPUS}")
-    fi
-
-    if ! output=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" "${args[@]}" 2>&1); then
-        echo "$output" >&2
-        exit 1
-    fi
-
-    corpus_report=$(echo "$output" | awk '/^Report:/ {print $2}' | tail -1)
-    if [[ -z "${corpus_report}" || ! -f "${corpus_report}" ]]; then
-        echo "Could not find corpus report for lexical=${lexical_weight}" >&2
-        echo "$output" >&2
-        exit 1
-    fi
-
-    mrr=$(jq -r '.metrics.mrr' "$corpus_report")
-    p1=$(jq -r '.metrics.p_at_1' "$corpus_report")
-    p3=$(jq -r '.metrics.p_at_3' "$corpus_report")
-    p50=$(jq -r '.metrics.latency_p50_ms' "$corpus_report")
-    total=$(jq -r '.metrics.total' "$corpus_report")
-
-    printf "%-10s %-10s %-8s %-8s %-8s %-8s %s\n" \
-        "${lexical_weight}" "${embedding_weight}" "${mrr}" "${p1}" "${p3}" "${p50}" "$(basename "$corpus_report")"
-
-    result_json=$(jq -n \
-        --argjson lexical_weight "${lexical_weight}" \
-        --argjson embedding_weight "${embedding_weight}" \
-        --argjson total "${total}" \
-        --argjson mrr "${mrr}" \
-        --argjson p1 "${p1}" \
-        --argjson p3 "${p3}" \
-        --argjson p50 "${p50}" \
-        --arg report "${corpus_report}" \
-        '{
-            lexical_weight: $lexical_weight,
-            embedding_weight: $embedding_weight,
-            total: $total,
-            mrr: $mrr,
-            p_at_1: $p1,
-            p_at_3: $p3,
-            latency_p50_ms: $p50,
-            report: $report
-        }')
-
-    tmp=$(mktemp)
-    jq --argjson result "${result_json}" '.results += [$result]' "${REPORT_FILE}" > "$tmp"
-    mv "$tmp" "${REPORT_FILE}"
-done <<< "${weights}"
-
-tmp=$(mktemp)
-jq '
-    .best = (
-        .results
-        | sort_by(.p_at_1, .mrr, .p_at_3, -(.latency_p50_ms))
-        | last
-    )
-' "${REPORT_FILE}" > "$tmp"
-mv "$tmp" "${REPORT_FILE}"
-
-cat > "${SUMMARY_FILE}" << EOF
-# Combined Weight Tuning
-
-## Best
-
-| Field | Value |
-|-------|-------|
-| Lexical Weight | $(jq -r '.best.lexical_weight' "$REPORT_FILE") |
-| Embedding Weight | $(jq -r '.best.embedding_weight' "$REPORT_FILE") |
-| MRR | $(jq -r '.best.mrr' "$REPORT_FILE") |
-| P@1 | $(jq -r '.best.p_at_1' "$REPORT_FILE") |
-| P@3 | $(jq -r '.best.p_at_3' "$REPORT_FILE") |
-| Latency P50 | $(jq -r '.best.latency_p50_ms' "$REPORT_FILE") ms |
-
-## All Runs
-
-| Lexical | Embedding | MRR | P@1 | P@3 | P50 |
-|---------|-----------|-----|-----|-----|-----|
-$(jq -r '.results | sort_by(-.p_at_1, -.mrr, -.p_at_3, .latency_p50_ms)[] | "| \(.lexical_weight) | \(.embedding_weight) | \(.mrr) | \(.p_at_1) | \(.p_at_3) | \(.latency_p50_ms) ms |"' "$REPORT_FILE")
-EOF
-
-echo ""
-echo "Best weights:"
-jq '.best' "${REPORT_FILE}"
-echo ""
-echo "Report:  ${REPORT_FILE}"
-echo "Summary: ${SUMMARY_FILE}"
diff --git a/tests/benchmark/scripts/update-baseline.sh b/tests/benchmark/scripts/update-baseline.sh
deleted file mode 100755
index ba93089..0000000
--- a/tests/benchmark/scripts/update-baseline.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-#
-# Update baseline after reviewing regressions.
-#
-# Usage:
-#   ./update-baseline.sh --accept [--baseline <file>]
-#
-# This re-runs the benchmark and overwrites the baseline file.
-# Use after reviewing check-baseline.sh output and confirming
-# the changes are intentional.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-
-# Read config
-if [[ ! -f "$CONFIG_FILE" ]]; then
-    echo "ERROR: Config file not found: $CONFIG_FILE" >&2
-    exit 1
-fi
-
-STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE")
-
-# Parse args
-BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json"
-ACCEPT=false
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --accept) ACCEPT=true; shift ;;
-        --baseline) BASELINE_FILE="$2"; shift 2 ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-if [[ "$ACCEPT" != "true" ]]; then
-    echo "Usage: $0 --accept [--baseline <file>]"
-    echo ""
-    echo "This will overwrite the baseline. Run check-baseline.sh first"
-    echo "to review changes before accepting."
-    exit 1
-fi
-
-if [[ ! -f "$BASELINE_FILE" ]]; then
-    echo "Baseline not found: $BASELINE_FILE"
-    echo "Creating new baseline instead..."
-    exec "${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
-fi
-
-# Show what will change
-echo "Current baseline: ${BASELINE_FILE}"
-echo ""
-jq -r '"  MRR:   \(.metrics.mrr)\n  P@1:   \(.metrics.p_at_1)\n  Hit@3: \(.metrics.hit_at_3)"' "$BASELINE_FILE"
-echo ""
-echo "Running benchmark to generate new baseline..."
-echo ""
-
-# Backup old baseline
-BACKUP_FILE="${BASELINE_FILE%.json}_$(date +%Y%m%d_%H%M%S).backup.json"
-cp "$BASELINE_FILE" "$BACKUP_FILE"
-echo "Backed up old baseline to: $BACKUP_FILE"
-
-# Create new baseline (overwrites)
-"${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")"
-
-echo ""
-echo "Baseline updated. Old baseline backed up to:"
-echo "  $BACKUP_FILE"

From 96d7142b20e645014105bd2831b62ce19a52b47e Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:35:49 +0100
Subject: [PATCH 08/14] feat: move runtime baseline check to Go CLI

Add `semantic-bench runtime` command to check Go benchmark
performance against baseline. Remove last bash script and
the scripts/ directory.
---
 cmd/semantic-bench/main.go                    |  16 ++
 dev                                           |   2 +-
 internal/benchmark/commands.go                | 209 ++++++++++++++++++
 internal/benchmark/config.go                  |  14 ++
 .../scripts/check-runtime-baseline.sh         | 137 ------------
 5 files changed, 240 insertions(+), 138 deletions(-)
 delete mode 100755 tests/benchmark/scripts/check-runtime-baseline.sh

diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go
index 4866601..076d71a 100644
--- a/cmd/semantic-bench/main.go
+++ b/cmd/semantic-bench/main.go
@@ -21,6 +21,7 @@ Commands:
   baseline    Manage quality baselines (create, update)
   calibrate   Find optimal thresholds via precision/recall analysis
   tune        Grid-search lexical/embedding weights
+  runtime     Check Go benchmark performance against baseline
 
 Flags:
   -h, --help    Show help
@@ -54,6 +55,8 @@ func main() {
 		runCalibrate(args)
 	case "tune":
 		runTune(args)
+	case "runtime":
+		runRuntime(args)
 	case "-h", "--help", "help":
 		fmt.Print(usage)
 	default:
@@ -150,3 +153,16 @@ func runTune(args []string) {
 	}
 	benchmark.PrintTuneResult(result, cfg)
 }
+
+func runRuntime(args []string) {
+	cfg := benchmark.ParseRuntimeFlags(args)
+	result, err := benchmark.RunRuntime(cfg)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(2)
+	}
+	benchmark.PrintRuntimeResult(result, cfg)
+	if result.Status == "fail" && cfg.FailOnRegression {
+		os.Exit(1)
+	}
+}
diff --git a/dev b/dev
index da0f70c..987e04c 100755
--- a/dev
+++ b/dev
@@ -197,7 +197,7 @@ run_calibrate() {
 
 run_runtime() {
   echo "  ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}"
-  bash tests/benchmark/scripts/check-runtime-baseline.sh "$@"
+  go run ./cmd/semantic-bench runtime "$@"
 }
 
 run_tune() {
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
index 7f37ed5..f537934 100644
--- a/internal/benchmark/commands.go
+++ b/internal/benchmark/commands.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"sort"
 	"strings"
@@ -871,3 +872,211 @@ func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
 	}
 	fmt.Println()
 }
+
+// Runtime baseline
+
+type RuntimeResult struct {
+	Status      string                     `json:"status"`
+	Benchmarks  []RuntimeBenchmark         `json:"benchmarks"`
+	Regressions int                        `json:"regressions"`
+	BaselinePath string                    `json:"baseline_path"`
+	Created     bool                       `json:"created"`
+}
+
+type RuntimeBenchmark struct {
+	Name       string  `json:"name"`
+	NsOp       float64 `json:"ns_op"`
+	BytesOp    int     `json:"bytes_op"`
+	AllocsOp   int     `json:"allocs_op"`
+	BaselineNs float64 `json:"baseline_ns,omitempty"`
+	Ratio      float64 `json:"ratio,omitempty"`
+	Status     string  `json:"status"`
+}
+
+type runtimeBaseline struct {
+	Timestamp  string             `json:"timestamp"`
+	Benchmarks []RuntimeBenchmark `json:"benchmarks"`
+}
+
+func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
+	root := FindBenchmarkRoot()
+	baselinePath := filepath.Join(root, "baselines", "runtime.json")
+
+	benchmarks, err := runGoBenchmarks()
+	if err != nil {
+		return nil, err
+	}
+
+	result := &RuntimeResult{
+		Status:       "pass",
+		Benchmarks:   benchmarks,
+		BaselinePath: baselinePath,
+	}
+
+	if _, err := os.Stat(baselinePath); os.IsNotExist(err) {
+		if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil {
+			return nil, err
+		}
+		result.Created = true
+		return result, nil
+	}
+
+	baseline, err := loadRuntimeBaseline(baselinePath)
+	if err != nil {
+		return nil, err
+	}
+
+	baselineMap := make(map[string]RuntimeBenchmark)
+	for _, b := range baseline.Benchmarks {
+		baselineMap[b.Name] = b
+	}
+
+	maxRatio := 1.25
+	for i, b := range result.Benchmarks {
+		if base, ok := baselineMap[b.Name]; ok {
+			ratio := b.NsOp / base.NsOp
+			result.Benchmarks[i].BaselineNs = base.NsOp
+			result.Benchmarks[i].Ratio = ratio
+
+			if ratio > maxRatio {
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			} else if ratio > 1.1 {
+				result.Benchmarks[i].Status = "warning"
+			} else {
+				result.Benchmarks[i].Status = "ok"
+			}
+		} else {
+			result.Benchmarks[i].Status = "new"
+		}
+	}
+
+	if result.Regressions > 0 {
+		result.Status = "fail"
+	}
+
+	return result, nil
+}
+
+func runGoBenchmarks() ([]RuntimeBenchmark, error) {
+	root := FindBenchmarkRoot()
+	projectRoot := filepath.Join(root, "..", "..")
+
+	cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...")
+	cmd.Dir = projectRoot
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("go test failed: %w\n%s", err, output)
+	}
+
+	return parseBenchOutput(string(output)), nil
+}
+
+func parseBenchOutput(output string) []RuntimeBenchmark {
+	var results []RuntimeBenchmark
+	lines := strings.Split(output, "\n")
+
+	for _, line := range lines {
+		if !strings.HasPrefix(line, "Benchmark") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 3 {
+			continue
+		}
+
+		name := strings.TrimSuffix(fields[0], "-8")
+		name = strings.TrimSuffix(name, "-10")
+		name = strings.TrimSuffix(name, "-12")
+		name = strings.TrimSuffix(name, "-16")
+
+		var nsOp float64
+		var bytesOp, allocsOp int
+
+		for i, f := range fields {
+			if f == "ns/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%f", &nsOp)
+			}
+			if f == "B/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
+			}
+			if f == "allocs/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
+			}
+		}
+
+		if nsOp > 0 {
+			results = append(results, RuntimeBenchmark{
+				Name:     name,
+				NsOp:     nsOp,
+				BytesOp:  bytesOp,
+				AllocsOp: allocsOp,
+			})
+		}
+	}
+
+	return results
+}
+
+func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error {
+	baseline := runtimeBaseline{
+		Timestamp:  time.Now().UTC().Format(time.RFC3339),
+		Benchmarks: benchmarks,
+	}
+	data, err := json.MarshalIndent(baseline, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func loadRuntimeBaseline(path string) (*runtimeBaseline, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var baseline runtimeBaseline
+	if err := json.Unmarshal(data, &baseline); err != nil {
+		return nil, err
+	}
+	return &baseline, nil
+}
+
+func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) {
+	if result.Created {
+		fmt.Printf("\n  Created runtime baseline: %s\n", result.BaselinePath)
+		fmt.Printf("  Benchmarks: %d\n\n", len(result.Benchmarks))
+		return
+	}
+
+	fmt.Printf("\n  Runtime Baseline Check\n\n")
+
+	for _, b := range result.Benchmarks {
+		var status string
+		switch b.Status {
+		case "regression":
+			status = "\033[31mREGRESSION\033[0m"
+		case "warning":
+			status = "\033[33mWARNING\033[0m"
+		case "ok":
+			status = "\033[32mOK\033[0m"
+		case "new":
+			status = "\033[33mNEW\033[0m"
+		}
+
+		if b.BaselineNs > 0 {
+			fmt.Printf("  %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n",
+				status, b.Name, b.BaselineNs, b.NsOp, b.Ratio)
+		} else {
+			fmt.Printf("  %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp)
+		}
+	}
+
+	fmt.Println()
+	if result.Regressions > 0 {
+		fmt.Printf("  \033[31mRegressions: %d\033[0m\n\n", result.Regressions)
+	} else {
+		fmt.Printf("  \033[32mNo regressions\033[0m\n\n")
+	}
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index eb2fe57..83e3f5c 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -118,6 +118,11 @@ type TuneConfig struct {
 	Verbose bool
 }
 
+type RuntimeConfig struct {
+	FailOnRegression bool
+	Verbose          bool
+}
+
 func FindBenchmarkRoot() string {
 	cwd, _ := os.Getwd()
 	for d := cwd; d != "/"; d = filepath.Dir(d) {
@@ -304,3 +309,12 @@ func ParseTuneFlags(args []string) TuneConfig {
 	fs.Parse(args)
 	return cfg
 }
+
+func ParseRuntimeFlags(args []string) RuntimeConfig {
+	fs := flag.NewFlagSet("runtime", flag.ExitOnError)
+	cfg := RuntimeConfig{}
+	fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression")
+	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
+	fs.Parse(args)
+	return cfg
+}
diff --git a/tests/benchmark/scripts/check-runtime-baseline.sh b/tests/benchmark/scripts/check-runtime-baseline.sh
deleted file mode 100755
index 75bc4fc..0000000
--- a/tests/benchmark/scripts/check-runtime-baseline.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/bin/bash
-#
-# Check Go benchmark results against runtime baseline.
-#
-# Usage:
-#   ./check-runtime-baseline.sh [--fail-on-regression]
-#
-# Runs Go benchmarks and compares against saved baseline.
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-BENCHMARK_DIR="${SCRIPT_DIR}/.."
-BASELINES_DIR="${BENCHMARK_DIR}/baselines"
-RESULTS_DIR="${BENCHMARK_DIR}/results"
-CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json"
-PROJECT_ROOT="${BENCHMARK_DIR}/../.."
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-# Read tolerances from config
-if [[ -f "$CONFIG_FILE" ]]; then
-    MAX_NS_RATIO=$(jq -r '.baseline.runtime.max_ns_op_regression_ratio // 1.25' "$CONFIG_FILE")
-    MAX_ALLOC_RATIO=$(jq -r '.baseline.runtime.max_alloc_regression_ratio // 1.25' "$CONFIG_FILE")
-else
-    MAX_NS_RATIO=1.25
-    MAX_ALLOC_RATIO=1.25
-fi
-
-# Parse args
-FAIL_ON_REGRESSION=false
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
-    esac
-done
-
-mkdir -p "${RESULTS_DIR}"
-mkdir -p "${BASELINES_DIR}"
-
-BASELINE_FILE="${BASELINES_DIR}/runtime.json"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-REPORT_FILE="${RESULTS_DIR}/runtime_${TIMESTAMP}.json"
-
-echo "Running Go benchmarks..."
-echo ""
-
-# Run benchmarks
-BENCH_OUTPUT=$(mktemp)
-(cd "$PROJECT_ROOT" && go test -bench=. -benchmem ./internal/engine/... 2>&1) | tee "$BENCH_OUTPUT"
-
-# Parse benchmark output into JSON
-echo ""
-echo "Parsing results..."
-
-jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" '{timestamp: $ts, benchmarks: []}' > "$REPORT_FILE"
-
-while IFS= read -r line; do
-    if [[ "$line" =~ ^Benchmark ]]; then
-        # Parse: BenchmarkName-N  iterations  ns/op  bytes/op  allocs/op
-        name=$(echo "$line" | awk '{print $1}' | sed 's/-[0-9]*$//')
-        ns_op=$(echo "$line" | grep -oE '[0-9.]+ ns/op' | awk '{print $1}' || echo "0")
-        bytes_op=$(echo "$line" | grep -oE '[0-9]+ B/op' | awk '{print $1}' || echo "0")
-        allocs_op=$(echo "$line" | grep -oE '[0-9]+ allocs/op' | awk '{print $1}' || echo "0")
-
-        if [[ -n "$ns_op" ]] && [[ "$ns_op" != "0" ]]; then
-            tmp=$(mktemp)
-            jq --arg name "$name" \
-               --argjson ns "$ns_op" \
-               --argjson bytes "${bytes_op:-0}" \
-               --argjson allocs "${allocs_op:-0}" \
-               '.benchmarks += [{name: $name, ns_op: $ns, bytes_op: $bytes, allocs_op: $allocs}]' \
-               "$REPORT_FILE" > "$tmp"
-            mv "$tmp" "$REPORT_FILE"
-        fi
-    fi
-done < "$BENCH_OUTPUT"
-
-rm -f "$BENCH_OUTPUT"
-
-# If no baseline exists, create one
-if [[ ! -f "$BASELINE_FILE" ]]; then
-    echo ""
-    echo "No runtime baseline found. Creating initial baseline..."
-    cp "$REPORT_FILE" "$BASELINE_FILE"
-    echo "Baseline saved to: $BASELINE_FILE"
-    exit 0
-fi
-
-# Compare against baseline
-echo ""
-echo "=== Comparing against baseline ==="
-echo ""
-
-REGRESSIONS=0
-
-for name in $(jq -r '.benchmarks[].name' "$REPORT_FILE"); do
-    baseline_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$BASELINE_FILE")
-    current_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$REPORT_FILE")
-
-    baseline_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$BASELINE_FILE")
-    current_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$REPORT_FILE")
-
-    if [[ "$baseline_ns" == "0" ]] || [[ "$baseline_ns" == "null" ]]; then
-        echo -e "${YELLOW}NEW${NC} $name: ${current_ns} ns/op"
-        continue
-    fi
-
-    ratio=$(echo "scale=4; $current_ns / $baseline_ns" | bc)
-
-    if (( $(echo "$ratio > $MAX_NS_RATIO" | bc -l) )); then
-        echo -e "${RED}REGRESSION${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x, max: ${MAX_NS_RATIO}x)"
-        REGRESSIONS=$((REGRESSIONS + 1))
-    elif (( $(echo "$ratio > 1.1" | bc -l) )); then
-        echo -e "${YELLOW}WARNING${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
-    else
-        echo -e "${GREEN}OK${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)"
-    fi
-done
-
-echo ""
-echo "================================================"
-if [[ $REGRESSIONS -gt 0 ]]; then
-    echo -e "${RED}RUNTIME REGRESSIONS: $REGRESSIONS${NC}"
-    if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
-        exit 1
-    fi
-else
-    echo -e "${GREEN}NO RUNTIME REGRESSIONS${NC}"
-fi
-echo "================================================"
-echo ""
-echo "Report: ${REPORT_FILE}"

From b7ee014f3bd8a4fc94c1c7823904c05f64d57c41 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:39:48 +0100
Subject: [PATCH 09/14] chore: ignore generated baseline files

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 9a58d8e..419dfaa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,4 +23,4 @@ cover.out
 tests/e2e/results/*.txt
 tests/benchmark/results/*.json
 tests/benchmark/results/*.md
-tests/benchmark/baselines/*.backup.json
\ No newline at end of file
+tests/benchmark/baselines/*.json
\ No newline at end of file

From 9ef5b362c52659a3c9a3b9c93a08aa81634401f5 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 17:59:19 +0100
Subject: [PATCH 10/14] chore: simplify dev tool and update SKILL.md

- Remove redundant ./dev loop (same as ./dev bench)
- Add cmd/semantic-bench to architecture docs
- Simplify benchmark improvement loop section
---
 README.md                    |  2 +-
 dev                          |  7 ----
 scripts/check-docs-links.sh  | 62 ++++++++++++++++++++++++++++++++++++
 skills/semantic-dev/SKILL.md | 54 +++++++------------------------
 4 files changed, 74 insertions(+), 51 deletions(-)
 create mode 100755 scripts/check-docs-links.sh

diff --git a/README.md b/README.md
index 57e3053..83fb48e 100644
--- a/README.md
+++ b/README.md
@@ -204,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode
 
 ## Design Trade-offs
 
-See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
+See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
 
 ## Origin
 
diff --git a/dev b/dev
index 987e04c..5d8c88d 100755
--- a/dev
+++ b/dev
@@ -33,7 +33,6 @@ commands=(
   "runtime:⏱️:Check runtime baseline"
   "tune:🎛️:Tune combined weights"
   "e2e:🐳:Run E2E tests (Docker)"
-  "loop:🔄:Benchmark loop (bench → compare → report)"
 )
 
 show_help() {
@@ -214,11 +213,6 @@ run_e2e() {
   bash scripts/e2e.sh
 }
 
-run_loop() {
-  echo "  ${ACCENT}${BOLD}🔄 Benchmark Loop${NC}"
-  go run ./cmd/semantic-bench check -verbose "$@"
-}
-
 case "${1:-help}" in
   pr)        run_pr ;;
   doctor)    exec bash scripts/doctor.sh ;;
@@ -258,6 +252,5 @@ case "${1:-help}" in
   runtime)   shift; run_runtime "$@" ;;
   tune)      shift; run_tune "$@" ;;
   e2e)       run_e2e ;;
-  loop)      run_loop ;;
   help|*)    show_help ;;
 esac
diff --git a/scripts/check-docs-links.sh b/scripts/check-docs-links.sh
new file mode 100755
index 0000000..90a8738
--- /dev/null
+++ b/scripts/check-docs-links.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#
+# Check for broken documentation links
+#
+# Usage:
+#   ./scripts/check-docs-links.sh
+#
+set -uo pipefail
+
+cd "$(dirname "$0")/.."
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m'
+
+ERRORS=0
+
+echo "Checking documentation links..."
+echo ""
+
+# Find all markdown files and check links
+while IFS= read -r file; do
+    dir=$(dirname "$file")
+
+    # Extract markdown links: [text](path)
+    while IFS= read -r link; do
+        # Skip URLs and anchors
+        if [[ "$link" =~ ^https?:// ]] || [[ "$link" =~ ^mailto: ]] || [[ "$link" =~ ^# ]]; then
+            continue
+        fi
+        
+        # Remove anchor from link
+        link_path="${link%%#*}"
+        
+        # Skip empty paths
+        if [[ -z "$link_path" ]]; then
+            continue
+        fi
+        
+        # Resolve relative path
+        if [[ "$link_path" =~ ^/ ]]; then
+            target="$link_path"
+        else
+            target="$dir/$link_path"
+        fi
+        
+        # Check if target exists
+        if [[ ! -e "$target" ]]; then
+            echo -e "${RED}BROKEN:${NC} $file -> $link"
+            ERRORS=$((ERRORS + 1))
+        fi
+    done < <(grep -oE '\]\([^)]+\)' "$file" 2>/dev/null | sed 's/\](//' | sed 's/)//')
+done < <(find . -name "*.md" -not -path "./.git/*" -not -path "./node_modules/*")
+
+echo ""
+if [[ $ERRORS -eq 0 ]]; then
+    echo -e "${GREEN}✓${NC} All documentation links valid"
+    exit 0
+else
+    echo -e "${RED}Found $ERRORS broken link(s)${NC}"
+    exit 1
+fi
diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md
index 7cbb684..2bea9dd 100644
--- a/skills/semantic-dev/SKILL.md
+++ b/skills/semantic-dev/SKILL.md
@@ -65,6 +65,7 @@ recovery/                  Public subpackage
   failure.go                 FailureType classification
 
 cmd/semantic/main.go       CLI tool (find, match, classify)
+cmd/semantic-bench/        Benchmark CLI (check, baseline, calibrate, tune, runtime)
 ```
 
 ## Key Design Decisions
@@ -92,57 +93,24 @@ cmd/semantic/main.go       CLI tool (find, match, classify)
 
 ## Benchmark Improvement Loop
 
-When implementing changes that affect matching quality, follow this loop:
-
-### Step 1: Ensure baseline exists
-
-```bash
-./dev baseline
-```
-
-Creates `tests/benchmark/baselines/combined.json` if missing.
-
-### Step 2: Implement change
-
-Make one focused improvement at a time.
-
-### Step 3: Run benchmark loop
+When implementing changes that affect matching quality:
 
 ```bash
-./dev loop
+./dev baseline          # create baseline (first time only)
+# ... make changes ...
+./dev bench             # run benchmark, compare to baseline
+./dev baseline update   # accept new baseline (if improved)
 ```
 
-Shows comparison table with deltas:
-- **Green (+)** = improved
-- **Red (-)** = regressed  
-- **Gray** = unchanged
-
-### Step 4: Evaluate and decide
-
-| Result | Action |
-|--------|--------|
-| All metrics improved/unchanged | `./dev baseline update` |
-| Mixed (some up, some down) | Investigate tradeoff |
-| Key metrics regressed | Fix before merging |
-
-### Step 5: Iterate
-
-Repeat steps 2-4. Each `baseline update` sets new goalpost.
-
-### Key metrics
-
+**Key metrics:**
 - **MRR** — Mean Reciprocal Rank (higher = finds correct element faster)
 - **P@1** — Precision at 1 (is top result correct?)
 - **Hit@3** — Any correct result in top 3?
-- **Margin** — Score gap between best correct and best wrong
-
-### Adding test cases
-
-When a query should work better:
 
-1. Add to `tests/benchmark/corpus/*/queries.json` or `cases/*.json`
-2. Run `./dev lint corpus`
-3. Run `./dev loop` — benchmark will show regression until fixed
+**Adding test cases:**
+1. Add to `tests/benchmark/corpus/*/queries.json`
+2. Run `./dev lint corpus` to validate
+3. Run `./dev bench` — shows regression until fixed
 
 ## Public API Surface
 

From 8786b2f430e99ae2b88ec02485234451443bd275 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 18:04:03 +0100
Subject: [PATCH 11/14] refactor: split benchmark commands.go into separate
 files

- types.go: shared result types
- check.go: RunCheck, PrintCheckResult
- compare.go: RunCompare, PrintCompareResult
- lint.go: RunLint, PrintLintResult
- catalog.go: RunCatalog, PrintCatalogResult
- baseline.go: baseline management
- calibrate.go: threshold calibration
- tune.go: weight tuning
- runtime.go: Go benchmark performance
---
 internal/benchmark/baseline.go  |  110 ++++
 internal/benchmark/calibrate.go |  175 +++++
 internal/benchmark/catalog.go   |   75 +++
 internal/benchmark/check.go     |  237 +++++++
 internal/benchmark/commands.go  | 1082 -------------------------------
 internal/benchmark/compare.go   |   78 +++
 internal/benchmark/lint.go      |   68 ++
 internal/benchmark/runtime.go   |  217 +++++++
 internal/benchmark/tune.go      |   90 +++
 internal/benchmark/types.go     |   67 ++
 10 files changed, 1117 insertions(+), 1082 deletions(-)
 create mode 100644 internal/benchmark/baseline.go
 create mode 100644 internal/benchmark/calibrate.go
 create mode 100644 internal/benchmark/catalog.go
 create mode 100644 internal/benchmark/check.go
 delete mode 100644 internal/benchmark/commands.go
 create mode 100644 internal/benchmark/compare.go
 create mode 100644 internal/benchmark/lint.go
 create mode 100644 internal/benchmark/runtime.go
 create mode 100644 internal/benchmark/tune.go
 create mode 100644 internal/benchmark/types.go

diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go
new file mode 100644
index 0000000..de2a371
--- /dev/null
+++ b/internal/benchmark/baseline.go
@@ -0,0 +1,110 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type BaselineResult struct {
+	Action   string          `json:"action"`
+	Path     string          `json:"path"`
+	Metrics  OverallMetrics  `json:"metrics"`
+	Previous *OverallMetrics `json:"previous,omitempty"`
+}
+
+func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) {
+	root := FindBenchmarkRoot()
+	baselinesDir := filepath.Join(root, "baselines")
+	if err := os.MkdirAll(baselinesDir, 0755); err != nil {
+		return nil, err
+	}
+
+	baselinePath := filepath.Join(baselinesDir, cfg.Name+".json")
+
+	switch cfg.Action {
+	case "create":
+		return createBaseline(root, baselinePath, cfg)
+	case "update":
+		if !cfg.Accept {
+			return nil, fmt.Errorf("use --accept to confirm baseline update")
+		}
+		return updateBaseline(root, baselinePath, cfg)
+	default:
+		return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action)
+	}
+}
+
+func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        "combined",
+		Threshold:       0.01,
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+		Mode:            "library",
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	data, err := json.MarshalIndent(report, "", "  ")
+	if err != nil {
+		return nil, err
+	}
+	if err := os.WriteFile(baselinePath, data, 0644); err != nil {
+		return nil, err
+	}
+
+	return &BaselineResult{
+		Action:  "create",
+		Path:    baselinePath,
+		Metrics: report.Metrics.Overall,
+	}, nil
+}
+
+func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
+	var previous *OverallMetrics
+	if data, err := os.ReadFile(baselinePath); err == nil {
+		var old Report
+		if json.Unmarshal(data, &old) == nil {
+			previous = &old.Metrics.Overall
+		}
+		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
+		os.WriteFile(backupPath, data, 0644)
+	}
+
+	result, err := createBaseline(root, baselinePath, cfg)
+	if err != nil {
+		return nil, err
+	}
+	result.Action = "update"
+	result.Previous = previous
+	return result, nil
+}
+
+func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) {
+	fmt.Printf("\n  Baseline %sd: %s\n\n", result.Action, result.Path)
+	fmt.Printf("  MRR:    %.4f\n", result.Metrics.MRR)
+	fmt.Printf("  P@1:    %.4f\n", result.Metrics.PAt1)
+	fmt.Printf("  Hit@3:  %.4f\n", result.Metrics.HitAt3)
+
+	if result.Previous != nil {
+		fmt.Printf("\n  Previous:\n")
+		fmt.Printf("    MRR:    %.4f\n", result.Previous.MRR)
+		fmt.Printf("    P@1:    %.4f\n", result.Previous.PAt1)
+		fmt.Printf("    Hit@3:  %.4f\n", result.Previous.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go
new file mode 100644
index 0000000..9c9fa33
--- /dev/null
+++ b/internal/benchmark/calibrate.go
@@ -0,0 +1,175 @@
+package benchmark
+
+import (
+	"fmt"
+
+	"github.com/pinchtab/semantic"
+)
+
+type CalibrateResult struct {
+	ByThreshold     map[string]ThresholdMetrics `json:"by_threshold"`
+	Recommendations CalibrateRecommendations    `json:"recommendations"`
+	TotalCases      int                         `json:"total_cases"`
+}
+
+type ThresholdMetrics struct {
+	TP        int     `json:"tp"`
+	FP        int     `json:"fp"`
+	FN        int     `json:"fn"`
+	TN        int     `json:"tn"`
+	Recall    float64 `json:"recall"`
+	Precision float64 `json:"precision"`
+	FPR       float64 `json:"false_positive_rate"`
+	F1        float64 `json:"f1"`
+}
+
+type CalibrateRecommendations struct {
+	DefaultThreshold  float64 `json:"default_threshold"`
+	RecoveryThreshold float64 `json:"recovery_threshold"`
+	BestF1            float64 `json:"best_f1"`
+}
+
+func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &CalibrateResult{
+		ByThreshold: make(map[string]ThresholdMetrics),
+	}
+
+	type testCase struct {
+		query  Query
+		corpus *Corpus
+	}
+
+	var cases []testCase
+	for i := range ds.Corpora {
+		corpus := &ds.Corpora[i]
+		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
+			continue
+		}
+		for _, q := range corpus.Queries {
+			cases = append(cases, testCase{query: q, corpus: corpus})
+		}
+	}
+	result.TotalCases = len(cases)
+
+	if cfg.Verbose {
+		fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases))
+	}
+
+	runCfg := RunConfig{
+		Strategy:        "combined",
+		TopK:            5,
+		LexicalWeight:   0.6,
+		EmbeddingWeight: 0.4,
+	}
+	matcher := createMatcher(runCfg)
+
+	var bestF1, bestF1Threshold float64
+	var bestRecallThreshold float64
+	var bestRecallWithPrecision float64
+
+	for _, threshold := range cfg.Thresholds {
+		tp, fp, fn, tn := 0, 0, 0, 0
+
+		for _, tc := range cases {
+			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
+				Threshold: threshold,
+				TopK:      5,
+			})
+
+			hasMatch := len(findResult.Matches) > 0
+			topRef := ""
+			if hasMatch {
+				topRef = findResult.Matches[0].Ref
+			}
+
+			if tc.query.ExpectNoMatch {
+				if hasMatch {
+					fp++
+				} else {
+					tn++
+				}
+			} else if len(tc.query.RelevantRefs) > 0 {
+				if !hasMatch {
+					fn++
+				} else if contains(tc.query.RelevantRefs, topRef) {
+					tp++
+				} else {
+					fp++
+				}
+			}
+		}
+
+		totalPos := tp + fn
+		totalNeg := tn + fp
+
+		var recall, precision, fpr, f1 float64
+		if totalPos > 0 {
+			recall = float64(tp) / float64(totalPos)
+		}
+		if tp+fp > 0 {
+			precision = float64(tp) / float64(tp+fp)
+		}
+		if totalNeg > 0 {
+			fpr = float64(fp) / float64(totalNeg)
+		}
+		if precision+recall > 0 {
+			f1 = 2 * precision * recall / (precision + recall)
+		}
+
+		key := fmt.Sprintf("%.2f", threshold)
+		result.ByThreshold[key] = ThresholdMetrics{
+			TP: tp, FP: fp, FN: fn, TN: tn,
+			Recall: recall, Precision: precision, FPR: fpr, F1: f1,
+		}
+
+		if f1 > bestF1 {
+			bestF1 = f1
+			bestF1Threshold = threshold
+		}
+		if recall >= 0.85 && precision > bestRecallWithPrecision {
+			bestRecallWithPrecision = precision
+			bestRecallThreshold = threshold
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n",
+				threshold, tp, fp, fn, tn, recall, precision, f1)
+		}
+	}
+
+	if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 {
+		bestRecallThreshold = cfg.Thresholds[0]
+	}
+
+	result.Recommendations = CalibrateRecommendations{
+		DefaultThreshold:  bestF1Threshold,
+		RecoveryThreshold: bestRecallThreshold,
+		BestF1:            bestF1,
+	}
+
+	return result, nil
+}
+
+func contains(refs []string, ref string) bool {
+	for _, r := range refs {
+		if r == ref {
+			return true
+		}
+	}
+	return false
+}
+
+func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) {
+	fmt.Printf("\n  Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold))
+
+	fmt.Printf("  Recommendations:\n")
+	fmt.Printf("    Default (best F1):   %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1)
+	fmt.Printf("    Recovery (recall):   %.2f\n", result.Recommendations.RecoveryThreshold)
+	fmt.Println()
+}
diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go
new file mode 100644
index 0000000..b4c4ec1
--- /dev/null
+++ b/internal/benchmark/catalog.go
@@ -0,0 +1,75 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"sort"
+)
+
+func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+
+	result := &CatalogResult{
+		ByTag:        make(map[string]int),
+		ByDifficulty: make(map[string]int),
+	}
+
+	for _, c := range ds.Corpora {
+		tags := make(map[string]bool)
+		for _, q := range c.Queries {
+			result.TotalQueries++
+			result.ByDifficulty[q.Difficulty]++
+			for _, t := range q.Tags {
+				tags[t] = true
+				result.ByTag[t]++
+			}
+		}
+		var tagList []string
+		for t := range tags {
+			tagList = append(tagList, t)
+		}
+		sort.Strings(tagList)
+		result.Corpora = append(result.Corpora, CorpusSummary{
+			ID:      c.ID,
+			Queries: len(c.Queries),
+			Tags:    tagList,
+		})
+	}
+
+	return result, nil
+}
+
+func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n  Corpora: %d\n", len(result.Corpora))
+	fmt.Printf("  Total Queries: %d\n\n", result.TotalQueries)
+
+	fmt.Printf("  %-30s %8s\n", "Corpus", "Queries")
+	fmt.Printf("  %-30s %8s\n", "------", "-------")
+	for _, c := range result.Corpora {
+		fmt.Printf("  %-30s %8d\n", c.ID, c.Queries)
+	}
+
+	switch cfg.By {
+	case "difficulty":
+		fmt.Printf("\n  By Difficulty:\n")
+		for d, n := range result.ByDifficulty {
+			fmt.Printf("    %-10s %4d\n", d, n)
+		}
+	case "tag":
+		fmt.Printf("\n  By Tag:\n")
+		for t, n := range result.ByTag {
+			fmt.Printf("    %-20s %4d\n", t, n)
+		}
+	}
+	fmt.Printf("\n")
+}
diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
new file mode 100644
index 0000000..81171bb
--- /dev/null
+++ b/internal/benchmark/check.go
@@ -0,0 +1,237 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+func RunCheck(cfg CheckConfig) (*CheckResult, error) {
+	root := FindBenchmarkRoot()
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	benchCfg, _ := LoadConfig(root)
+	profile := Profile{
+		Strategy:  "combined",
+		Threshold: 0.01,
+		TopK:      5,
+		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+	}
+	if benchCfg != nil {
+		profile = ResolveProfile(benchCfg, cfg.Profile)
+	}
+
+	runCfg := RunConfig{
+		Suite:           "corpus",
+		Strategy:        profile.Strategy,
+		Threshold:       profile.Threshold,
+		TopK:            profile.TopK,
+		LexicalWeight:   profile.Weights.Lexical,
+		EmbeddingWeight: profile.Weights.Embedding,
+		Profile:         cfg.Profile,
+		Mode:            "library",
+		Verbose:         cfg.Verbose,
+		Explain:         cfg.Explain,
+		OutputDir:       cfg.OutputDir,
+	}
+
+	report, err := RunCorpusBenchmark(ds, runCfg)
+	if err != nil {
+		return nil, fmt.Errorf("run benchmark: %w", err)
+	}
+
+	result := &CheckResult{
+		Status: "pass",
+		Report: report,
+	}
+	result.Summary.PAt1 = report.Metrics.Overall.PAt1
+	result.Summary.MRR = report.Metrics.Overall.MRR
+	result.Summary.HitAt3 = report.Metrics.Overall.HitAt3
+	result.Summary.Total = report.Metrics.Overall.Total
+
+	for _, r := range report.Results {
+		if r.Status == "miss" {
+			result.TopRegs = append(result.TopRegs, Regression{
+				ID:           r.ID,
+				Corpus:       r.Corpus,
+				Query:        r.Query,
+				Expected:     r.Expected.RelevantRefs,
+				CurrentRef:   r.Actual.BestRef,
+				Reason:       "miss",
+				DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID),
+			})
+		}
+	}
+	result.Summary.Regressions = len(result.TopRegs)
+
+	baselinePath := cfg.BaselinePath
+	if baselinePath == "" {
+		baselinePath = filepath.Join(root, "baselines", "combined.json")
+	}
+	if _, err := os.Stat(baselinePath); err == nil {
+		baseline, err := loadReport(baselinePath)
+		if err == nil {
+			result.Delta = &MetricsDelta{
+				PAt1:   report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+			}
+			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
+				result.Status = "fail"
+			}
+		}
+	}
+
+	os.MkdirAll(cfg.OutputDir, 0755)
+	ts := time.Now().Format("20060102_150405")
+	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
+	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
+
+	reportJSON, _ := json.MarshalIndent(report, "", "  ")
+	os.WriteFile(reportPath, reportJSON, 0644)
+
+	summaryMD := generateSummaryMD(report, result)
+	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
+
+	result.Artifacts.ReportJSON = reportPath
+	result.Artifacts.SummaryMD = summaryPath
+
+	return result, nil
+}
+
+func RunBenchmark(cfg RunConfig) (*Report, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, err
+	}
+	return RunCorpusBenchmark(ds, cfg)
+}
+
+func loadReport(path string) (*Report, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var r Report
+	if err := json.Unmarshal(data, &r); err != nil {
+		return nil, err
+	}
+	return &r, nil
+}
+
+func generateSummaryMD(report *Report, result *CheckResult) string {
+	var sb strings.Builder
+
+	sb.WriteString("# Benchmark Summary\n\n")
+	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
+
+	sb.WriteString("## Overall Metrics\n\n")
+	sb.WriteString("| Metric | Value |\n")
+	sb.WriteString("|--------|-------|\n")
+	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
+	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
+	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
+	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
+	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
+
+	if result.Delta != nil {
+		sb.WriteString("\n## Delta from Baseline\n\n")
+		sb.WriteString("| Metric | Delta |\n")
+		sb.WriteString("|--------|-------|\n")
+		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
+		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
+		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
+	}
+
+	if len(result.TopRegs) > 0 {
+		sb.WriteString("\n## Misses\n\n")
+		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
+		sb.WriteString("|----|--------|-------|-----|----------|\n")
+		for _, r := range result.TopRegs {
+			if len(result.TopRegs) > 10 {
+				break
+			}
+			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
+				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
+		}
+	}
+
+	return sb.String()
+}
+
+func PrintCheckResult(result *CheckResult, cfg CheckConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m Benchmark passed\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Benchmark failed\n")
+	}
+	fmt.Printf("\n")
+
+	fmt.Printf("  %-12s %8.4f\n", "MRR", result.Summary.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", result.Summary.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", result.Summary.Total)
+	fmt.Printf("  %-12s %8d\n", "Misses", result.Summary.Regressions)
+
+	if result.Delta != nil {
+		fmt.Printf("\n  Delta from baseline:\n")
+		printDelta("P@1", result.Delta.PAt1)
+		printDelta("MRR", result.Delta.MRR)
+		printDelta("Hit@3", result.Delta.HitAt3)
+	}
+
+	fmt.Printf("\n  Artifacts:\n")
+	fmt.Printf("    Report:  %s\n", result.Artifacts.ReportJSON)
+	fmt.Printf("    Summary: %s\n", result.Artifacts.SummaryMD)
+	fmt.Printf("\n")
+}
+
+func printDelta(name string, delta float64) {
+	color := "\033[0m"
+	sign := ""
+	if delta > 0.001 {
+		color = "\033[32m"
+		sign = "+"
+	} else if delta < -0.001 {
+		color = "\033[31m"
+	}
+	fmt.Printf("    %s%-8s %s%.4f\033[0m\n", color, name, sign, delta)
+}
+
+func PrintRunResult(report *Report, cfg RunConfig) {
+	fmt.Printf("\n")
+	fmt.Printf("  %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR)
+	fmt.Printf("  %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1)
+	fmt.Printf("  %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3)
+	fmt.Printf("  %-12s %8d\n", "Total", report.Metrics.Overall.Total)
+	fmt.Printf("\n")
+
+	if cfg.Verbose {
+		for _, r := range report.Results {
+			status := "\033[32mHIT \033[0m"
+			switch r.Status {
+			case "miss":
+				status = "\033[31mMISS\033[0m"
+			case "partial":
+				status = "\033[33mPART\033[0m"
+			}
+			fmt.Printf("  [%s] %s | %s | got=%s score=%.3f\n",
+				r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore)
+		}
+	}
+}
diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go
deleted file mode 100644
index f537934..0000000
--- a/internal/benchmark/commands.go
+++ /dev/null
@@ -1,1082 +0,0 @@
-package benchmark
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"sort"
-	"strings"
-	"time"
-
-	"github.com/pinchtab/semantic"
-)
-
-type CheckResult struct {
-	Status    string        `json:"status"`
-	Summary   CheckSummary  `json:"summary"`
-	Delta     *MetricsDelta `json:"delta,omitempty"`
-	TopRegs   []Regression  `json:"top_regressions,omitempty"`
-	Artifacts Artifacts     `json:"artifacts"`
-	Report    *Report       `json:"-"`
-}
-
-type CheckSummary struct {
-	PAt1        float64 `json:"p_at_1"`
-	MRR         float64 `json:"mrr"`
-	HitAt3      float64 `json:"hit_at_3"`
-	Total       int     `json:"total"`
-	Regressions int     `json:"regressions"`
-	Warnings    int     `json:"warnings"`
-}
-
-type MetricsDelta struct {
-	PAt1   float64 `json:"p_at_1"`
-	MRR    float64 `json:"mrr"`
-	HitAt3 float64 `json:"hit_at_3"`
-}
-
-type Regression struct {
-	ID           string   `json:"id"`
-	Corpus       string   `json:"corpus"`
-	Query        string   `json:"query"`
-	Expected     []string `json:"expected"`
-	BaselineRef  string   `json:"baseline_ref,omitempty"`
-	CurrentRef   string   `json:"current_ref"`
-	Reason       string   `json:"reason"`
-	DebugCommand string   `json:"debug_command"`
-}
-
-type Artifacts struct {
-	ReportJSON string `json:"report_json"`
-	SummaryMD  string `json:"summary_md"`
-}
-
-type CompareResult struct {
-	Status       string       `json:"status"`
-	Delta        MetricsDelta `json:"delta"`
-	Regressions  []Regression `json:"regressions"`
-	Improvements []string     `json:"improvements"`
-}
-
-type LintResult struct {
-	Errors   int      `json:"errors"`
-	Warnings int      `json:"warnings"`
-	Messages []string `json:"messages"`
-}
-
-type CatalogResult struct {
-	Corpora      []CorpusSummary `json:"corpora"`
-	TotalQueries int             `json:"total_queries"`
-	ByTag        map[string]int  `json:"by_tag,omitempty"`
-	ByDifficulty map[string]int  `json:"by_difficulty,omitempty"`
-}
-
-type CorpusSummary struct {
-	ID      string   `json:"id"`
-	Queries int      `json:"queries"`
-	Tags    []string `json:"tags"`
-}
-
-func RunCheck(cfg CheckConfig) (*CheckResult, error) {
-	root := FindBenchmarkRoot()
-
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	benchCfg, _ := LoadConfig(root)
-	profile := Profile{
-		Strategy:  "combined",
-		Threshold: 0.01,
-		TopK:      5,
-		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
-	}
-	if benchCfg != nil {
-		profile = ResolveProfile(benchCfg, cfg.Profile)
-	}
-
-	runCfg := RunConfig{
-		Suite:           "corpus",
-		Strategy:        profile.Strategy,
-		Threshold:       profile.Threshold,
-		TopK:            profile.TopK,
-		LexicalWeight:   profile.Weights.Lexical,
-		EmbeddingWeight: profile.Weights.Embedding,
-		Profile:         cfg.Profile,
-		Mode:            "library",
-		Verbose:         cfg.Verbose,
-		Explain:         cfg.Explain,
-		OutputDir:       cfg.OutputDir,
-	}
-
-	report, err := RunCorpusBenchmark(ds, runCfg)
-	if err != nil {
-		return nil, fmt.Errorf("run benchmark: %w", err)
-	}
-
-	result := &CheckResult{
-		Status: "pass",
-		Report: report,
-	}
-	result.Summary.PAt1 = report.Metrics.Overall.PAt1
-	result.Summary.MRR = report.Metrics.Overall.MRR
-	result.Summary.HitAt3 = report.Metrics.Overall.HitAt3
-	result.Summary.Total = report.Metrics.Overall.Total
-
-	// Count misses
-	for _, r := range report.Results {
-		if r.Status == "miss" {
-			result.TopRegs = append(result.TopRegs, Regression{
-				ID:           r.ID,
-				Corpus:       r.Corpus,
-				Query:        r.Query,
-				Expected:     r.Expected.RelevantRefs,
-				CurrentRef:   r.Actual.BestRef,
-				Reason:       "miss",
-				DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID),
-			})
-		}
-	}
-	result.Summary.Regressions = len(result.TopRegs)
-
-	// Compare to baseline if exists
-	baselinePath := cfg.BaselinePath
-	if baselinePath == "" {
-		baselinePath = filepath.Join(root, "baselines", "combined.json")
-	}
-	if _, err := os.Stat(baselinePath); err == nil {
-		baseline, err := loadReport(baselinePath)
-		if err == nil {
-			result.Delta = &MetricsDelta{
-				PAt1:   report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
-				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
-				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
-			}
-			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
-				result.Status = "fail"
-			}
-		}
-	}
-
-	// Write artifacts
-	os.MkdirAll(cfg.OutputDir, 0755)
-	ts := time.Now().Format("20060102_150405")
-	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
-	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
-
-	reportJSON, _ := json.MarshalIndent(report, "", "  ")
-	os.WriteFile(reportPath, reportJSON, 0644)
-
-	summaryMD := generateSummaryMD(report, result)
-	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
-
-	result.Artifacts.ReportJSON = reportPath
-	result.Artifacts.SummaryMD = summaryPath
-
-	return result, nil
-}
-
-func RunBenchmark(cfg RunConfig) (*Report, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, err
-	}
-	return RunCorpusBenchmark(ds, cfg)
-}
-
-func RunCompare(cfg CompareConfig) (*CompareResult, error) {
-	baseline, err := loadReport(cfg.BaselinePath)
-	if err != nil {
-		return nil, fmt.Errorf("load baseline: %w", err)
-	}
-	current, err := loadReport(cfg.CurrentPath)
-	if err != nil {
-		return nil, fmt.Errorf("load current: %w", err)
-	}
-
-	result := &CompareResult{
-		Status: "pass",
-		Delta: MetricsDelta{
-			PAt1:   current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
-			MRR:    current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
-			HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
-		},
-	}
-
-	if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 {
-		result.Status = "fail"
-	}
-
-	// Find regressions
-	baselineResults := make(map[string]QueryResult)
-	for _, r := range baseline.Results {
-		baselineResults[r.ID] = r
-	}
-	for _, r := range current.Results {
-		if base, ok := baselineResults[r.ID]; ok {
-			if base.Status == "hit" && r.Status != "hit" {
-				result.Regressions = append(result.Regressions, Regression{
-					ID:          r.ID,
-					Corpus:      r.Corpus,
-					Query:       r.Query,
-					BaselineRef: base.Actual.BestRef,
-					CurrentRef:  r.Actual.BestRef,
-					Reason:      fmt.Sprintf("%s -> %s", base.Status, r.Status),
-				})
-			}
-		}
-	}
-
-	return result, nil
-}
-
-func RunLint(cfg LintConfig) (*LintResult, error) {
-	root := FindBenchmarkRoot()
-	result := &LintResult{}
-
-	ds, err := LoadDataset(root)
-	if err != nil {
-		result.Errors++
-		result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err))
-		return result, nil
-	}
-
-	// Check for duplicate IDs
-	ids := make(map[string]string)
-	for _, c := range ds.Corpora {
-		for _, q := range c.Queries {
-			if existing, ok := ids[q.ID]; ok {
-				result.Errors++
-				result.Messages = append(result.Messages,
-					fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing))
-			} else {
-				ids[q.ID] = c.ID
-			}
-		}
-	}
-
-	// Check refs exist
-	for _, c := range ds.Corpora {
-		refs := make(map[string]bool)
-		for _, d := range c.Snapshot {
-			refs[d.Ref] = true
-		}
-		for _, q := range c.Queries {
-			for _, r := range q.RelevantRefs {
-				if !refs[r] {
-					result.Errors++
-					result.Messages = append(result.Messages,
-						fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r))
-				}
-			}
-		}
-	}
-
-	// Check difficulty values
-	validDiff := map[string]bool{"easy": true, "medium": true, "hard": true}
-	for _, c := range ds.Corpora {
-		for _, q := range c.Queries {
-			if q.Difficulty != "" && !validDiff[q.Difficulty] {
-				result.Errors++
-				result.Messages = append(result.Messages,
-					fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID))
-			}
-		}
-	}
-
-	if result.Errors == 0 && result.Warnings == 0 {
-		result.Messages = append(result.Messages, "All checks passed")
-	}
-
-	return result, nil
-}
-
-func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, err
-	}
-
-	result := &CatalogResult{
-		ByTag:        make(map[string]int),
-		ByDifficulty: make(map[string]int),
-	}
-
-	for _, c := range ds.Corpora {
-		tags := make(map[string]bool)
-		for _, q := range c.Queries {
-			result.TotalQueries++
-			result.ByDifficulty[q.Difficulty]++
-			for _, t := range q.Tags {
-				tags[t] = true
-				result.ByTag[t]++
-			}
-		}
-		var tagList []string
-		for t := range tags {
-			tagList = append(tagList, t)
-		}
-		sort.Strings(tagList)
-		result.Corpora = append(result.Corpora, CorpusSummary{
-			ID:      c.ID,
-			Queries: len(c.Queries),
-			Tags:    tagList,
-		})
-	}
-
-	return result, nil
-}
-
-func loadReport(path string) (*Report, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, err
-	}
-	var r Report
-	if err := json.Unmarshal(data, &r); err != nil {
-		return nil, err
-	}
-	return &r, nil
-}
-
-func generateSummaryMD(report *Report, result *CheckResult) string {
-	var sb strings.Builder
-
-	sb.WriteString("# Benchmark Summary\n\n")
-	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
-
-	sb.WriteString("## Overall Metrics\n\n")
-	sb.WriteString("| Metric | Value |\n")
-	sb.WriteString("|--------|-------|\n")
-	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
-	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
-	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
-	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
-	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
-
-	if result.Delta != nil {
-		sb.WriteString("\n## Delta from Baseline\n\n")
-		sb.WriteString("| Metric | Delta |\n")
-		sb.WriteString("|--------|-------|\n")
-		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
-		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
-		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
-	}
-
-	if len(result.TopRegs) > 0 {
-		sb.WriteString("\n## Misses\n\n")
-		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
-		sb.WriteString("|----|--------|-------|-----|----------|\n")
-		for _, r := range result.TopRegs {
-			if len(result.TopRegs) > 10 {
-				break
-			}
-			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
-				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
-		}
-	}
-
-	return sb.String()
-}
-
-func PrintCheckResult(result *CheckResult, cfg CheckConfig) {
-	if cfg.Format == "json" {
-		data, _ := json.MarshalIndent(result, "", "  ")
-		fmt.Println(string(data))
-		return
-	}
-
-	fmt.Printf("\n")
-	if result.Status == "pass" {
-		fmt.Printf("  \033[32m✓\033[0m Benchmark passed\n")
-	} else {
-		fmt.Printf("  \033[31m✗\033[0m Benchmark failed\n")
-	}
-	fmt.Printf("\n")
-
-	fmt.Printf("  %-12s %8.4f\n", "MRR", result.Summary.MRR)
-	fmt.Printf("  %-12s %8.4f\n", "P@1", result.Summary.PAt1)
-	fmt.Printf("  %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3)
-	fmt.Printf("  %-12s %8d\n", "Total", result.Summary.Total)
-	fmt.Printf("  %-12s %8d\n", "Misses", result.Summary.Regressions)
-
-	if result.Delta != nil {
-		fmt.Printf("\n  Delta from baseline:\n")
-		printDelta("P@1", result.Delta.PAt1)
-		printDelta("MRR", result.Delta.MRR)
-		printDelta("Hit@3", result.Delta.HitAt3)
-	}
-
-	fmt.Printf("\n  Artifacts:\n")
-	fmt.Printf("    Report:  %s\n", result.Artifacts.ReportJSON)
-	fmt.Printf("    Summary: %s\n", result.Artifacts.SummaryMD)
-	fmt.Printf("\n")
-}
-
-func printDelta(name string, delta float64) {
-	color := "\033[0m"
-	sign := ""
-	if delta > 0.001 {
-		color = "\033[32m"
-		sign = "+"
-	} else if delta < -0.001 {
-		color = "\033[31m"
-	}
-	fmt.Printf("    %s%-8s %s%.4f\033[0m\n", color, name, sign, delta)
-}
-
-func PrintRunResult(report *Report, cfg RunConfig) {
-	fmt.Printf("\n")
-	fmt.Printf("  %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR)
-	fmt.Printf("  %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1)
-	fmt.Printf("  %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3)
-	fmt.Printf("  %-12s %8d\n", "Total", report.Metrics.Overall.Total)
-	fmt.Printf("\n")
-
-	if cfg.Verbose {
-		for _, r := range report.Results {
-			status := "\033[32mHIT \033[0m"
-			switch r.Status {
-			case "miss":
-				status = "\033[31mMISS\033[0m"
-			case "partial":
-				status = "\033[33mPART\033[0m"
-			}
-			fmt.Printf("  [%s] %s | %s | got=%s score=%.3f\n",
-				r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore)
-		}
-	}
-}
-
-func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
-	fmt.Printf("\n")
-	if result.Status == "pass" {
-		fmt.Printf("  \033[32m✓\033[0m No regression\n")
-	} else {
-		fmt.Printf("  \033[31m✗\033[0m Regression detected\n")
-	}
-	fmt.Printf("\n")
-	printDelta("P@1", result.Delta.PAt1)
-	printDelta("MRR", result.Delta.MRR)
-	printDelta("Hit@3", result.Delta.HitAt3)
-
-	if len(result.Regressions) > 0 {
-		fmt.Printf("\n  Regressions:\n")
-		for _, r := range result.Regressions {
-			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
-		}
-	}
-	fmt.Printf("\n")
-}
-
-func PrintLintResult(result *LintResult, cfg LintConfig) {
-	for _, msg := range result.Messages {
-		fmt.Println(msg)
-	}
-	fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings)
-}
-
-func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
-	if cfg.Format == "json" {
-		data, _ := json.MarshalIndent(result, "", "  ")
-		fmt.Println(string(data))
-		return
-	}
-
-	fmt.Printf("\n  Corpora: %d\n", len(result.Corpora))
-	fmt.Printf("  Total Queries: %d\n\n", result.TotalQueries)
-
-	fmt.Printf("  %-30s %8s\n", "Corpus", "Queries")
-	fmt.Printf("  %-30s %8s\n", "------", "-------")
-	for _, c := range result.Corpora {
-		fmt.Printf("  %-30s %8d\n", c.ID, c.Queries)
-	}
-
-	switch cfg.By {
-	case "difficulty":
-		fmt.Printf("\n  By Difficulty:\n")
-		for d, n := range result.ByDifficulty {
-			fmt.Printf("    %-10s %4d\n", d, n)
-		}
-	case "tag":
-		fmt.Printf("\n  By Tag:\n")
-		for t, n := range result.ByTag {
-			fmt.Printf("    %-20s %4d\n", t, n)
-		}
-	}
-	fmt.Printf("\n")
-}
-
-// Baseline management
-
-type BaselineResult struct {
-	Action   string         `json:"action"`
-	Path     string         `json:"path"`
-	Metrics  OverallMetrics `json:"metrics"`
-	Previous *OverallMetrics `json:"previous,omitempty"`
-}
-
-func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) {
-	root := FindBenchmarkRoot()
-	baselinesDir := filepath.Join(root, "baselines")
-	if err := os.MkdirAll(baselinesDir, 0755); err != nil {
-		return nil, err
-	}
-
-	baselinePath := filepath.Join(baselinesDir, cfg.Name+".json")
-
-	switch cfg.Action {
-	case "create":
-		return createBaseline(root, baselinePath, cfg)
-	case "update":
-		if !cfg.Accept {
-			return nil, fmt.Errorf("use --accept to confirm baseline update")
-		}
-		return updateBaseline(root, baselinePath, cfg)
-	default:
-		return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action)
-	}
-}
-
-func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	runCfg := RunConfig{
-		Suite:           "corpus",
-		Strategy:        "combined",
-		Threshold:       0.01,
-		TopK:            5,
-		LexicalWeight:   0.6,
-		EmbeddingWeight: 0.4,
-		Mode:            "library",
-	}
-
-	report, err := RunCorpusBenchmark(ds, runCfg)
-	if err != nil {
-		return nil, fmt.Errorf("run benchmark: %w", err)
-	}
-
-	data, err := json.MarshalIndent(report, "", "  ")
-	if err != nil {
-		return nil, err
-	}
-	if err := os.WriteFile(baselinePath, data, 0644); err != nil {
-		return nil, err
-	}
-
-	return &BaselineResult{
-		Action:  "create",
-		Path:    baselinePath,
-		Metrics: report.Metrics.Overall,
-	}, nil
-}
-
-func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) {
-	var previous *OverallMetrics
-	if data, err := os.ReadFile(baselinePath); err == nil {
-		var old Report
-		if json.Unmarshal(data, &old) == nil {
-			previous = &old.Metrics.Overall
-		}
-		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
-		os.WriteFile(backupPath, data, 0644)
-	}
-
-	result, err := createBaseline(root, baselinePath, cfg)
-	if err != nil {
-		return nil, err
-	}
-	result.Action = "update"
-	result.Previous = previous
-	return result, nil
-}
-
-func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) {
-	fmt.Printf("\n  Baseline %sd: %s\n\n", result.Action, result.Path)
-	fmt.Printf("  MRR:    %.4f\n", result.Metrics.MRR)
-	fmt.Printf("  P@1:    %.4f\n", result.Metrics.PAt1)
-	fmt.Printf("  Hit@3:  %.4f\n", result.Metrics.HitAt3)
-
-	if result.Previous != nil {
-		fmt.Printf("\n  Previous:\n")
-		fmt.Printf("    MRR:    %.4f\n", result.Previous.MRR)
-		fmt.Printf("    P@1:    %.4f\n", result.Previous.PAt1)
-		fmt.Printf("    Hit@3:  %.4f\n", result.Previous.HitAt3)
-	}
-	fmt.Println()
-}
-
-// Threshold calibration
-
-type CalibrateResult struct {
-	ByThreshold     map[string]ThresholdMetrics `json:"by_threshold"`
-	Recommendations CalibrateRecommendations    `json:"recommendations"`
-	TotalCases      int                         `json:"total_cases"`
-}
-
-type ThresholdMetrics struct {
-	TP        int     `json:"tp"`
-	FP        int     `json:"fp"`
-	FN        int     `json:"fn"`
-	TN        int     `json:"tn"`
-	Recall    float64 `json:"recall"`
-	Precision float64 `json:"precision"`
-	FPR       float64 `json:"false_positive_rate"`
-	F1        float64 `json:"f1"`
-}
-
-type CalibrateRecommendations struct {
-	DefaultThreshold  float64 `json:"default_threshold"`
-	RecoveryThreshold float64 `json:"recovery_threshold"`
-	BestF1            float64 `json:"best_f1"`
-}
-
-func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	result := &CalibrateResult{
-		ByThreshold: make(map[string]ThresholdMetrics),
-	}
-
-	type testCase struct {
-		query         Query
-		corpus        *Corpus
-	}
-
-	var cases []testCase
-	for i := range ds.Corpora {
-		corpus := &ds.Corpora[i]
-		if cfg.Corpus != "" && corpus.ID != cfg.Corpus {
-			continue
-		}
-		for _, q := range corpus.Queries {
-			cases = append(cases, testCase{query: q, corpus: corpus})
-		}
-	}
-	result.TotalCases = len(cases)
-
-	if cfg.Verbose {
-		fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases))
-	}
-
-	runCfg := RunConfig{
-		Strategy:        "combined",
-		TopK:            5,
-		LexicalWeight:   0.6,
-		EmbeddingWeight: 0.4,
-	}
-	matcher := createMatcher(runCfg)
-
-	var bestF1, bestF1Threshold float64
-	var bestRecallThreshold float64
-	var bestRecallWithPrecision float64
-
-	for _, threshold := range cfg.Thresholds {
-		tp, fp, fn, tn := 0, 0, 0, 0
-
-		for _, tc := range cases {
-			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
-				Threshold: threshold,
-				TopK:      5,
-			})
-
-			hasMatch := len(findResult.Matches) > 0
-			topRef := ""
-			if hasMatch {
-				topRef = findResult.Matches[0].Ref
-			}
-
-			if tc.query.ExpectNoMatch {
-				if hasMatch {
-					fp++
-				} else {
-					tn++
-				}
-			} else if len(tc.query.RelevantRefs) > 0 {
-				if !hasMatch {
-					fn++
-				} else if contains(tc.query.RelevantRefs, topRef) {
-					tp++
-				} else {
-					fp++
-				}
-			}
-		}
-
-		totalPos := tp + fn
-		totalNeg := tn + fp
-
-		var recall, precision, fpr, f1 float64
-		if totalPos > 0 {
-			recall = float64(tp) / float64(totalPos)
-		}
-		if tp+fp > 0 {
-			precision = float64(tp) / float64(tp+fp)
-		}
-		if totalNeg > 0 {
-			fpr = float64(fp) / float64(totalNeg)
-		}
-		if precision+recall > 0 {
-			f1 = 2 * precision * recall / (precision + recall)
-		}
-
-		key := fmt.Sprintf("%.2f", threshold)
-		result.ByThreshold[key] = ThresholdMetrics{
-			TP: tp, FP: fp, FN: fn, TN: tn,
-			Recall: recall, Precision: precision, FPR: fpr, F1: f1,
-		}
-
-		if f1 > bestF1 {
-			bestF1 = f1
-			bestF1Threshold = threshold
-		}
-		if recall >= 0.85 && precision > bestRecallWithPrecision {
-			bestRecallWithPrecision = precision
-			bestRecallThreshold = threshold
-		}
-
-		if cfg.Verbose {
-			fmt.Printf("  threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n",
-				threshold, tp, fp, fn, tn, recall, precision, f1)
-		}
-	}
-
-	if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 {
-		bestRecallThreshold = cfg.Thresholds[0]
-	}
-
-	result.Recommendations = CalibrateRecommendations{
-		DefaultThreshold:  bestF1Threshold,
-		RecoveryThreshold: bestRecallThreshold,
-		BestF1:            bestF1,
-	}
-
-	return result, nil
-}
-
-func contains(refs []string, ref string) bool {
-	for _, r := range refs {
-		if r == ref {
-			return true
-		}
-	}
-	return false
-}
-
-func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) {
-	fmt.Printf("\n  Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold))
-
-	fmt.Printf("  Recommendations:\n")
-	fmt.Printf("    Default (best F1):   %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1)
-	fmt.Printf("    Recovery (recall):   %.2f\n", result.Recommendations.RecoveryThreshold)
-	fmt.Println()
-}
-
-// Weight tuning
-
-type TuneResult struct {
-	Results []TuneRun `json:"results"`
-	Best    *TuneRun  `json:"best"`
-}
-
-type TuneRun struct {
-	LexicalWeight   float64 `json:"lexical_weight"`
-	EmbeddingWeight float64 `json:"embedding_weight"`
-	MRR             float64 `json:"mrr"`
-	PAt1            float64 `json:"p_at_1"`
-	HitAt3          float64 `json:"hit_at_3"`
-}
-
-func RunTune(cfg TuneConfig) (*TuneResult, error) {
-	root := FindBenchmarkRoot()
-	ds, err := LoadDataset(root)
-	if err != nil {
-		return nil, fmt.Errorf("load dataset: %w", err)
-	}
-
-	result := &TuneResult{}
-
-	if cfg.Verbose {
-		fmt.Printf("  %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3")
-	}
-
-	for w := 0.0; w <= 1.0001; w += cfg.Step {
-		lexW := w
-		embW := 1.0 - w
-
-		runCfg := RunConfig{
-			Suite:           "corpus",
-			Strategy:        "combined",
-			Threshold:       0.01,
-			TopK:            5,
-			LexicalWeight:   lexW,
-			EmbeddingWeight: embW,
-			Mode:            "library",
-		}
-
-		if cfg.Corpus != "" {
-			runCfg.Corpus = cfg.Corpus
-		}
-
-		report, err := RunCorpusBenchmark(ds, runCfg)
-		if err != nil {
-			return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err)
-		}
-
-		run := TuneRun{
-			LexicalWeight:   lexW,
-			EmbeddingWeight: embW,
-			MRR:             report.Metrics.Overall.MRR,
-			PAt1:            report.Metrics.Overall.PAt1,
-			HitAt3:          report.Metrics.Overall.HitAt3,
-		}
-		result.Results = append(result.Results, run)
-
-		if result.Best == nil || run.PAt1 > result.Best.PAt1 ||
-			(run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) {
-			best := run
-			result.Best = &best
-		}
-
-		if cfg.Verbose {
-			fmt.Printf("  %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n",
-				lexW, embW, run.MRR, run.PAt1, run.HitAt3)
-		}
-	}
-
-	return result, nil
-}
-
-func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
-	fmt.Printf("\n  Tested %d weight combinations\n\n", len(result.Results))
-
-	if result.Best != nil {
-		fmt.Printf("  Best weights:\n")
-		fmt.Printf("    Lexical:   %.2f\n", result.Best.LexicalWeight)
-		fmt.Printf("    Embedding: %.2f\n", result.Best.EmbeddingWeight)
-		fmt.Printf("    MRR:       %.4f\n", result.Best.MRR)
-		fmt.Printf("    P@1:       %.4f\n", result.Best.PAt1)
-		fmt.Printf("    Hit@3:     %.4f\n", result.Best.HitAt3)
-	}
-	fmt.Println()
-}
-
-// Runtime baseline
-
-type RuntimeResult struct {
-	Status      string                     `json:"status"`
-	Benchmarks  []RuntimeBenchmark         `json:"benchmarks"`
-	Regressions int                        `json:"regressions"`
-	BaselinePath string                    `json:"baseline_path"`
-	Created     bool                       `json:"created"`
-}
-
-type RuntimeBenchmark struct {
-	Name       string  `json:"name"`
-	NsOp       float64 `json:"ns_op"`
-	BytesOp    int     `json:"bytes_op"`
-	AllocsOp   int     `json:"allocs_op"`
-	BaselineNs float64 `json:"baseline_ns,omitempty"`
-	Ratio      float64 `json:"ratio,omitempty"`
-	Status     string  `json:"status"`
-}
-
-type runtimeBaseline struct {
-	Timestamp  string             `json:"timestamp"`
-	Benchmarks []RuntimeBenchmark `json:"benchmarks"`
-}
-
-func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
-	root := FindBenchmarkRoot()
-	baselinePath := filepath.Join(root, "baselines", "runtime.json")
-
-	benchmarks, err := runGoBenchmarks()
-	if err != nil {
-		return nil, err
-	}
-
-	result := &RuntimeResult{
-		Status:       "pass",
-		Benchmarks:   benchmarks,
-		BaselinePath: baselinePath,
-	}
-
-	if _, err := os.Stat(baselinePath); os.IsNotExist(err) {
-		if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil {
-			return nil, err
-		}
-		result.Created = true
-		return result, nil
-	}
-
-	baseline, err := loadRuntimeBaseline(baselinePath)
-	if err != nil {
-		return nil, err
-	}
-
-	baselineMap := make(map[string]RuntimeBenchmark)
-	for _, b := range baseline.Benchmarks {
-		baselineMap[b.Name] = b
-	}
-
-	maxRatio := 1.25
-	for i, b := range result.Benchmarks {
-		if base, ok := baselineMap[b.Name]; ok {
-			ratio := b.NsOp / base.NsOp
-			result.Benchmarks[i].BaselineNs = base.NsOp
-			result.Benchmarks[i].Ratio = ratio
-
-			if ratio > maxRatio {
-				result.Benchmarks[i].Status = "regression"
-				result.Regressions++
-			} else if ratio > 1.1 {
-				result.Benchmarks[i].Status = "warning"
-			} else {
-				result.Benchmarks[i].Status = "ok"
-			}
-		} else {
-			result.Benchmarks[i].Status = "new"
-		}
-	}
-
-	if result.Regressions > 0 {
-		result.Status = "fail"
-	}
-
-	return result, nil
-}
-
-func runGoBenchmarks() ([]RuntimeBenchmark, error) {
-	root := FindBenchmarkRoot()
-	projectRoot := filepath.Join(root, "..", "..")
-
-	cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...")
-	cmd.Dir = projectRoot
-	output, err := cmd.CombinedOutput()
-	if err != nil {
-		return nil, fmt.Errorf("go test failed: %w\n%s", err, output)
-	}
-
-	return parseBenchOutput(string(output)), nil
-}
-
-func parseBenchOutput(output string) []RuntimeBenchmark {
-	var results []RuntimeBenchmark
-	lines := strings.Split(output, "\n")
-
-	for _, line := range lines {
-		if !strings.HasPrefix(line, "Benchmark") {
-			continue
-		}
-
-		fields := strings.Fields(line)
-		if len(fields) < 3 {
-			continue
-		}
-
-		name := strings.TrimSuffix(fields[0], "-8")
-		name = strings.TrimSuffix(name, "-10")
-		name = strings.TrimSuffix(name, "-12")
-		name = strings.TrimSuffix(name, "-16")
-
-		var nsOp float64
-		var bytesOp, allocsOp int
-
-		for i, f := range fields {
-			if f == "ns/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%f", &nsOp)
-			}
-			if f == "B/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
-			}
-			if f == "allocs/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
-			}
-		}
-
-		if nsOp > 0 {
-			results = append(results, RuntimeBenchmark{
-				Name:     name,
-				NsOp:     nsOp,
-				BytesOp:  bytesOp,
-				AllocsOp: allocsOp,
-			})
-		}
-	}
-
-	return results
-}
-
-func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error {
-	baseline := runtimeBaseline{
-		Timestamp:  time.Now().UTC().Format(time.RFC3339),
-		Benchmarks: benchmarks,
-	}
-	data, err := json.MarshalIndent(baseline, "", "  ")
-	if err != nil {
-		return err
-	}
-	return os.WriteFile(path, data, 0644)
-}
-
-func loadRuntimeBaseline(path string) (*runtimeBaseline, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, err
-	}
-	var baseline runtimeBaseline
-	if err := json.Unmarshal(data, &baseline); err != nil {
-		return nil, err
-	}
-	return &baseline, nil
-}
-
-func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) {
-	if result.Created {
-		fmt.Printf("\n  Created runtime baseline: %s\n", result.BaselinePath)
-		fmt.Printf("  Benchmarks: %d\n\n", len(result.Benchmarks))
-		return
-	}
-
-	fmt.Printf("\n  Runtime Baseline Check\n\n")
-
-	for _, b := range result.Benchmarks {
-		var status string
-		switch b.Status {
-		case "regression":
-			status = "\033[31mREGRESSION\033[0m"
-		case "warning":
-			status = "\033[33mWARNING\033[0m"
-		case "ok":
-			status = "\033[32mOK\033[0m"
-		case "new":
-			status = "\033[33mNEW\033[0m"
-		}
-
-		if b.BaselineNs > 0 {
-			fmt.Printf("  %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n",
-				status, b.Name, b.BaselineNs, b.NsOp, b.Ratio)
-		} else {
-			fmt.Printf("  %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp)
-		}
-	}
-
-	fmt.Println()
-	if result.Regressions > 0 {
-		fmt.Printf("  \033[31mRegressions: %d\033[0m\n\n", result.Regressions)
-	} else {
-		fmt.Printf("  \033[32mNo regressions\033[0m\n\n")
-	}
-}
diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go
new file mode 100644
index 0000000..2b0a3d5
--- /dev/null
+++ b/internal/benchmark/compare.go
@@ -0,0 +1,78 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+)
+
+func RunCompare(cfg CompareConfig) (*CompareResult, error) {
+	baseline, err := loadReport(cfg.BaselinePath)
+	if err != nil {
+		return nil, fmt.Errorf("load baseline: %w", err)
+	}
+	current, err := loadReport(cfg.CurrentPath)
+	if err != nil {
+		return nil, fmt.Errorf("load current: %w", err)
+	}
+
+	result := &CompareResult{
+		Status: "pass",
+		Delta: MetricsDelta{
+			PAt1:   current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1,
+			MRR:    current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
+			HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
+		},
+	}
+
+	if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 {
+		result.Status = "fail"
+	}
+
+	baselineResults := make(map[string]QueryResult)
+	for _, r := range baseline.Results {
+		baselineResults[r.ID] = r
+	}
+	for _, r := range current.Results {
+		if base, ok := baselineResults[r.ID]; ok {
+			if base.Status == "hit" && r.Status != "hit" {
+				result.Regressions = append(result.Regressions, Regression{
+					ID:          r.ID,
+					Corpus:      r.Corpus,
+					Query:       r.Query,
+					BaselineRef: base.Actual.BestRef,
+					CurrentRef:  r.Actual.BestRef,
+					Reason:      fmt.Sprintf("%s -> %s", base.Status, r.Status),
+				})
+			}
+		}
+	}
+
+	return result, nil
+}
+
+func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
+	if cfg.Format == "json" {
+		data, _ := json.MarshalIndent(result, "", "  ")
+		fmt.Println(string(data))
+		return
+	}
+
+	fmt.Printf("\n")
+	if result.Status == "pass" {
+		fmt.Printf("  \033[32m✓\033[0m No regression\n")
+	} else {
+		fmt.Printf("  \033[31m✗\033[0m Regression detected\n")
+	}
+	fmt.Printf("\n")
+	printDelta("P@1", result.Delta.PAt1)
+	printDelta("MRR", result.Delta.MRR)
+	printDelta("Hit@3", result.Delta.HitAt3)
+
+	if len(result.Regressions) > 0 {
+		fmt.Printf("\n  Regressions:\n")
+		for _, r := range result.Regressions {
+			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
+		}
+	}
+	fmt.Printf("\n")
+}
diff --git a/internal/benchmark/lint.go b/internal/benchmark/lint.go
new file mode 100644
index 0000000..20565ce
--- /dev/null
+++ b/internal/benchmark/lint.go
@@ -0,0 +1,68 @@
+package benchmark
+
+import "fmt"
+
+func RunLint(cfg LintConfig) (*LintResult, error) {
+	root := FindBenchmarkRoot()
+	result := &LintResult{}
+
+	ds, err := LoadDataset(root)
+	if err != nil {
+		result.Errors++
+		result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err))
+		return result, nil
+	}
+
+	ids := make(map[string]string)
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if existing, ok := ids[q.ID]; ok {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing))
+			} else {
+				ids[q.ID] = c.ID
+			}
+		}
+	}
+
+	for _, c := range ds.Corpora {
+		refs := make(map[string]bool)
+		for _, d := range c.Snapshot {
+			refs[d.Ref] = true
+		}
+		for _, q := range c.Queries {
+			for _, r := range q.RelevantRefs {
+				if !refs[r] {
+					result.Errors++
+					result.Messages = append(result.Messages,
+						fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r))
+				}
+			}
+		}
+	}
+
+	validDiff := map[string]bool{"easy": true, "medium": true, "hard": true}
+	for _, c := range ds.Corpora {
+		for _, q := range c.Queries {
+			if q.Difficulty != "" && !validDiff[q.Difficulty] {
+				result.Errors++
+				result.Messages = append(result.Messages,
+					fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID))
+			}
+		}
+	}
+
+	if result.Errors == 0 && result.Warnings == 0 {
+		result.Messages = append(result.Messages, "All checks passed")
+	}
+
+	return result, nil
+}
+
+func PrintLintResult(result *LintResult, cfg LintConfig) {
+	for _, msg := range result.Messages {
+		fmt.Println(msg)
+	}
+	fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings)
+}
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
new file mode 100644
index 0000000..8e28dcb
--- /dev/null
+++ b/internal/benchmark/runtime.go
@@ -0,0 +1,217 @@
+package benchmark
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type RuntimeResult struct {
+	Status       string             `json:"status"`
+	Benchmarks   []RuntimeBenchmark `json:"benchmarks"`
+	Regressions  int                `json:"regressions"`
+	BaselinePath string             `json:"baseline_path"`
+	Created      bool               `json:"created"`
+}
+
+type RuntimeBenchmark struct {
+	Name       string  `json:"name"`
+	NsOp       float64 `json:"ns_op"`
+	BytesOp    int     `json:"bytes_op"`
+	AllocsOp   int     `json:"allocs_op"`
+	BaselineNs float64 `json:"baseline_ns,omitempty"`
+	Ratio      float64 `json:"ratio,omitempty"`
+	Status     string  `json:"status"`
+}
+
+type runtimeBaseline struct {
+	Timestamp  string             `json:"timestamp"`
+	Benchmarks []RuntimeBenchmark `json:"benchmarks"`
+}
+
+func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
+	root := FindBenchmarkRoot()
+	baselinePath := filepath.Join(root, "baselines", "runtime.json")
+
+	benchmarks, err := runGoBenchmarks()
+	if err != nil {
+		return nil, err
+	}
+
+	result := &RuntimeResult{
+		Status:       "pass",
+		Benchmarks:   benchmarks,
+		BaselinePath: baselinePath,
+	}
+
+	if _, err := os.Stat(baselinePath); os.IsNotExist(err) {
+		if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil {
+			return nil, err
+		}
+		result.Created = true
+		return result, nil
+	}
+
+	baseline, err := loadRuntimeBaseline(baselinePath)
+	if err != nil {
+		return nil, err
+	}
+
+	baselineMap := make(map[string]RuntimeBenchmark)
+	for _, b := range baseline.Benchmarks {
+		baselineMap[b.Name] = b
+	}
+
+	maxRatio := 1.25
+	for i, b := range result.Benchmarks {
+		if base, ok := baselineMap[b.Name]; ok {
+			ratio := b.NsOp / base.NsOp
+			result.Benchmarks[i].BaselineNs = base.NsOp
+			result.Benchmarks[i].Ratio = ratio
+
+			if ratio > maxRatio {
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			} else if ratio > 1.1 {
+				result.Benchmarks[i].Status = "warning"
+			} else {
+				result.Benchmarks[i].Status = "ok"
+			}
+		} else {
+			result.Benchmarks[i].Status = "new"
+		}
+	}
+
+	if result.Regressions > 0 {
+		result.Status = "fail"
+	}
+
+	return result, nil
+}
+
+func runGoBenchmarks() ([]RuntimeBenchmark, error) {
+	root := FindBenchmarkRoot()
+	projectRoot := filepath.Join(root, "..", "..")
+
+	cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...")
+	cmd.Dir = projectRoot
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("go test failed: %w\n%s", err, output)
+	}
+
+	return parseBenchOutput(string(output)), nil
+}
+
+func parseBenchOutput(output string) []RuntimeBenchmark {
+	var results []RuntimeBenchmark
+	lines := strings.Split(output, "\n")
+
+	for _, line := range lines {
+		if !strings.HasPrefix(line, "Benchmark") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 3 {
+			continue
+		}
+
+		name := strings.TrimSuffix(fields[0], "-8")
+		name = strings.TrimSuffix(name, "-10")
+		name = strings.TrimSuffix(name, "-12")
+		name = strings.TrimSuffix(name, "-16")
+
+		var nsOp float64
+		var bytesOp, allocsOp int
+
+		for i, f := range fields {
+			if f == "ns/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%f", &nsOp)
+			}
+			if f == "B/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
+			}
+			if f == "allocs/op" && i > 0 {
+				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
+			}
+		}
+
+		if nsOp > 0 {
+			results = append(results, RuntimeBenchmark{
+				Name:     name,
+				NsOp:     nsOp,
+				BytesOp:  bytesOp,
+				AllocsOp: allocsOp,
+			})
+		}
+	}
+
+	return results
+}
+
+func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error {
+	baseline := runtimeBaseline{
+		Timestamp:  time.Now().UTC().Format(time.RFC3339),
+		Benchmarks: benchmarks,
+	}
+	data, err := json.MarshalIndent(baseline, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func loadRuntimeBaseline(path string) (*runtimeBaseline, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var baseline runtimeBaseline
+	if err := json.Unmarshal(data, &baseline); err != nil {
+		return nil, err
+	}
+	return &baseline, nil
+}
+
+func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) {
+	if result.Created {
+		fmt.Printf("\n  Created runtime baseline: %s\n", result.BaselinePath)
+		fmt.Printf("  Benchmarks: %d\n\n", len(result.Benchmarks))
+		return
+	}
+
+	fmt.Printf("\n  Runtime Baseline Check\n\n")
+
+	for _, b := range result.Benchmarks {
+		var status string
+		switch b.Status {
+		case "regression":
+			status = "\033[31mREGRESSION\033[0m"
+		case "warning":
+			status = "\033[33mWARNING\033[0m"
+		case "ok":
+			status = "\033[32mOK\033[0m"
+		case "new":
+			status = "\033[33mNEW\033[0m"
+		}
+
+		if b.BaselineNs > 0 {
+			fmt.Printf("  %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n",
+				status, b.Name, b.BaselineNs, b.NsOp, b.Ratio)
+		} else {
+			fmt.Printf("  %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp)
+		}
+	}
+
+	fmt.Println()
+	if result.Regressions > 0 {
+		fmt.Printf("  \033[31mRegressions: %d\033[0m\n\n", result.Regressions)
+	} else {
+		fmt.Printf("  \033[32mNo regressions\033[0m\n\n")
+	}
+}
diff --git a/internal/benchmark/tune.go b/internal/benchmark/tune.go
new file mode 100644
index 0000000..7db259b
--- /dev/null
+++ b/internal/benchmark/tune.go
@@ -0,0 +1,90 @@
+package benchmark
+
+import "fmt"
+
+type TuneResult struct {
+	Results []TuneRun `json:"results"`
+	Best    *TuneRun  `json:"best"`
+}
+
+type TuneRun struct {
+	LexicalWeight   float64 `json:"lexical_weight"`
+	EmbeddingWeight float64 `json:"embedding_weight"`
+	MRR             float64 `json:"mrr"`
+	PAt1            float64 `json:"p_at_1"`
+	HitAt3          float64 `json:"hit_at_3"`
+}
+
+func RunTune(cfg TuneConfig) (*TuneResult, error) {
+	root := FindBenchmarkRoot()
+	ds, err := LoadDataset(root)
+	if err != nil {
+		return nil, fmt.Errorf("load dataset: %w", err)
+	}
+
+	result := &TuneResult{}
+
+	if cfg.Verbose {
+		fmt.Printf("  %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3")
+	}
+
+	for w := 0.0; w <= 1.0001; w += cfg.Step {
+		lexW := w
+		embW := 1.0 - w
+
+		runCfg := RunConfig{
+			Suite:           "corpus",
+			Strategy:        "combined",
+			Threshold:       0.01,
+			TopK:            5,
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			Mode:            "library",
+		}
+
+		if cfg.Corpus != "" {
+			runCfg.Corpus = cfg.Corpus
+		}
+
+		report, err := RunCorpusBenchmark(ds, runCfg)
+		if err != nil {
+			return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err)
+		}
+
+		run := TuneRun{
+			LexicalWeight:   lexW,
+			EmbeddingWeight: embW,
+			MRR:             report.Metrics.Overall.MRR,
+			PAt1:            report.Metrics.Overall.PAt1,
+			HitAt3:          report.Metrics.Overall.HitAt3,
+		}
+		result.Results = append(result.Results, run)
+
+		if result.Best == nil || run.PAt1 > result.Best.PAt1 ||
+			(run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) {
+			best := run
+			result.Best = &best
+		}
+
+		if cfg.Verbose {
+			fmt.Printf("  %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n",
+				lexW, embW, run.MRR, run.PAt1, run.HitAt3)
+		}
+	}
+
+	return result, nil
+}
+
+func PrintTuneResult(result *TuneResult, cfg TuneConfig) {
+	fmt.Printf("\n  Tested %d weight combinations\n\n", len(result.Results))
+
+	if result.Best != nil {
+		fmt.Printf("  Best weights:\n")
+		fmt.Printf("    Lexical:   %.2f\n", result.Best.LexicalWeight)
+		fmt.Printf("    Embedding: %.2f\n", result.Best.EmbeddingWeight)
+		fmt.Printf("    MRR:       %.4f\n", result.Best.MRR)
+		fmt.Printf("    P@1:       %.4f\n", result.Best.PAt1)
+		fmt.Printf("    Hit@3:     %.4f\n", result.Best.HitAt3)
+	}
+	fmt.Println()
+}
diff --git a/internal/benchmark/types.go b/internal/benchmark/types.go
new file mode 100644
index 0000000..916978a
--- /dev/null
+++ b/internal/benchmark/types.go
@@ -0,0 +1,67 @@
+package benchmark
+
+type CheckResult struct {
+	Status    string        `json:"status"`
+	Summary   CheckSummary  `json:"summary"`
+	Delta     *MetricsDelta `json:"delta,omitempty"`
+	TopRegs   []Regression  `json:"top_regressions,omitempty"`
+	Artifacts Artifacts     `json:"artifacts"`
+	Report    *Report       `json:"-"`
+}
+
+type CheckSummary struct {
+	PAt1        float64 `json:"p_at_1"`
+	MRR         float64 `json:"mrr"`
+	HitAt3      float64 `json:"hit_at_3"`
+	Total       int     `json:"total"`
+	Regressions int     `json:"regressions"`
+	Warnings    int     `json:"warnings"`
+}
+
+type MetricsDelta struct {
+	PAt1   float64 `json:"p_at_1"`
+	MRR    float64 `json:"mrr"`
+	HitAt3 float64 `json:"hit_at_3"`
+}
+
+type Regression struct {
+	ID           string   `json:"id"`
+	Corpus       string   `json:"corpus"`
+	Query        string   `json:"query"`
+	Expected     []string `json:"expected"`
+	BaselineRef  string   `json:"baseline_ref,omitempty"`
+	CurrentRef   string   `json:"current_ref"`
+	Reason       string   `json:"reason"`
+	DebugCommand string   `json:"debug_command"`
+}
+
+type Artifacts struct {
+	ReportJSON string `json:"report_json"`
+	SummaryMD  string `json:"summary_md"`
+}
+
+type CompareResult struct {
+	Status       string       `json:"status"`
+	Delta        MetricsDelta `json:"delta"`
+	Regressions  []Regression `json:"regressions"`
+	Improvements []string     `json:"improvements"`
+}
+
+type LintResult struct {
+	Errors   int      `json:"errors"`
+	Warnings int      `json:"warnings"`
+	Messages []string `json:"messages"`
+}
+
+type CatalogResult struct {
+	Corpora      []CorpusSummary `json:"corpora"`
+	TotalQueries int             `json:"total_queries"`
+	ByTag        map[string]int  `json:"by_tag,omitempty"`
+	ByDifficulty map[string]int  `json:"by_difficulty,omitempty"`
+}
+
+type CorpusSummary struct {
+	ID      string   `json:"id"`
+	Queries int      `json:"queries"`
+	Tags    []string `json:"tags"`
+}

From 4f166857d1cfb1818b07c7651b5c9fd1579622f9 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 18:50:28 +0100
Subject: [PATCH 12/14] fix: resolve golangci-lint errors in benchmark package

- Fix unchecked error returns (errcheck)
- Convert if-else chains to switch statements (gocritic)
- Use context.Background() instead of nil context (staticcheck)
- Replace WriteString(fmt.Sprintf) with fmt.Fprintf (staticcheck)
---
 dev                             | 14 +++++++--
 internal/benchmark/baseline.go  |  2 +-
 internal/benchmark/calibrate.go | 28 +++++++++---------
 internal/benchmark/check.go     | 28 +++++++++---------
 internal/benchmark/config.go    | 50 ++++++++++++++++-----------------
 internal/benchmark/dataset.go   |  8 +++---
 internal/benchmark/runner.go    | 50 +++++++++++++++++----------------
 internal/benchmark/runtime.go   | 13 +++++----
 recovery/benchmark_test.go      |  2 +-
 9 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/dev b/dev
index 5d8c88d..11d53d9 100755
--- a/dev
+++ b/dev
@@ -128,9 +128,19 @@ run_check() {
   if [ -n "$unformatted" ]; then
     echo "  ${ERROR}✗${NC} Unformatted files:"
     echo "$unformatted"
-    exit 1
+    echo ""
+    printf "  Fix formatting now? (Y/n) "
+    read -r answer
+    if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then
+      gofmt -w .
+      echo "  ${SUCCESS}✓${NC} Format (fixed)"
+    else
+      echo "  ${MUTED}Run: gofmt -w .${NC}"
+      exit 1
+    fi
+  else
+    echo "  ${SUCCESS}✓${NC} Format"
   fi
-  echo "  ${SUCCESS}✓${NC} Format"
 
   echo "  ${MUTED}2/4 Vet${NC}"
   go vet ./...
diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go
index de2a371..07cc418 100644
--- a/internal/benchmark/baseline.go
+++ b/internal/benchmark/baseline.go
@@ -82,7 +82,7 @@ func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*Baseline
 			previous = &old.Metrics.Overall
 		}
 		backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json"
-		os.WriteFile(backupPath, data, 0644)
+		_ = os.WriteFile(backupPath, data, 0644)
 	}
 
 	result, err := createBaseline(root, baselinePath, cfg)
diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go
index 9c9fa33..48ec06e 100644
--- a/internal/benchmark/calibrate.go
+++ b/internal/benchmark/calibrate.go
@@ -1,6 +1,7 @@
 package benchmark
 
 import (
+	"context"
 	"fmt"
 
 	"github.com/pinchtab/semantic"
@@ -77,7 +78,7 @@ func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
 		tp, fp, fn, tn := 0, 0, 0, 0
 
 		for _, tc := range cases {
-			findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
+			findResult, _ := matcher.Find(context.Background(), tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{
 				Threshold: threshold,
 				TopK:      5,
 			})
@@ -88,20 +89,17 @@ func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) {
 				topRef = findResult.Matches[0].Ref
 			}
 
-			if tc.query.ExpectNoMatch {
-				if hasMatch {
-					fp++
-				} else {
-					tn++
-				}
-			} else if len(tc.query.RelevantRefs) > 0 {
-				if !hasMatch {
-					fn++
-				} else if contains(tc.query.RelevantRefs, topRef) {
-					tp++
-				} else {
-					fp++
-				}
+			switch {
+			case tc.query.ExpectNoMatch && hasMatch:
+				fp++
+			case tc.query.ExpectNoMatch && !hasMatch:
+				tn++
+			case len(tc.query.RelevantRefs) > 0 && !hasMatch:
+				fn++
+			case len(tc.query.RelevantRefs) > 0 && contains(tc.query.RelevantRefs, topRef):
+				tp++
+			case len(tc.query.RelevantRefs) > 0:
+				fp++
 			}
 		}
 
diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
index 81171bb..e2ceedc 100644
--- a/internal/benchmark/check.go
+++ b/internal/benchmark/check.go
@@ -89,16 +89,16 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 		}
 	}
 
-	os.MkdirAll(cfg.OutputDir, 0755)
+	_ = os.MkdirAll(cfg.OutputDir, 0755)
 	ts := time.Now().Format("20060102_150405")
 	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
 	summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts))
 
 	reportJSON, _ := json.MarshalIndent(report, "", "  ")
-	os.WriteFile(reportPath, reportJSON, 0644)
+	_ = os.WriteFile(reportPath, reportJSON, 0644)
 
 	summaryMD := generateSummaryMD(report, result)
-	os.WriteFile(summaryPath, []byte(summaryMD), 0644)
+	_ = os.WriteFile(summaryPath, []byte(summaryMD), 0644)
 
 	result.Artifacts.ReportJSON = reportPath
 	result.Artifacts.SummaryMD = summaryPath
@@ -131,24 +131,24 @@ func generateSummaryMD(report *Report, result *CheckResult) string {
 	var sb strings.Builder
 
 	sb.WriteString("# Benchmark Summary\n\n")
-	sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp))
+	fmt.Fprintf(&sb, "Generated: %s\n\n", report.Run.Timestamp)
 
 	sb.WriteString("## Overall Metrics\n\n")
 	sb.WriteString("| Metric | Value |\n")
 	sb.WriteString("|--------|-------|\n")
-	sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total))
-	sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR))
-	sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1))
-	sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3))
-	sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin))
+	fmt.Fprintf(&sb, "| Total | %d |\n", report.Metrics.Overall.Total)
+	fmt.Fprintf(&sb, "| MRR | %.4f |\n", report.Metrics.Overall.MRR)
+	fmt.Fprintf(&sb, "| P@1 | %.4f |\n", report.Metrics.Overall.PAt1)
+	fmt.Fprintf(&sb, "| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3)
+	fmt.Fprintf(&sb, "| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin)
 
 	if result.Delta != nil {
 		sb.WriteString("\n## Delta from Baseline\n\n")
 		sb.WriteString("| Metric | Delta |\n")
 		sb.WriteString("|--------|-------|\n")
-		sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1))
-		sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR))
-		sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3))
+		fmt.Fprintf(&sb, "| P@1 | %+.4f |\n", result.Delta.PAt1)
+		fmt.Fprintf(&sb, "| MRR | %+.4f |\n", result.Delta.MRR)
+		fmt.Fprintf(&sb, "| Hit@3 | %+.4f |\n", result.Delta.HitAt3)
 	}
 
 	if len(result.TopRegs) > 0 {
@@ -159,8 +159,8 @@ func generateSummaryMD(report *Report, result *CheckResult) string {
 			if len(result.TopRegs) > 10 {
 				break
 			}
-			sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n",
-				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")))
+			fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n",
+				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))
 		}
 	}
 
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index 83e3f5c..e41fe1c 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -19,16 +19,16 @@ type DefaultsConfig struct {
 }
 
 type Profile struct {
-	Strategy   string   `json:"strategy"`
-	Threshold  float64  `json:"threshold"`
-	TopK       int      `json:"top_k"`
-	Weights    Weights  `json:"weights"`
-	Suites     []string `json:"suites"`
-	Mode       string   `json:"mode"`
-	Inherits   string   `json:"inherits"`
-	Verbose    bool     `json:"verbose"`
-	Explain    bool     `json:"explain"`
-	FailOnReg  bool     `json:"fail_on_regression"`
+	Strategy  string   `json:"strategy"`
+	Threshold float64  `json:"threshold"`
+	TopK      int      `json:"top_k"`
+	Weights   Weights  `json:"weights"`
+	Suites    []string `json:"suites"`
+	Mode      string   `json:"mode"`
+	Inherits  string   `json:"inherits"`
+	Verbose   bool     `json:"verbose"`
+	Explain   bool     `json:"explain"`
+	FailOnReg bool     `json:"fail_on_regression"`
 }
 
 type Weights struct {
@@ -42,16 +42,16 @@ type BaselineConfig struct {
 }
 
 type BaselineQuality struct {
-	MaxOverallPAt1Drop  float64 `json:"max_overall_p_at_1_drop"`
-	MaxOverallMRRDrop   float64 `json:"max_overall_mrr_drop"`
+	MaxOverallPAt1Drop   float64 `json:"max_overall_p_at_1_drop"`
+	MaxOverallMRRDrop    float64 `json:"max_overall_mrr_drop"`
 	MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"`
-	MaxCorpusPAt1Drop   float64 `json:"max_corpus_p_at_1_drop"`
-	MaxTagPAt1Drop      float64 `json:"max_tag_p_at_1_drop"`
+	MaxCorpusPAt1Drop    float64 `json:"max_corpus_p_at_1_drop"`
+	MaxTagPAt1Drop       float64 `json:"max_tag_p_at_1_drop"`
 }
 
 type BaselineRuntime struct {
-	MaxNsOpRegressionRatio   float64 `json:"max_ns_op_regression_ratio"`
-	MaxAllocRegressionRatio  float64 `json:"max_alloc_regression_ratio"`
+	MaxNsOpRegressionRatio  float64 `json:"max_ns_op_regression_ratio"`
+	MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"`
 }
 
 type CheckConfig struct {
@@ -200,7 +200,7 @@ func ParseCheckFlags(args []string) CheckConfig {
 	fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details")
 	fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -231,7 +231,7 @@ func ParseRunFlags(args []string) RunConfig {
 	fs.BoolVar(&cfg.Explain, "explain", false, "include explanations")
 	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
 	fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -244,7 +244,7 @@ func ParseCompareFlags(args []string) CompareConfig {
 	fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)")
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -255,7 +255,7 @@ func ParseLintFlags(args []string) LintConfig {
 	}
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -266,7 +266,7 @@ func ParseCatalogFlags(args []string) CatalogConfig {
 	}
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)")
 	fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -279,7 +279,7 @@ func ParseBaselineFlags(args []string) BaselineCmdConfig {
 	fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name")
 	fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 
 	if len(fs.Args()) > 0 {
 		cfg.Action = fs.Args()[0]
@@ -294,7 +294,7 @@ func ParseCalibrateFlags(args []string) CalibrateConfig {
 	}
 	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -306,7 +306,7 @@ func ParseTuneFlags(args []string) TuneConfig {
 	fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against")
 	fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
 
@@ -315,6 +315,6 @@ func ParseRuntimeFlags(args []string) RuntimeConfig {
 	cfg := RuntimeConfig{}
 	fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output")
-	fs.Parse(args)
+	_ = fs.Parse(args)
 	return cfg
 }
diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go
index 555b503..86c5014 100644
--- a/internal/benchmark/dataset.go
+++ b/internal/benchmark/dataset.go
@@ -25,10 +25,10 @@ type Query struct {
 }
 
 type Corpus struct {
-	ID        string
-	Path      string
-	Snapshot  []semantic.ElementDescriptor
-	Queries   []Query
+	ID       string
+	Path     string
+	Snapshot []semantic.ElementDescriptor
+	Queries  []Query
 }
 
 type Dataset struct {
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
index 391cc0a..f5b3a7d 100644
--- a/internal/benchmark/runner.go
+++ b/internal/benchmark/runner.go
@@ -8,14 +8,14 @@ import (
 )
 
 type QueryResult struct {
-	ID       string   `json:"id"`
-	Corpus   string   `json:"corpus"`
-	Query    string   `json:"query"`
-	Difficulty string `json:"difficulty"`
-	Tags     []string `json:"tags"`
-	Intent   string   `json:"intent,omitempty"`
-	PageType string   `json:"page_type,omitempty"`
-	Expected struct {
+	ID         string   `json:"id"`
+	Corpus     string   `json:"corpus"`
+	Query      string   `json:"query"`
+	Difficulty string   `json:"difficulty"`
+	Tags       []string `json:"tags"`
+	Intent     string   `json:"intent,omitempty"`
+	PageType   string   `json:"page_type,omitempty"`
+	Expected   struct {
 		RelevantRefs          []string `json:"relevant_refs"`
 		PartiallyRelevantRefs []string `json:"partially_relevant_refs"`
 	} `json:"expected"`
@@ -36,7 +36,7 @@ type QueryResult struct {
 		Margin            float64 `json:"margin"`
 	} `json:"metrics"`
 	Latency struct {
-		LibraryMs int64 `json:"library_ms"`
+		LibraryMs int64  `json:"library_ms"`
 		CLIMs     *int64 `json:"cli_ms,omitempty"`
 	} `json:"latency"`
 	Status string `json:"status"`
@@ -60,10 +60,10 @@ type Report struct {
 		Command   string `json:"command"`
 	} `json:"run"`
 	Dataset struct {
-		Name         string `json:"name"`
-		Version      string `json:"version,omitempty"`
-		QueryCount   int    `json:"query_count"`
-		CorpusCount  int    `json:"corpus_count"`
+		Name        string `json:"name"`
+		Version     string `json:"version,omitempty"`
+		QueryCount  int    `json:"query_count"`
+		CorpusCount int    `json:"corpus_count"`
 	} `json:"dataset"`
 	Config struct {
 		Profile   string  `json:"profile"`
@@ -74,11 +74,11 @@ type Report struct {
 	} `json:"config"`
 	Status  string `json:"status"`
 	Metrics struct {
-		Overall    OverallMetrics           `json:"overall"`
-		Latency    LatencyMetrics           `json:"latency"`
-		ByCorpus   map[string]CorpusMetrics `json:"by_corpus"`
+		Overall      OverallMetrics           `json:"overall"`
+		Latency      LatencyMetrics           `json:"latency"`
+		ByCorpus     map[string]CorpusMetrics `json:"by_corpus"`
 		ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"`
-		ByTag      map[string]CorpusMetrics `json:"by_tag"`
+		ByTag        map[string]CorpusMetrics `json:"by_tag"`
 	} `json:"metrics"`
 	Results []QueryResult `json:"results"`
 }
@@ -243,7 +243,8 @@ func computeQueryMetrics(result *QueryResult, query Query) {
 		if i >= 5 {
 			break
 		}
-		if relevantSet[m.Ref] {
+		switch {
+		case relevantSet[m.Ref]:
 			if result.Metrics.BestRelevantRank == nil {
 				rank := i + 1
 				result.Metrics.BestRelevantRank = &rank
@@ -256,11 +257,11 @@ func computeQueryMetrics(result *QueryResult, query Query) {
 				result.Metrics.HitAt3 = 1
 			}
 			result.Metrics.HitAt5 = 1
-		} else if partialSet[m.Ref] {
+		case partialSet[m.Ref]:
 			if i < 3 {
 				partialInTop3++
 			}
-		} else {
+		default:
 			if m.Score > result.Metrics.BestWrongScore {
 				result.Metrics.BestWrongScore = m.Score
 			}
@@ -270,17 +271,18 @@ func computeQueryMetrics(result *QueryResult, query Query) {
 	result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore
 
 	// Status
-	if query.ExpectNoMatch {
+	switch {
+	case query.ExpectNoMatch:
 		if len(result.Actual.Matches) == 0 {
 			result.Status = "no_match_expected"
 		} else {
 			result.Status = "unexpected_match"
 		}
-	} else if result.Metrics.PAt1 >= 1.0 {
+	case result.Metrics.PAt1 >= 1.0:
 		result.Status = "hit"
-	} else if result.Metrics.PAt1 >= 0.5 {
+	case result.Metrics.PAt1 >= 0.5:
 		result.Status = "partial"
-	} else {
+	default:
 		result.Status = "miss"
 	}
 }
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
index 8e28dcb..e7622f1 100644
--- a/internal/benchmark/runtime.go
+++ b/internal/benchmark/runtime.go
@@ -73,12 +73,13 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 			result.Benchmarks[i].BaselineNs = base.NsOp
 			result.Benchmarks[i].Ratio = ratio
 
-			if ratio > maxRatio {
+			switch {
+			case ratio > maxRatio:
 				result.Benchmarks[i].Status = "regression"
 				result.Regressions++
-			} else if ratio > 1.1 {
+			case ratio > 1.1:
 				result.Benchmarks[i].Status = "warning"
-			} else {
+			default:
 				result.Benchmarks[i].Status = "ok"
 			}
 		} else {
@@ -131,13 +132,13 @@ func parseBenchOutput(output string) []RuntimeBenchmark {
 
 		for i, f := range fields {
 			if f == "ns/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%f", &nsOp)
+				_, _ = fmt.Sscanf(fields[i-1], "%f", &nsOp)
 			}
 			if f == "B/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &bytesOp)
+				_, _ = fmt.Sscanf(fields[i-1], "%d", &bytesOp)
 			}
 			if f == "allocs/op" && i > 0 {
-				fmt.Sscanf(fields[i-1], "%d", &allocsOp)
+				_, _ = fmt.Sscanf(fields[i-1], "%d", &allocsOp)
 			}
 		}
 
diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go
index 9670a68..1261dd6 100644
--- a/recovery/benchmark_test.go
+++ b/recovery/benchmark_test.go
@@ -237,7 +237,7 @@ func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc Ben
 
 	err := fmt.Errorf("could not find node with id %s", sc.OriginalRef)
 
-	re.AttemptWithClassification(
+	_, _, _ = re.AttemptWithClassification(
 		context.Background(),
 		"test-tab",
 		sc.OriginalRef,

From b37cd438aa18e035fbb24754cfbb9125b8ca224a Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 22:52:43 +0100
Subject: [PATCH 13/14] feat: config-driven thresholds, validation, and
 deterministic output

---
 internal/benchmark/catalog.go     |  19 ++-
 internal/benchmark/check.go       |  45 +++++-
 internal/benchmark/compare.go     |  11 ++
 internal/benchmark/config.go      | 239 ++++++++++++++++++++++++++++--
 internal/benchmark/config_test.go | 147 ++++++++++++++++++
 internal/benchmark/runner.go      |  84 ++++++++++-
 internal/benchmark/runtime.go     |  42 +++++-
 internal/engine/benchmark_test.go | 119 ++++++++++++++-
 8 files changed, 673 insertions(+), 33 deletions(-)
 create mode 100644 internal/benchmark/config_test.go

diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go
index b4c4ec1..69a3091 100644
--- a/internal/benchmark/catalog.go
+++ b/internal/benchmark/catalog.go
@@ -62,14 +62,25 @@ func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) {
 	switch cfg.By {
 	case "difficulty":
 		fmt.Printf("\n  By Difficulty:\n")
-		for d, n := range result.ByDifficulty {
-			fmt.Printf("    %-10s %4d\n", d, n)
+		diffs := sortedKeys(result.ByDifficulty)
+		for _, d := range diffs {
+			fmt.Printf("    %-10s %4d\n", d, result.ByDifficulty[d])
 		}
 	case "tag":
 		fmt.Printf("\n  By Tag:\n")
-		for t, n := range result.ByTag {
-			fmt.Printf("    %-20s %4d\n", t, n)
+		tags := sortedKeys(result.ByTag)
+		for _, t := range tags {
+			fmt.Printf("    %-20s %4d\n", t, result.ByTag[t])
 		}
 	}
 	fmt.Printf("\n")
 }
+
+func sortedKeys(m map[string]int) []string {
+	keys := make([]string, 0, len(m))
+	for k := range m {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	return keys
+}
diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
index e2ceedc..0528059 100644
--- a/internal/benchmark/check.go
+++ b/internal/benchmark/check.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"sort"
 	"strings"
 	"time"
 )
@@ -40,6 +41,7 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 		Verbose:         cfg.Verbose,
 		Explain:         cfg.Explain,
 		OutputDir:       cfg.OutputDir,
+		Quick:           cfg.Quick,
 	}
 
 	report, err := RunCorpusBenchmark(ds, runCfg)
@@ -71,10 +73,28 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 	}
 	result.Summary.Regressions = len(result.TopRegs)
 
+	// Determine baseline path from config
 	baselinePath := cfg.BaselinePath
 	if baselinePath == "" {
-		baselinePath = filepath.Join(root, "baselines", "combined.json")
+		if benchCfg != nil {
+			baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json")
+		} else {
+			baselinePath = filepath.Join(root, "baselines", "combined.json")
+		}
+	}
+
+	// Get quality thresholds from config
+	var thresholds BaselineQuality
+	if benchCfg != nil {
+		thresholds = benchCfg.QualityThresholds()
+	} else {
+		thresholds = BaselineQuality{
+			MaxOverallPAt1Drop:   0.02,
+			MaxOverallMRRDrop:    0.02,
+			MaxOverallHitAt3Drop: 0.02,
+		}
 	}
+
 	if _, err := os.Stat(baselinePath); err == nil {
 		baseline, err := loadReport(baselinePath)
 		if err == nil {
@@ -83,12 +103,24 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 				MRR:    report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR,
 				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
 			}
-			if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) {
-				result.Status = "fail"
+			if cfg.FailOnReg {
+				if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop ||
+					result.Delta.MRR < -thresholds.MaxOverallMRRDrop ||
+					result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop {
+					result.Status = "fail"
+				}
 			}
 		}
 	}
 
+	// Sort regressions for deterministic output
+	sort.Slice(result.TopRegs, func(i, j int) bool {
+		if result.TopRegs[i].Corpus != result.TopRegs[j].Corpus {
+			return result.TopRegs[i].Corpus < result.TopRegs[j].Corpus
+		}
+		return result.TopRegs[i].ID < result.TopRegs[j].ID
+	})
+
 	_ = os.MkdirAll(cfg.OutputDir, 0755)
 	ts := time.Now().Format("20060102_150405")
 	reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts))
@@ -155,13 +187,16 @@ func generateSummaryMD(report *Report, result *CheckResult) string {
 		sb.WriteString("\n## Misses\n\n")
 		sb.WriteString("| ID | Corpus | Query | Got | Expected |\n")
 		sb.WriteString("|----|--------|-------|-----|----------|\n")
-		for _, r := range result.TopRegs {
-			if len(result.TopRegs) > 10 {
+		for i, r := range result.TopRegs {
+			if i >= 10 {
 				break
 			}
 			fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n",
 				r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))
 		}
+		if len(result.TopRegs) > 10 {
+			fmt.Fprintf(&sb, "\n*Showing 10 of %d misses.*\n", len(result.TopRegs))
+		}
 	}
 
 	return sb.String()
diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go
index 2b0a3d5..f0e6ccf 100644
--- a/internal/benchmark/compare.go
+++ b/internal/benchmark/compare.go
@@ -3,6 +3,7 @@ package benchmark
 import (
 	"encoding/json"
 	"fmt"
+	"sort"
 )
 
 func RunCompare(cfg CompareConfig) (*CompareResult, error) {
@@ -70,9 +71,19 @@ func PrintCompareResult(result *CompareResult, cfg CompareConfig) {
 
 	if len(result.Regressions) > 0 {
 		fmt.Printf("\n  Regressions:\n")
+		sortRegressions(result.Regressions)
 		for _, r := range result.Regressions {
 			fmt.Printf("    %s: %s (%s)\n", r.ID, r.Reason, r.Query)
 		}
 	}
 	fmt.Printf("\n")
 }
+
+func sortRegressions(regs []Regression) {
+	sort.Slice(regs, func(i, j int) bool {
+		if regs[i].Corpus != regs[j].Corpus {
+			return regs[i].Corpus < regs[j].Corpus
+		}
+		return regs[i].ID < regs[j].ID
+	})
+}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index e41fe1c..cd0bbec 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -2,20 +2,35 @@ package benchmark
 
 import (
 	"encoding/json"
+	"errors"
 	"flag"
+	"fmt"
 	"os"
 	"path/filepath"
 )
 
 type Config struct {
-	Version  string             `json:"version"`
-	Defaults DefaultsConfig     `json:"defaults"`
-	Profiles map[string]Profile `json:"profiles"`
-	Baseline BaselineConfig     `json:"baseline"`
+	Version      string             `json:"version"`
+	Defaults     DefaultsConfig     `json:"defaults"`
+	Profiles     map[string]Profile `json:"profiles"`
+	Baseline     BaselineConfig     `json:"baseline"`
+	Results      ResultsConfig      `json:"results"`
+	Strategies   []string           `json:"strategies"`
+	SnapshotsDir string             `json:"snapshots_dir"`
 }
 
 type DefaultsConfig struct {
-	Profile string `json:"profile"`
+	Profile   string  `json:"profile"`
+	Strategy  string  `json:"strategy"`
+	Threshold float64 `json:"threshold"`
+	TopK      int     `json:"top_k"`
+	Weights   Weights `json:"weights"`
+}
+
+type ResultsConfig struct {
+	Dir                  string `json:"dir"`
+	BaselinesDir         string `json:"baselines_dir"`
+	GeneratedFilesPolicy string `json:"generated_files_policy"`
 }
 
 type Profile struct {
@@ -42,16 +57,20 @@ type BaselineConfig struct {
 }
 
 type BaselineQuality struct {
-	MaxOverallPAt1Drop   float64 `json:"max_overall_p_at_1_drop"`
-	MaxOverallMRRDrop    float64 `json:"max_overall_mrr_drop"`
-	MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"`
-	MaxCorpusPAt1Drop    float64 `json:"max_corpus_p_at_1_drop"`
-	MaxTagPAt1Drop       float64 `json:"max_tag_p_at_1_drop"`
+	MaxOverallPAt1Drop    float64 `json:"max_overall_p_at_1_drop"`
+	MaxOverallMRRDrop     float64 `json:"max_overall_mrr_drop"`
+	MaxOverallHitAt3Drop  float64 `json:"max_overall_hit_at_3_drop"`
+	MaxCorpusPAt1Drop     float64 `json:"max_corpus_p_at_1_drop"`
+	MaxDifficultyPAt1Drop float64 `json:"max_difficulty_p_at_1_drop"`
+	MaxTagPAt1Drop        float64 `json:"max_tag_p_at_1_drop"`
+	MaxMarginDropReport   float64 `json:"max_margin_drop_report"`
 }
 
 type BaselineRuntime struct {
 	MaxNsOpRegressionRatio  float64 `json:"max_ns_op_regression_ratio"`
 	MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"`
+	MaxCorpusLatencyP50MS   int     `json:"max_corpus_latency_p50_ms"`
+	MaxCorpusLatencyP95MS   int     `json:"max_corpus_latency_p95_ms"`
 }
 
 type CheckConfig struct {
@@ -80,6 +99,7 @@ type RunConfig struct {
 	Explain         bool
 	OutputDir       string
 	ReportName      string
+	Quick           bool
 }
 
 type CompareConfig struct {
@@ -152,11 +172,28 @@ func LoadConfig(benchmarkRoot string) (*Config, error) {
 func ResolveProfile(cfg *Config, name string) Profile {
 	p, ok := cfg.Profiles[name]
 	if !ok {
+		// Use defaults from config, falling back to hardcoded values
+		strategy := cfg.Defaults.Strategy
+		if strategy == "" {
+			strategy = "combined"
+		}
+		threshold := cfg.Defaults.Threshold
+		if threshold == 0 {
+			threshold = 0.01
+		}
+		topK := cfg.Defaults.TopK
+		if topK == 0 {
+			topK = 5
+		}
+		weights := cfg.Defaults.Weights
+		if weights.Lexical == 0 && weights.Embedding == 0 {
+			weights = Weights{Lexical: 0.6, Embedding: 0.4}
+		}
 		return Profile{
-			Strategy:  "combined",
-			Threshold: 0.01,
-			TopK:      5,
-			Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
+			Strategy:  strategy,
+			Threshold: threshold,
+			TopK:      topK,
+			Weights:   weights,
 			Suites:    []string{"corpus"},
 			Mode:      "library",
 		}
@@ -185,6 +222,180 @@ func ResolveProfile(cfg *Config, name string) Profile {
 	return p
 }
 
+// projectRoot returns the project root (parent of tests/benchmark).
+func projectRoot(benchmarkRoot string) string {
+	return filepath.Dir(filepath.Dir(benchmarkRoot))
+}
+
+// ResultsDir returns the configured results directory.
+func (c *Config) ResultsDir(benchmarkRoot string) string {
+	if c.Results.Dir != "" {
+		if filepath.IsAbs(c.Results.Dir) {
+			return c.Results.Dir
+		}
+		return filepath.Join(projectRoot(benchmarkRoot), c.Results.Dir)
+	}
+	return filepath.Join(benchmarkRoot, "results")
+}
+
+// BaselinesDir returns the configured baselines directory.
+func (c *Config) BaselinesDir(benchmarkRoot string) string {
+	if c.Results.BaselinesDir != "" {
+		if filepath.IsAbs(c.Results.BaselinesDir) {
+			return c.Results.BaselinesDir
+		}
+		return filepath.Join(projectRoot(benchmarkRoot), c.Results.BaselinesDir)
+	}
+	return filepath.Join(benchmarkRoot, "baselines")
+}
+
+// QualityThresholds returns quality thresholds with fallback defaults.
+func (c *Config) QualityThresholds() BaselineQuality {
+	q := c.Baseline.Quality
+	if q.MaxOverallPAt1Drop == 0 {
+		q.MaxOverallPAt1Drop = 0.02
+	}
+	if q.MaxOverallMRRDrop == 0 {
+		q.MaxOverallMRRDrop = 0.02
+	}
+	if q.MaxOverallHitAt3Drop == 0 {
+		q.MaxOverallHitAt3Drop = 0.02
+	}
+	if q.MaxCorpusPAt1Drop == 0 {
+		q.MaxCorpusPAt1Drop = 0.08
+	}
+	if q.MaxDifficultyPAt1Drop == 0 {
+		q.MaxDifficultyPAt1Drop = 0.08
+	}
+	if q.MaxTagPAt1Drop == 0 {
+		q.MaxTagPAt1Drop = 0.08
+	}
+	if q.MaxMarginDropReport == 0 {
+		q.MaxMarginDropReport = 0.15
+	}
+	return q
+}
+
+// RuntimeThresholds returns runtime thresholds with fallback defaults.
+func (c *Config) RuntimeThresholds() BaselineRuntime {
+	r := c.Baseline.Runtime
+	if r.MaxNsOpRegressionRatio == 0 {
+		r.MaxNsOpRegressionRatio = 1.25
+	}
+	if r.MaxAllocRegressionRatio == 0 {
+		r.MaxAllocRegressionRatio = 1.25
+	}
+	return r
+}
+
+// ValidateConfig checks the config for errors and returns a descriptive error if invalid.
+func ValidateConfig(cfg *Config) error {
+	var errs []error
+
+	// Validate strategies
+	if len(cfg.Strategies) == 0 {
+		errs = append(errs, errors.New("strategies list is empty"))
+	} else {
+		validStrategies := make(map[string]bool)
+		for _, s := range cfg.Strategies {
+			validStrategies[s] = true
+		}
+		// Check default strategy is in list
+		if cfg.Defaults.Strategy != "" && !validStrategies[cfg.Defaults.Strategy] {
+			errs = append(errs, fmt.Errorf("default strategy %q not in strategies list", cfg.Defaults.Strategy))
+		}
+		// Check profile strategies
+		for name, p := range cfg.Profiles {
+			if p.Strategy != "" && !validStrategies[p.Strategy] {
+				errs = append(errs, fmt.Errorf("profile %q uses strategy %q not in strategies list", name, p.Strategy))
+			}
+		}
+	}
+
+	// Validate weights
+	if cfg.Defaults.Weights.Lexical < 0 {
+		errs = append(errs, errors.New("defaults.weights.lexical must be non-negative"))
+	}
+	if cfg.Defaults.Weights.Embedding < 0 {
+		errs = append(errs, errors.New("defaults.weights.embedding must be non-negative"))
+	}
+	if cfg.Defaults.Weights.Lexical == 0 && cfg.Defaults.Weights.Embedding == 0 {
+		errs = append(errs, errors.New("defaults.weights: lexical and embedding cannot both be zero"))
+	}
+
+	// Validate profile weights
+	for name, p := range cfg.Profiles {
+		if p.Weights.Lexical < 0 {
+			errs = append(errs, fmt.Errorf("profile %q: weights.lexical must be non-negative", name))
+		}
+		if p.Weights.Embedding < 0 {
+			errs = append(errs, fmt.Errorf("profile %q: weights.embedding must be non-negative", name))
+		}
+	}
+
+	// Validate quality thresholds (should be positive when set)
+	q := cfg.Baseline.Quality
+	if q.MaxOverallPAt1Drop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_p_at_1_drop must be non-negative"))
+	}
+	if q.MaxOverallMRRDrop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_mrr_drop must be non-negative"))
+	}
+	if q.MaxOverallHitAt3Drop < 0 {
+		errs = append(errs, errors.New("baseline.quality.max_overall_hit_at_3_drop must be non-negative"))
+	}
+
+	// Validate runtime thresholds (must be >= 1)
+	r := cfg.Baseline.Runtime
+	if r.MaxNsOpRegressionRatio != 0 && r.MaxNsOpRegressionRatio < 1 {
+		errs = append(errs, errors.New("baseline.runtime.max_ns_op_regression_ratio must be >= 1"))
+	}
+	if r.MaxAllocRegressionRatio != 0 && r.MaxAllocRegressionRatio < 1 {
+		errs = append(errs, errors.New("baseline.runtime.max_alloc_regression_ratio must be >= 1"))
+	}
+
+	// Validate profile inheritance
+	if err := validateProfileInheritance(cfg); err != nil {
+		errs = append(errs, err)
+	}
+
+	if len(errs) == 0 {
+		return nil
+	}
+	if len(errs) == 1 {
+		return errs[0]
+	}
+	return fmt.Errorf("config has %d errors: %v", len(errs), errs)
+}
+
+// validateProfileInheritance checks for missing references and cycles.
+func validateProfileInheritance(cfg *Config) error {
+	for name, p := range cfg.Profiles {
+		if p.Inherits == "" {
+			continue
+		}
+		// Check reference exists
+		if _, ok := cfg.Profiles[p.Inherits]; !ok {
+			return fmt.Errorf("profile %q inherits from non-existent profile %q", name, p.Inherits)
+		}
+		// Check for cycles
+		visited := map[string]bool{name: true}
+		current := p.Inherits
+		for current != "" {
+			if visited[current] {
+				return fmt.Errorf("profile inheritance cycle detected: %q -> %q", name, current)
+			}
+			visited[current] = true
+			if parent, ok := cfg.Profiles[current]; ok {
+				current = parent.Inherits
+			} else {
+				break
+			}
+		}
+	}
+	return nil
+}
+
 func ParseCheckFlags(args []string) CheckConfig {
 	fs := flag.NewFlagSet("check", flag.ExitOnError)
 	cfg := CheckConfig{
diff --git a/internal/benchmark/config_test.go b/internal/benchmark/config_test.go
new file mode 100644
index 0000000..2590556
--- /dev/null
+++ b/internal/benchmark/config_test.go
@@ -0,0 +1,147 @@
+package benchmark
+
+import "testing"
+
+func TestValidateConfig_Valid(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"lexical", "embedding", "combined"},
+		Defaults: DefaultsConfig{
+			Strategy: "combined",
+			Weights:  Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Quality: BaselineQuality{
+				MaxOverallPAt1Drop: 0.02,
+			},
+			Runtime: BaselineRuntime{
+				MaxNsOpRegressionRatio: 1.25,
+			},
+		},
+	}
+	if err := ValidateConfig(cfg); err != nil {
+		t.Errorf("expected valid config, got error: %v", err)
+	}
+}
+
+func TestValidateConfig_EmptyStrategies(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for empty strategies")
+	}
+}
+
+func TestValidateConfig_InvalidDefaultStrategy(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"lexical", "embedding"},
+		Defaults: DefaultsConfig{
+			Strategy: "combined",
+			Weights:  Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for invalid default strategy")
+	}
+}
+
+func TestValidateConfig_NegativeWeights(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: -0.5, Embedding: 0.4},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for negative weight")
+	}
+}
+
+func TestValidateConfig_BothWeightsZero(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0, Embedding: 0},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error when both weights are zero")
+	}
+}
+
+func TestValidateConfig_RuntimeRatioTooLow(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Runtime: BaselineRuntime{
+				MaxNsOpRegressionRatio: 0.5,
+			},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for runtime ratio < 1")
+	}
+}
+
+func TestValidateConfig_ProfileInheritsMissing(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Profiles: map[string]Profile{
+			"fast": {Inherits: "nonexistent"},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for missing inherited profile")
+	}
+}
+
+func TestValidateConfig_ProfileInheritanceCycle(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Profiles: map[string]Profile{
+			"a": {Inherits: "b"},
+			"b": {Inherits: "c"},
+			"c": {Inherits: "a"},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for inheritance cycle")
+	}
+}
+
+func TestValidateConfig_NegativeQualityThreshold(t *testing.T) {
+	cfg := &Config{
+		Strategies: []string{"combined"},
+		Defaults: DefaultsConfig{
+			Weights: Weights{Lexical: 0.6, Embedding: 0.4},
+		},
+		Baseline: BaselineConfig{
+			Quality: BaselineQuality{
+				MaxOverallPAt1Drop: -0.02,
+			},
+		},
+	}
+	err := ValidateConfig(cfg)
+	if err == nil {
+		t.Error("expected error for negative quality threshold")
+	}
+}
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
index f5b3a7d..253a4c3 100644
--- a/internal/benchmark/runner.go
+++ b/internal/benchmark/runner.go
@@ -2,6 +2,8 @@ package benchmark
 
 import (
 	"context"
+	"os/exec"
+	"strings"
 	"time"
 
 	"github.com/pinchtab/semantic"
@@ -118,6 +120,7 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 	report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile
 	report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339)
 	report.Run.Tool = "semantic-bench"
+	report.Run.GitSHA, report.Run.GitDirty = getGitInfo()
 	report.Dataset.Name = "semantic-ui-matching-corpus"
 	report.Dataset.QueryCount = ds.QueryCount()
 	report.Dataset.CorpusCount = ds.CorpusCount()
@@ -138,7 +141,12 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 			continue
 		}
 
-		for _, query := range corpus.Queries {
+		queries := corpus.Queries
+		if cfg.Quick {
+			queries = selectQuickSubset(corpus.Queries)
+		}
+
+		for _, query := range queries {
 			if cfg.QueryID != "" && query.ID != cfg.QueryID {
 				continue
 			}
@@ -153,6 +161,56 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 	return report, nil
 }
 
+// selectQuickSubset returns a deterministic subset of queries for quick mode.
+// It selects at most 3 queries per corpus, preferring a mix of difficulties.
+func selectQuickSubset(queries []Query) []Query {
+	if len(queries) <= 3 {
+		return queries
+	}
+
+	// Group by difficulty
+	byDiff := make(map[string][]Query)
+	for _, q := range queries {
+		diff := q.Difficulty
+		if diff == "" {
+			diff = "medium"
+		}
+		byDiff[diff] = append(byDiff[diff], q)
+	}
+
+	// Select one from each difficulty level, up to 3 total
+	var selected []Query
+	for _, diff := range []string{"easy", "medium", "hard"} {
+		if qs, ok := byDiff[diff]; ok && len(qs) > 0 {
+			selected = append(selected, qs[0])
+			if len(selected) >= 3 {
+				break
+			}
+		}
+	}
+
+	// If we don't have 3 yet, fill from remaining
+	if len(selected) < 3 {
+		for _, q := range queries {
+			found := false
+			for _, s := range selected {
+				if s.ID == q.ID {
+					found = true
+					break
+				}
+			}
+			if !found {
+				selected = append(selected, q)
+				if len(selected) >= 3 {
+					break
+				}
+			}
+		}
+	}
+
+	return selected
+}
+
 func createMatcher(cfg RunConfig) semantic.ElementMatcher {
 	embedder := semantic.NewHashingEmbedder(128)
 	switch cfg.Strategy {
@@ -189,8 +247,11 @@ func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg R
 
 	start := time.Now()
 	findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{
-		Threshold: threshold,
-		TopK:      topK,
+		Threshold:       threshold,
+		TopK:            topK,
+		LexicalWeight:   cfg.LexicalWeight,
+		EmbeddingWeight: cfg.EmbeddingWeight,
+		Explain:         cfg.Explain,
 	})
 	result.Latency.LibraryMs = time.Since(start).Milliseconds()
 
@@ -384,3 +445,20 @@ func sortInt64(s []int64) {
 		}
 	}
 }
+
+func getGitInfo() (sha string, dirty bool) {
+	cmd := exec.Command("git", "rev-parse", "HEAD")
+	out, err := cmd.Output()
+	if err != nil {
+		return "", false
+	}
+	sha = strings.TrimSpace(string(out))
+
+	cmd = exec.Command("git", "status", "--porcelain")
+	out, err = cmd.Output()
+	if err != nil {
+		return sha, false
+	}
+	dirty = len(strings.TrimSpace(string(out))) > 0
+	return sha, dirty
+}
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
index e7622f1..6545913 100644
--- a/internal/benchmark/runtime.go
+++ b/internal/benchmark/runtime.go
@@ -35,7 +35,26 @@ type runtimeBaseline struct {
 
 func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 	root := FindBenchmarkRoot()
-	baselinePath := filepath.Join(root, "baselines", "runtime.json")
+
+	// Load config for thresholds
+	benchCfg, _ := LoadConfig(root)
+	var thresholds BaselineRuntime
+	if benchCfg != nil {
+		thresholds = benchCfg.RuntimeThresholds()
+	} else {
+		thresholds = BaselineRuntime{
+			MaxNsOpRegressionRatio:  1.25,
+			MaxAllocRegressionRatio: 1.25,
+		}
+	}
+
+	// Determine baseline path from config
+	var baselinePath string
+	if benchCfg != nil {
+		baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "runtime.json")
+	} else {
+		baselinePath = filepath.Join(root, "baselines", "runtime.json")
+	}
 
 	benchmarks, err := runGoBenchmarks()
 	if err != nil {
@@ -66,18 +85,29 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 		baselineMap[b.Name] = b
 	}
 
-	maxRatio := 1.25
+	// Warning threshold is halfway between 1.0 and max ratio
+	warnRatio := 1.0 + ((thresholds.MaxNsOpRegressionRatio - 1.0) / 2.0)
+
 	for i, b := range result.Benchmarks {
 		if base, ok := baselineMap[b.Name]; ok {
-			ratio := b.NsOp / base.NsOp
+			nsRatio := b.NsOp / base.NsOp
 			result.Benchmarks[i].BaselineNs = base.NsOp
-			result.Benchmarks[i].Ratio = ratio
+			result.Benchmarks[i].Ratio = nsRatio
+
+			// Check allocation regression if baseline has allocation data
+			var allocRatio float64
+			if base.AllocsOp > 0 && b.AllocsOp > 0 {
+				allocRatio = float64(b.AllocsOp) / float64(base.AllocsOp)
+			}
 
 			switch {
-			case ratio > maxRatio:
+			case nsRatio > thresholds.MaxNsOpRegressionRatio:
+				result.Benchmarks[i].Status = "regression"
+				result.Regressions++
+			case allocRatio > thresholds.MaxAllocRegressionRatio:
 				result.Benchmarks[i].Status = "regression"
 				result.Regressions++
-			case ratio > 1.1:
+			case nsRatio > warnRatio:
 				result.Benchmarks[i].Status = "warning"
 			default:
 				result.Benchmarks[i].Status = "ok"
diff --git a/internal/engine/benchmark_test.go b/internal/engine/benchmark_test.go
index c37528c..0ebc2c6 100644
--- a/internal/engine/benchmark_test.go
+++ b/internal/engine/benchmark_test.go
@@ -2,9 +2,10 @@ package engine
 
 import (
 	"context"
-	"github.com/pinchtab/semantic/internal/types"
 	"strconv"
 	"testing"
+
+	"github.com/pinchtab/semantic/internal/types"
 )
 
 // benchElements returns a realistic set of elements for benchmarking.
@@ -244,3 +245,119 @@ func BenchmarkCombinedFind_Issue24_100Elements(b *testing.B) {
 		})
 	}
 }
+
+// Focused microbenchmarks for individual components
+
+func BenchmarkParseQueryContext(b *testing.B) {
+	queries := []string{
+		"sign in button",
+		"the first email textbox in the login form",
+		"button not submit near the checkout section",
+		"second item in the dropdown menu",
+	}
+	b.ReportAllocs()
+
+	for b.Loop() {
+		for _, q := range queries {
+			ParseQueryContext(q)
+		}
+	}
+}
+
+func BenchmarkParseQueryContext_Complex(b *testing.B) {
+	q := "the third blue submit button in the checkout form not disabled"
+	b.ReportAllocs()
+
+	for b.Loop() {
+		ParseQueryContext(q)
+	}
+}
+
+func BenchmarkRemoveStopwords(b *testing.B) {
+	tokenSets := [][]string{
+		{"click", "the", "sign", "in", "button"},
+		{"find", "the", "email", "address", "textbox"},
+		{"the", "first", "item", "in", "a", "dropdown", "menu"},
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, tokens := range tokenSets {
+			removeStopwords(tokens)
+		}
+	}
+}
+
+func BenchmarkScoreFusion(b *testing.B) {
+	// Test the score fusion calculation
+	lexScores := make([]float64, 100)
+	embScores := make([]float64, 100)
+	for i := range lexScores {
+		lexScores[i] = float64(i) / 100.0
+		embScores[i] = float64(100-i) / 100.0
+	}
+	lexWeight, embWeight := 0.6, 0.4
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for j := range lexScores {
+			_ = lexWeight*lexScores[j] + embWeight*embScores[j]
+		}
+	}
+}
+
+func BenchmarkLexicalScore_Variants(b *testing.B) {
+	cases := []struct {
+		name  string
+		query string
+		desc  string
+	}{
+		{"exact", "Sign In", "button: Sign In"},
+		{"partial", "sign", "button: Sign In"},
+		{"synonym", "login", "button: Sign In"},
+		{"mismatch", "checkout", "button: Sign In"},
+		{"long_query", "click the sign in button on the login page", "button: Sign In"},
+	}
+	for _, tc := range cases {
+		b.Run(tc.name, func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				LexicalScore(tc.query, tc.desc)
+			}
+		})
+	}
+}
+
+func BenchmarkCombinedFind_WeightVariants(b *testing.B) {
+	elements := benchElements()
+	ctx := context.Background()
+
+	weights := []struct {
+		name string
+		lex  float64
+		emb  float64
+	}{
+		{"lex_only", 1.0, 0.0},
+		{"emb_only", 0.0, 1.0},
+		{"balanced", 0.5, 0.5},
+		{"lex_heavy", 0.8, 0.2},
+		{"emb_heavy", 0.2, 0.8},
+	}
+
+	for _, w := range weights {
+		b.Run(w.name, func(b *testing.B) {
+			m := NewCombinedMatcher(NewHashingEmbedder(128))
+			opts := types.FindOptions{
+				Threshold:       0.3,
+				TopK:            3,
+				LexicalWeight:   w.lex,
+				EmbeddingWeight: w.emb,
+			}
+			b.ReportAllocs()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_, _ = m.Find(ctx, "sign in button", elements, opts)
+			}
+		})
+	}
+}

From b6fadf1d4f35feba339ac7ebdb5ab25dfeae32c0 Mon Sep 17 00:00:00 2001
From: Luigi Agosti <luigi@tengio.com>
Date: Fri, 24 Apr 2026 23:05:00 +0100
Subject: [PATCH 14/14] feat: config-driven thresholds with validation and
 enforcement

---
 internal/benchmark/check.go   | 55 ++++++++++++++++++++---------------
 internal/benchmark/config.go  |  5 +++-
 internal/benchmark/runner.go  |  6 ++--
 internal/benchmark/runtime.go | 22 ++++----------
 4 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go
index 0528059..88234f6 100644
--- a/internal/benchmark/check.go
+++ b/internal/benchmark/check.go
@@ -18,16 +18,11 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 		return nil, fmt.Errorf("load dataset: %w", err)
 	}
 
-	benchCfg, _ := LoadConfig(root)
-	profile := Profile{
-		Strategy:  "combined",
-		Threshold: 0.01,
-		TopK:      5,
-		Weights:   Weights{Lexical: 0.6, Embedding: 0.4},
-	}
-	if benchCfg != nil {
-		profile = ResolveProfile(benchCfg, cfg.Profile)
+	benchCfg, err := LoadConfig(root)
+	if err != nil {
+		return nil, fmt.Errorf("load config: %w", err)
 	}
+	profile := ResolveProfile(benchCfg, cfg.Profile)
 
 	runCfg := RunConfig{
 		Suite:           "corpus",
@@ -76,24 +71,11 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 	// Determine baseline path from config
 	baselinePath := cfg.BaselinePath
 	if baselinePath == "" {
-		if benchCfg != nil {
-			baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json")
-		} else {
-			baselinePath = filepath.Join(root, "baselines", "combined.json")
-		}
+		baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json")
 	}
 
 	// Get quality thresholds from config
-	var thresholds BaselineQuality
-	if benchCfg != nil {
-		thresholds = benchCfg.QualityThresholds()
-	} else {
-		thresholds = BaselineQuality{
-			MaxOverallPAt1Drop:   0.02,
-			MaxOverallMRRDrop:    0.02,
-			MaxOverallHitAt3Drop: 0.02,
-		}
-	}
+	thresholds := benchCfg.QualityThresholds()
 
 	if _, err := os.Stat(baselinePath); err == nil {
 		baseline, err := loadReport(baselinePath)
@@ -104,11 +86,36 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) {
 				HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3,
 			}
 			if cfg.FailOnReg {
+				// Check overall thresholds
 				if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop ||
 					result.Delta.MRR < -thresholds.MaxOverallMRRDrop ||
 					result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop {
 					result.Status = "fail"
 				}
+				// Check corpus-level thresholds
+				for corpus, current := range report.Metrics.ByCorpus {
+					if base, ok := baseline.Metrics.ByCorpus[corpus]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxCorpusPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
+				// Check difficulty-level thresholds
+				for diff, current := range report.Metrics.ByDifficulty {
+					if base, ok := baseline.Metrics.ByDifficulty[diff]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxDifficultyPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
+				// Check tag-level thresholds
+				for tag, current := range report.Metrics.ByTag {
+					if base, ok := baseline.Metrics.ByTag[tag]; ok {
+						if current.PAt1-base.PAt1 < -thresholds.MaxTagPAt1Drop {
+							result.Status = "fail"
+						}
+					}
+				}
 			}
 		}
 	}
diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go
index cd0bbec..2d233e2 100644
--- a/internal/benchmark/config.go
+++ b/internal/benchmark/config.go
@@ -166,6 +166,9 @@ func LoadConfig(benchmarkRoot string) (*Config, error) {
 	if err := json.Unmarshal(data, &cfg); err != nil {
 		return nil, err
 	}
+	if err := ValidateConfig(&cfg); err != nil {
+		return nil, fmt.Errorf("invalid config: %w", err)
+	}
 	return &cfg, nil
 }
 
@@ -408,7 +411,7 @@ func ParseCheckFlags(args []string) CheckConfig {
 	fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory")
 	fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)")
 	fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression")
-	fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks")
+	fs.BoolVar(&cfg.Quick, "quick", false, "smoke mode: 3 queries per corpus (not representative)")
 	fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details")
 	fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations")
 	_ = fs.Parse(args)
diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go
index 253a4c3..6f00821 100644
--- a/internal/benchmark/runner.go
+++ b/internal/benchmark/runner.go
@@ -161,8 +161,10 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) {
 	return report, nil
 }
 
-// selectQuickSubset returns a deterministic subset of queries for quick mode.
-// It selects at most 3 queries per corpus, preferring a mix of difficulties.
+// selectQuickSubset returns a deterministic subset for smoke testing.
+// Selects up to 3 queries per corpus by difficulty. This is NOT representative
+// of full corpus coverage—edge-case tags may be missed. Use for fast iteration,
+// not for final regression checks.
 func selectQuickSubset(queries []Query) []Query {
 	if len(queries) <= 3 {
 		return queries
diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go
index 6545913..dd68f75 100644
--- a/internal/benchmark/runtime.go
+++ b/internal/benchmark/runtime.go
@@ -37,24 +37,12 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) {
 	root := FindBenchmarkRoot()
 
 	// Load config for thresholds
-	benchCfg, _ := LoadConfig(root)
-	var thresholds BaselineRuntime
-	if benchCfg != nil {
-		thresholds = benchCfg.RuntimeThresholds()
-	} else {
-		thresholds = BaselineRuntime{
-			MaxNsOpRegressionRatio:  1.25,
-			MaxAllocRegressionRatio: 1.25,
-		}
-	}
-
-	// Determine baseline path from config
-	var baselinePath string
-	if benchCfg != nil {
-		baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "runtime.json")
-	} else {
-		baselinePath = filepath.Join(root, "baselines", "runtime.json")
+	benchCfg, err := LoadConfig(root)
+	if err != nil {
+		return nil, fmt.Errorf("load config: %w", err)
 	}
+	thresholds := benchCfg.RuntimeThresholds()
+	baselinePath := filepath.Join(benchCfg.BaselinesDir(root), "runtime.json")
 
 	benchmarks, err := runGoBenchmarks()
 	if err != nil {