From 5c33b3622dae2c13953da58b387f669d0ddf0ec5 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 11:03:34 +0100 Subject: [PATCH 01/14] chore: expand benchmark corpus and add tuning tools Add baseline management scripts (create/check/update), threshold calibration, and runtime baseline tracking. Centralize benchmark config. Update dev tool with ./dev pr command for pre-PR validation. --- .gitignore | 3 +- dev | 110 ++++++++- skills/semantic-dev/SKILL.md | 36 +-- tests/benchmark/baselines/.gitkeep | 0 tests/benchmark/config/benchmark.json | 40 +++- .../benchmark/scripts/calibrate-thresholds.sh | 208 ++++++++++++++++++ tests/benchmark/scripts/check-baseline.sh | 140 ++++++++++++ .../scripts/check-runtime-baseline.sh | 137 ++++++++++++ tests/benchmark/scripts/create-baseline.sh | 86 ++++++++ tests/benchmark/scripts/lint-corpus.sh | 4 +- tests/benchmark/scripts/run-benchmark.sh | 13 +- .../benchmark/scripts/run-corpus-benchmark.sh | 26 ++- tests/benchmark/scripts/run-full-benchmark.sh | 13 ++ tests/benchmark/scripts/tune-weights.sh | 10 + tests/benchmark/scripts/update-baseline.sh | 70 ++++++ 15 files changed, 855 insertions(+), 41 deletions(-) create mode 100644 tests/benchmark/baselines/.gitkeep create mode 100755 tests/benchmark/scripts/calibrate-thresholds.sh create mode 100755 tests/benchmark/scripts/check-baseline.sh create mode 100755 tests/benchmark/scripts/check-runtime-baseline.sh create mode 100755 tests/benchmark/scripts/create-baseline.sh create mode 100755 tests/benchmark/scripts/update-baseline.sh diff --git a/.gitignore b/.gitignore index 2f3b5cc..8a46978 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ cover.out .claude tests/e2e/results/*.txt tests/benchmark/results/*.json -tests/benchmark/results/*.md \ No newline at end of file +tests/benchmark/results/*.md +tests/benchmark/baselines/*.backup.json \ No newline at end of file diff --git a/dev b/dev index dc15e75..215b566 100755 --- a/dev +++ b/dev @@ -11,17 +11,26 @@ ERROR=$'\033[38;2;230;57;70m' NC=$'\033[0m' commands=( + "pr:πŸš€:Pre-PR checks (check + e2e + bench)" "doctor:🩺:Setup dev environment" "test:πŸ§ͺ:Run unit tests" "test verbose:πŸ§ͺ:Run unit tests (verbose)" "test race:πŸ§ͺ:Run unit tests with race detector" "coverage:πŸ“Š:Run tests with coverage report" "lint:πŸ”:Run golangci-lint" + "lint corpus:πŸ”:Lint benchmark corpus" "fmt:✨:Format code" "vet:πŸ”¬:Run go vet" "check:βœ…:Run all checks (fmt + vet + lint + test)" "build:πŸ“¦:Build CLI binary" - "bench:πŸ‹:Run corpus benchmark suite" + "bench:πŸ‹:Run corpus benchmark" + "bench full:πŸ‹:Run full benchmark suite" + "baseline:πŸ“:Create quality baseline" + "baseline check:πŸ“:Check against baseline" + "baseline update:πŸ“:Update baseline (--accept)" + "calibrate:🎯:Calibrate threshold recommendations" + "runtime:⏱️:Check runtime baseline" + "tune:πŸŽ›οΈ:Tune combined weights" "e2e:🐳:Run E2E tests (Docker)" ) @@ -36,6 +45,36 @@ show_help() { echo "" } +run_pr() { + echo " ${ACCENT}${BOLD}πŸš€ Pre-PR checks${NC}" + echo "" + + echo " ${MUTED}1/4 All checks (fmt + vet + lint + test)${NC}" + run_check + + echo "" + echo " ${MUTED}2/4 E2E tests${NC}" + if [[ -f tests/e2e/run.sh ]]; then + go build -o /tmp/semantic ./cmd/semantic + PATH="/tmp:$PATH" bash tests/e2e/run.sh + echo " ${SUCCESS}βœ“${NC} E2E passed" + else + echo " ${MUTED}Skipped (no e2e/run.sh)${NC}" + fi + + echo "" + echo " ${MUTED}3/4 Lint corpus${NC}" + run_lint_corpus + + echo "" + echo " ${MUTED}4/4 Corpus benchmark${NC}" + run_bench > /dev/null 2>&1 + echo " ${SUCCESS}βœ“${NC} Benchmark complete" + + echo "" + echo " ${SUCCESS}${BOLD}πŸš€ Ready for PR${NC}" +} + run_test() { echo " ${ACCENT}${BOLD}πŸ§ͺ Running tests${NC}" go test ./... -count=1 @@ -115,8 +154,48 @@ run_build() { } run_bench() { - echo " ${ACCENT}${BOLD}⏱️ Running corpus benchmark suite${NC}" - bash tests/benchmark/scripts/run-corpus-benchmark.sh + echo " ${ACCENT}${BOLD}πŸ‹ Running corpus benchmark${NC}" + bash tests/benchmark/scripts/run-corpus-benchmark.sh "$@" +} + +run_bench_full() { + echo " ${ACCENT}${BOLD}πŸ‹ Running full benchmark suite${NC}" + bash tests/benchmark/scripts/run-full-benchmark.sh +} + +run_lint_corpus() { + echo " ${ACCENT}${BOLD}πŸ” Linting benchmark corpus${NC}" + bash tests/benchmark/scripts/lint-corpus.sh +} + +run_baseline() { + echo " ${ACCENT}${BOLD}πŸ“ Creating quality baseline${NC}" + bash tests/benchmark/scripts/create-baseline.sh "$@" +} + +run_baseline_check() { + echo " ${ACCENT}${BOLD}πŸ“ Checking against baseline${NC}" + bash tests/benchmark/scripts/check-baseline.sh "$@" +} + +run_baseline_update() { + echo " ${ACCENT}${BOLD}πŸ“ Updating baseline${NC}" + bash tests/benchmark/scripts/update-baseline.sh --accept "$@" +} + +run_calibrate() { + echo " ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}" + bash tests/benchmark/scripts/calibrate-thresholds.sh "$@" +} + +run_runtime() { + echo " ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}" + bash tests/benchmark/scripts/check-runtime-baseline.sh "$@" +} + +run_tune() { + echo " ${ACCENT}${BOLD}πŸŽ›οΈ Tuning combined weights${NC}" + bash tests/benchmark/scripts/tune-weights.sh "$@" } run_e2e() { @@ -129,6 +208,7 @@ run_e2e() { } case "${1:-help}" in + pr) run_pr ;; doctor) exec bash scripts/doctor.sh ;; test) case "${2:-}" in @@ -138,12 +218,32 @@ case "${1:-help}" in esac ;; coverage) run_coverage ;; - lint) run_lint ;; + lint) + case "${2:-}" in + corpus) run_lint_corpus ;; + *) run_lint ;; + esac + ;; fmt) run_fmt ;; vet) run_vet ;; check) run_check ;; build) run_build ;; - bench|benchmark) run_bench ;; + bench|benchmark) + case "${2:-}" in + full) run_bench_full ;; + *) shift; run_bench "$@" ;; + esac + ;; + baseline) + case "${2:-}" in + check) shift 2; run_baseline_check "$@" ;; + update) shift 2; run_baseline_update "$@" ;; + *) shift; run_baseline "$@" ;; + esac + ;; + calibrate) shift; run_calibrate "$@" ;; + runtime) shift; run_runtime "$@" ;; + tune) shift; run_tune "$@" ;; e2e) run_e2e ;; help|*) show_help ;; esac diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md index 84ade33..b813297 100644 --- a/skills/semantic-dev/SKILL.md +++ b/skills/semantic-dev/SKILL.md @@ -15,22 +15,26 @@ cd ~/dev/semantic ## Dev Commands -All development commands run via `./dev`: - -| Command | Description | -|---------|-------------| -| `./dev doctor` | Setup dev environment | -| `./dev test` | Run unit tests | -| `./dev test verbose` | Run unit tests (verbose) | -| `./dev test race` | Run unit tests with race detector | -| `./dev coverage` | Run tests with coverage report | -| `./dev lint` | Run golangci-lint | -| `./dev fmt` | Format code | -| `./dev vet` | Run go vet | -| `./dev check` | All checks (fmt + vet + lint + test) | -| `./dev build` | Build CLI binary | -| `./dev bench` | Run corpus benchmark suite | -| `./dev e2e` | Run E2E tests (Docker) | +```bash +# Before opening a PR (runs all checks + e2e + benchmark) +./dev pr + +# Quick iteration +./dev test # unit tests +./dev check # fmt + vet + lint + test race + +# Benchmarking +./dev bench # corpus benchmark +./dev baseline # create baseline (first time) +./dev baseline check # check for regressions + +# Other +./dev build # build ./semantic binary +./dev e2e # e2e tests (Docker) +./dev lint corpus # validate benchmark data +./dev calibrate # find optimal thresholds +./dev tune # grid-search weights +``` ## Architecture diff --git a/tests/benchmark/baselines/.gitkeep b/tests/benchmark/baselines/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmark/config/benchmark.json b/tests/benchmark/config/benchmark.json index 23b5661..7b06060 100644 --- a/tests/benchmark/config/benchmark.json +++ b/tests/benchmark/config/benchmark.json @@ -1,13 +1,35 @@ { - "version": "1.0.0", - "strategies": ["lexical", "embedding", "combined"], - "default_strategy": "combined", - "default_threshold": 0.3, - "default_top_k": 3, - "metrics": { - "min_accuracy": 0.85, - "min_avg_score": 0.5, - "max_latency_ms": 100 + "version": "1.1.0", + "defaults": { + "strategy": "combined", + "threshold": 0.01, + "top_k": 5, + "weights": { + "lexical": 0.6, + "embedding": 0.4 + } + }, + "baseline": { + "quality": { + "max_overall_p_at_1_drop": 0.02, + "max_overall_mrr_drop": 0.02, + "max_overall_hit_at_3_drop": 0.02, + "max_corpus_p_at_1_drop": 0.08, + "max_difficulty_p_at_1_drop": 0.08, + "max_margin_drop_report": 0.15 + }, + "runtime": { + "max_ns_op_regression_ratio": 1.25, + "max_alloc_regression_ratio": 1.25, + "max_corpus_latency_p50_ms": 75, + "max_corpus_latency_p95_ms": 200 + } }, + "results": { + "dir": "tests/benchmark/results", + "baselines_dir": "tests/benchmark/baselines", + "generated_files_policy": "warn" + }, + "strategies": ["lexical", "embedding", "combined"], "snapshots_dir": "../e2e/assets/snapshots" } diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh new file mode 100755 index 0000000..ef5603d --- /dev/null +++ b/tests/benchmark/scripts/calibrate-thresholds.sh @@ -0,0 +1,208 @@ +#!/bin/bash +# +# Calibrate threshold recommendations for find and recovery. +# +# Usage: +# ./calibrate-thresholds.sh [--corpus ] +# +# Reports recall/precision/false-positive-rate by threshold. +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_DIR="${SCRIPT_DIR}/.." +CORPUS_DIR="${BENCHMARK_DIR}/corpus" +RESULTS_DIR="${BENCHMARK_DIR}/results" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" + +# Read config +if [[ -f "$CONFIG_FILE" ]]; then + STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") + LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE") + EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE") +else + STRATEGY="combined" + LEXICAL_WEIGHT=0.6 + EMBEDDING_WEIGHT=0.4 +fi + +SPECIFIC_CORPUS="" +while [[ $# -gt 0 ]]; do + case "$1" in + --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +mkdir -p "${RESULTS_DIR}" + +# Build semantic binary +echo "Building semantic..." +(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) + +SEMANTIC="${BENCHMARK_DIR}/semantic" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json" + +# Thresholds to test +THRESHOLDS=(0.01 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.60 0.70 0.80 0.90) + +echo "Testing ${#THRESHOLDS[@]} thresholds: ${THRESHOLDS[*]}" +echo "" + +# Initialize report +jq -n \ + --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg strategy "${STRATEGY}" \ + '{ + timestamp: $ts, + strategy: $strategy, + thresholds: [], + recommendations: {} + }' > "${REPORT_FILE}" + +# Collect results for each threshold +for thresh in "${THRESHOLDS[@]}"; do + echo "Testing threshold: ${thresh}" + + total=0 + true_positives=0 + false_positives=0 + false_negatives=0 + + for corpus in "${CORPUS_DIR}"/*/; do + [[ -d "$corpus" ]] || continue + + if [[ -n "$SPECIFIC_CORPUS" ]] && [[ "$(basename "$corpus")" != "$SPECIFIC_CORPUS" ]]; then + continue + fi + + snapshot="${corpus}/snapshot.json" + queries="${corpus}/queries.json" + + [[ -f "$snapshot" ]] && [[ -f "$queries" ]] || continue + + count=$(jq length "$queries") + + for i in $(seq 0 $((count - 1))); do + query=$(jq -r ".[$i].query" "$queries") + relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries") + + result=$("${SEMANTIC}" find "${query}" \ + --snapshot "${snapshot}" \ + --strategy "${STRATEGY}" \ + --threshold "${thresh}" \ + --top-k 5 \ + --lexical-weight "${LEXICAL_WEIGHT}" \ + --embedding-weight "${EMBEDDING_WEIGHT}" \ + --format json 2>/dev/null) || continue + + best_ref=$(echo "$result" | jq -r '.best_ref // ""') + num_matches=$(echo "$result" | jq '.matches | length') + + total=$((total + 1)) + + # Check if best match is relevant + if [[ -n "$best_ref" ]] && echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then + true_positives=$((true_positives + 1)) + elif [[ -n "$best_ref" ]] && [[ "$num_matches" -gt 0 ]]; then + false_positives=$((false_positives + 1)) + fi + + # If no match but there should be one + if [[ -z "$best_ref" ]] || [[ "$num_matches" -eq 0 ]]; then + rel_count=$(echo "$relevant_refs" | jq 'length') + if [[ "$rel_count" -gt 0 ]]; then + false_negatives=$((false_negatives + 1)) + fi + fi + done + done + + # Calculate metrics + if [[ $total -eq 0 ]]; then + echo " No queries processed" + continue + fi + + precision=0 + recall=0 + fpr=0 + + if [[ $((true_positives + false_positives)) -gt 0 ]]; then + precision=$(echo "scale=4; $true_positives / ($true_positives + $false_positives)" | bc) + fi + + if [[ $((true_positives + false_negatives)) -gt 0 ]]; then + recall=$(echo "scale=4; $true_positives / ($true_positives + $false_negatives)" | bc) + fi + + if [[ $((false_positives + true_positives)) -gt 0 ]]; then + fpr=$(echo "scale=4; $false_positives / $total" | bc) + fi + + f1=0 + if (( $(echo "$precision + $recall > 0" | bc -l) )); then + f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc) + fi + + printf " Precision: %.3f | Recall: %.3f | FPR: %.3f | F1: %.3f\n" "$precision" "$recall" "$fpr" "$f1" + + # Append to report + tmp=$(mktemp) + jq --argjson thresh "$thresh" \ + --argjson total "$total" \ + --argjson tp "$true_positives" \ + --argjson fp "$false_positives" \ + --argjson fn "$false_negatives" \ + --argjson precision "$precision" \ + --argjson recall "$recall" \ + --argjson fpr "$fpr" \ + --argjson f1 "$f1" \ + '.thresholds += [{ + threshold: $thresh, + total: $total, + true_positives: $tp, + false_positives: $fp, + false_negatives: $fn, + precision: $precision, + recall: $recall, + false_positive_rate: $fpr, + f1: $f1 + }]' "$REPORT_FILE" > "$tmp" + mv "$tmp" "$REPORT_FILE" +done + +# Calculate recommendations +echo "" +echo "Calculating recommendations..." + +# Best F1 for general find +BEST_FIND=$(jq -r '[.thresholds[] | select(.f1 > 0)] | max_by(.f1) | .threshold // 0.3' "$REPORT_FILE") + +# Best recall with precision > 0.8 for recovery (prioritize not missing) +BEST_RECOVERY=$(jq -r '[.thresholds[] | select(.precision >= 0.7)] | max_by(.recall) | .threshold // 0.2' "$REPORT_FILE") + +# Update recommendations +tmp=$(mktemp) +jq --argjson find "$BEST_FIND" \ + --argjson recovery "$BEST_RECOVERY" \ + '.recommendations = { + find: $find, + recovery: $recovery, + note: "find optimizes F1; recovery optimizes recall with precision >= 0.7" + }' "$REPORT_FILE" > "$tmp" +mv "$tmp" "$REPORT_FILE" + +# Cleanup +rm -f "${BENCHMARK_DIR}/semantic" + +echo "" +echo "================================================" +echo " THRESHOLD CALIBRATION RESULTS" +echo "================================================" +echo " Recommended for Find: ${BEST_FIND}" +echo " Recommended for Recovery: ${BEST_RECOVERY}" +echo "================================================" +echo "" +echo "Report: ${REPORT_FILE}" diff --git a/tests/benchmark/scripts/check-baseline.sh b/tests/benchmark/scripts/check-baseline.sh new file mode 100755 index 0000000..f6e95ae --- /dev/null +++ b/tests/benchmark/scripts/check-baseline.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# +# Check current benchmark results against a baseline. +# +# Usage: +# ./check-baseline.sh [--baseline ] [--fail-on-regression] +# +# Exit codes: +# 0 - No regressions detected +# 1 - Regressions detected (if --fail-on-regression) +# 2 - Error (missing files, invalid config) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_DIR="${SCRIPT_DIR}/.." +BASELINES_DIR="${BENCHMARK_DIR}/baselines" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +NC='\033[0m' + +# Read config +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "ERROR: Config file not found: $CONFIG_FILE" >&2 + exit 2 +fi + +STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") +MAX_P1_DROP=$(jq -r '.baseline.quality.max_overall_p_at_1_drop // 0.02' "$CONFIG_FILE") +MAX_MRR_DROP=$(jq -r '.baseline.quality.max_overall_mrr_drop // 0.02' "$CONFIG_FILE") +MAX_HIT3_DROP=$(jq -r '.baseline.quality.max_overall_hit_at_3_drop // 0.02' "$CONFIG_FILE") +MAX_CORPUS_P1_DROP=$(jq -r '.baseline.quality.max_corpus_p_at_1_drop // 0.08' "$CONFIG_FILE") +MAX_MARGIN_DROP=$(jq -r '.baseline.quality.max_margin_drop_report // 0.15' "$CONFIG_FILE") + +# Parse args +BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json" +FAIL_ON_REGRESSION=false +while [[ $# -gt 0 ]]; do + case "$1" in + --baseline) BASELINE_FILE="$2"; shift 2 ;; + --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;; + *) echo "Unknown option: $1"; exit 2 ;; + esac +done + +if [[ ! -f "$BASELINE_FILE" ]]; then + echo "ERROR: Baseline not found: $BASELINE_FILE" >&2 + echo "Run ./create-baseline.sh first" >&2 + exit 2 +fi + +echo "Checking against baseline: ${BASELINE_FILE}" +echo "Tolerances: P@1=${MAX_P1_DROP}, MRR=${MAX_MRR_DROP}, Hit@3=${MAX_HIT3_DROP}" +echo "" + +# Run current benchmark +TEMP_DIR=$(mktemp -d) +trap 'rm -rf "$TEMP_DIR"' EXIT + +"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" > "${TEMP_DIR}/output.log" 2>&1 + +# Find the latest report +LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1) + +if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then + echo "ERROR: Could not find benchmark report" >&2 + exit 2 +fi + +# Compare metrics +REGRESSIONS=0 +WARNINGS=0 + +compare_metric() { + local name="$1" + local baseline_val="$2" + local current_val="$3" + local max_drop="$4" + + local diff + diff=$(echo "scale=4; $current_val - $baseline_val" | bc) + local drop + drop=$(echo "scale=4; $baseline_val - $current_val" | bc) + + if (( $(echo "$drop > $max_drop" | bc -l) )); then + echo -e "${RED}REGRESSION${NC} $name: $baseline_val -> $current_val (drop: $drop, max: $max_drop)" + REGRESSIONS=$((REGRESSIONS + 1)) + elif (( $(echo "$drop > 0" | bc -l) )); then + echo -e "${YELLOW}WARNING${NC} $name: $baseline_val -> $current_val (drop: $drop)" + WARNINGS=$((WARNINGS + 1)) + else + echo -e "${GREEN}OK${NC} $name: $baseline_val -> $current_val (${diff:0:6})" + fi +} + +echo "=== Overall Metrics ===" +echo "" + +BASELINE_MRR=$(jq -r '.metrics.mrr' "$BASELINE_FILE") +CURRENT_MRR=$(jq -r '.metrics.mrr' "$LATEST_REPORT") +compare_metric "MRR" "$BASELINE_MRR" "$CURRENT_MRR" "$MAX_MRR_DROP" + +BASELINE_P1=$(jq -r '.metrics.p_at_1' "$BASELINE_FILE") +CURRENT_P1=$(jq -r '.metrics.p_at_1' "$LATEST_REPORT") +compare_metric "P@1" "$BASELINE_P1" "$CURRENT_P1" "$MAX_P1_DROP" + +BASELINE_HIT3=$(jq -r '.metrics.hit_at_3' "$BASELINE_FILE") +CURRENT_HIT3=$(jq -r '.metrics.hit_at_3' "$LATEST_REPORT") +compare_metric "Hit@3" "$BASELINE_HIT3" "$CURRENT_HIT3" "$MAX_HIT3_DROP" + +BASELINE_MARGIN=$(jq -r '.metrics.avg_margin' "$BASELINE_FILE") +CURRENT_MARGIN=$(jq -r '.metrics.avg_margin' "$LATEST_REPORT") +compare_metric "Margin" "$BASELINE_MARGIN" "$CURRENT_MARGIN" "$MAX_MARGIN_DROP" + +echo "" +echo "=== Per-Corpus ===" +echo "" + +for corpus in $(jq -r '.by_corpus | keys[]' "$BASELINE_FILE"); do + BASELINE_CORPUS_P1=$(jq -r ".by_corpus[\"$corpus\"].p_at_1 // 0" "$BASELINE_FILE") + CURRENT_CORPUS_P1=$(jq -r ".metrics.by_corpus[\"$corpus\"].p_at_1 // 0" "$LATEST_REPORT") + compare_metric "$corpus P@1" "$BASELINE_CORPUS_P1" "$CURRENT_CORPUS_P1" "$MAX_CORPUS_P1_DROP" +done + +echo "" +echo "================================================" +if [[ $REGRESSIONS -gt 0 ]]; then + echo -e "${RED}REGRESSIONS: $REGRESSIONS${NC}" + if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then + exit 1 + fi +elif [[ $WARNINGS -gt 0 ]]; then + echo -e "${YELLOW}WARNINGS: $WARNINGS (no regressions)${NC}" +else + echo -e "${GREEN}ALL CHECKS PASSED${NC}" +fi +echo "================================================" diff --git a/tests/benchmark/scripts/check-runtime-baseline.sh b/tests/benchmark/scripts/check-runtime-baseline.sh new file mode 100755 index 0000000..75bc4fc --- /dev/null +++ b/tests/benchmark/scripts/check-runtime-baseline.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# +# Check Go benchmark results against runtime baseline. +# +# Usage: +# ./check-runtime-baseline.sh [--fail-on-regression] +# +# Runs Go benchmarks and compares against saved baseline. +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_DIR="${SCRIPT_DIR}/.." +BASELINES_DIR="${BENCHMARK_DIR}/baselines" +RESULTS_DIR="${BENCHMARK_DIR}/results" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" +PROJECT_ROOT="${BENCHMARK_DIR}/../.." + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +NC='\033[0m' + +# Read tolerances from config +if [[ -f "$CONFIG_FILE" ]]; then + MAX_NS_RATIO=$(jq -r '.baseline.runtime.max_ns_op_regression_ratio // 1.25' "$CONFIG_FILE") + MAX_ALLOC_RATIO=$(jq -r '.baseline.runtime.max_alloc_regression_ratio // 1.25' "$CONFIG_FILE") +else + MAX_NS_RATIO=1.25 + MAX_ALLOC_RATIO=1.25 +fi + +# Parse args +FAIL_ON_REGRESSION=false +while [[ $# -gt 0 ]]; do + case "$1" in + --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +mkdir -p "${RESULTS_DIR}" +mkdir -p "${BASELINES_DIR}" + +BASELINE_FILE="${BASELINES_DIR}/runtime.json" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +REPORT_FILE="${RESULTS_DIR}/runtime_${TIMESTAMP}.json" + +echo "Running Go benchmarks..." +echo "" + +# Run benchmarks +BENCH_OUTPUT=$(mktemp) +(cd "$PROJECT_ROOT" && go test -bench=. -benchmem ./internal/engine/... 2>&1) | tee "$BENCH_OUTPUT" + +# Parse benchmark output into JSON +echo "" +echo "Parsing results..." + +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" '{timestamp: $ts, benchmarks: []}' > "$REPORT_FILE" + +while IFS= read -r line; do + if [[ "$line" =~ ^Benchmark ]]; then + # Parse: BenchmarkName-N iterations ns/op bytes/op allocs/op + name=$(echo "$line" | awk '{print $1}' | sed 's/-[0-9]*$//') + ns_op=$(echo "$line" | grep -oE '[0-9.]+ ns/op' | awk '{print $1}' || echo "0") + bytes_op=$(echo "$line" | grep -oE '[0-9]+ B/op' | awk '{print $1}' || echo "0") + allocs_op=$(echo "$line" | grep -oE '[0-9]+ allocs/op' | awk '{print $1}' || echo "0") + + if [[ -n "$ns_op" ]] && [[ "$ns_op" != "0" ]]; then + tmp=$(mktemp) + jq --arg name "$name" \ + --argjson ns "$ns_op" \ + --argjson bytes "${bytes_op:-0}" \ + --argjson allocs "${allocs_op:-0}" \ + '.benchmarks += [{name: $name, ns_op: $ns, bytes_op: $bytes, allocs_op: $allocs}]' \ + "$REPORT_FILE" > "$tmp" + mv "$tmp" "$REPORT_FILE" + fi + fi +done < "$BENCH_OUTPUT" + +rm -f "$BENCH_OUTPUT" + +# If no baseline exists, create one +if [[ ! -f "$BASELINE_FILE" ]]; then + echo "" + echo "No runtime baseline found. Creating initial baseline..." + cp "$REPORT_FILE" "$BASELINE_FILE" + echo "Baseline saved to: $BASELINE_FILE" + exit 0 +fi + +# Compare against baseline +echo "" +echo "=== Comparing against baseline ===" +echo "" + +REGRESSIONS=0 + +for name in $(jq -r '.benchmarks[].name' "$REPORT_FILE"); do + baseline_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$BASELINE_FILE") + current_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$REPORT_FILE") + + baseline_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$BASELINE_FILE") + current_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$REPORT_FILE") + + if [[ "$baseline_ns" == "0" ]] || [[ "$baseline_ns" == "null" ]]; then + echo -e "${YELLOW}NEW${NC} $name: ${current_ns} ns/op" + continue + fi + + ratio=$(echo "scale=4; $current_ns / $baseline_ns" | bc) + + if (( $(echo "$ratio > $MAX_NS_RATIO" | bc -l) )); then + echo -e "${RED}REGRESSION${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x, max: ${MAX_NS_RATIO}x)" + REGRESSIONS=$((REGRESSIONS + 1)) + elif (( $(echo "$ratio > 1.1" | bc -l) )); then + echo -e "${YELLOW}WARNING${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)" + else + echo -e "${GREEN}OK${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)" + fi +done + +echo "" +echo "================================================" +if [[ $REGRESSIONS -gt 0 ]]; then + echo -e "${RED}RUNTIME REGRESSIONS: $REGRESSIONS${NC}" + if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then + exit 1 + fi +else + echo -e "${GREEN}NO RUNTIME REGRESSIONS${NC}" +fi +echo "================================================" +echo "" +echo "Report: ${REPORT_FILE}" diff --git a/tests/benchmark/scripts/create-baseline.sh b/tests/benchmark/scripts/create-baseline.sh new file mode 100755 index 0000000..cd4696a --- /dev/null +++ b/tests/benchmark/scripts/create-baseline.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# +# Create a quality baseline from current corpus benchmark results. +# +# Usage: +# ./create-baseline.sh [--name ] +# +# This runs run-corpus-benchmark.sh and saves the results as a baseline. +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_DIR="${SCRIPT_DIR}/.." +BASELINES_DIR="${BENCHMARK_DIR}/baselines" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" + +# Read defaults from config +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "ERROR: Config file not found: $CONFIG_FILE" >&2 + exit 1 +fi + +STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") + +# Parse args +BASELINE_NAME="${STRATEGY}" +while [[ $# -gt 0 ]]; do + case "$1" in + --name) BASELINE_NAME="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +mkdir -p "${BASELINES_DIR}" + +BASELINE_FILE="${BASELINES_DIR}/${BASELINE_NAME}.json" + +echo "Creating baseline: ${BASELINE_NAME}" +echo "Strategy: ${STRATEGY}" +echo "" + +# Run corpus benchmark +TEMP_DIR=$(mktemp -d) +trap 'rm -rf "$TEMP_DIR"' EXIT + +"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" 2>&1 | tee "${TEMP_DIR}/output.log" + +# Find the latest report +LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1) + +if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then + echo "ERROR: Could not find benchmark report" >&2 + exit 1 +fi + +# Extract baseline data +jq '{ + created_at: .benchmark.timestamp, + strategy: .benchmark.strategy, + threshold: .benchmark.threshold, + top_k: .benchmark.top_k, + weights: .benchmark.weights, + metrics: { + total: .metrics.total, + mrr: .metrics.mrr, + p_at_1: .metrics.p_at_1, + p_at_3: .metrics.p_at_3, + hit_at_3: .metrics.hit_at_3, + hit_at_5: .metrics.hit_at_5, + avg_margin: .metrics.avg_margin, + latency_p50_ms: .metrics.latency_p50_ms, + latency_p95_ms: .metrics.latency_p95_ms + }, + by_difficulty: .metrics.by_difficulty, + by_corpus: .metrics.by_corpus, + per_query: [.results[] | {id, corpus, difficulty, p_at_1, rr, margin}] +}' "$LATEST_REPORT" > "$BASELINE_FILE" + +echo "" +echo "================================================" +echo " BASELINE CREATED" +echo "================================================" +echo " File: ${BASELINE_FILE}" +echo "" +jq -r '" MRR: \(.metrics.mrr)\n P@1: \(.metrics.p_at_1)\n Hit@3: \(.metrics.hit_at_3)\n Margin: \(.metrics.avg_margin)"' "$BASELINE_FILE" +echo "================================================" diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh index 29f81b2..783e546 100755 --- a/tests/benchmark/scripts/lint-corpus.sh +++ b/tests/benchmark/scripts/lint-corpus.sh @@ -17,12 +17,12 @@ WARNINGS=0 error() { echo -e "${RED}ERROR:${NC} $1" - ((ERRORS++)) + ERRORS=$((ERRORS + 1)) } warn() { echo -e "${YELLOW}WARN:${NC} $1" - ((WARNINGS++)) + WARNINGS=$((WARNINGS + 1)) } ok() { diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh index 4ce67d6..29c8a22 100755 --- a/tests/benchmark/scripts/run-benchmark.sh +++ b/tests/benchmark/scripts/run-benchmark.sh @@ -19,9 +19,18 @@ CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots" RESULTS_DIR="${BENCHMARK_DIR}/results" -# Parse args -STRATEGY="combined" +# Read defaults from config +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "ERROR: Config file not found: $CONFIG_FILE" >&2 + exit 1 +fi + +STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") +THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") +TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") CASE_FILE="" + +# Parse args (override config) while [[ $# -gt 0 ]]; do case "$1" in --strategy) STRATEGY="$2"; shift 2 ;; diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh index b5579bf..53216af 100755 --- a/tests/benchmark/scripts/run-corpus-benchmark.sh +++ b/tests/benchmark/scripts/run-corpus-benchmark.sh @@ -17,17 +17,27 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BENCHMARK_DIR="${SCRIPT_DIR}/.." CORPUS_DIR="${BENCHMARK_DIR}/corpus" RESULTS_DIR="${BENCHMARK_DIR}/results" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" -# Parse args -STRATEGY="combined" +# Read defaults from config +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "ERROR: Config file not found: $CONFIG_FILE" >&2 + exit 1 +fi + +STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") +THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") +TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") +LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE") +EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE") SPECIFIC_CORPUS="" -TOP_K=5 -LEXICAL_WEIGHT=0.6 -EMBEDDING_WEIGHT=0.4 + +# Parse args (override config) while [[ $# -gt 0 ]]; do case "$1" in --strategy) STRATEGY="$2"; shift 2 ;; --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; + --threshold) THRESHOLD="$2"; shift 2 ;; --top-k) TOP_K="$2"; shift 2 ;; --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;; --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;; @@ -54,15 +64,19 @@ REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json" jq -n \ --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ --arg strategy "${STRATEGY}" \ + --argjson threshold "${THRESHOLD}" \ --argjson top_k "${TOP_K}" \ --argjson lexical_weight "${LEXICAL_WEIGHT}" \ --argjson embedding_weight "${EMBEDDING_WEIGHT}" \ + --arg config_file "${CONFIG_FILE}" \ '{ benchmark: { timestamp: $ts, strategy: $strategy, + threshold: $threshold, top_k: $top_k, type: "corpus", + config_source: $config_file, weights: { lexical: $lexical_weight, embedding: $embedding_weight @@ -128,7 +142,7 @@ run_corpus() { if ! result=$("${SEMANTIC}" find "${query}" \ --snapshot "${snapshot}" \ --strategy "${STRATEGY}" \ - --threshold 0.01 \ + --threshold "${THRESHOLD}" \ --top-k "${TOP_K}" \ --lexical-weight "${LEXICAL_WEIGHT}" \ --embedding-weight "${EMBEDDING_WEIGHT}" \ diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh index eadaad7..5c759dc 100755 --- a/tests/benchmark/scripts/run-full-benchmark.sh +++ b/tests/benchmark/scripts/run-full-benchmark.sh @@ -10,6 +10,19 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BENCHMARK_DIR="${SCRIPT_DIR}/.." CORPUS_DIR="${BENCHMARK_DIR}/corpus" RESULTS_DIR="${BENCHMARK_DIR}/results" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" + +# Read defaults from config +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "ERROR: Config file not found: $CONFIG_FILE" >&2 + exit 1 +fi + +STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") +THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") +TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") +LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE") +EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE") mkdir -p "${RESULTS_DIR}" diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh index ef61d88..011b1b2 100755 --- a/tests/benchmark/scripts/tune-weights.sh +++ b/tests/benchmark/scripts/tune-weights.sh @@ -10,6 +10,16 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BENCHMARK_DIR="${SCRIPT_DIR}/.." RESULTS_DIR="${BENCHMARK_DIR}/results" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" + +# Read defaults from config (used for threshold/top_k in grid runs) +if [[ -f "$CONFIG_FILE" ]]; then + THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") + TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") +else + THRESHOLD=0.01 + TOP_K=5 +fi SPECIFIC_CORPUS="" STEP="0.1" diff --git a/tests/benchmark/scripts/update-baseline.sh b/tests/benchmark/scripts/update-baseline.sh new file mode 100755 index 0000000..ba93089 --- /dev/null +++ b/tests/benchmark/scripts/update-baseline.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Update baseline after reviewing regressions. +# +# Usage: +# ./update-baseline.sh --accept [--baseline ] +# +# This re-runs the benchmark and overwrites the baseline file. +# Use after reviewing check-baseline.sh output and confirming +# the changes are intentional. +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_DIR="${SCRIPT_DIR}/.." +BASELINES_DIR="${BENCHMARK_DIR}/baselines" +CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" + +# Read config +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "ERROR: Config file not found: $CONFIG_FILE" >&2 + exit 1 +fi + +STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") + +# Parse args +BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json" +ACCEPT=false +while [[ $# -gt 0 ]]; do + case "$1" in + --accept) ACCEPT=true; shift ;; + --baseline) BASELINE_FILE="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +if [[ "$ACCEPT" != "true" ]]; then + echo "Usage: $0 --accept [--baseline ]" + echo "" + echo "This will overwrite the baseline. Run check-baseline.sh first" + echo "to review changes before accepting." + exit 1 +fi + +if [[ ! -f "$BASELINE_FILE" ]]; then + echo "Baseline not found: $BASELINE_FILE" + echo "Creating new baseline instead..." + exec "${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")" +fi + +# Show what will change +echo "Current baseline: ${BASELINE_FILE}" +echo "" +jq -r '" MRR: \(.metrics.mrr)\n P@1: \(.metrics.p_at_1)\n Hit@3: \(.metrics.hit_at_3)"' "$BASELINE_FILE" +echo "" +echo "Running benchmark to generate new baseline..." +echo "" + +# Backup old baseline +BACKUP_FILE="${BASELINE_FILE%.json}_$(date +%Y%m%d_%H%M%S).backup.json" +cp "$BASELINE_FILE" "$BACKUP_FILE" +echo "Backed up old baseline to: $BACKUP_FILE" + +# Create new baseline (overwrites) +"${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")" + +echo "" +echo "Baseline updated. Old baseline backed up to:" +echo " $BACKUP_FILE" From a5a3d55d0473326c7d944b59c03b50aca7072de3 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 11:04:00 +0100 Subject: [PATCH 02/14] docs: improve SKILL.md for LLM usage Add scenario-based command table to help LLM assistants pick the right dev command for each situation. --- skills/semantic-dev/SKILL.md | 51 ++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md index b813297..16e70b4 100644 --- a/skills/semantic-dev/SKILL.md +++ b/skills/semantic-dev/SKILL.md @@ -5,37 +5,44 @@ description: Develop and contribute to the Semantic project. Use when working on # Semantic Development -Semantic is a zero-dependency Go library for matching natural language queries against accessibility tree elements. +Zero-dependency Go library for matching natural language queries against accessibility tree elements. -## Project Location +## Essential Commands +**Before any PR:** ```bash -cd ~/dev/semantic +./dev pr # runs: check + e2e + lint corpus + bench ``` -## Dev Commands +**During development:** +```bash +./dev test # unit tests (fast) +./dev check # fmt + vet + lint + test race (full validation) +./dev build # build ./semantic CLI binary +``` +**Quality regression checks:** ```bash -# Before opening a PR (runs all checks + e2e + benchmark) -./dev pr - -# Quick iteration -./dev test # unit tests -./dev check # fmt + vet + lint + test race - -# Benchmarking -./dev bench # corpus benchmark -./dev baseline # create baseline (first time) -./dev baseline check # check for regressions - -# Other -./dev build # build ./semantic binary -./dev e2e # e2e tests (Docker) -./dev lint corpus # validate benchmark data -./dev calibrate # find optimal thresholds -./dev tune # grid-search weights +./dev baseline check # compare quality against baseline +./dev runtime # compare performance against baseline ``` +**When quality changes intentionally:** +```bash +./dev baseline update # accept new quality baseline (after review) +``` + +## When to Use Each + +| Scenario | Command | +|----------|---------| +| Made code changes, quick sanity | `./dev test` | +| Ready to commit | `./dev check` | +| Before opening PR | `./dev pr` | +| Changed scoring/matching logic | `./dev baseline check` | +| Performance-sensitive changes | `./dev runtime` | +| Tuning weights | `./dev tune` then `./dev bench` | + ## Architecture ``` From 93eee1ee9024ec6dec08d8892ee27df4ced70716 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 15:37:42 +0100 Subject: [PATCH 03/14] refactor: use Go CLI instead of bash scripts in dev tool Replace bash implementations of bench, lint corpus, and loop commands with calls to go run ./cmd/semantic-bench. Removes ~100 lines of duplicate bash logic. --- dev | 20 +- recovery/benchmark_test.go | 250 ++++++++++++ skills/semantic-dev/SKILL.md | 54 +++ .../benchmark/scripts/calibrate-thresholds.sh | 368 ++++++++++++------ .../scripts/run-recovery-benchmark.sh | 42 ++ 5 files changed, 613 insertions(+), 121 deletions(-) create mode 100644 recovery/benchmark_test.go create mode 100755 tests/benchmark/scripts/run-recovery-benchmark.sh diff --git a/dev b/dev index 215b566..a7f6247 100755 --- a/dev +++ b/dev @@ -19,6 +19,7 @@ commands=( "coverage:πŸ“Š:Run tests with coverage report" "lint:πŸ”:Run golangci-lint" "lint corpus:πŸ”:Lint benchmark corpus" + "lint docs:πŸ”:Check documentation links" "fmt:✨:Format code" "vet:πŸ”¬:Run go vet" "check:βœ…:Run all checks (fmt + vet + lint + test)" @@ -32,6 +33,7 @@ commands=( "runtime:⏱️:Check runtime baseline" "tune:πŸŽ›οΈ:Tune combined weights" "e2e:🐳:Run E2E tests (Docker)" + "loop:πŸ”„:Benchmark loop (bench β†’ compare β†’ report)" ) show_help() { @@ -155,17 +157,22 @@ run_build() { run_bench() { echo " ${ACCENT}${BOLD}πŸ‹ Running corpus benchmark${NC}" - bash tests/benchmark/scripts/run-corpus-benchmark.sh "$@" + go run ./cmd/semantic-bench check "$@" } run_bench_full() { echo " ${ACCENT}${BOLD}πŸ‹ Running full benchmark suite${NC}" - bash tests/benchmark/scripts/run-full-benchmark.sh + go run ./cmd/semantic-bench run -suite=all "$@" } run_lint_corpus() { echo " ${ACCENT}${BOLD}πŸ” Linting benchmark corpus${NC}" - bash tests/benchmark/scripts/lint-corpus.sh + go run ./cmd/semantic-bench lint "$@" +} + +run_lint_docs() { + echo " ${ACCENT}${BOLD}πŸ” Checking documentation links${NC}" + bash scripts/check-docs-links.sh } run_baseline() { @@ -207,6 +214,11 @@ run_e2e() { bash scripts/e2e.sh } +run_loop() { + echo " ${ACCENT}${BOLD}πŸ”„ Benchmark Loop${NC}" + go run ./cmd/semantic-bench check -verbose "$@" +} + case "${1:-help}" in pr) run_pr ;; doctor) exec bash scripts/doctor.sh ;; @@ -221,6 +233,7 @@ case "${1:-help}" in lint) case "${2:-}" in corpus) run_lint_corpus ;; + docs) run_lint_docs ;; *) run_lint ;; esac ;; @@ -245,5 +258,6 @@ case "${1:-help}" in runtime) shift; run_runtime "$@" ;; tune) shift; run_tune "$@" ;; e2e) run_e2e ;; + loop) run_loop ;; help|*) show_help ;; esac diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go new file mode 100644 index 0000000..9670a68 --- /dev/null +++ b/recovery/benchmark_test.go @@ -0,0 +1,250 @@ +package recovery + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "runtime" + "testing" + "time" + + "github.com/pinchtab/semantic" +) + +type BenchmarkScenario struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + OriginalQuery string `json:"original_query"` + OriginalRef string `json:"original_ref"` + Before []semantic.ElementDescriptor `json:"before"` + After []semantic.ElementDescriptor `json:"after"` + ExpectedRef *string `json:"expected_ref"` + ExpectedAlt []string `json:"expected_alt"` + ExpectNoMatch bool `json:"expect_no_match"` + Difficulty string `json:"difficulty"` +} + +func loadScenarios(t *testing.T) []BenchmarkScenario { + _, thisFile, _, _ := runtime.Caller(0) + repoRoot := filepath.Join(filepath.Dir(thisFile), "..") + scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json") + + data, err := os.ReadFile(scenariosPath) + if err != nil { + t.Fatalf("failed to read scenarios: %v", err) + } + + var scenarios []BenchmarkScenario + if err := json.Unmarshal(data, &scenarios); err != nil { + t.Fatalf("failed to parse scenarios: %v", err) + } + + return scenarios +} + +func TestRecoveryBenchmark_Scenarios(t *testing.T) { + scenarios := loadScenarios(t) + matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128)) + + passed, failed := 0, 0 + + for _, sc := range scenarios { + t.Run(sc.ID, func(t *testing.T) { + result := runBenchmarkScenario(t, matcher, sc) + + if result.pass { + passed++ + t.Logf("PASS: recovered=%v got=%s expected=%s score=%.3f", + result.recovered, result.gotRef, result.expectedRef, result.score) + } else { + failed++ + t.Errorf("FAIL: recovered=%v got=%s expected=%s score=%.3f error=%s", + result.recovered, result.gotRef, result.expectedRef, result.score, result.err) + } + }) + } + + t.Logf("Summary: %d passed, %d failed out of %d scenarios", passed, failed, len(scenarios)) +} + +type scenarioResult struct { + pass bool + recovered bool + gotRef string + expectedRef string + score float64 + confidence string + latencyMs int64 + err string +} + +func runBenchmarkScenario(t *testing.T, matcher semantic.ElementMatcher, sc BenchmarkScenario) scenarioResult { + result := scenarioResult{} + + if sc.ExpectedRef != nil { + result.expectedRef = *sc.ExpectedRef + } + + var origDesc semantic.ElementDescriptor + for _, d := range sc.Before { + if d.Ref == sc.OriginalRef { + origDesc = d + break + } + } + + cache := NewIntentCache(100, 5*time.Minute) + cache.Store("test-tab", sc.OriginalRef, IntentEntry{ + Query: sc.OriginalQuery, + Descriptor: origDesc, + Score: 0.95, + Confidence: "high", + Strategy: "combined", + }) + + re := NewRecoveryEngine( + DefaultRecoveryConfig(), + matcher, + cache, + func(_ context.Context, _ string) error { return nil }, + func(_, ref string) (int64, bool) { + for i, d := range sc.After { + if d.Ref == ref { + return int64(1000 + i), true + } + } + return 0, false + }, + func(_ string) []semantic.ElementDescriptor { return sc.After }, + ) + + start := time.Now() + + err := fmt.Errorf("could not find node with id %s", sc.OriginalRef) + + if !re.ShouldAttempt(err, sc.OriginalRef) { + result.err = "ShouldAttempt returned false" + result.pass = sc.ExpectNoMatch + result.latencyMs = time.Since(start).Milliseconds() + return result + } + + rr, _, recErr := re.AttemptWithClassification( + context.Background(), + "test-tab", + sc.OriginalRef, + "click", + ClassifyFailure(err), + func(_ context.Context, kind string, nodeID int64) (map[string]any, error) { + return map[string]any{"clicked": true}, nil + }, + ) + + result.latencyMs = time.Since(start).Milliseconds() + result.recovered = rr.Recovered + result.gotRef = rr.NewRef + result.score = rr.Score + result.confidence = rr.Confidence + + if recErr != nil { + result.err = recErr.Error() + } + + if sc.ExpectNoMatch { + result.pass = !rr.Recovered + } else if sc.ExpectedRef != nil { + if rr.NewRef == *sc.ExpectedRef { + result.pass = true + } else { + for _, alt := range sc.ExpectedAlt { + if rr.NewRef == alt { + result.pass = true + break + } + } + } + } + + return result +} + +func BenchmarkRecoveryEngine_Scenarios(b *testing.B) { + scenarios := loadScenariosB(b) + matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128)) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, sc := range scenarios { + runBenchmarkScenarioB(b, matcher, sc) + } + } +} + +func loadScenariosB(b *testing.B) []BenchmarkScenario { + _, thisFile, _, _ := runtime.Caller(0) + repoRoot := filepath.Join(filepath.Dir(thisFile), "..") + scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json") + + data, err := os.ReadFile(scenariosPath) + if err != nil { + b.Fatalf("failed to read scenarios: %v", err) + } + + var scenarios []BenchmarkScenario + if err := json.Unmarshal(data, &scenarios); err != nil { + b.Fatalf("failed to parse scenarios: %v", err) + } + + return scenarios +} + +func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc BenchmarkScenario) { + var origDesc semantic.ElementDescriptor + for _, d := range sc.Before { + if d.Ref == sc.OriginalRef { + origDesc = d + break + } + } + + cache := NewIntentCache(100, 5*time.Minute) + cache.Store("test-tab", sc.OriginalRef, IntentEntry{ + Query: sc.OriginalQuery, + Descriptor: origDesc, + Score: 0.95, + Confidence: "high", + Strategy: "combined", + }) + + re := NewRecoveryEngine( + DefaultRecoveryConfig(), + matcher, + cache, + func(_ context.Context, _ string) error { return nil }, + func(_, ref string) (int64, bool) { + for i, d := range sc.After { + if d.Ref == ref { + return int64(1000 + i), true + } + } + return 0, false + }, + func(_ string) []semantic.ElementDescriptor { return sc.After }, + ) + + err := fmt.Errorf("could not find node with id %s", sc.OriginalRef) + + re.AttemptWithClassification( + context.Background(), + "test-tab", + sc.OriginalRef, + "click", + ClassifyFailure(err), + func(_ context.Context, kind string, nodeID int64) (map[string]any, error) { + return map[string]any{"clicked": true}, nil + }, + ) +} diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md index 16e70b4..7cbb684 100644 --- a/skills/semantic-dev/SKILL.md +++ b/skills/semantic-dev/SKILL.md @@ -90,6 +90,60 @@ cmd/semantic/main.go CLI tool (find, match, classify) 4. **Pre-commit hook** runs gofmt + golangci-lint automatically on staged files. +## Benchmark Improvement Loop + +When implementing changes that affect matching quality, follow this loop: + +### Step 1: Ensure baseline exists + +```bash +./dev baseline +``` + +Creates `tests/benchmark/baselines/combined.json` if missing. + +### Step 2: Implement change + +Make one focused improvement at a time. + +### Step 3: Run benchmark loop + +```bash +./dev loop +``` + +Shows comparison table with deltas: +- **Green (+)** = improved +- **Red (-)** = regressed +- **Gray** = unchanged + +### Step 4: Evaluate and decide + +| Result | Action | +|--------|--------| +| All metrics improved/unchanged | `./dev baseline update` | +| Mixed (some up, some down) | Investigate tradeoff | +| Key metrics regressed | Fix before merging | + +### Step 5: Iterate + +Repeat steps 2-4. Each `baseline update` sets new goalpost. + +### Key metrics + +- **MRR** β€” Mean Reciprocal Rank (higher = finds correct element faster) +- **P@1** β€” Precision at 1 (is top result correct?) +- **Hit@3** β€” Any correct result in top 3? +- **Margin** β€” Score gap between best correct and best wrong + +### Adding test cases + +When a query should work better: + +1. Add to `tests/benchmark/corpus/*/queries.json` or `cases/*.json` +2. Run `./dev lint corpus` +3. Run `./dev loop` β€” benchmark will show regression until fixed + ## Public API Surface Only these symbols are visible to consumers: diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh index ef5603d..84d68d1 100755 --- a/tests/benchmark/scripts/calibrate-thresholds.sh +++ b/tests/benchmark/scripts/calibrate-thresholds.sh @@ -1,30 +1,20 @@ #!/bin/bash # -# Calibrate threshold recommendations for find and recovery. +# Threshold Calibration Benchmark +# +# Calculates optimal thresholds for semantic matching by evaluating +# recall, precision, and false-positive rates across threshold levels. # # Usage: # ./calibrate-thresholds.sh [--corpus ] # -# Reports recall/precision/false-positive-rate by threshold. -# set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BENCHMARK_DIR="${SCRIPT_DIR}/.." CORPUS_DIR="${BENCHMARK_DIR}/corpus" +CASES_DIR="${BENCHMARK_DIR}/cases" RESULTS_DIR="${BENCHMARK_DIR}/results" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" - -# Read config -if [[ -f "$CONFIG_FILE" ]]; then - STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") - LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE") - EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE") -else - STRATEGY="combined" - LEXICAL_WEIGHT=0.6 - EMBEDDING_WEIGHT=0.4 -fi SPECIFIC_CORPUS="" while [[ $# -gt 0 ]]; do @@ -45,164 +35,306 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json" # Thresholds to test -THRESHOLDS=(0.01 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.60 0.70 0.80 0.90) - -echo "Testing ${#THRESHOLDS[@]} thresholds: ${THRESHOLDS[*]}" -echo "" +THRESHOLDS=(0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60) # Initialize report jq -n \ --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg strategy "${STRATEGY}" \ + --argjson thresholds "$(printf '%s\n' "${THRESHOLDS[@]}" | jq -s '.')" \ '{ - timestamp: $ts, - strategy: $strategy, - thresholds: [], + calibration: { + timestamp: $ts, + thresholds_tested: $thresholds + }, + by_threshold: {}, + by_tag: {}, recommendations: {} }' > "${REPORT_FILE}" -# Collect results for each threshold -for thresh in "${THRESHOLDS[@]}"; do - echo "Testing threshold: ${thresh}" +echo "" +echo "=== Threshold Calibration ===" +echo "Testing thresholds: ${THRESHOLDS[*]}" +echo "" - total=0 - true_positives=0 - false_positives=0 - false_negatives=0 +# Collect all test cases +declare -a ALL_QUERIES=() +declare -a ALL_SNAPSHOTS=() +declare -a ALL_RELEVANT=() +declare -a ALL_EXPECT_NO_MATCH=() +declare -a ALL_IDS=() - for corpus in "${CORPUS_DIR}"/*/; do - [[ -d "$corpus" ]] || continue +load_corpus() { + local corpus_path="$1" + local snapshot="${corpus_path}/snapshot.json" + local queries="${corpus_path}/queries.json" - if [[ -n "$SPECIFIC_CORPUS" ]] && [[ "$(basename "$corpus")" != "$SPECIFIC_CORPUS" ]]; then + if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then + return + fi + + local count + count=$(jq length "$queries") + + for i in $(seq 0 $((count - 1))); do + local query relevant id expect_no_match + id=$(jq -r ".[$i].id" "$queries") + query=$(jq -r ".[$i].query" "$queries") + relevant=$(jq -c ".[$i].relevant_refs // []" "$queries") + expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$queries") + + ALL_IDS+=("$id") + ALL_QUERIES+=("$query") + ALL_SNAPSHOTS+=("$snapshot") + ALL_RELEVANT+=("$relevant") + ALL_EXPECT_NO_MATCH+=("$expect_no_match") + done +} + +load_cases() { + local cases_file="$1" + local snapshots_dir="${BENCHMARK_DIR}/../e2e/assets/snapshots" + + if [[ ! -f "$cases_file" ]]; then + return + fi + + local count + count=$(jq length "$cases_file") + + for i in $(seq 0 $((count - 1))); do + local id query snapshot_name expect_no_match expect_ref expect_ref_alt relevant + id=$(jq -r ".[$i].id" "$cases_file") + query=$(jq -r ".[$i].query" "$cases_file") + snapshot_name=$(jq -r ".[$i].snapshot" "$cases_file") + expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$cases_file") + expect_ref=$(jq -r ".[$i].expect_ref // \"\"" "$cases_file") + expect_ref_alt=$(jq -c ".[$i].expect_ref_alt // []" "$cases_file") + + if [[ -n "$expect_ref" && "$expect_ref" != "null" ]]; then + relevant=$(echo "$expect_ref_alt" | jq --arg r "$expect_ref" '. + [$r]') + else + relevant="[]" + fi + + local snapshot="${snapshots_dir}/${snapshot_name}" + if [[ ! -f "$snapshot" ]]; then continue fi - snapshot="${corpus}/snapshot.json" - queries="${corpus}/queries.json" + ALL_IDS+=("$id") + ALL_QUERIES+=("$query") + ALL_SNAPSHOTS+=("$snapshot") + ALL_RELEVANT+=("$relevant") + ALL_EXPECT_NO_MATCH+=("$expect_no_match") + done +} - [[ -f "$snapshot" ]] && [[ -f "$queries" ]] || continue +echo "Loading test cases..." +if [[ -n "${SPECIFIC_CORPUS}" ]]; then + load_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}" +else + for corpus in "${CORPUS_DIR}"/*/; do + [[ -d "$corpus" ]] || continue + load_corpus "$corpus" + done +fi - count=$(jq length "$queries") +load_cases "${CASES_DIR}/negative-threshold.json" - for i in $(seq 0 $((count - 1))); do - query=$(jq -r ".[$i].query" "$queries") - relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries") +TOTAL_CASES=${#ALL_QUERIES[@]} +echo "Loaded ${TOTAL_CASES} test cases" +echo "" - result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot}" \ - --strategy "${STRATEGY}" \ - --threshold "${thresh}" \ - --top-k 5 \ - --lexical-weight "${LEXICAL_WEIGHT}" \ - --embedding-weight "${EMBEDDING_WEIGHT}" \ - --format json 2>/dev/null) || continue +for threshold in "${THRESHOLDS[@]}"; do + echo "Testing threshold ${threshold}..." - best_ref=$(echo "$result" | jq -r '.best_ref // ""') - num_matches=$(echo "$result" | jq '.matches | length') + tp=0 fp=0 fn=0 tn=0 - total=$((total + 1)) + for i in $(seq 0 $((TOTAL_CASES - 1))); do + query="${ALL_QUERIES[$i]}" + snapshot="${ALL_SNAPSHOTS[$i]}" + relevant="${ALL_RELEVANT[$i]}" + expect_no_match="${ALL_EXPECT_NO_MATCH[$i]}" - # Check if best match is relevant - if [[ -n "$best_ref" ]] && echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - true_positives=$((true_positives + 1)) - elif [[ -n "$best_ref" ]] && [[ "$num_matches" -gt 0 ]]; then - false_positives=$((false_positives + 1)) + result=$("${SEMANTIC}" find "${query}" \ + --snapshot "${snapshot}" \ + --strategy combined \ + --threshold "${threshold}" \ + --top-k 5 \ + --format json 2>/dev/null) || result='{"matches":[]}' + + match_count=$(echo "$result" | jq '.matches | length') + best_ref=$(echo "$result" | jq -r '.best_ref // ""') + + if [[ "$expect_no_match" == "true" ]]; then + if [[ $match_count -eq 0 ]]; then + tn=$((tn + 1)) + else + fp=$((fp + 1)) + fi + else + relevant_count=$(echo "$relevant" | jq 'length') + if [[ $relevant_count -eq 0 ]]; then + continue fi - # If no match but there should be one - if [[ -z "$best_ref" ]] || [[ "$num_matches" -eq 0 ]]; then - rel_count=$(echo "$relevant_refs" | jq 'length') - if [[ "$rel_count" -gt 0 ]]; then - false_negatives=$((false_negatives + 1)) - fi + if [[ $match_count -eq 0 ]]; then + fn=$((fn + 1)) + elif echo "$relevant" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then + tp=$((tp + 1)) + else + fp=$((fp + 1)) fi - done + fi done - # Calculate metrics - if [[ $total -eq 0 ]]; then - echo " No queries processed" - continue - fi - - precision=0 - recall=0 - fpr=0 + total_positive=$((tp + fn)) + total_negative=$((tn + fp)) - if [[ $((true_positives + false_positives)) -gt 0 ]]; then - precision=$(echo "scale=4; $true_positives / ($true_positives + $false_positives)" | bc) + if [[ $total_positive -gt 0 ]]; then + recall=$(echo "scale=4; $tp / $total_positive" | bc) + else + recall="0" fi - if [[ $((true_positives + false_negatives)) -gt 0 ]]; then - recall=$(echo "scale=4; $true_positives / ($true_positives + $false_negatives)" | bc) + if [[ $((tp + fp)) -gt 0 ]]; then + precision=$(echo "scale=4; $tp / ($tp + $fp)" | bc) + else + precision="1" fi - if [[ $((false_positives + true_positives)) -gt 0 ]]; then - fpr=$(echo "scale=4; $false_positives / $total" | bc) + if [[ $total_negative -gt 0 ]]; then + fpr=$(echo "scale=4; $fp / $total_negative" | bc) + else + fpr="0" fi - f1=0 - if (( $(echo "$precision + $recall > 0" | bc -l) )); then + if [[ $(echo "$precision + $recall > 0" | bc) -eq 1 ]]; then f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc) + else + f1="0" fi - printf " Precision: %.3f | Recall: %.3f | FPR: %.3f | F1: %.3f\n" "$precision" "$recall" "$fpr" "$f1" + printf " threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f FPR=%.3f F1=%.3f\n" \ + "$threshold" "$tp" "$fp" "$fn" "$tn" "$recall" "$precision" "$fpr" "$f1" - # Append to report tmp=$(mktemp) - jq --argjson thresh "$thresh" \ - --argjson total "$total" \ - --argjson tp "$true_positives" \ - --argjson fp "$false_positives" \ - --argjson fn "$false_negatives" \ - --argjson precision "$precision" \ - --argjson recall "$recall" \ - --argjson fpr "$fpr" \ - --argjson f1 "$f1" \ - '.thresholds += [{ - threshold: $thresh, - total: $total, - true_positives: $tp, - false_positives: $fp, - false_negatives: $fn, - precision: $precision, - recall: $recall, - false_positive_rate: $fpr, - f1: $f1 - }]' "$REPORT_FILE" > "$tmp" + jq --arg t "$threshold" \ + --argjson tp "$tp" --argjson fp "$fp" --argjson fn "$fn" --argjson tn "$tn" \ + --argjson recall "$recall" --argjson precision "$precision" \ + --argjson fpr "$fpr" --argjson f1 "$f1" \ + '.by_threshold[$t] = { + tp: $tp, fp: $fp, fn: $fn, tn: $tn, + recall: $recall, precision: $precision, + false_positive_rate: $fpr, f1: $f1 + }' "$REPORT_FILE" > "$tmp" mv "$tmp" "$REPORT_FILE" done -# Calculate recommendations echo "" echo "Calculating recommendations..." -# Best F1 for general find -BEST_FIND=$(jq -r '[.thresholds[] | select(.f1 > 0)] | max_by(.f1) | .threshold // 0.3' "$REPORT_FILE") +best_f1_threshold="" best_f1=0 +best_recall_threshold="" best_recall=0 + +for threshold in "${THRESHOLDS[@]}"; do + metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE") + f1=$(echo "$metrics" | jq -r '.f1') + recall=$(echo "$metrics" | jq -r '.recall') + + if (( $(echo "$f1 > $best_f1" | bc -l) )); then + best_f1=$f1 + best_f1_threshold=$threshold + fi + if (( $(echo "$recall > $best_recall" | bc -l) )); then + best_recall=$recall + best_recall_threshold=$threshold + fi +done + +recovery_threshold="" +recovery_precision=0 +for threshold in "${THRESHOLDS[@]}"; do + metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE") + recall=$(echo "$metrics" | jq -r '.recall') + precision=$(echo "$metrics" | jq -r '.precision') + + if (( $(echo "$recall >= 0.85" | bc -l) )); then + if (( $(echo "$precision > $recovery_precision" | bc -l) )); then + recovery_precision=$precision + recovery_threshold=$threshold + fi + fi +done + +if [[ -z "$recovery_threshold" ]]; then + recovery_threshold="${THRESHOLDS[0]}" +fi -# Best recall with precision > 0.8 for recovery (prioritize not missing) -BEST_RECOVERY=$(jq -r '[.thresholds[] | select(.precision >= 0.7)] | max_by(.recall) | .threshold // 0.2' "$REPORT_FILE") +default_threshold="$best_f1_threshold" -# Update recommendations tmp=$(mktemp) -jq --argjson find "$BEST_FIND" \ - --argjson recovery "$BEST_RECOVERY" \ +jq --arg default "$default_threshold" \ + --arg recovery "$recovery_threshold" \ + --arg best_f1 "$best_f1_threshold" \ + --argjson best_f1_val "$best_f1" \ '.recommendations = { - find: $find, - recovery: $recovery, - note: "find optimizes F1; recovery optimizes recall with precision >= 0.7" + default_threshold: $default, + recovery_threshold: $recovery, + best_f1: { threshold: $best_f1, value: $best_f1_val }, + notes: "default_threshold optimizes F1. recovery_threshold prioritizes recall (>=85%)." }' "$REPORT_FILE" > "$tmp" mv "$tmp" "$REPORT_FILE" -# Cleanup +SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" + +cat > "${SUMMARY_FILE}" << EOF +# Threshold Calibration Report + +Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ) + +## Recommendations + +| Use Case | Threshold | Rationale | +|----------|-----------|-----------| +| **Default (find)** | **${default_threshold}** | Best F1 score (${best_f1}) | +| **Recovery** | **${recovery_threshold}** | High recall for element recovery | + +## Metrics by Threshold + +| Threshold | TP | FP | FN | TN | Recall | Precision | FPR | F1 | +|-----------|----|----|----|----|--------|-----------|-----|-----| +$(for t in "${THRESHOLDS[@]}"; do + m=$(jq -r ".by_threshold[\"$t\"]" "$REPORT_FILE") + printf "| %.2f | %d | %d | %d | %d | %.3f | %.3f | %.3f | %.3f |\n" \ + "$t" \ + "$(echo "$m" | jq -r '.tp')" \ + "$(echo "$m" | jq -r '.fp')" \ + "$(echo "$m" | jq -r '.fn')" \ + "$(echo "$m" | jq -r '.tn')" \ + "$(echo "$m" | jq -r '.recall')" \ + "$(echo "$m" | jq -r '.precision')" \ + "$(echo "$m" | jq -r '.false_positive_rate')" \ + "$(echo "$m" | jq -r '.f1')" +done) + +## Trade-offs + +- **Lower threshold** (0.10-0.20): High recall, more false positives. Good for recovery. +- **Medium threshold** (0.25-0.35): Balanced. Good default for find operations. +- **Higher threshold** (0.40+): High precision, misses weaker matches. +EOF + rm -f "${BENCHMARK_DIR}/semantic" echo "" echo "================================================" -echo " THRESHOLD CALIBRATION RESULTS" +echo " THRESHOLD CALIBRATION COMPLETE" echo "================================================" -echo " Recommended for Find: ${BEST_FIND}" -echo " Recommended for Recovery: ${BEST_RECOVERY}" +echo " Test cases: ${TOTAL_CASES}" +echo " Default threshold: ${default_threshold} (F1=${best_f1})" +echo " Recovery threshold: ${recovery_threshold}" echo "================================================" echo "" -echo "Report: ${REPORT_FILE}" +echo "Report: ${REPORT_FILE}" +echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/run-recovery-benchmark.sh b/tests/benchmark/scripts/run-recovery-benchmark.sh new file mode 100755 index 0000000..93fc88a --- /dev/null +++ b/tests/benchmark/scripts/run-recovery-benchmark.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Recovery Engine Benchmark +# +# Exercises RecoveryEngine directly using before/after snapshots +# and intent cache entries from recovery scenarios. +# +# Usage: +# ./run-recovery-benchmark.sh +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_DIR="${SCRIPT_DIR}/.." +RESULTS_DIR="${BENCHMARK_DIR}/results" + +mkdir -p "${RESULTS_DIR}" + +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +REPORT_FILE="${RESULTS_DIR}/recovery_benchmark_${TIMESTAMP}.txt" + +echo "=== Recovery Engine Benchmark ===" +echo "" + +cd "${BENCHMARK_DIR}/../.." + +# Run the Go test that exercises RecoveryEngine with scenarios +echo "Running recovery scenarios..." +echo "" + +go test -v -run TestRecoveryBenchmark_Scenarios ./recovery/ 2>&1 | tee "$REPORT_FILE" + +# Also run the Go benchmark for performance +echo "" +echo "Running performance benchmark..." +go test -bench=BenchmarkRecoveryEngine_Scenarios -benchmem ./recovery/ 2>&1 | tee -a "$REPORT_FILE" + +echo "" +echo "================================================" +echo " RECOVERY BENCHMARK COMPLETE" +echo "================================================" +echo "Report: $REPORT_FILE" From d504268ebe4ca92e12f1cc8916b084e35a656af6 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 15:38:06 +0100 Subject: [PATCH 04/14] feat: add semantic-bench CLI for benchmark management Go CLI with commands: check, run, compare, lint, catalog. Replaces bash scripts with structured benchmark framework. --- cmd/semantic-bench/main.go | 113 ++++++++ internal/benchmark/commands.go | 510 +++++++++++++++++++++++++++++++++ internal/benchmark/config.go | 247 ++++++++++++++++ internal/benchmark/dataset.go | 117 ++++++++ internal/benchmark/runner.go | 384 +++++++++++++++++++++++++ 5 files changed, 1371 insertions(+) create mode 100644 cmd/semantic-bench/main.go create mode 100644 internal/benchmark/commands.go create mode 100644 internal/benchmark/config.go create mode 100644 internal/benchmark/dataset.go create mode 100644 internal/benchmark/runner.go diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go new file mode 100644 index 0000000..35bf051 --- /dev/null +++ b/cmd/semantic-bench/main.go @@ -0,0 +1,113 @@ +package main + +import ( + "fmt" + "os" + + "github.com/pinchtab/semantic/internal/benchmark" +) + +const usage = `semantic-bench - Benchmark runner for semantic matching + +Usage: + semantic-bench [flags] + +Commands: + check Run benchmark and compare against baseline (default) + run Run benchmark suites + compare Compare two reports + lint Validate dataset + catalog Print dataset inventory + +Flags: + -h, --help Show help + +Run 'semantic-bench --help' for command-specific help. +` + +func main() { + if len(os.Args) < 2 { + runCheck(os.Args[1:]) + return + } + + cmd := os.Args[1] + args := os.Args[2:] + + switch cmd { + case "check": + runCheck(args) + case "run": + runRun(args) + case "compare": + runCompare(args) + case "lint": + runLint(args) + case "catalog": + runCatalog(args) + case "-h", "--help", "help": + fmt.Print(usage) + default: + fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage) + os.Exit(2) + } +} + +func runCheck(args []string) { + cfg := benchmark.ParseCheckFlags(args) + result, err := benchmark.RunCheck(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCheckResult(result, cfg) + if result.Status == "fail" { + os.Exit(1) + } +} + +func runRun(args []string) { + cfg := benchmark.ParseRunFlags(args) + result, err := benchmark.RunBenchmark(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintRunResult(result, cfg) +} + +func runCompare(args []string) { + cfg := benchmark.ParseCompareFlags(args) + result, err := benchmark.RunCompare(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCompareResult(result, cfg) + if result.Status == "fail" { + os.Exit(1) + } +} + +func runLint(args []string) { + cfg := benchmark.ParseLintFlags(args) + result, err := benchmark.RunLint(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintLintResult(result, cfg) + if result.Errors > 0 { + os.Exit(1) + } +} + +func runCatalog(args []string) { + cfg := benchmark.ParseCatalogFlags(args) + result, err := benchmark.RunCatalog(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCatalogResult(result, cfg) +} diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go new file mode 100644 index 0000000..ad22ea3 --- /dev/null +++ b/internal/benchmark/commands.go @@ -0,0 +1,510 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +type CheckResult struct { + Status string `json:"status"` + Summary CheckSummary `json:"summary"` + Delta *MetricsDelta `json:"delta,omitempty"` + TopRegs []Regression `json:"top_regressions,omitempty"` + Artifacts Artifacts `json:"artifacts"` + Report *Report `json:"-"` +} + +type CheckSummary struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` + Total int `json:"total"` + Regressions int `json:"regressions"` + Warnings int `json:"warnings"` +} + +type MetricsDelta struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` +} + +type Regression struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Expected []string `json:"expected"` + BaselineRef string `json:"baseline_ref,omitempty"` + CurrentRef string `json:"current_ref"` + Reason string `json:"reason"` + DebugCommand string `json:"debug_command"` +} + +type Artifacts struct { + ReportJSON string `json:"report_json"` + SummaryMD string `json:"summary_md"` +} + +type CompareResult struct { + Status string `json:"status"` + Delta MetricsDelta `json:"delta"` + Regressions []Regression `json:"regressions"` + Improvements []string `json:"improvements"` +} + +type LintResult struct { + Errors int `json:"errors"` + Warnings int `json:"warnings"` + Messages []string `json:"messages"` +} + +type CatalogResult struct { + Corpora []CorpusSummary `json:"corpora"` + TotalQueries int `json:"total_queries"` + ByTag map[string]int `json:"by_tag,omitempty"` + ByDifficulty map[string]int `json:"by_difficulty,omitempty"` +} + +type CorpusSummary struct { + ID string `json:"id"` + Queries int `json:"queries"` + Tags []string `json:"tags"` +} + +func RunCheck(cfg CheckConfig) (*CheckResult, error) { + root := FindBenchmarkRoot() + + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + benchCfg, _ := LoadConfig(root) + profile := Profile{ + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + } + if benchCfg != nil { + profile = ResolveProfile(benchCfg, cfg.Profile) + } + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: profile.Strategy, + Threshold: profile.Threshold, + TopK: profile.TopK, + LexicalWeight: profile.Weights.Lexical, + EmbeddingWeight: profile.Weights.Embedding, + Profile: cfg.Profile, + Mode: "library", + Verbose: cfg.Verbose, + Explain: cfg.Explain, + OutputDir: cfg.OutputDir, + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + result := &CheckResult{ + Status: "pass", + Report: report, + } + result.Summary.PAt1 = report.Metrics.Overall.PAt1 + result.Summary.MRR = report.Metrics.Overall.MRR + result.Summary.HitAt3 = report.Metrics.Overall.HitAt3 + result.Summary.Total = report.Metrics.Overall.Total + + // Count misses + for _, r := range report.Results { + if r.Status == "miss" { + result.TopRegs = append(result.TopRegs, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + Expected: r.Expected.RelevantRefs, + CurrentRef: r.Actual.BestRef, + Reason: "miss", + DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID), + }) + } + } + result.Summary.Regressions = len(result.TopRegs) + + // Compare to baseline if exists + baselinePath := cfg.BaselinePath + if baselinePath == "" { + baselinePath = filepath.Join(root, "baselines", "combined.json") + } + if _, err := os.Stat(baselinePath); err == nil { + baseline, err := loadReport(baselinePath) + if err == nil { + result.Delta = &MetricsDelta{ + PAt1: report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + } + if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) { + result.Status = "fail" + } + } + } + + // Write artifacts + os.MkdirAll(cfg.OutputDir, 0755) + ts := time.Now().Format("20060102_150405") + reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts)) + summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts)) + + reportJSON, _ := json.MarshalIndent(report, "", " ") + os.WriteFile(reportPath, reportJSON, 0644) + + summaryMD := generateSummaryMD(report, result) + os.WriteFile(summaryPath, []byte(summaryMD), 0644) + + result.Artifacts.ReportJSON = reportPath + result.Artifacts.SummaryMD = summaryPath + + return result, nil +} + +func RunBenchmark(cfg RunConfig) (*Report, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + return RunCorpusBenchmark(ds, cfg) +} + +func RunCompare(cfg CompareConfig) (*CompareResult, error) { + baseline, err := loadReport(cfg.BaselinePath) + if err != nil { + return nil, fmt.Errorf("load baseline: %w", err) + } + current, err := loadReport(cfg.CurrentPath) + if err != nil { + return nil, fmt.Errorf("load current: %w", err) + } + + result := &CompareResult{ + Status: "pass", + Delta: MetricsDelta{ + PAt1: current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + }, + } + + if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 { + result.Status = "fail" + } + + // Find regressions + baselineResults := make(map[string]QueryResult) + for _, r := range baseline.Results { + baselineResults[r.ID] = r + } + for _, r := range current.Results { + if base, ok := baselineResults[r.ID]; ok { + if base.Status == "hit" && r.Status != "hit" { + result.Regressions = append(result.Regressions, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + BaselineRef: base.Actual.BestRef, + CurrentRef: r.Actual.BestRef, + Reason: fmt.Sprintf("%s -> %s", base.Status, r.Status), + }) + } + } + } + + return result, nil +} + +func RunLint(cfg LintConfig) (*LintResult, error) { + root := FindBenchmarkRoot() + result := &LintResult{} + + ds, err := LoadDataset(root) + if err != nil { + result.Errors++ + result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err)) + return result, nil + } + + // Check for duplicate IDs + ids := make(map[string]string) + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if existing, ok := ids[q.ID]; ok { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing)) + } else { + ids[q.ID] = c.ID + } + } + } + + // Check refs exist + for _, c := range ds.Corpora { + refs := make(map[string]bool) + for _, d := range c.Snapshot { + refs[d.Ref] = true + } + for _, q := range c.Queries { + for _, r := range q.RelevantRefs { + if !refs[r] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r)) + } + } + } + } + + // Check difficulty values + validDiff := map[string]bool{"easy": true, "medium": true, "hard": true} + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if q.Difficulty != "" && !validDiff[q.Difficulty] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID)) + } + } + } + + if result.Errors == 0 && result.Warnings == 0 { + result.Messages = append(result.Messages, "All checks passed") + } + + return result, nil +} + +func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + + result := &CatalogResult{ + ByTag: make(map[string]int), + ByDifficulty: make(map[string]int), + } + + for _, c := range ds.Corpora { + tags := make(map[string]bool) + for _, q := range c.Queries { + result.TotalQueries++ + result.ByDifficulty[q.Difficulty]++ + for _, t := range q.Tags { + tags[t] = true + result.ByTag[t]++ + } + } + var tagList []string + for t := range tags { + tagList = append(tagList, t) + } + sort.Strings(tagList) + result.Corpora = append(result.Corpora, CorpusSummary{ + ID: c.ID, + Queries: len(c.Queries), + Tags: tagList, + }) + } + + return result, nil +} + +func loadReport(path string) (*Report, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var r Report + if err := json.Unmarshal(data, &r); err != nil { + return nil, err + } + return &r, nil +} + +func generateSummaryMD(report *Report, result *CheckResult) string { + var sb strings.Builder + + sb.WriteString("# Benchmark Summary\n\n") + sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp)) + + sb.WriteString("## Overall Metrics\n\n") + sb.WriteString("| Metric | Value |\n") + sb.WriteString("|--------|-------|\n") + sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total)) + sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR)) + sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1)) + sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3)) + sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin)) + + if result.Delta != nil { + sb.WriteString("\n## Delta from Baseline\n\n") + sb.WriteString("| Metric | Delta |\n") + sb.WriteString("|--------|-------|\n") + sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1)) + sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR)) + sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3)) + } + + if len(result.TopRegs) > 0 { + sb.WriteString("\n## Misses\n\n") + sb.WriteString("| ID | Corpus | Query | Got | Expected |\n") + sb.WriteString("|----|--------|-------|-----|----------|\n") + for _, r := range result.TopRegs { + if len(result.TopRegs) > 10 { + break + } + sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n", + r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))) + } + } + + return sb.String() +} + +func PrintCheckResult(result *CheckResult, cfg CheckConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32mβœ“\033[0m Benchmark passed\n") + } else { + fmt.Printf(" \033[31mβœ—\033[0m Benchmark failed\n") + } + fmt.Printf("\n") + + fmt.Printf(" %-12s %8.4f\n", "MRR", result.Summary.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", result.Summary.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", result.Summary.Total) + fmt.Printf(" %-12s %8d\n", "Misses", result.Summary.Regressions) + + if result.Delta != nil { + fmt.Printf("\n Delta from baseline:\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + } + + fmt.Printf("\n Artifacts:\n") + fmt.Printf(" Report: %s\n", result.Artifacts.ReportJSON) + fmt.Printf(" Summary: %s\n", result.Artifacts.SummaryMD) + fmt.Printf("\n") +} + +func printDelta(name string, delta float64) { + color := "\033[0m" + sign := "" + if delta > 0.001 { + color = "\033[32m" + sign = "+" + } else if delta < -0.001 { + color = "\033[31m" + } + fmt.Printf(" %s%-8s %s%.4f\033[0m\n", color, name, sign, delta) +} + +func PrintRunResult(report *Report, cfg RunConfig) { + fmt.Printf("\n") + fmt.Printf(" %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", report.Metrics.Overall.Total) + fmt.Printf("\n") + + if cfg.Verbose { + for _, r := range report.Results { + status := "\033[32mHIT \033[0m" + switch r.Status { + case "miss": + status = "\033[31mMISS\033[0m" + case "partial": + status = "\033[33mPART\033[0m" + } + fmt.Printf(" [%s] %s | %s | got=%s score=%.3f\n", + r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore) + } + } +} + +func PrintCompareResult(result *CompareResult, cfg CompareConfig) { + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32mβœ“\033[0m No regression\n") + } else { + fmt.Printf(" \033[31mβœ—\033[0m Regression detected\n") + } + fmt.Printf("\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + + if len(result.Regressions) > 0 { + fmt.Printf("\n Regressions:\n") + for _, r := range result.Regressions { + fmt.Printf(" %s: %s (%s)\n", r.ID, r.Reason, r.Query) + } + } + fmt.Printf("\n") +} + +func PrintLintResult(result *LintResult, cfg LintConfig) { + for _, msg := range result.Messages { + fmt.Println(msg) + } + fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings) +} + +func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n Corpora: %d\n", len(result.Corpora)) + fmt.Printf(" Total Queries: %d\n\n", result.TotalQueries) + + fmt.Printf(" %-30s %8s\n", "Corpus", "Queries") + fmt.Printf(" %-30s %8s\n", "------", "-------") + for _, c := range result.Corpora { + fmt.Printf(" %-30s %8d\n", c.ID, c.Queries) + } + + switch cfg.By { + case "difficulty": + fmt.Printf("\n By Difficulty:\n") + for d, n := range result.ByDifficulty { + fmt.Printf(" %-10s %4d\n", d, n) + } + case "tag": + fmt.Printf("\n By Tag:\n") + for t, n := range result.ByTag { + fmt.Printf(" %-20s %4d\n", t, n) + } + } + fmt.Printf("\n") +} diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go new file mode 100644 index 0000000..c8ac10d --- /dev/null +++ b/internal/benchmark/config.go @@ -0,0 +1,247 @@ +package benchmark + +import ( + "encoding/json" + "flag" + "os" + "path/filepath" +) + +type Config struct { + Version string `json:"version"` + Defaults DefaultsConfig `json:"defaults"` + Profiles map[string]Profile `json:"profiles"` + Baseline BaselineConfig `json:"baseline"` +} + +type DefaultsConfig struct { + Profile string `json:"profile"` +} + +type Profile struct { + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` + Suites []string `json:"suites"` + Mode string `json:"mode"` + Inherits string `json:"inherits"` + Verbose bool `json:"verbose"` + Explain bool `json:"explain"` + FailOnReg bool `json:"fail_on_regression"` +} + +type Weights struct { + Lexical float64 `json:"lexical"` + Embedding float64 `json:"embedding"` +} + +type BaselineConfig struct { + Quality BaselineQuality `json:"quality"` + Runtime BaselineRuntime `json:"runtime"` +} + +type BaselineQuality struct { + MaxOverallPAt1Drop float64 `json:"max_overall_p_at_1_drop"` + MaxOverallMRRDrop float64 `json:"max_overall_mrr_drop"` + MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"` + MaxCorpusPAt1Drop float64 `json:"max_corpus_p_at_1_drop"` + MaxTagPAt1Drop float64 `json:"max_tag_p_at_1_drop"` +} + +type BaselineRuntime struct { + MaxNsOpRegressionRatio float64 `json:"max_ns_op_regression_ratio"` + MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"` +} + +type CheckConfig struct { + Profile string + BaselinePath string + OutputDir string + Format string + FailOnReg bool + Quick bool + Verbose bool + Explain bool +} + +type RunConfig struct { + Suite string + Corpus string + QueryID string + Strategy string + Threshold float64 + TopK int + LexicalWeight float64 + EmbeddingWeight float64 + Profile string + Mode string + Verbose bool + Explain bool + OutputDir string + ReportName string +} + +type CompareConfig struct { + BaselinePath string + CurrentPath string + Format string + Verbose bool +} + +type LintConfig struct { + Format string + Verbose bool +} + +type CatalogConfig struct { + Format string + By string +} + +func FindBenchmarkRoot() string { + cwd, _ := os.Getwd() + for d := cwd; d != "/"; d = filepath.Dir(d) { + if _, err := os.Stat(filepath.Join(d, "tests/benchmark/config/benchmark.json")); err == nil { + return filepath.Join(d, "tests/benchmark") + } + if _, err := os.Stat(filepath.Join(d, "go.mod")); err == nil { + return filepath.Join(d, "tests/benchmark") + } + } + return filepath.Join(cwd, "tests/benchmark") +} + +func LoadConfig(benchmarkRoot string) (*Config, error) { + path := filepath.Join(benchmarkRoot, "config/benchmark.json") + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var cfg Config + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, err + } + return &cfg, nil +} + +func ResolveProfile(cfg *Config, name string) Profile { + p, ok := cfg.Profiles[name] + if !ok { + return Profile{ + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + Suites: []string{"corpus"}, + Mode: "library", + } + } + if p.Inherits != "" { + base := ResolveProfile(cfg, p.Inherits) + if p.Strategy == "" { + p.Strategy = base.Strategy + } + if p.Threshold == 0 { + p.Threshold = base.Threshold + } + if p.TopK == 0 { + p.TopK = base.TopK + } + if p.Weights.Lexical == 0 && p.Weights.Embedding == 0 { + p.Weights = base.Weights + } + if len(p.Suites) == 0 { + p.Suites = base.Suites + } + if p.Mode == "" { + p.Mode = base.Mode + } + } + return p +} + +func ParseCheckFlags(args []string) CheckConfig { + fs := flag.NewFlagSet("check", flag.ExitOnError) + cfg := CheckConfig{ + Profile: "default", + OutputDir: filepath.Join(FindBenchmarkRoot(), "results"), + Format: "text", + } + fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile") + fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline file path") + fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)") + fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression") + fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks") + fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details") + fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations") + fs.Parse(args) + return cfg +} + +func ParseRunFlags(args []string) RunConfig { + fs := flag.NewFlagSet("run", flag.ExitOnError) + cfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + Profile: "default", + Mode: "library", + OutputDir: filepath.Join(FindBenchmarkRoot(), "results"), + } + fs.StringVar(&cfg.Suite, "suite", cfg.Suite, "suite to run (corpus|recovery|classification|runtime|all)") + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to run") + fs.StringVar(&cfg.QueryID, "query", "", "specific query ID to run") + fs.StringVar(&cfg.Strategy, "strategy", cfg.Strategy, "matching strategy") + fs.Float64Var(&cfg.Threshold, "threshold", cfg.Threshold, "score threshold") + fs.IntVar(&cfg.TopK, "top-k", cfg.TopK, "number of results") + fs.Float64Var(&cfg.LexicalWeight, "lexical-weight", cfg.LexicalWeight, "lexical weight") + fs.Float64Var(&cfg.EmbeddingWeight, "embedding-weight", cfg.EmbeddingWeight, "embedding weight") + fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile") + fs.StringVar(&cfg.Mode, "mode", cfg.Mode, "execution mode (cli|library|both)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.BoolVar(&cfg.Explain, "explain", false, "include explanations") + fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") + fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name") + fs.Parse(args) + return cfg +} + +func ParseCompareFlags(args []string) CompareConfig { + fs := flag.NewFlagSet("compare", flag.ExitOnError) + cfg := CompareConfig{ + Format: "text", + } + fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline report path (required)") + fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)") + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.Parse(args) + return cfg +} + +func ParseLintFlags(args []string) LintConfig { + fs := flag.NewFlagSet("lint", flag.ExitOnError) + cfg := LintConfig{ + Format: "text", + } + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.Parse(args) + return cfg +} + +func ParseCatalogFlags(args []string) CatalogConfig { + fs := flag.NewFlagSet("catalog", flag.ExitOnError) + cfg := CatalogConfig{ + Format: "table", + } + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)") + fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)") + fs.Parse(args) + return cfg +} diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go new file mode 100644 index 0000000..555b503 --- /dev/null +++ b/internal/benchmark/dataset.go @@ -0,0 +1,117 @@ +package benchmark + +import ( + "encoding/json" + "os" + "path/filepath" + + "github.com/pinchtab/semantic" +) + +type Query struct { + ID string `json:"id"` + QueryText string `json:"query"` + RelevantRefs []string `json:"relevant_refs"` + PartiallyRelevantRefs []string `json:"partially_relevant_refs"` + Difficulty string `json:"difficulty"` + Tags []string `json:"tags"` + Intent string `json:"intent,omitempty"` + PageType string `json:"page_type,omitempty"` + Threshold *float64 `json:"threshold,omitempty"` + TopK *int `json:"top_k,omitempty"` + ExpectNoMatch bool `json:"expect_no_match,omitempty"` + MinScore *float64 `json:"min_score,omitempty"` + Notes string `json:"notes,omitempty"` +} + +type Corpus struct { + ID string + Path string + Snapshot []semantic.ElementDescriptor + Queries []Query +} + +type Dataset struct { + Root string + Corpora []Corpus +} + +func LoadDataset(benchmarkRoot string) (*Dataset, error) { + corpusDir := filepath.Join(benchmarkRoot, "corpus") + entries, err := os.ReadDir(corpusDir) + if err != nil { + return nil, err + } + + ds := &Dataset{Root: benchmarkRoot} + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + corpusPath := filepath.Join(corpusDir, entry.Name()) + snapshotPath := filepath.Join(corpusPath, "snapshot.json") + queriesPath := filepath.Join(corpusPath, "queries.json") + + if _, err := os.Stat(snapshotPath); os.IsNotExist(err) { + continue + } + if _, err := os.Stat(queriesPath); os.IsNotExist(err) { + continue + } + + corpus, err := loadCorpus(entry.Name(), corpusPath) + if err != nil { + return nil, err + } + + ds.Corpora = append(ds.Corpora, *corpus) + } + + return ds, nil +} + +func loadCorpus(id, path string) (*Corpus, error) { + snapshotPath := filepath.Join(path, "snapshot.json") + queriesPath := filepath.Join(path, "queries.json") + + snapshotData, err := os.ReadFile(snapshotPath) + if err != nil { + return nil, err + } + + var snapshot []semantic.ElementDescriptor + if err := json.Unmarshal(snapshotData, &snapshot); err != nil { + return nil, err + } + + queriesData, err := os.ReadFile(queriesPath) + if err != nil { + return nil, err + } + + var queries []Query + if err := json.Unmarshal(queriesData, &queries); err != nil { + return nil, err + } + + return &Corpus{ + ID: id, + Path: path, + Snapshot: snapshot, + Queries: queries, + }, nil +} + +func (ds *Dataset) QueryCount() int { + count := 0 + for _, c := range ds.Corpora { + count += len(c.Queries) + } + return count +} + +func (ds *Dataset) CorpusCount() int { + return len(ds.Corpora) +} diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go new file mode 100644 index 0000000..391cc0a --- /dev/null +++ b/internal/benchmark/runner.go @@ -0,0 +1,384 @@ +package benchmark + +import ( + "context" + "time" + + "github.com/pinchtab/semantic" +) + +type QueryResult struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Difficulty string `json:"difficulty"` + Tags []string `json:"tags"` + Intent string `json:"intent,omitempty"` + PageType string `json:"page_type,omitempty"` + Expected struct { + RelevantRefs []string `json:"relevant_refs"` + PartiallyRelevantRefs []string `json:"partially_relevant_refs"` + } `json:"expected"` + Actual struct { + BestRef string `json:"best_ref"` + BestScore float64 `json:"best_score"` + Matches []Match `json:"matches"` + } `json:"actual"` + Metrics struct { + RR float64 `json:"rr"` + PAt1 float64 `json:"p_at_1"` + PAt3 float64 `json:"p_at_3"` + HitAt3 int `json:"hit_at_3"` + HitAt5 int `json:"hit_at_5"` + BestRelevantRank *int `json:"best_relevant_rank"` + BestRelevantScore float64 `json:"best_relevant_score"` + BestWrongScore float64 `json:"best_wrong_score"` + Margin float64 `json:"margin"` + } `json:"metrics"` + Latency struct { + LibraryMs int64 `json:"library_ms"` + CLIMs *int64 `json:"cli_ms,omitempty"` + } `json:"latency"` + Status string `json:"status"` +} + +type Match struct { + Ref string `json:"ref"` + Score float64 `json:"score"` + Role string `json:"role"` + Name string `json:"name"` +} + +type Report struct { + SchemaVersion string `json:"schema_version"` + Run struct { + ID string `json:"id"` + Timestamp string `json:"timestamp"` + Tool string `json:"tool"` + GitSHA string `json:"git_sha,omitempty"` + GitDirty bool `json:"git_dirty,omitempty"` + Command string `json:"command"` + } `json:"run"` + Dataset struct { + Name string `json:"name"` + Version string `json:"version,omitempty"` + QueryCount int `json:"query_count"` + CorpusCount int `json:"corpus_count"` + } `json:"dataset"` + Config struct { + Profile string `json:"profile"` + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` + } `json:"config"` + Status string `json:"status"` + Metrics struct { + Overall OverallMetrics `json:"overall"` + Latency LatencyMetrics `json:"latency"` + ByCorpus map[string]CorpusMetrics `json:"by_corpus"` + ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"` + ByTag map[string]CorpusMetrics `json:"by_tag"` + } `json:"metrics"` + Results []QueryResult `json:"results"` +} + +type OverallMetrics struct { + Total int `json:"total"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + PAt3 float64 `json:"p_at_3"` + HitAt3 float64 `json:"hit_at_3"` + HitAt5 float64 `json:"hit_at_5"` + AvgMargin float64 `json:"avg_margin"` +} + +type LatencyMetrics struct { + LibraryP50Ms int64 `json:"library_p50_ms"` + LibraryP95Ms int64 `json:"library_p95_ms"` + CLIP50Ms *int64 `json:"cli_p50_ms,omitempty"` + CLIP95Ms *int64 `json:"cli_p95_ms,omitempty"` +} + +type CorpusMetrics struct { + Count int `json:"count"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + HitAt3 float64 `json:"hit_at_3"` + AvgMargin float64 `json:"avg_margin"` +} + +func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) { + matcher := createMatcher(cfg) + + report := &Report{ + SchemaVersion: "1.0.0", + Status: "pass", + } + report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile + report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339) + report.Run.Tool = "semantic-bench" + report.Dataset.Name = "semantic-ui-matching-corpus" + report.Dataset.QueryCount = ds.QueryCount() + report.Dataset.CorpusCount = ds.CorpusCount() + report.Config.Profile = cfg.Profile + report.Config.Strategy = cfg.Strategy + report.Config.Threshold = cfg.Threshold + report.Config.TopK = cfg.TopK + report.Config.Weights = Weights{Lexical: cfg.LexicalWeight, Embedding: cfg.EmbeddingWeight} + + report.Metrics.ByCorpus = make(map[string]CorpusMetrics) + report.Metrics.ByDifficulty = make(map[string]CorpusMetrics) + report.Metrics.ByTag = make(map[string]CorpusMetrics) + + var allLatencies []int64 + + for _, corpus := range ds.Corpora { + if cfg.Corpus != "" && corpus.ID != cfg.Corpus { + continue + } + + for _, query := range corpus.Queries { + if cfg.QueryID != "" && query.ID != cfg.QueryID { + continue + } + + result := runQuery(matcher, corpus, query, cfg) + report.Results = append(report.Results, result) + allLatencies = append(allLatencies, result.Latency.LibraryMs) + } + } + + aggregateMetrics(report, allLatencies) + return report, nil +} + +func createMatcher(cfg RunConfig) semantic.ElementMatcher { + embedder := semantic.NewHashingEmbedder(128) + switch cfg.Strategy { + case "lexical": + return semantic.NewLexicalMatcher() + case "embedding": + return semantic.NewEmbeddingMatcher(embedder) + default: + return semantic.NewCombinedMatcher(embedder) + } +} + +func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg RunConfig) QueryResult { + result := QueryResult{ + ID: query.ID, + Corpus: corpus.ID, + Query: query.QueryText, + Difficulty: query.Difficulty, + Tags: query.Tags, + Intent: query.Intent, + PageType: query.PageType, + } + result.Expected.RelevantRefs = query.RelevantRefs + result.Expected.PartiallyRelevantRefs = query.PartiallyRelevantRefs + + threshold := cfg.Threshold + if query.Threshold != nil { + threshold = *query.Threshold + } + topK := cfg.TopK + if query.TopK != nil { + topK = *query.TopK + } + + start := time.Now() + findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{ + Threshold: threshold, + TopK: topK, + }) + result.Latency.LibraryMs = time.Since(start).Milliseconds() + + result.Actual.BestRef = findResult.BestRef + result.Actual.BestScore = findResult.BestScore + for _, m := range findResult.Matches { + result.Actual.Matches = append(result.Actual.Matches, Match{ + Ref: m.Ref, + Score: m.Score, + Role: m.Role, + Name: m.Name, + }) + } + + computeQueryMetrics(&result, query) + return result +} + +func computeQueryMetrics(result *QueryResult, query Query) { + relevantSet := make(map[string]bool) + for _, r := range query.RelevantRefs { + relevantSet[r] = true + } + partialSet := make(map[string]bool) + for _, r := range query.PartiallyRelevantRefs { + partialSet[r] = true + } + + // Reciprocal Rank + for i, m := range result.Actual.Matches { + if relevantSet[m.Ref] { + result.Metrics.RR = 1.0 / float64(i+1) + break + } + } + + // P@1 + if len(result.Actual.Matches) > 0 { + if relevantSet[result.Actual.Matches[0].Ref] { + result.Metrics.PAt1 = 1.0 + } else if partialSet[result.Actual.Matches[0].Ref] { + result.Metrics.PAt1 = 0.5 + } + } + + // P@3, Hit@3, Hit@5 + relevantInTop3 := 0 + partialInTop3 := 0 + for i, m := range result.Actual.Matches { + if i >= 5 { + break + } + if relevantSet[m.Ref] { + if result.Metrics.BestRelevantRank == nil { + rank := i + 1 + result.Metrics.BestRelevantRank = &rank + } + if result.Metrics.BestRelevantScore == 0 || m.Score > result.Metrics.BestRelevantScore { + result.Metrics.BestRelevantScore = m.Score + } + if i < 3 { + relevantInTop3++ + result.Metrics.HitAt3 = 1 + } + result.Metrics.HitAt5 = 1 + } else if partialSet[m.Ref] { + if i < 3 { + partialInTop3++ + } + } else { + if m.Score > result.Metrics.BestWrongScore { + result.Metrics.BestWrongScore = m.Score + } + } + } + result.Metrics.PAt3 = (float64(relevantInTop3) + float64(partialInTop3)*0.5) / 3.0 + result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore + + // Status + if query.ExpectNoMatch { + if len(result.Actual.Matches) == 0 { + result.Status = "no_match_expected" + } else { + result.Status = "unexpected_match" + } + } else if result.Metrics.PAt1 >= 1.0 { + result.Status = "hit" + } else if result.Metrics.PAt1 >= 0.5 { + result.Status = "partial" + } else { + result.Status = "miss" + } +} + +func aggregateMetrics(report *Report, latencies []int64) { + n := len(report.Results) + if n == 0 { + return + } + + report.Metrics.Overall.Total = n + + var sumRR, sumP1, sumP3, sumHit3, sumHit5, sumMargin float64 + corpusAgg := make(map[string]*aggregator) + diffAgg := make(map[string]*aggregator) + tagAgg := make(map[string]*aggregator) + + for _, r := range report.Results { + sumRR += r.Metrics.RR + sumP1 += r.Metrics.PAt1 + sumP3 += r.Metrics.PAt3 + sumHit3 += float64(r.Metrics.HitAt3) + sumHit5 += float64(r.Metrics.HitAt5) + sumMargin += r.Metrics.Margin + + addToAgg(corpusAgg, r.Corpus, r) + addToAgg(diffAgg, r.Difficulty, r) + for _, t := range r.Tags { + addToAgg(tagAgg, t, r) + } + } + + report.Metrics.Overall.MRR = sumRR / float64(n) + report.Metrics.Overall.PAt1 = sumP1 / float64(n) + report.Metrics.Overall.PAt3 = sumP3 / float64(n) + report.Metrics.Overall.HitAt3 = sumHit3 / float64(n) + report.Metrics.Overall.HitAt5 = sumHit5 / float64(n) + report.Metrics.Overall.AvgMargin = sumMargin / float64(n) + + for k, a := range corpusAgg { + report.Metrics.ByCorpus[k] = a.toMetrics() + } + for k, a := range diffAgg { + report.Metrics.ByDifficulty[k] = a.toMetrics() + } + for k, a := range tagAgg { + report.Metrics.ByTag[k] = a.toMetrics() + } + + // Latency percentiles + if len(latencies) > 0 { + sorted := make([]int64, len(latencies)) + copy(sorted, latencies) + sortInt64(sorted) + report.Metrics.Latency.LibraryP50Ms = sorted[len(sorted)*50/100] + report.Metrics.Latency.LibraryP95Ms = sorted[len(sorted)*95/100] + } +} + +type aggregator struct { + count int + sumRR float64 + sumP1 float64 + sumHit3 float64 + sumMargin float64 +} + +func addToAgg(m map[string]*aggregator, key string, r QueryResult) { + if _, ok := m[key]; !ok { + m[key] = &aggregator{} + } + a := m[key] + a.count++ + a.sumRR += r.Metrics.RR + a.sumP1 += r.Metrics.PAt1 + a.sumHit3 += float64(r.Metrics.HitAt3) + a.sumMargin += r.Metrics.Margin +} + +func (a *aggregator) toMetrics() CorpusMetrics { + if a.count == 0 { + return CorpusMetrics{} + } + return CorpusMetrics{ + Count: a.count, + MRR: a.sumRR / float64(a.count), + PAt1: a.sumP1 / float64(a.count), + HitAt3: a.sumHit3 / float64(a.count), + AvgMargin: a.sumMargin / float64(a.count), + } +} + +func sortInt64(s []int64) { + for i := range s { + for j := i + 1; j < len(s); j++ { + if s[j] < s[i] { + s[i], s[j] = s[j], s[i] + } + } + } +} From fd7a3f195170fdde80af5fccdba951b4277f8be2 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 15:38:26 +0100 Subject: [PATCH 05/14] chore: ignore semantic-bench binary --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8a46978..9a58d8e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Binary /semantic +/semantic-bench tests/benchmark/semantic tests/e2e/semantic *.exe From 8b21ba78ba88bfbfeca4f47fba309853ba926e36 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 15:48:00 +0100 Subject: [PATCH 06/14] feat: add baseline, calibrate, tune commands to Go CLI Move benchmark management from bash scripts to Go: - `semantic-bench baseline create/update` - manage quality baselines - `semantic-bench calibrate` - threshold optimization via precision/recall - `semantic-bench tune` - grid-search lexical/embedding weights Update dev tool to use Go CLI for all benchmark commands. --- cmd/semantic-bench/main.go | 49 ++++- dev | 10 +- internal/benchmark/commands.go | 363 +++++++++++++++++++++++++++++++++ internal/benchmark/config.go | 59 ++++++ 4 files changed, 471 insertions(+), 10 deletions(-) diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go index 35bf051..4866601 100644 --- a/cmd/semantic-bench/main.go +++ b/cmd/semantic-bench/main.go @@ -13,11 +13,14 @@ Usage: semantic-bench [flags] Commands: - check Run benchmark and compare against baseline (default) - run Run benchmark suites - compare Compare two reports - lint Validate dataset - catalog Print dataset inventory + check Run benchmark and compare against baseline (default) + run Run benchmark suites + compare Compare two reports + lint Validate dataset + catalog Print dataset inventory + baseline Manage quality baselines (create, update) + calibrate Find optimal thresholds via precision/recall analysis + tune Grid-search lexical/embedding weights Flags: -h, --help Show help @@ -45,6 +48,12 @@ func main() { runLint(args) case "catalog": runCatalog(args) + case "baseline": + runBaseline(args) + case "calibrate": + runCalibrate(args) + case "tune": + runTune(args) case "-h", "--help", "help": fmt.Print(usage) default: @@ -111,3 +120,33 @@ func runCatalog(args []string) { } benchmark.PrintCatalogResult(result, cfg) } + +func runBaseline(args []string) { + cfg := benchmark.ParseBaselineFlags(args) + result, err := benchmark.RunBaseline(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintBaselineResult(result, cfg) +} + +func runCalibrate(args []string) { + cfg := benchmark.ParseCalibrateFlags(args) + result, err := benchmark.RunCalibrate(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCalibrateResult(result, cfg) +} + +func runTune(args []string) { + cfg := benchmark.ParseTuneFlags(args) + result, err := benchmark.RunTune(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintTuneResult(result, cfg) +} diff --git a/dev b/dev index a7f6247..da0f70c 100755 --- a/dev +++ b/dev @@ -177,22 +177,22 @@ run_lint_docs() { run_baseline() { echo " ${ACCENT}${BOLD}πŸ“ Creating quality baseline${NC}" - bash tests/benchmark/scripts/create-baseline.sh "$@" + go run ./cmd/semantic-bench baseline create "$@" } run_baseline_check() { echo " ${ACCENT}${BOLD}πŸ“ Checking against baseline${NC}" - bash tests/benchmark/scripts/check-baseline.sh "$@" + go run ./cmd/semantic-bench check "$@" } run_baseline_update() { echo " ${ACCENT}${BOLD}πŸ“ Updating baseline${NC}" - bash tests/benchmark/scripts/update-baseline.sh --accept "$@" + go run ./cmd/semantic-bench baseline update --accept "$@" } run_calibrate() { echo " ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}" - bash tests/benchmark/scripts/calibrate-thresholds.sh "$@" + go run ./cmd/semantic-bench calibrate -verbose "$@" } run_runtime() { @@ -202,7 +202,7 @@ run_runtime() { run_tune() { echo " ${ACCENT}${BOLD}πŸŽ›οΈ Tuning combined weights${NC}" - bash tests/benchmark/scripts/tune-weights.sh "$@" + go run ./cmd/semantic-bench tune -verbose "$@" } run_e2e() { diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go index ad22ea3..7f37ed5 100644 --- a/internal/benchmark/commands.go +++ b/internal/benchmark/commands.go @@ -8,6 +8,8 @@ import ( "sort" "strings" "time" + + "github.com/pinchtab/semantic" ) type CheckResult struct { @@ -508,3 +510,364 @@ func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) { } fmt.Printf("\n") } + +// Baseline management + +type BaselineResult struct { + Action string `json:"action"` + Path string `json:"path"` + Metrics OverallMetrics `json:"metrics"` + Previous *OverallMetrics `json:"previous,omitempty"` +} + +func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) { + root := FindBenchmarkRoot() + baselinesDir := filepath.Join(root, "baselines") + if err := os.MkdirAll(baselinesDir, 0755); err != nil { + return nil, err + } + + baselinePath := filepath.Join(baselinesDir, cfg.Name+".json") + + switch cfg.Action { + case "create": + return createBaseline(root, baselinePath, cfg) + case "update": + if !cfg.Accept { + return nil, fmt.Errorf("use --accept to confirm baseline update") + } + return updateBaseline(root, baselinePath, cfg) + default: + return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action) + } +} + +func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + Mode: "library", + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + data, err := json.MarshalIndent(report, "", " ") + if err != nil { + return nil, err + } + if err := os.WriteFile(baselinePath, data, 0644); err != nil { + return nil, err + } + + return &BaselineResult{ + Action: "create", + Path: baselinePath, + Metrics: report.Metrics.Overall, + }, nil +} + +func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + var previous *OverallMetrics + if data, err := os.ReadFile(baselinePath); err == nil { + var old Report + if json.Unmarshal(data, &old) == nil { + previous = &old.Metrics.Overall + } + backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json" + os.WriteFile(backupPath, data, 0644) + } + + result, err := createBaseline(root, baselinePath, cfg) + if err != nil { + return nil, err + } + result.Action = "update" + result.Previous = previous + return result, nil +} + +func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) { + fmt.Printf("\n Baseline %sd: %s\n\n", result.Action, result.Path) + fmt.Printf(" MRR: %.4f\n", result.Metrics.MRR) + fmt.Printf(" P@1: %.4f\n", result.Metrics.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Metrics.HitAt3) + + if result.Previous != nil { + fmt.Printf("\n Previous:\n") + fmt.Printf(" MRR: %.4f\n", result.Previous.MRR) + fmt.Printf(" P@1: %.4f\n", result.Previous.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Previous.HitAt3) + } + fmt.Println() +} + +// Threshold calibration + +type CalibrateResult struct { + ByThreshold map[string]ThresholdMetrics `json:"by_threshold"` + Recommendations CalibrateRecommendations `json:"recommendations"` + TotalCases int `json:"total_cases"` +} + +type ThresholdMetrics struct { + TP int `json:"tp"` + FP int `json:"fp"` + FN int `json:"fn"` + TN int `json:"tn"` + Recall float64 `json:"recall"` + Precision float64 `json:"precision"` + FPR float64 `json:"false_positive_rate"` + F1 float64 `json:"f1"` +} + +type CalibrateRecommendations struct { + DefaultThreshold float64 `json:"default_threshold"` + RecoveryThreshold float64 `json:"recovery_threshold"` + BestF1 float64 `json:"best_f1"` +} + +func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &CalibrateResult{ + ByThreshold: make(map[string]ThresholdMetrics), + } + + type testCase struct { + query Query + corpus *Corpus + } + + var cases []testCase + for i := range ds.Corpora { + corpus := &ds.Corpora[i] + if cfg.Corpus != "" && corpus.ID != cfg.Corpus { + continue + } + for _, q := range corpus.Queries { + cases = append(cases, testCase{query: q, corpus: corpus}) + } + } + result.TotalCases = len(cases) + + if cfg.Verbose { + fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases)) + } + + runCfg := RunConfig{ + Strategy: "combined", + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + } + matcher := createMatcher(runCfg) + + var bestF1, bestF1Threshold float64 + var bestRecallThreshold float64 + var bestRecallWithPrecision float64 + + for _, threshold := range cfg.Thresholds { + tp, fp, fn, tn := 0, 0, 0, 0 + + for _, tc := range cases { + findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{ + Threshold: threshold, + TopK: 5, + }) + + hasMatch := len(findResult.Matches) > 0 + topRef := "" + if hasMatch { + topRef = findResult.Matches[0].Ref + } + + if tc.query.ExpectNoMatch { + if hasMatch { + fp++ + } else { + tn++ + } + } else if len(tc.query.RelevantRefs) > 0 { + if !hasMatch { + fn++ + } else if contains(tc.query.RelevantRefs, topRef) { + tp++ + } else { + fp++ + } + } + } + + totalPos := tp + fn + totalNeg := tn + fp + + var recall, precision, fpr, f1 float64 + if totalPos > 0 { + recall = float64(tp) / float64(totalPos) + } + if tp+fp > 0 { + precision = float64(tp) / float64(tp+fp) + } + if totalNeg > 0 { + fpr = float64(fp) / float64(totalNeg) + } + if precision+recall > 0 { + f1 = 2 * precision * recall / (precision + recall) + } + + key := fmt.Sprintf("%.2f", threshold) + result.ByThreshold[key] = ThresholdMetrics{ + TP: tp, FP: fp, FN: fn, TN: tn, + Recall: recall, Precision: precision, FPR: fpr, F1: f1, + } + + if f1 > bestF1 { + bestF1 = f1 + bestF1Threshold = threshold + } + if recall >= 0.85 && precision > bestRecallWithPrecision { + bestRecallWithPrecision = precision + bestRecallThreshold = threshold + } + + if cfg.Verbose { + fmt.Printf(" threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n", + threshold, tp, fp, fn, tn, recall, precision, f1) + } + } + + if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 { + bestRecallThreshold = cfg.Thresholds[0] + } + + result.Recommendations = CalibrateRecommendations{ + DefaultThreshold: bestF1Threshold, + RecoveryThreshold: bestRecallThreshold, + BestF1: bestF1, + } + + return result, nil +} + +func contains(refs []string, ref string) bool { + for _, r := range refs { + if r == ref { + return true + } + } + return false +} + +func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) { + fmt.Printf("\n Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold)) + + fmt.Printf(" Recommendations:\n") + fmt.Printf(" Default (best F1): %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1) + fmt.Printf(" Recovery (recall): %.2f\n", result.Recommendations.RecoveryThreshold) + fmt.Println() +} + +// Weight tuning + +type TuneResult struct { + Results []TuneRun `json:"results"` + Best *TuneRun `json:"best"` +} + +type TuneRun struct { + LexicalWeight float64 `json:"lexical_weight"` + EmbeddingWeight float64 `json:"embedding_weight"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + HitAt3 float64 `json:"hit_at_3"` +} + +func RunTune(cfg TuneConfig) (*TuneResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &TuneResult{} + + if cfg.Verbose { + fmt.Printf(" %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3") + } + + for w := 0.0; w <= 1.0001; w += cfg.Step { + lexW := w + embW := 1.0 - w + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: lexW, + EmbeddingWeight: embW, + Mode: "library", + } + + if cfg.Corpus != "" { + runCfg.Corpus = cfg.Corpus + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err) + } + + run := TuneRun{ + LexicalWeight: lexW, + EmbeddingWeight: embW, + MRR: report.Metrics.Overall.MRR, + PAt1: report.Metrics.Overall.PAt1, + HitAt3: report.Metrics.Overall.HitAt3, + } + result.Results = append(result.Results, run) + + if result.Best == nil || run.PAt1 > result.Best.PAt1 || + (run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) { + best := run + result.Best = &best + } + + if cfg.Verbose { + fmt.Printf(" %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n", + lexW, embW, run.MRR, run.PAt1, run.HitAt3) + } + } + + return result, nil +} + +func PrintTuneResult(result *TuneResult, cfg TuneConfig) { + fmt.Printf("\n Tested %d weight combinations\n\n", len(result.Results)) + + if result.Best != nil { + fmt.Printf(" Best weights:\n") + fmt.Printf(" Lexical: %.2f\n", result.Best.LexicalWeight) + fmt.Printf(" Embedding: %.2f\n", result.Best.EmbeddingWeight) + fmt.Printf(" MRR: %.4f\n", result.Best.MRR) + fmt.Printf(" P@1: %.4f\n", result.Best.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Best.HitAt3) + } + fmt.Println() +} diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go index c8ac10d..eb2fe57 100644 --- a/internal/benchmark/config.go +++ b/internal/benchmark/config.go @@ -99,6 +99,25 @@ type CatalogConfig struct { By string } +type BaselineCmdConfig struct { + Action string // "create" or "update" + Name string + Accept bool + Verbose bool +} + +type CalibrateConfig struct { + Corpus string + Thresholds []float64 + Verbose bool +} + +type TuneConfig struct { + Corpus string + Step float64 + Verbose bool +} + func FindBenchmarkRoot() string { cwd, _ := os.Getwd() for d := cwd; d != "/"; d = filepath.Dir(d) { @@ -245,3 +264,43 @@ func ParseCatalogFlags(args []string) CatalogConfig { fs.Parse(args) return cfg } + +func ParseBaselineFlags(args []string) BaselineCmdConfig { + fs := flag.NewFlagSet("baseline", flag.ExitOnError) + cfg := BaselineCmdConfig{ + Action: "create", + Name: "combined", + } + fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name") + fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.Parse(args) + + if len(fs.Args()) > 0 { + cfg.Action = fs.Args()[0] + } + return cfg +} + +func ParseCalibrateFlags(args []string) CalibrateConfig { + fs := flag.NewFlagSet("calibrate", flag.ExitOnError) + cfg := CalibrateConfig{ + Thresholds: []float64{0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60}, + } + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.Parse(args) + return cfg +} + +func ParseTuneFlags(args []string) TuneConfig { + fs := flag.NewFlagSet("tune", flag.ExitOnError) + cfg := TuneConfig{ + Step: 0.1, + } + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against") + fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.Parse(args) + return cfg +} From 5e39de714da80fc5e1c133fbaaa98e1c2bbc368c Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 17:31:21 +0100 Subject: [PATCH 07/14] chore: remove bash scripts replaced by Go CLI Keep only check-runtime-baseline.sh (wraps go test -bench). --- .../benchmark/scripts/calibrate-thresholds.sh | 340 ------------ tests/benchmark/scripts/check-baseline.sh | 140 ----- tests/benchmark/scripts/create-baseline.sh | 86 --- tests/benchmark/scripts/finalize-report.sh | 115 ---- tests/benchmark/scripts/lint-corpus.sh | 197 ------- tests/benchmark/scripts/record-result.sh | 44 -- tests/benchmark/scripts/run-benchmark.sh | 226 -------- .../benchmark/scripts/run-corpus-benchmark.sh | 514 ------------------ tests/benchmark/scripts/run-full-benchmark.sh | 317 ----------- .../scripts/run-recovery-benchmark.sh | 42 -- tests/benchmark/scripts/tune-weights.sh | 167 ------ tests/benchmark/scripts/update-baseline.sh | 70 --- 12 files changed, 2258 deletions(-) delete mode 100755 tests/benchmark/scripts/calibrate-thresholds.sh delete mode 100755 tests/benchmark/scripts/check-baseline.sh delete mode 100755 tests/benchmark/scripts/create-baseline.sh delete mode 100755 tests/benchmark/scripts/finalize-report.sh delete mode 100755 tests/benchmark/scripts/lint-corpus.sh delete mode 100755 tests/benchmark/scripts/record-result.sh delete mode 100755 tests/benchmark/scripts/run-benchmark.sh delete mode 100755 tests/benchmark/scripts/run-corpus-benchmark.sh delete mode 100755 tests/benchmark/scripts/run-full-benchmark.sh delete mode 100755 tests/benchmark/scripts/run-recovery-benchmark.sh delete mode 100755 tests/benchmark/scripts/tune-weights.sh delete mode 100755 tests/benchmark/scripts/update-baseline.sh diff --git a/tests/benchmark/scripts/calibrate-thresholds.sh b/tests/benchmark/scripts/calibrate-thresholds.sh deleted file mode 100755 index 84d68d1..0000000 --- a/tests/benchmark/scripts/calibrate-thresholds.sh +++ /dev/null @@ -1,340 +0,0 @@ -#!/bin/bash -# -# Threshold Calibration Benchmark -# -# Calculates optimal thresholds for semantic matching by evaluating -# recall, precision, and false-positive rates across threshold levels. -# -# Usage: -# ./calibrate-thresholds.sh [--corpus ] -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -CASES_DIR="${BENCHMARK_DIR}/cases" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -SPECIFIC_CORPUS="" -while [[ $# -gt 0 ]]; do - case "$1" in - --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/threshold_calibration_${TIMESTAMP}.json" - -# Thresholds to test -THRESHOLDS=(0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60) - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --argjson thresholds "$(printf '%s\n' "${THRESHOLDS[@]}" | jq -s '.')" \ - '{ - calibration: { - timestamp: $ts, - thresholds_tested: $thresholds - }, - by_threshold: {}, - by_tag: {}, - recommendations: {} - }' > "${REPORT_FILE}" - -echo "" -echo "=== Threshold Calibration ===" -echo "Testing thresholds: ${THRESHOLDS[*]}" -echo "" - -# Collect all test cases -declare -a ALL_QUERIES=() -declare -a ALL_SNAPSHOTS=() -declare -a ALL_RELEVANT=() -declare -a ALL_EXPECT_NO_MATCH=() -declare -a ALL_IDS=() - -load_corpus() { - local corpus_path="$1" - local snapshot="${corpus_path}/snapshot.json" - local queries="${corpus_path}/queries.json" - - if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then - return - fi - - local count - count=$(jq length "$queries") - - for i in $(seq 0 $((count - 1))); do - local query relevant id expect_no_match - id=$(jq -r ".[$i].id" "$queries") - query=$(jq -r ".[$i].query" "$queries") - relevant=$(jq -c ".[$i].relevant_refs // []" "$queries") - expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$queries") - - ALL_IDS+=("$id") - ALL_QUERIES+=("$query") - ALL_SNAPSHOTS+=("$snapshot") - ALL_RELEVANT+=("$relevant") - ALL_EXPECT_NO_MATCH+=("$expect_no_match") - done -} - -load_cases() { - local cases_file="$1" - local snapshots_dir="${BENCHMARK_DIR}/../e2e/assets/snapshots" - - if [[ ! -f "$cases_file" ]]; then - return - fi - - local count - count=$(jq length "$cases_file") - - for i in $(seq 0 $((count - 1))); do - local id query snapshot_name expect_no_match expect_ref expect_ref_alt relevant - id=$(jq -r ".[$i].id" "$cases_file") - query=$(jq -r ".[$i].query" "$cases_file") - snapshot_name=$(jq -r ".[$i].snapshot" "$cases_file") - expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$cases_file") - expect_ref=$(jq -r ".[$i].expect_ref // \"\"" "$cases_file") - expect_ref_alt=$(jq -c ".[$i].expect_ref_alt // []" "$cases_file") - - if [[ -n "$expect_ref" && "$expect_ref" != "null" ]]; then - relevant=$(echo "$expect_ref_alt" | jq --arg r "$expect_ref" '. + [$r]') - else - relevant="[]" - fi - - local snapshot="${snapshots_dir}/${snapshot_name}" - if [[ ! -f "$snapshot" ]]; then - continue - fi - - ALL_IDS+=("$id") - ALL_QUERIES+=("$query") - ALL_SNAPSHOTS+=("$snapshot") - ALL_RELEVANT+=("$relevant") - ALL_EXPECT_NO_MATCH+=("$expect_no_match") - done -} - -echo "Loading test cases..." -if [[ -n "${SPECIFIC_CORPUS}" ]]; then - load_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}" -else - for corpus in "${CORPUS_DIR}"/*/; do - [[ -d "$corpus" ]] || continue - load_corpus "$corpus" - done -fi - -load_cases "${CASES_DIR}/negative-threshold.json" - -TOTAL_CASES=${#ALL_QUERIES[@]} -echo "Loaded ${TOTAL_CASES} test cases" -echo "" - -for threshold in "${THRESHOLDS[@]}"; do - echo "Testing threshold ${threshold}..." - - tp=0 fp=0 fn=0 tn=0 - - for i in $(seq 0 $((TOTAL_CASES - 1))); do - query="${ALL_QUERIES[$i]}" - snapshot="${ALL_SNAPSHOTS[$i]}" - relevant="${ALL_RELEVANT[$i]}" - expect_no_match="${ALL_EXPECT_NO_MATCH[$i]}" - - result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot}" \ - --strategy combined \ - --threshold "${threshold}" \ - --top-k 5 \ - --format json 2>/dev/null) || result='{"matches":[]}' - - match_count=$(echo "$result" | jq '.matches | length') - best_ref=$(echo "$result" | jq -r '.best_ref // ""') - - if [[ "$expect_no_match" == "true" ]]; then - if [[ $match_count -eq 0 ]]; then - tn=$((tn + 1)) - else - fp=$((fp + 1)) - fi - else - relevant_count=$(echo "$relevant" | jq 'length') - if [[ $relevant_count -eq 0 ]]; then - continue - fi - - if [[ $match_count -eq 0 ]]; then - fn=$((fn + 1)) - elif echo "$relevant" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - tp=$((tp + 1)) - else - fp=$((fp + 1)) - fi - fi - done - - total_positive=$((tp + fn)) - total_negative=$((tn + fp)) - - if [[ $total_positive -gt 0 ]]; then - recall=$(echo "scale=4; $tp / $total_positive" | bc) - else - recall="0" - fi - - if [[ $((tp + fp)) -gt 0 ]]; then - precision=$(echo "scale=4; $tp / ($tp + $fp)" | bc) - else - precision="1" - fi - - if [[ $total_negative -gt 0 ]]; then - fpr=$(echo "scale=4; $fp / $total_negative" | bc) - else - fpr="0" - fi - - if [[ $(echo "$precision + $recall > 0" | bc) -eq 1 ]]; then - f1=$(echo "scale=4; 2 * $precision * $recall / ($precision + $recall)" | bc) - else - f1="0" - fi - - printf " threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f FPR=%.3f F1=%.3f\n" \ - "$threshold" "$tp" "$fp" "$fn" "$tn" "$recall" "$precision" "$fpr" "$f1" - - tmp=$(mktemp) - jq --arg t "$threshold" \ - --argjson tp "$tp" --argjson fp "$fp" --argjson fn "$fn" --argjson tn "$tn" \ - --argjson recall "$recall" --argjson precision "$precision" \ - --argjson fpr "$fpr" --argjson f1 "$f1" \ - '.by_threshold[$t] = { - tp: $tp, fp: $fp, fn: $fn, tn: $tn, - recall: $recall, precision: $precision, - false_positive_rate: $fpr, f1: $f1 - }' "$REPORT_FILE" > "$tmp" - mv "$tmp" "$REPORT_FILE" -done - -echo "" -echo "Calculating recommendations..." - -best_f1_threshold="" best_f1=0 -best_recall_threshold="" best_recall=0 - -for threshold in "${THRESHOLDS[@]}"; do - metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE") - f1=$(echo "$metrics" | jq -r '.f1') - recall=$(echo "$metrics" | jq -r '.recall') - - if (( $(echo "$f1 > $best_f1" | bc -l) )); then - best_f1=$f1 - best_f1_threshold=$threshold - fi - if (( $(echo "$recall > $best_recall" | bc -l) )); then - best_recall=$recall - best_recall_threshold=$threshold - fi -done - -recovery_threshold="" -recovery_precision=0 -for threshold in "${THRESHOLDS[@]}"; do - metrics=$(jq -r ".by_threshold[\"$threshold\"]" "$REPORT_FILE") - recall=$(echo "$metrics" | jq -r '.recall') - precision=$(echo "$metrics" | jq -r '.precision') - - if (( $(echo "$recall >= 0.85" | bc -l) )); then - if (( $(echo "$precision > $recovery_precision" | bc -l) )); then - recovery_precision=$precision - recovery_threshold=$threshold - fi - fi -done - -if [[ -z "$recovery_threshold" ]]; then - recovery_threshold="${THRESHOLDS[0]}" -fi - -default_threshold="$best_f1_threshold" - -tmp=$(mktemp) -jq --arg default "$default_threshold" \ - --arg recovery "$recovery_threshold" \ - --arg best_f1 "$best_f1_threshold" \ - --argjson best_f1_val "$best_f1" \ - '.recommendations = { - default_threshold: $default, - recovery_threshold: $recovery, - best_f1: { threshold: $best_f1, value: $best_f1_val }, - notes: "default_threshold optimizes F1. recovery_threshold prioritizes recall (>=85%)." - }' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -cat > "${SUMMARY_FILE}" << EOF -# Threshold Calibration Report - -Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ) - -## Recommendations - -| Use Case | Threshold | Rationale | -|----------|-----------|-----------| -| **Default (find)** | **${default_threshold}** | Best F1 score (${best_f1}) | -| **Recovery** | **${recovery_threshold}** | High recall for element recovery | - -## Metrics by Threshold - -| Threshold | TP | FP | FN | TN | Recall | Precision | FPR | F1 | -|-----------|----|----|----|----|--------|-----------|-----|-----| -$(for t in "${THRESHOLDS[@]}"; do - m=$(jq -r ".by_threshold[\"$t\"]" "$REPORT_FILE") - printf "| %.2f | %d | %d | %d | %d | %.3f | %.3f | %.3f | %.3f |\n" \ - "$t" \ - "$(echo "$m" | jq -r '.tp')" \ - "$(echo "$m" | jq -r '.fp')" \ - "$(echo "$m" | jq -r '.fn')" \ - "$(echo "$m" | jq -r '.tn')" \ - "$(echo "$m" | jq -r '.recall')" \ - "$(echo "$m" | jq -r '.precision')" \ - "$(echo "$m" | jq -r '.false_positive_rate')" \ - "$(echo "$m" | jq -r '.f1')" -done) - -## Trade-offs - -- **Lower threshold** (0.10-0.20): High recall, more false positives. Good for recovery. -- **Medium threshold** (0.25-0.35): Balanced. Good default for find operations. -- **Higher threshold** (0.40+): High precision, misses weaker matches. -EOF - -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo "================================================" -echo " THRESHOLD CALIBRATION COMPLETE" -echo "================================================" -echo " Test cases: ${TOTAL_CASES}" -echo " Default threshold: ${default_threshold} (F1=${best_f1})" -echo " Recovery threshold: ${recovery_threshold}" -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/check-baseline.sh b/tests/benchmark/scripts/check-baseline.sh deleted file mode 100755 index f6e95ae..0000000 --- a/tests/benchmark/scripts/check-baseline.sh +++ /dev/null @@ -1,140 +0,0 @@ -#!/bin/bash -# -# Check current benchmark results against a baseline. -# -# Usage: -# ./check-baseline.sh [--baseline ] [--fail-on-regression] -# -# Exit codes: -# 0 - No regressions detected -# 1 - Regressions detected (if --fail-on-regression) -# 2 - Error (missing files, invalid config) -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -BASELINES_DIR="${BENCHMARK_DIR}/baselines" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[0;33m' -NC='\033[0m' - -# Read config -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: Config file not found: $CONFIG_FILE" >&2 - exit 2 -fi - -STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") -MAX_P1_DROP=$(jq -r '.baseline.quality.max_overall_p_at_1_drop // 0.02' "$CONFIG_FILE") -MAX_MRR_DROP=$(jq -r '.baseline.quality.max_overall_mrr_drop // 0.02' "$CONFIG_FILE") -MAX_HIT3_DROP=$(jq -r '.baseline.quality.max_overall_hit_at_3_drop // 0.02' "$CONFIG_FILE") -MAX_CORPUS_P1_DROP=$(jq -r '.baseline.quality.max_corpus_p_at_1_drop // 0.08' "$CONFIG_FILE") -MAX_MARGIN_DROP=$(jq -r '.baseline.quality.max_margin_drop_report // 0.15' "$CONFIG_FILE") - -# Parse args -BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json" -FAIL_ON_REGRESSION=false -while [[ $# -gt 0 ]]; do - case "$1" in - --baseline) BASELINE_FILE="$2"; shift 2 ;; - --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;; - *) echo "Unknown option: $1"; exit 2 ;; - esac -done - -if [[ ! -f "$BASELINE_FILE" ]]; then - echo "ERROR: Baseline not found: $BASELINE_FILE" >&2 - echo "Run ./create-baseline.sh first" >&2 - exit 2 -fi - -echo "Checking against baseline: ${BASELINE_FILE}" -echo "Tolerances: P@1=${MAX_P1_DROP}, MRR=${MAX_MRR_DROP}, Hit@3=${MAX_HIT3_DROP}" -echo "" - -# Run current benchmark -TEMP_DIR=$(mktemp -d) -trap 'rm -rf "$TEMP_DIR"' EXIT - -"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" > "${TEMP_DIR}/output.log" 2>&1 - -# Find the latest report -LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1) - -if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then - echo "ERROR: Could not find benchmark report" >&2 - exit 2 -fi - -# Compare metrics -REGRESSIONS=0 -WARNINGS=0 - -compare_metric() { - local name="$1" - local baseline_val="$2" - local current_val="$3" - local max_drop="$4" - - local diff - diff=$(echo "scale=4; $current_val - $baseline_val" | bc) - local drop - drop=$(echo "scale=4; $baseline_val - $current_val" | bc) - - if (( $(echo "$drop > $max_drop" | bc -l) )); then - echo -e "${RED}REGRESSION${NC} $name: $baseline_val -> $current_val (drop: $drop, max: $max_drop)" - REGRESSIONS=$((REGRESSIONS + 1)) - elif (( $(echo "$drop > 0" | bc -l) )); then - echo -e "${YELLOW}WARNING${NC} $name: $baseline_val -> $current_val (drop: $drop)" - WARNINGS=$((WARNINGS + 1)) - else - echo -e "${GREEN}OK${NC} $name: $baseline_val -> $current_val (${diff:0:6})" - fi -} - -echo "=== Overall Metrics ===" -echo "" - -BASELINE_MRR=$(jq -r '.metrics.mrr' "$BASELINE_FILE") -CURRENT_MRR=$(jq -r '.metrics.mrr' "$LATEST_REPORT") -compare_metric "MRR" "$BASELINE_MRR" "$CURRENT_MRR" "$MAX_MRR_DROP" - -BASELINE_P1=$(jq -r '.metrics.p_at_1' "$BASELINE_FILE") -CURRENT_P1=$(jq -r '.metrics.p_at_1' "$LATEST_REPORT") -compare_metric "P@1" "$BASELINE_P1" "$CURRENT_P1" "$MAX_P1_DROP" - -BASELINE_HIT3=$(jq -r '.metrics.hit_at_3' "$BASELINE_FILE") -CURRENT_HIT3=$(jq -r '.metrics.hit_at_3' "$LATEST_REPORT") -compare_metric "Hit@3" "$BASELINE_HIT3" "$CURRENT_HIT3" "$MAX_HIT3_DROP" - -BASELINE_MARGIN=$(jq -r '.metrics.avg_margin' "$BASELINE_FILE") -CURRENT_MARGIN=$(jq -r '.metrics.avg_margin' "$LATEST_REPORT") -compare_metric "Margin" "$BASELINE_MARGIN" "$CURRENT_MARGIN" "$MAX_MARGIN_DROP" - -echo "" -echo "=== Per-Corpus ===" -echo "" - -for corpus in $(jq -r '.by_corpus | keys[]' "$BASELINE_FILE"); do - BASELINE_CORPUS_P1=$(jq -r ".by_corpus[\"$corpus\"].p_at_1 // 0" "$BASELINE_FILE") - CURRENT_CORPUS_P1=$(jq -r ".metrics.by_corpus[\"$corpus\"].p_at_1 // 0" "$LATEST_REPORT") - compare_metric "$corpus P@1" "$BASELINE_CORPUS_P1" "$CURRENT_CORPUS_P1" "$MAX_CORPUS_P1_DROP" -done - -echo "" -echo "================================================" -if [[ $REGRESSIONS -gt 0 ]]; then - echo -e "${RED}REGRESSIONS: $REGRESSIONS${NC}" - if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then - exit 1 - fi -elif [[ $WARNINGS -gt 0 ]]; then - echo -e "${YELLOW}WARNINGS: $WARNINGS (no regressions)${NC}" -else - echo -e "${GREEN}ALL CHECKS PASSED${NC}" -fi -echo "================================================" diff --git a/tests/benchmark/scripts/create-baseline.sh b/tests/benchmark/scripts/create-baseline.sh deleted file mode 100755 index cd4696a..0000000 --- a/tests/benchmark/scripts/create-baseline.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash -# -# Create a quality baseline from current corpus benchmark results. -# -# Usage: -# ./create-baseline.sh [--name ] -# -# This runs run-corpus-benchmark.sh and saves the results as a baseline. -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -BASELINES_DIR="${BENCHMARK_DIR}/baselines" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" - -# Read defaults from config -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: Config file not found: $CONFIG_FILE" >&2 - exit 1 -fi - -STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") - -# Parse args -BASELINE_NAME="${STRATEGY}" -while [[ $# -gt 0 ]]; do - case "$1" in - --name) BASELINE_NAME="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -mkdir -p "${BASELINES_DIR}" - -BASELINE_FILE="${BASELINES_DIR}/${BASELINE_NAME}.json" - -echo "Creating baseline: ${BASELINE_NAME}" -echo "Strategy: ${STRATEGY}" -echo "" - -# Run corpus benchmark -TEMP_DIR=$(mktemp -d) -trap 'rm -rf "$TEMP_DIR"' EXIT - -"${SCRIPT_DIR}/run-corpus-benchmark.sh" --strategy "${STRATEGY}" 2>&1 | tee "${TEMP_DIR}/output.log" - -# Find the latest report -LATEST_REPORT=$(ls -t "${BENCHMARK_DIR}/results"/corpus_${STRATEGY}_*.json 2>/dev/null | head -1) - -if [[ -z "$LATEST_REPORT" ]] || [[ ! -f "$LATEST_REPORT" ]]; then - echo "ERROR: Could not find benchmark report" >&2 - exit 1 -fi - -# Extract baseline data -jq '{ - created_at: .benchmark.timestamp, - strategy: .benchmark.strategy, - threshold: .benchmark.threshold, - top_k: .benchmark.top_k, - weights: .benchmark.weights, - metrics: { - total: .metrics.total, - mrr: .metrics.mrr, - p_at_1: .metrics.p_at_1, - p_at_3: .metrics.p_at_3, - hit_at_3: .metrics.hit_at_3, - hit_at_5: .metrics.hit_at_5, - avg_margin: .metrics.avg_margin, - latency_p50_ms: .metrics.latency_p50_ms, - latency_p95_ms: .metrics.latency_p95_ms - }, - by_difficulty: .metrics.by_difficulty, - by_corpus: .metrics.by_corpus, - per_query: [.results[] | {id, corpus, difficulty, p_at_1, rr, margin}] -}' "$LATEST_REPORT" > "$BASELINE_FILE" - -echo "" -echo "================================================" -echo " BASELINE CREATED" -echo "================================================" -echo " File: ${BASELINE_FILE}" -echo "" -jq -r '" MRR: \(.metrics.mrr)\n P@1: \(.metrics.p_at_1)\n Hit@3: \(.metrics.hit_at_3)\n Margin: \(.metrics.avg_margin)"' "$BASELINE_FILE" -echo "================================================" diff --git a/tests/benchmark/scripts/finalize-report.sh b/tests/benchmark/scripts/finalize-report.sh deleted file mode 100755 index 38d314f..0000000 --- a/tests/benchmark/scripts/finalize-report.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -# -# Finalize benchmark report and generate summary -# -# Usage: -# ./finalize-report.sh -# -set -euo pipefail - -if [[ $# -lt 1 ]]; then - echo "Usage: $0 " - exit 1 -fi - -REPORT_FILE="$1" -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -# Calculate final metrics -TMP_FILE=$(mktemp) -jq ' - .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 10000 | floor / 100) else 0 end) | - .summary.avg_score = (if (.results | length) > 0 then ([.results[].score] | add / length | . * 1000 | floor / 1000) else 0 end) | - .summary.avg_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | add / length | floor) else 0 end) | - .summary.min_score = (if (.results | length) > 0 then ([.results[].score] | min) else 0 end) | - .summary.max_score = (if (.results | length) > 0 then ([.results[].score] | max) else 0 end) | - .summary.min_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | min) else 0 end) | - .summary.max_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | max) else 0 end) -' "${REPORT_FILE}" > "${TMP_FILE}" -mv "${TMP_FILE}" "${REPORT_FILE}" - -# Generate markdown summary -TIMESTAMP=$(jq -r '.benchmark.timestamp' "${REPORT_FILE}") -STRATEGY=$(jq -r '.benchmark.strategy' "${REPORT_FILE}") -VERSION=$(jq -r '.benchmark.version' "${REPORT_FILE}") -TOTAL=$(jq -r '.summary.total' "${REPORT_FILE}") -PASSED=$(jq -r '.summary.passed' "${REPORT_FILE}") -FAILED=$(jq -r '.summary.failed' "${REPORT_FILE}") -SKIPPED=$(jq -r '.summary.skipped' "${REPORT_FILE}") -ACCURACY=$(jq -r '.summary.accuracy' "${REPORT_FILE}") -AVG_SCORE=$(jq -r '.summary.avg_score' "${REPORT_FILE}") -AVG_LATENCY=$(jq -r '.summary.avg_latency_ms' "${REPORT_FILE}") -MIN_SCORE=$(jq -r '.summary.min_score' "${REPORT_FILE}") -MAX_SCORE=$(jq -r '.summary.max_score' "${REPORT_FILE}") -MIN_LATENCY=$(jq -r '.summary.min_latency_ms' "${REPORT_FILE}") -MAX_LATENCY=$(jq -r '.summary.max_latency_ms' "${REPORT_FILE}") - -cat > "${SUMMARY_FILE}" << EOF -# Semantic Matching Benchmark Results - -## Benchmark Info - -| Field | Value | -|-------|-------| -| Timestamp | ${TIMESTAMP} | -| Strategy | ${STRATEGY} | -| Version | ${VERSION} | - -## Results Summary - -| Metric | Value | -|--------|-------| -| Total Cases | ${TOTAL} | -| Passed | ${PASSED} | -| Failed | ${FAILED} | -| Skipped | ${SKIPPED} | -| **Accuracy** | **${ACCURACY}%** | - -## Score Distribution - -| Metric | Value | -|--------|-------| -| Average Score | ${AVG_SCORE} | -| Min Score | ${MIN_SCORE} | -| Max Score | ${MAX_SCORE} | - -## Latency - -| Metric | Value | -|--------|-------| -| Average | ${AVG_LATENCY} ms | -| Min | ${MIN_LATENCY} ms | -| Max | ${MAX_LATENCY} ms | - -## Failed Cases - -EOF - -# Add failed cases -jq -r '.results[] | select(.status == "fail") | "| \(.id) | \(.notes) |"' "${REPORT_FILE}" >> "${SUMMARY_FILE}" - -if [[ $(jq '[.results[] | select(.status == "fail")] | length' "${REPORT_FILE}") -eq 0 ]]; then - echo "_No failures_" >> "${SUMMARY_FILE}" -else - # Add header - sed -i.bak '/## Failed Cases/a\ -| ID | Notes |\ -|-----|-------|' "${SUMMARY_FILE}" - rm -f "${SUMMARY_FILE}.bak" -fi - -echo "" -echo "================================================" -echo " BENCHMARK SUMMARY" -echo "================================================" -echo " Strategy: ${STRATEGY}" -echo " Total: ${TOTAL}" -echo " Passed: ${PASSED}" -echo " Failed: ${FAILED}" -echo " Accuracy: ${ACCURACY}%" -echo " Avg Score: ${AVG_SCORE}" -echo " Avg Latency: ${AVG_LATENCY} ms" -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh deleted file mode 100755 index 783e546..0000000 --- a/tests/benchmark/scripts/lint-corpus.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -CASES_DIR="${BENCHMARK_DIR}/cases" -SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots" - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[0;33m' -NC='\033[0m' - -ERRORS=0 -WARNINGS=0 - -error() { - echo -e "${RED}ERROR:${NC} $1" - ERRORS=$((ERRORS + 1)) -} - -warn() { - echo -e "${YELLOW}WARN:${NC} $1" - WARNINGS=$((WARNINGS + 1)) -} - -ok() { - echo -e "${GREEN}βœ“${NC} $1" -} - -echo "=== Corpus Lint ===" -echo "" - -# 1. Check for invalid JSON in all benchmark files -echo "Checking JSON validity..." -for f in "${CORPUS_DIR}"/*/*.json "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - if ! jq . "$f" >/dev/null 2>&1; then - error "Invalid JSON: $f" - fi - fi -done - -# 2. Check for duplicate query IDs across corpus files -echo "Checking for duplicate query IDs..." -declare -A QUERY_IDS -for f in "${CORPUS_DIR}"/*/queries.json; do - if [[ -f "$f" ]]; then - while IFS= read -r id; do - if [[ -n "$id" && "$id" != "null" ]]; then - if [[ -n "${QUERY_IDS[$id]:-}" ]]; then - error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})" - else - QUERY_IDS[$id]="$f" - fi - fi - done < <(jq -r '.[].id // empty' "$f" 2>/dev/null) - fi -done - -# Also check cases files -for f in "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - while IFS= read -r id; do - if [[ -n "$id" && "$id" != "null" ]]; then - if [[ -n "${QUERY_IDS[$id]:-}" ]]; then - error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})" - else - QUERY_IDS[$id]="$f" - fi - fi - done < <(jq -r '.[].id // empty' "$f" 2>/dev/null) - fi -done - -# 3. Check for duplicate refs within snapshots -echo "Checking for duplicate refs in snapshots..." -for f in "${CORPUS_DIR}"/*/snapshot.json; do - if [[ -f "$f" ]]; then - dupes=$(jq -r '.[].ref' "$f" 2>/dev/null | sort | uniq -d) - if [[ -n "$dupes" ]]; then - error "Duplicate refs in $f: $dupes" - fi - fi -done - -# 4. Check that relevant_refs exist in snapshot -echo "Checking relevant_refs exist in snapshots..." -for corpus_dir in "${CORPUS_DIR}"/*/; do - corpus_name=$(basename "$corpus_dir") - snapshot="${corpus_dir}snapshot.json" - queries="${corpus_dir}queries.json" - - if [[ -f "$snapshot" && -f "$queries" ]]; then - # Get all refs from snapshot - refs=$(jq -r '.[].ref' "$snapshot" 2>/dev/null | sort | uniq) - - # Check relevant_refs - while IFS= read -r ref; do - if [[ -n "$ref" && "$ref" != "null" ]]; then - if ! echo "$refs" | grep -qx "$ref"; then - error "[$corpus_name] relevant_ref '$ref' not found in snapshot" - fi - fi - done < <(jq -r '.[].relevant_refs[]? // empty' "$queries" 2>/dev/null) - - # Check partially_relevant_refs - while IFS= read -r ref; do - if [[ -n "$ref" && "$ref" != "null" ]]; then - if ! echo "$refs" | grep -qx "$ref"; then - error "[$corpus_name] partially_relevant_ref '$ref' not found in snapshot" - fi - fi - done < <(jq -r '.[].partially_relevant_refs[]? // empty' "$queries" 2>/dev/null) - fi -done - -# 5. Check for empty relevant_refs (except no-match cases) -echo "Checking for empty relevant_refs..." -for f in "${CORPUS_DIR}"/*/queries.json; do - if [[ -f "$f" ]]; then - empty_relevant=$(jq -r '.[] | select(.relevant_refs | length == 0) | select(.partially_relevant_refs | length == 0) | select(.expect_no_match != true) | .id' "$f" 2>/dev/null) - for id in $empty_relevant; do - if [[ -n "$id" ]]; then - warn "Query '$id' in $f has empty relevant_refs" - fi - done - fi -done - -# 6. Check difficulty values -echo "Checking difficulty values..." -VALID_DIFFICULTIES="easy medium hard" -for f in "${CORPUS_DIR}"/*/queries.json; do - if [[ -f "$f" ]]; then - while IFS= read -r line; do - id=$(echo "$line" | cut -d'|' -f1) - diff=$(echo "$line" | cut -d'|' -f2) - if [[ -n "$diff" && "$diff" != "null" ]]; then - if ! echo "$VALID_DIFFICULTIES" | grep -qw "$diff"; then - error "Invalid difficulty '$diff' for query '$id' in $f" - fi - fi - done < <(jq -r '.[] | "\(.id)|\(.difficulty // "null")"' "$f" 2>/dev/null) - fi -done - -# 7. Check for known tags (warn on unknown) -echo "Checking tags..." -KNOWN_TAGS="absent-control accessibility action action-synonym action-verb adversarial alertdialog all-stopwords auth basket-cart bulk-action button cell checkbox combobox compound context-exclusion conversational dashboard description descriptive dialog directional disambiguation domain-intent download-export duplicate-labels ecommerce empty-query empty-snapshot exact exact-match filter find-search generic-verb github guard icon implicit input interactive-boost keyboard-mash legal link literal-text login login-signin long-query lookup-search media menu menuitem missing-letter name-match natural-language navigation negative-context no-match noise-tokens nonsense option ordinal pagination parent-context partial position preferences-settings purchase-buy question-form radio register-create registration repeated-word row-context search searchbox section section-context signout-logout single-char social special-chars spinbutton stale-ref state switch synonym synonym-chain tab table textbox threshold toggle transposition typo vague-query visual weak-match wikipedia" -for f in "${CORPUS_DIR}"/*/queries.json "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - while IFS= read -r tag; do - if [[ -n "$tag" && "$tag" != "null" ]]; then - if ! echo "$KNOWN_TAGS" | grep -qw "$tag"; then - warn "Unknown tag '$tag' in $f" - fi - fi - done < <(jq -r '.[].tags[]? // empty' "$f" 2>/dev/null) - fi -done - -# 8. Check case files reference existing snapshots -echo "Checking case file snapshot references..." -for f in "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - while IFS= read -r snapshot; do - if [[ -n "$snapshot" && "$snapshot" != "null" ]]; then - if [[ ! -f "${SNAPSHOTS_DIR}/${snapshot}" ]]; then - error "Case file $f references missing snapshot: $snapshot" - fi - fi - done < <(jq -r '.[].snapshot // empty' "$f" 2>/dev/null) - fi -done - -# 9. Check for generated result files in source tree -echo "Checking for generated result files..." -if ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | grep -v '.gitkeep' | head -1 >/dev/null 2>&1; then - result_count=$(ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | wc -l | tr -d ' ') - warn "Found $result_count generated result files in tests/benchmark/results/ (should be gitignored)" -fi - -echo "" -echo "=== Summary ===" -if [[ $ERRORS -eq 0 && $WARNINGS -eq 0 ]]; then - ok "All checks passed" - exit 0 -elif [[ $ERRORS -eq 0 ]]; then - echo -e "${YELLOW}Warnings: $WARNINGS${NC}" - exit 0 -else - echo -e "${RED}Errors: $ERRORS${NC}" - echo -e "${YELLOW}Warnings: $WARNINGS${NC}" - exit 1 -fi diff --git a/tests/benchmark/scripts/record-result.sh b/tests/benchmark/scripts/record-result.sh deleted file mode 100755 index 2288f7c..0000000 --- a/tests/benchmark/scripts/record-result.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# -# Record a benchmark result -# -# Usage: -# ./record-result.sh "notes" -# -set -euo pipefail - -if [[ $# -lt 5 ]]; then - echo "Usage: $0 [notes]" - exit 1 -fi - -REPORT_FILE="$1" -ID="$2" -STATUS="$3" -SCORE="$4" -LATENCY_MS="$5" -NOTES="${6:-}" -TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) - -# Create result entry -RESULT_JSON=$(jq -n \ - --arg id "${ID}" \ - --arg status "${STATUS}" \ - --argjson score "${SCORE}" \ - --argjson latency "${LATENCY_MS}" \ - --arg notes "${NOTES}" \ - --arg ts "${TIMESTAMP}" \ - '{id: $id, status: $status, score: $score, latency_ms: $latency, notes: $notes, timestamp: $ts}') - -# Append to report -TMP_FILE=$(mktemp) -jq --argjson result "${RESULT_JSON}" \ - --arg status "${STATUS}" \ - '.results += [$result] | - .summary.total += 1 | - if $status == "pass" then .summary.passed += 1 - elif $status == "fail" then .summary.failed += 1 - else .summary.skipped += 1 end' \ - "${REPORT_FILE}" > "${TMP_FILE}" - -mv "${TMP_FILE}" "${REPORT_FILE}" diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh deleted file mode 100755 index 29c8a22..0000000 --- a/tests/benchmark/scripts/run-benchmark.sh +++ /dev/null @@ -1,226 +0,0 @@ -#!/bin/bash -# -# Run semantic matching benchmark -# -# Usage: -# ./run-benchmark.sh [--strategy ] [--cases ] -# -# Options: -# --strategy Strategy to benchmark (lexical, embedding, combined) -# --cases Specific case file to run (default: all) -# --output Output directory (default: ../results) -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CASES_DIR="${BENCHMARK_DIR}/cases" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" -SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -# Read defaults from config -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: Config file not found: $CONFIG_FILE" >&2 - exit 1 -fi - -STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") -THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") -TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") -CASE_FILE="" - -# Parse args (override config) -while [[ $# -gt 0 ]]; do - case "$1" in - --strategy) STRATEGY="$2"; shift 2 ;; - --cases) CASE_FILE="$2"; shift 2 ;; - --output) RESULTS_DIR="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -case "${STRATEGY}" in - lexical|embedding|combined) ;; - *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;; -esac - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.json" - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg strategy "${STRATEGY}" \ - --arg version "$(${SEMANTIC} --version 2>/dev/null || echo 'dev')" \ - '{ - benchmark: { - timestamp: $ts, - strategy: $strategy, - version: $version - }, - results: [], - summary: { - total: 0, - passed: 0, - failed: 0, - skipped: 0, - accuracy: 0, - avg_score: 0, - avg_latency_ms: 0 - } - }' > "${REPORT_FILE}" - -# Run cases -score_at_least() { - local score="$1" - local min_score="$2" - awk -v score="${score}" -v min_score="${min_score}" 'BEGIN { exit (score + 0 >= min_score + 0) ? 0 : 1 }' -} - -run_case() { - local case_file="$1" - local case_name - case_name=$(basename "$case_file" .json) - - echo "" - echo "=== Running: ${case_name} ===" - - local count - count=$(jq length "$case_file") - - for i in $(seq 0 $((count - 1))); do - local id query snapshot expect_ref expect_ref_alt expect_no_match expect_no_crash expect_has_matches threshold min_score - - id=$(jq -r ".[$i].id" "$case_file") - query=$(jq -r ".[$i].query" "$case_file") - snapshot=$(jq -r ".[$i].snapshot" "$case_file") - expect_ref=$(jq -r ".[$i].expect_ref // empty" "$case_file") - expect_ref_alt=$(jq -r ".[$i].expect_ref_alt // [] | join(\",\")" "$case_file") - expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$case_file") - expect_no_crash=$(jq -r ".[$i].expect_no_crash // false" "$case_file") - expect_has_matches=$(jq -r ".[$i].expect_has_matches // false" "$case_file") - threshold=$(jq -r ".[$i].threshold // 0.3" "$case_file") - min_score=$(jq -r ".[$i].min_score // 0" "$case_file") - - local snapshot_path="${SNAPSHOTS_DIR}/${snapshot}" - if [[ ! -f "${snapshot_path}" ]]; then - echo " [${id}] SKIP: snapshot not found: ${snapshot}" - "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "skip" 0 0 "snapshot not found" - continue - fi - - # Run query and measure time - local start_ms end_ms duration_ms result exit_code - start_ms=$(python3 -c 'import time; print(int(time.time() * 1000))') - - set +e - result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot_path}" \ - --strategy "${STRATEGY}" \ - --threshold "${threshold}" \ - --format json 2>&1) - exit_code=$? - set -e - - end_ms=$(python3 -c 'import time; print(int(time.time() * 1000))') - duration_ms=$((end_ms - start_ms)) - - # Evaluate result - local status="fail" - local got_ref="" - local got_score=0 - local notes="" - - if [[ ${exit_code} -ne 0 ]]; then - if [[ "${expect_no_crash}" == "true" ]]; then - # Some crashes are expected (empty query, etc) - status="pass" - notes="exit ${exit_code} (expected)" - else - notes="exit ${exit_code}: ${result}" - fi - else - got_ref=$(echo "$result" | jq -r '.best_ref // empty') - got_score=$(echo "$result" | jq -r '.best_score // 0') - local match_count - match_count=$(echo "$result" | jq -r '.matches | length') - - if [[ "${expect_no_match}" == "true" ]]; then - if [[ ${match_count} -eq 0 ]]; then - status="pass" - notes="no matches (expected)" - else - notes="expected no matches, got ${match_count}" - fi - elif [[ "${expect_has_matches}" == "true" ]]; then - if [[ ${match_count} -gt 0 ]]; then - if score_at_least "${got_score}" "${min_score}"; then - status="pass" - notes="${match_count} matches, score=${got_score}" - else - notes="${match_count} matches, score=${got_score} below min_score=${min_score}" - fi - else - notes="expected matches, got 0" - fi - elif [[ -n "${expect_ref}" ]]; then - if [[ "${got_ref}" == "${expect_ref}" ]]; then - if score_at_least "${got_score}" "${min_score}"; then - status="pass" - notes="ref=${got_ref}, score=${got_score}" - else - notes="ref=${got_ref}, score=${got_score} below min_score=${min_score}" - fi - elif [[ -n "${expect_ref_alt}" ]] && echo ",${expect_ref_alt}," | grep -q ",${got_ref},"; then - if score_at_least "${got_score}" "${min_score}"; then - status="pass" - notes="ref=${got_ref} (alt), score=${got_score}" - else - notes="ref=${got_ref} (alt), score=${got_score} below min_score=${min_score}" - fi - else - notes="got ${got_ref}, want ${expect_ref}" - fi - elif [[ "${expect_no_crash}" == "true" ]]; then - status="pass" - notes="no crash" - fi - fi - - # Record result - "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "${status}" "${got_score}" "${duration_ms}" "${notes}" - - if [[ "${status}" == "pass" ]]; then - echo " [${id}] PASS: ${notes}" - else - echo " [${id}] FAIL: ${notes}" - fi - done -} - -# Find case files -if [[ -n "${CASE_FILE}" ]]; then - run_case "${CASES_DIR}/${CASE_FILE}" -else - for case_file in "${CASES_DIR}"/*.json; do - [[ -f "$case_file" ]] || continue - run_case "$case_file" - done -fi - -# Finalize report -"${SCRIPT_DIR}/finalize-report.sh" "${REPORT_FILE}" - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo "Benchmark complete: ${REPORT_FILE}" diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh deleted file mode 100755 index 53216af..0000000 --- a/tests/benchmark/scripts/run-corpus-benchmark.sh +++ /dev/null @@ -1,514 +0,0 @@ -#!/bin/bash -# -# Run semantic matching benchmark with ranking metrics -# -# Usage: -# ./run-corpus-benchmark.sh [--strategy ] [--corpus ] [--lexical-weight ] [--embedding-weight ] -# -# Metrics: -# - MRR (Mean Reciprocal Rank) -# - P@1 (Precision at 1) -# - P@3 (Precision at 3) -# - Latency distribution (p50, p95, p99) -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -RESULTS_DIR="${BENCHMARK_DIR}/results" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" - -# Read defaults from config -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: Config file not found: $CONFIG_FILE" >&2 - exit 1 -fi - -STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") -THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") -TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") -LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE") -EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE") -SPECIFIC_CORPUS="" - -# Parse args (override config) -while [[ $# -gt 0 ]]; do - case "$1" in - --strategy) STRATEGY="$2"; shift 2 ;; - --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; - --threshold) THRESHOLD="$2"; shift 2 ;; - --top-k) TOP_K="$2"; shift 2 ;; - --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;; - --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -case "${STRATEGY}" in - lexical|embedding|combined) ;; - *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;; -esac - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json" - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg strategy "${STRATEGY}" \ - --argjson threshold "${THRESHOLD}" \ - --argjson top_k "${TOP_K}" \ - --argjson lexical_weight "${LEXICAL_WEIGHT}" \ - --argjson embedding_weight "${EMBEDDING_WEIGHT}" \ - --arg config_file "${CONFIG_FILE}" \ - '{ - benchmark: { - timestamp: $ts, - strategy: $strategy, - threshold: $threshold, - top_k: $top_k, - type: "corpus", - config_source: $config_file, - weights: { - lexical: $lexical_weight, - embedding: $embedding_weight - } - }, - results: [], - metrics: { - total: 0, - mrr: 0, - p_at_1: 0, - p_at_3: 0, - latencies_ms: [], - by_difficulty: {}, - by_tag: {} - } - }' > "${REPORT_FILE}" - -# Arrays to collect metrics -declare -a ALL_RRS=() -declare -a ALL_P1=() -declare -a ALL_P3=() -declare -a ALL_HIT3=() -declare -a ALL_HIT5=() -declare -a ALL_MARGINS=() -declare -a ALL_LATENCIES=() - -run_corpus() { - local corpus_path="$1" - local corpus_name - corpus_name=$(basename "$corpus_path") - - local snapshot="${corpus_path}/snapshot.json" - local queries="${corpus_path}/queries.json" - - if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then - if [[ -f "${corpus_path}/cases.json" ]] || [[ -f "${corpus_path}/scenarios.json" ]]; then - return - fi - echo " Skipping ${corpus_name}: missing files" - return - fi - - echo "" - echo "=== Corpus: ${corpus_name} ===" - - local count - count=$(jq length "$queries") - - for i in $(seq 0 $((count - 1))); do - local id query relevant_refs partial_refs difficulty tags - - id=$(jq -r ".[$i].id" "$queries") - query=$(jq -r ".[$i].query" "$queries") - relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries") - partial_refs=$(jq -c ".[$i].partially_relevant_refs // []" "$queries") - difficulty=$(jq -r ".[$i].difficulty // \"medium\"" "$queries") - tags=$(jq -c ".[$i].tags // []" "$queries") - - # Run query and measure time - local start_ns end_ns duration_ms result - start_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))') - - if ! result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot}" \ - --strategy "${STRATEGY}" \ - --threshold "${THRESHOLD}" \ - --top-k "${TOP_K}" \ - --lexical-weight "${LEXICAL_WEIGHT}" \ - --embedding-weight "${EMBEDDING_WEIGHT}" \ - --format json 2>&1); then - echo " [${id}] ERROR: semantic find failed for query: ${query}" >&2 - echo "${result}" >&2 - exit 1 - fi - - if ! echo "$result" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then - echo " [${id}] ERROR: semantic find returned invalid JSON" >&2 - echo "${result}" >&2 - exit 1 - fi - - end_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))') - duration_ms=$(( (end_ns - start_ns) / 1000 )) - - # Extract results - local matches best_ref best_score - matches=$(echo "$result" | jq -c '[.matches[].ref]') - best_ref=$(echo "$result" | jq -r '.best_ref // ""') - best_score=$(echo "$result" | jq -r '.best_score // 0') - - # Calculate Reciprocal Rank - local rr=0 - for rank in $(seq 1 ${TOP_K}); do - local ref_at_rank - ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - rr=$(echo "scale=4; 1 / ${rank}" | bc) - break - fi - done - - # Calculate P@1 - local p1=0 - if echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - p1=1 - elif echo "$partial_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - p1=0.5 - fi - - # Calculate P@3 (count relevant in top 3, partials count as 0.5) - local relevant_in_top3=0 - local partial_in_top3=0 - local hit_at_3=0 - local hit_at_5=0 - local best_relevant_rank="null" - for rank in 1 2 3 4 5; do - local ref_at_rank - ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - if [[ "$best_relevant_rank" == "null" ]]; then - best_relevant_rank=$rank - fi - if [[ $rank -le 3 ]]; then - relevant_in_top3=$((relevant_in_top3 + 1)) - hit_at_3=1 - fi - hit_at_5=1 - elif [[ $rank -le 3 ]]; then - if echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - partial_in_top3=$((partial_in_top3 + 1)) - fi - fi - done - local p3 - p3=$(echo "scale=4; (${relevant_in_top3} + ${partial_in_top3} * 0.5) / 3" | bc) - - # Calculate best_relevant_score, best_wrong_score, and margin - local best_relevant_score=0 - local best_wrong_score=0 - local num_matches - num_matches=$(echo "$result" | jq '.matches | length') - for idx in $(seq 0 $((num_matches - 1))); do - local ref_at_idx score_at_idx - ref_at_idx=$(echo "$result" | jq -r ".matches[$idx].ref // \"\"") - score_at_idx=$(echo "$result" | jq -r ".matches[$idx].score // 0") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then - if (( $(echo "$score_at_idx > $best_relevant_score" | bc -l) )); then - best_relevant_score=$score_at_idx - fi - elif echo "$partial_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then - : # partials don't count as wrong - else - if (( $(echo "$score_at_idx > $best_wrong_score" | bc -l) )); then - best_wrong_score=$score_at_idx - fi - fi - done - local margin - margin=$(echo "scale=4; $best_relevant_score - $best_wrong_score" | bc) - - # Collect metrics - ALL_RRS+=("$rr") - ALL_P1+=("$p1") - ALL_P3+=("$p3") - ALL_HIT3+=("$hit_at_3") - ALL_HIT5+=("$hit_at_5") - ALL_MARGINS+=("$margin") - ALL_LATENCIES+=("$duration_ms") - - # Status indicator - local status="MISS" - if (( $(echo "$p1 >= 1" | bc -l) )); then - status="HIT " - elif (( $(echo "$p1 >= 0.5" | bc -l) )); then - status="PART" - fi - - printf " [%s] %s | RR=%.2f P@1=%.1f P@3=%.2f | %dms | %s\n" \ - "$id" "$status" "$rr" "$p1" "$p3" "$duration_ms" "$query" - - # Record to report - local result_json - result_json=$(jq -n \ - --arg id "$id" \ - --arg query "$query" \ - --arg corpus "$corpus_name" \ - --arg difficulty "$difficulty" \ - --argjson tags "$tags" \ - --arg best_ref "$best_ref" \ - --argjson best_score "$best_score" \ - --argjson matches "$matches" \ - --argjson relevant "$relevant_refs" \ - --argjson rr "$rr" \ - --argjson p1 "$p1" \ - --argjson p3 "$p3" \ - --argjson hit_at_3 "$hit_at_3" \ - --argjson hit_at_5 "$hit_at_5" \ - --argjson best_relevant_rank "$best_relevant_rank" \ - --argjson best_relevant_score "$best_relevant_score" \ - --argjson best_wrong_score "$best_wrong_score" \ - --argjson margin "$margin" \ - --argjson latency "$duration_ms" \ - '{ - id: $id, query: $query, corpus: $corpus, - difficulty: $difficulty, tags: $tags, - best_ref: $best_ref, best_score: $best_score, - matches: $matches, relevant_refs: $relevant, - rr: $rr, p_at_1: $p1, p_at_3: $p3, - hit_at_3: $hit_at_3, hit_at_5: $hit_at_5, - best_relevant_rank: $best_relevant_rank, - best_relevant_score: $best_relevant_score, - best_wrong_score: $best_wrong_score, - margin: $margin, - latency_ms: $latency - }') - - # Append to report - local tmp - tmp=$(mktemp) - jq --argjson r "$result_json" '.results += [$r]' "$REPORT_FILE" > "$tmp" - mv "$tmp" "$REPORT_FILE" - done -} - -# Run benchmarks -if [[ -n "${SPECIFIC_CORPUS}" ]]; then - run_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}" -else - for corpus in "${CORPUS_DIR}"/*/; do - [[ -d "$corpus" ]] || continue - run_corpus "$corpus" - done -fi - -# Calculate aggregate metrics -echo "" -echo "Calculating aggregate metrics..." - -TOTAL=${#ALL_RRS[@]} -if [[ $TOTAL -eq 0 ]]; then - echo "No results to aggregate" - exit 1 -fi - -# MRR -MRR=$(printf '%s\n' "${ALL_RRS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# P@1 -P1=$(printf '%s\n' "${ALL_P1[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# P@3 -P3=$(printf '%s\n' "${ALL_P3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Hit@3 -HIT3=$(printf '%s\n' "${ALL_HIT3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Hit@5 -HIT5=$(printf '%s\n' "${ALL_HIT5[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Average margin -AVG_MARGIN=$(printf '%s\n' "${ALL_MARGINS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Latency percentiles -SORTED_LAT=($(printf '%s\n' "${ALL_LATENCIES[@]}" | sort -n)) -P50_IDX=$(( TOTAL * 50 / 100 )) -P95_IDX=$(( TOTAL * 95 / 100 )) -P99_IDX=$(( TOTAL * 99 / 100 )) -LAT_P50=${SORTED_LAT[$P50_IDX]:-0} -LAT_P95=${SORTED_LAT[$P95_IDX]:-0} -LAT_P99=${SORTED_LAT[$P99_IDX]:-0} -LAT_AVG=$(printf '%s\n' "${ALL_LATENCIES[@]}" | awk '{s+=$1} END {printf "%.0f", s/NR}') - -# Update report with aggregates -tmp=$(mktemp) -jq \ - --argjson total "$TOTAL" \ - --argjson mrr "$MRR" \ - --argjson p1 "$P1" \ - --argjson p3 "$P3" \ - --argjson hit3 "$HIT3" \ - --argjson hit5 "$HIT5" \ - --argjson avg_margin "$AVG_MARGIN" \ - --argjson lat_avg "$LAT_AVG" \ - --argjson lat_p50 "$LAT_P50" \ - --argjson lat_p95 "$LAT_P95" \ - --argjson lat_p99 "$LAT_P99" \ - '.metrics = { - total: $total, - mrr: $mrr, - p_at_1: $p1, - p_at_3: $p3, - hit_at_3: $hit3, - hit_at_5: $hit5, - avg_margin: $avg_margin, - latency_avg_ms: $lat_avg, - latency_p50_ms: $lat_p50, - latency_p95_ms: $lat_p95, - latency_p99_ms: $lat_p99 - }' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Add by-difficulty breakdown -tmp=$(mktemp) -jq '.metrics.by_difficulty = ( - .results | group_by(.difficulty) | map({ - key: .[0].difficulty, - value: { - count: length, - mrr: ([.[].rr] | add / length), - p_at_1: ([.[].p_at_1] | add / length), - hit_at_3: ([.[].hit_at_3] | add / length), - hit_at_5: ([.[].hit_at_5] | add / length), - avg_margin: ([.[].margin] | add / length) - } - }) | from_entries -)' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Add by-corpus breakdown -tmp=$(mktemp) -jq '.metrics.by_corpus = ( - .results | group_by(.corpus) | map({ - key: .[0].corpus, - value: { - count: length, - mrr: ([.[].rr] | add / length), - p_at_1: ([.[].p_at_1] | add / length), - hit_at_3: ([.[].hit_at_3] | add / length), - hit_at_5: ([.[].hit_at_5] | add / length), - avg_margin: ([.[].margin] | add / length) - } - }) | from_entries -)' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Add by-tag breakdown -tmp=$(mktemp) -jq '.metrics.by_tag = ( - [.results[] | {tags: .tags, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}] - | [.[] | .tags[] as $tag | {tag: $tag, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}] - | group_by(.tag) - | map({ - key: .[0].tag, - value: { - count: length, - mrr: ([.[].rr] | add / length), - p_at_1: ([.[].p_at_1] | add / length), - hit_at_3: ([.[].hit_at_3] | add / length), - hit_at_5: ([.[].hit_at_5] | add / length), - avg_margin: ([.[].margin] | add / length) - } - }) - | from_entries -)' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Generate summary -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -cat > "${SUMMARY_FILE}" << EOF -# Semantic Matching Benchmark Results - -## Configuration - -| Field | Value | -|-------|-------| -| Timestamp | $(jq -r '.benchmark.timestamp' "$REPORT_FILE") | -| Strategy | ${STRATEGY} | -| Lexical Weight | ${LEXICAL_WEIGHT} | -| Embedding Weight | ${EMBEDDING_WEIGHT} | -| Top-K | ${TOP_K} | -| Total Queries | ${TOTAL} | - -## Ranking Metrics - -| Metric | Value | Description | -|--------|-------|-------------| -| **MRR** | **${MRR}** | Mean Reciprocal Rank | -| **P@1** | **${P1}** | Precision at rank 1 | -| **P@3** | **${P3}** | Precision at rank 3 | -| **Hit@3** | **${HIT3}** | Any relevant in top 3 | -| **Hit@5** | **${HIT5}** | Any relevant in top 5 | -| **Avg Margin** | **${AVG_MARGIN}** | best_relevant - best_wrong | - -## Latency - -| Percentile | Value | -|------------|-------| -| Average | ${LAT_AVG} ms | -| P50 | ${LAT_P50} ms | -| P95 | ${LAT_P95} ms | -| P99 | ${LAT_P99} ms | - -## By Difficulty - -| Difficulty | Count | MRR | P@1 | Hit@3 | Margin | -|------------|-------|-----|-----|-------|--------| -$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE") - -## By Corpus - -| Corpus | Count | MRR | P@1 | Hit@3 | Margin | -|--------|-------|-----|-----|-------|--------| -$(jq -r '.metrics.by_corpus | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE") - -## Misses (P@1 = 0) - -| ID | Query | Got | Expected | -|----|-------|-----|----------| -$(jq -r '.results[] | select(.p_at_1 == 0) | "| \(.id) | \(.query) | \(.best_ref) | \(.relevant_refs | join(",")) |"' "$REPORT_FILE") - -EOF - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo "================================================" -echo " CORPUS BENCHMARK RESULTS" -echo "================================================" -echo " Strategy: ${STRATEGY}" -echo " Weights: lexical=${LEXICAL_WEIGHT} embedding=${EMBEDDING_WEIGHT}" -echo " Queries: ${TOTAL}" -echo " MRR: ${MRR}" -echo " P@1: ${P1}" -echo " P@3: ${P3}" -echo " Hit@3: ${HIT3}" -echo " Hit@5: ${HIT5}" -echo " Avg Margin: ${AVG_MARGIN}" -echo " Latency P50: ${LAT_P50} ms" -echo " Latency P95: ${LAT_P95} ms" -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh deleted file mode 100755 index 5c759dc..0000000 --- a/tests/benchmark/scripts/run-full-benchmark.sh +++ /dev/null @@ -1,317 +0,0 @@ -#!/bin/bash -# -# Full semantic benchmark: Find + Recovery + Classification -# -# Produces a composite score for overall system health. -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -RESULTS_DIR="${BENCHMARK_DIR}/results" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" - -# Read defaults from config -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: Config file not found: $CONFIG_FILE" >&2 - exit 1 -fi - -STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") -THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") -TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") -LEXICAL_WEIGHT=$(jq -r '.defaults.weights.lexical // 0.6' "$CONFIG_FILE") -EMBEDDING_WEIGHT=$(jq -r '.defaults.weights.embedding // 0.4' "$CONFIG_FILE") - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary with recovery support -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/full_benchmark_${TIMESTAMP}.json" - -has_role_keyword() { - local query="$1" - echo "$query" | grep -Eiq '(^|[^[:alnum:]])(button|input|link|textbox|checkbox|radio|select|option|tab|menu|form|search)([^[:alnum:]]|$)' -} - -enrich_recovery_query() { - local query="$1" - local role="$2" - - if [[ -z "$query" || -z "$role" ]]; then - printf '%s' "$query" - return - fi - if has_role_keyword "$query"; then - printf '%s' "$query" - return - fi - printf '%s %s' "$query" "$role" -} - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - '{ - timestamp: $ts, - find: { total: 0, mrr: 0, p_at_1: 0, latency_p50: 0 }, - recovery: { total: 0, recovered: 0, rate: 0 }, - classification: { total: 0, correct: 0, accuracy: 0 }, - composite: { score: 0, grade: "" } - }' > "${REPORT_FILE}" - -echo "" -echo "==============================================" -echo " PHASE 1: FIND BENCHMARK" -echo "==============================================" - -# Run corpus benchmark and capture metrics -FIND_OUTPUT=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" 2>&1) -echo "$FIND_OUTPUT" - -# Extract metrics from the corpus report rather than the human-readable output. -FIND_REPORT=$(echo "$FIND_OUTPUT" | awk '/^Report:/ {print $2}' | tail -1) -if [[ -z "${FIND_REPORT}" ]] || [[ ! -f "${FIND_REPORT}" ]]; then - echo "error: could not locate corpus benchmark report" >&2 - exit 1 -fi -FIND_MRR=$(jq -r '.metrics.mrr' "$FIND_REPORT") -FIND_P1=$(jq -r '.metrics.p_at_1' "$FIND_REPORT") -FIND_TOTAL=$(jq -r '.metrics.total' "$FIND_REPORT") -FIND_LAT=$(jq -r '.metrics.latency_p50_ms' "$FIND_REPORT") - -# Rebuild semantic binary (corpus benchmark deletes it) -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -echo "" -echo "==============================================" -echo " PHASE 2: RECOVERY BENCHMARK" -echo "==============================================" - -SCENARIOS_FILE="${CORPUS_DIR}/recovery-scenarios/scenarios.json" -RECOVERY_TOTAL=0 -RECOVERY_SUCCESS=0 - -if [[ -f "$SCENARIOS_FILE" ]]; then - SCENARIO_COUNT=$(jq length "$SCENARIOS_FILE") - - for i in $(seq 0 $((SCENARIO_COUNT - 1))); do - ID=$(jq -r ".[$i].id" "$SCENARIOS_FILE") - NAME=$(jq -r ".[$i].name" "$SCENARIOS_FILE") - RAW_QUERY=$(jq -r ".[$i].original_query" "$SCENARIOS_FILE") - ORIGINAL_REF=$(jq -r ".[$i].original_ref // empty" "$SCENARIOS_FILE") - ORIGINAL_ROLE=$(jq -r ".[$i].before[]? | select(.ref == \"$ORIGINAL_REF\") | .role // empty" "$SCENARIOS_FILE") - QUERY=$(enrich_recovery_query "$RAW_QUERY" "$ORIGINAL_ROLE") - EXPECTED=$(jq -r ".[$i].expected_ref // empty" "$SCENARIOS_FILE") - EXPECTED_ALT=$(jq -r ".[$i].expected_alt // [] | join(\",\")" "$SCENARIOS_FILE") - EXPECT_NO_MATCH=$(jq -r ".[$i].expect_no_match // false" "$SCENARIOS_FILE") - - # Write after snapshot to temp file - AFTER_FILE=$(mktemp) - jq ".[$i].after" "$SCENARIOS_FILE" > "$AFTER_FILE" - - # Run semantic find on after snapshot with the same minimum score - # enforced by DefaultRecoveryConfig in the recovery engine. - if ! RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>&1); then - echo " [$ID] ERROR: semantic find failed during recovery benchmark" >&2 - echo "$RESULT" >&2 - rm -f "$AFTER_FILE" - exit 1 - fi - if ! echo "$RESULT" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then - echo " [$ID] ERROR: semantic find returned invalid JSON during recovery benchmark" >&2 - echo "$RESULT" >&2 - rm -f "$AFTER_FILE" - exit 1 - fi - BEST_REF=$(echo "$RESULT" | jq -r '.best_ref // ""') - - rm -f "$AFTER_FILE" - - RECOVERY_TOTAL=$((RECOVERY_TOTAL + 1)) - STATUS="FAIL" - - if [[ "$EXPECT_NO_MATCH" == "true" ]]; then - if [[ -z "$BEST_REF" ]] || [[ "$BEST_REF" == "null" ]]; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - fi - elif [[ "$BEST_REF" == "$EXPECTED" ]]; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - elif [[ -n "$EXPECTED_ALT" ]] && echo ",$EXPECTED_ALT," | grep -q ",$BEST_REF,"; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - fi - - printf " [%s] %s | %s | got=%s want=%s\n" "$ID" "$STATUS" "$NAME" "$BEST_REF" "$EXPECTED" - done -fi - -RECOVERY_RATE=0 -if [[ $RECOVERY_TOTAL -gt 0 ]]; then - RECOVERY_RATE=$(echo "scale=4; $RECOVERY_SUCCESS / $RECOVERY_TOTAL" | bc) -fi - -echo "" -echo " Recovery: $RECOVERY_SUCCESS / $RECOVERY_TOTAL = $RECOVERY_RATE" - -echo "" -echo "==============================================" -echo " PHASE 3: CLASSIFICATION BENCHMARK" -echo "==============================================" - -CLASS_FILE="${CORPUS_DIR}/classification/cases.json" -CLASS_TOTAL=0 -CLASS_CORRECT=0 - -if [[ -f "$CLASS_FILE" ]]; then - CLASS_COUNT=$(jq length "$CLASS_FILE") - - for i in $(seq 0 $((CLASS_COUNT - 1))); do - ID=$(jq -r ".[$i].id" "$CLASS_FILE") - ERROR=$(jq -r ".[$i].error" "$CLASS_FILE") - EXPECTED=$(jq -r ".[$i].expected_type" "$CLASS_FILE") - - # Run semantic classify (extract just the type, first word) - if ! RESULT=$("${SEMANTIC}" classify "$ERROR" 2>&1); then - echo " [$ID] ERROR: semantic classify failed" >&2 - echo "$RESULT" >&2 - exit 1 - fi - GOT=$(echo "$RESULT" | awk '{print $1}') - - CLASS_TOTAL=$((CLASS_TOTAL + 1)) - STATUS="FAIL" - - if [[ "$GOT" == "$EXPECTED" ]]; then - STATUS="PASS" - CLASS_CORRECT=$((CLASS_CORRECT + 1)) - fi - - printf " [%s] %s | \"%s\" β†’ %s (want %s)\n" "$ID" "$STATUS" "${ERROR:0:40}" "$GOT" "$EXPECTED" - done -fi - -CLASS_ACCURACY=0 -if [[ $CLASS_TOTAL -gt 0 ]]; then - CLASS_ACCURACY=$(echo "scale=4; $CLASS_CORRECT / $CLASS_TOTAL" | bc) -fi - -echo "" -echo " Classification: $CLASS_CORRECT / $CLASS_TOTAL = $CLASS_ACCURACY" - -echo "" -echo "==============================================" -echo " COMPOSITE SCORE" -echo "==============================================" - -# Calculate composite score with weights: -# Find P@1: 40% -# Find MRR: 20% -# Recovery Rate: 25% -# Classification: 15% - -COMPOSITE=$(echo "scale=4; \ - ($FIND_P1 * 0.40) + \ - ($FIND_MRR * 0.20) + \ - ($RECOVERY_RATE * 0.25) + \ - ($CLASS_ACCURACY * 0.15)" | bc) -COMPOSITE=$(awk -v value="$COMPOSITE" 'BEGIN { printf "%.4f", value }') - -# Assign grade -GRADE="F" -if (( $(echo "$COMPOSITE >= 0.95" | bc -l) )); then GRADE="A+" -elif (( $(echo "$COMPOSITE >= 0.90" | bc -l) )); then GRADE="A" -elif (( $(echo "$COMPOSITE >= 0.85" | bc -l) )); then GRADE="B+" -elif (( $(echo "$COMPOSITE >= 0.80" | bc -l) )); then GRADE="B" -elif (( $(echo "$COMPOSITE >= 0.75" | bc -l) )); then GRADE="C+" -elif (( $(echo "$COMPOSITE >= 0.70" | bc -l) )); then GRADE="C" -elif (( $(echo "$COMPOSITE >= 0.60" | bc -l) )); then GRADE="D" -fi - -# Update report -TMP=$(mktemp) -jq \ - --argjson find_total "${FIND_TOTAL:-0}" \ - --argjson find_mrr "${FIND_MRR:-0}" \ - --argjson find_p1 "${FIND_P1:-0}" \ - --argjson find_lat "${FIND_LAT:-0}" \ - --argjson rec_total "$RECOVERY_TOTAL" \ - --argjson rec_success "$RECOVERY_SUCCESS" \ - --argjson rec_rate "$RECOVERY_RATE" \ - --argjson class_total "$CLASS_TOTAL" \ - --argjson class_correct "$CLASS_CORRECT" \ - --argjson class_acc "$CLASS_ACCURACY" \ - --argjson composite "$COMPOSITE" \ - --arg grade "$GRADE" \ - '.find = { total: $find_total, mrr: $find_mrr, p_at_1: $find_p1, latency_p50: $find_lat } | - .recovery = { total: $rec_total, recovered: $rec_success, rate: $rec_rate } | - .classification = { total: $class_total, correct: $class_correct, accuracy: $class_acc } | - .composite = { score: $composite, grade: $grade }' \ - "$REPORT_FILE" > "$TMP" -mv "$TMP" "$REPORT_FILE" - -# Generate summary -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" -cat > "$SUMMARY_FILE" << EOF -# Semantic Benchmark Report - -## Composite Score: ${COMPOSITE} (${GRADE}) - -| Component | Weight | Score | Weighted | -|-----------|--------|-------|----------| -| Find P@1 | 40% | ${FIND_P1:-0} | $(echo "scale=3; ${FIND_P1:-0} * 0.40" | bc) | -| Find MRR | 20% | ${FIND_MRR:-0} | $(echo "scale=3; ${FIND_MRR:-0} * 0.20" | bc) | -| Recovery | 25% | ${RECOVERY_RATE} | $(echo "scale=3; ${RECOVERY_RATE} * 0.25" | bc) | -| Classification | 15% | ${CLASS_ACCURACY} | $(echo "scale=3; ${CLASS_ACCURACY} * 0.15" | bc) | - -## Find Performance -- Queries: ${FIND_TOTAL:-0} -- MRR: ${FIND_MRR:-0} -- P@1: ${FIND_P1:-0} -- Latency P50: ${FIND_LAT:-0} ms - -## Recovery Performance -- Scenarios: ${RECOVERY_TOTAL} -- Recovered: ${RECOVERY_SUCCESS} -- Rate: ${RECOVERY_RATE} - -## Classification Performance -- Cases: ${CLASS_TOTAL} -- Correct: ${CLASS_CORRECT} -- Accuracy: ${CLASS_ACCURACY} - -## Grade Scale -| Grade | Score | -|-------|-------| -| A+ | >= 0.95 | -| A | >= 0.90 | -| B+ | >= 0.85 | -| B | >= 0.80 | -| C+ | >= 0.75 | -| C | >= 0.70 | -| D | >= 0.60 | -| F | < 0.60 | -EOF - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”" -echo " β”‚ COMPOSITE SCORE: ${COMPOSITE} GRADE: ${GRADE} β”‚" -echo " β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€" -echo " β”‚ Find P@1: ${FIND_P1:-0} (40%) β”‚" -echo " β”‚ Find MRR: ${FIND_MRR:-0} (20%) β”‚" -echo " β”‚ Recovery: ${RECOVERY_RATE} (25%) β”‚" -echo " β”‚ Classification: ${CLASS_ACCURACY} (15%) β”‚" -echo " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/run-recovery-benchmark.sh b/tests/benchmark/scripts/run-recovery-benchmark.sh deleted file mode 100755 index 93fc88a..0000000 --- a/tests/benchmark/scripts/run-recovery-benchmark.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# -# Recovery Engine Benchmark -# -# Exercises RecoveryEngine directly using before/after snapshots -# and intent cache entries from recovery scenarios. -# -# Usage: -# ./run-recovery-benchmark.sh -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -RESULTS_DIR="${BENCHMARK_DIR}/results" - -mkdir -p "${RESULTS_DIR}" - -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/recovery_benchmark_${TIMESTAMP}.txt" - -echo "=== Recovery Engine Benchmark ===" -echo "" - -cd "${BENCHMARK_DIR}/../.." - -# Run the Go test that exercises RecoveryEngine with scenarios -echo "Running recovery scenarios..." -echo "" - -go test -v -run TestRecoveryBenchmark_Scenarios ./recovery/ 2>&1 | tee "$REPORT_FILE" - -# Also run the Go benchmark for performance -echo "" -echo "Running performance benchmark..." -go test -bench=BenchmarkRecoveryEngine_Scenarios -benchmem ./recovery/ 2>&1 | tee -a "$REPORT_FILE" - -echo "" -echo "================================================" -echo " RECOVERY BENCHMARK COMPLETE" -echo "================================================" -echo "Report: $REPORT_FILE" diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh deleted file mode 100755 index 011b1b2..0000000 --- a/tests/benchmark/scripts/tune-weights.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/bash -# -# Grid-search combined matcher lexical/embedding weights against the corpus. -# -# Usage: -# ./tune-weights.sh [--corpus ] [--step ] [--output ] -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -RESULTS_DIR="${BENCHMARK_DIR}/results" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" - -# Read defaults from config (used for threshold/top_k in grid runs) -if [[ -f "$CONFIG_FILE" ]]; then - THRESHOLD=$(jq -r '.defaults.threshold // 0.01' "$CONFIG_FILE") - TOP_K=$(jq -r '.defaults.top_k // 5' "$CONFIG_FILE") -else - THRESHOLD=0.01 - TOP_K=5 -fi - -SPECIFIC_CORPUS="" -STEP="0.1" -while [[ $# -gt 0 ]]; do - case "$1" in - --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; - --step) STEP="$2"; shift 2 ;; - --output) RESULTS_DIR="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -mkdir -p "${RESULTS_DIR}" - -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/tuning_weights_${TIMESTAMP}.json" -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg step "${STEP}" \ - '{ - benchmark: { - timestamp: $ts, - type: "weight-tuning", - strategy: "combined", - step: ($step | tonumber) - }, - results: [], - best: null - }' > "${REPORT_FILE}" - -weights=$(awk -v step="${STEP}" 'BEGIN { - if (step <= 0 || step > 1) { - exit 1 - } - for (w = 0; w <= 1.000001; w += step) { - printf "%.4f\n", w - } -}') - -if [[ -z "${weights}" ]]; then - echo "Invalid step: ${STEP}" >&2 - exit 1 -fi - -echo "Weight tuning: step=${STEP}" -echo "" -printf "%-10s %-10s %-8s %-8s %-8s %-8s %-8s\n" "lexical" "embedding" "MRR" "P@1" "P@3" "P50" "report" - -while IFS= read -r lexical_weight; do - embedding_weight=$(awk -v w="${lexical_weight}" 'BEGIN { printf "%.4f", 1 - w }') - - args=( - --strategy combined - --lexical-weight "${lexical_weight}" - --embedding-weight "${embedding_weight}" - ) - if [[ -n "${SPECIFIC_CORPUS}" ]]; then - args+=(--corpus "${SPECIFIC_CORPUS}") - fi - - if ! output=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" "${args[@]}" 2>&1); then - echo "$output" >&2 - exit 1 - fi - - corpus_report=$(echo "$output" | awk '/^Report:/ {print $2}' | tail -1) - if [[ -z "${corpus_report}" || ! -f "${corpus_report}" ]]; then - echo "Could not find corpus report for lexical=${lexical_weight}" >&2 - echo "$output" >&2 - exit 1 - fi - - mrr=$(jq -r '.metrics.mrr' "$corpus_report") - p1=$(jq -r '.metrics.p_at_1' "$corpus_report") - p3=$(jq -r '.metrics.p_at_3' "$corpus_report") - p50=$(jq -r '.metrics.latency_p50_ms' "$corpus_report") - total=$(jq -r '.metrics.total' "$corpus_report") - - printf "%-10s %-10s %-8s %-8s %-8s %-8s %s\n" \ - "${lexical_weight}" "${embedding_weight}" "${mrr}" "${p1}" "${p3}" "${p50}" "$(basename "$corpus_report")" - - result_json=$(jq -n \ - --argjson lexical_weight "${lexical_weight}" \ - --argjson embedding_weight "${embedding_weight}" \ - --argjson total "${total}" \ - --argjson mrr "${mrr}" \ - --argjson p1 "${p1}" \ - --argjson p3 "${p3}" \ - --argjson p50 "${p50}" \ - --arg report "${corpus_report}" \ - '{ - lexical_weight: $lexical_weight, - embedding_weight: $embedding_weight, - total: $total, - mrr: $mrr, - p_at_1: $p1, - p_at_3: $p3, - latency_p50_ms: $p50, - report: $report - }') - - tmp=$(mktemp) - jq --argjson result "${result_json}" '.results += [$result]' "${REPORT_FILE}" > "$tmp" - mv "$tmp" "${REPORT_FILE}" -done <<< "${weights}" - -tmp=$(mktemp) -jq ' - .best = ( - .results - | sort_by(.p_at_1, .mrr, .p_at_3, -(.latency_p50_ms)) - | last - ) -' "${REPORT_FILE}" > "$tmp" -mv "$tmp" "${REPORT_FILE}" - -cat > "${SUMMARY_FILE}" << EOF -# Combined Weight Tuning - -## Best - -| Field | Value | -|-------|-------| -| Lexical Weight | $(jq -r '.best.lexical_weight' "$REPORT_FILE") | -| Embedding Weight | $(jq -r '.best.embedding_weight' "$REPORT_FILE") | -| MRR | $(jq -r '.best.mrr' "$REPORT_FILE") | -| P@1 | $(jq -r '.best.p_at_1' "$REPORT_FILE") | -| P@3 | $(jq -r '.best.p_at_3' "$REPORT_FILE") | -| Latency P50 | $(jq -r '.best.latency_p50_ms' "$REPORT_FILE") ms | - -## All Runs - -| Lexical | Embedding | MRR | P@1 | P@3 | P50 | -|---------|-----------|-----|-----|-----|-----| -$(jq -r '.results | sort_by(-.p_at_1, -.mrr, -.p_at_3, .latency_p50_ms)[] | "| \(.lexical_weight) | \(.embedding_weight) | \(.mrr) | \(.p_at_1) | \(.p_at_3) | \(.latency_p50_ms) ms |"' "$REPORT_FILE") -EOF - -echo "" -echo "Best weights:" -jq '.best' "${REPORT_FILE}" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/update-baseline.sh b/tests/benchmark/scripts/update-baseline.sh deleted file mode 100755 index ba93089..0000000 --- a/tests/benchmark/scripts/update-baseline.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# -# Update baseline after reviewing regressions. -# -# Usage: -# ./update-baseline.sh --accept [--baseline ] -# -# This re-runs the benchmark and overwrites the baseline file. -# Use after reviewing check-baseline.sh output and confirming -# the changes are intentional. -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -BASELINES_DIR="${BENCHMARK_DIR}/baselines" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" - -# Read config -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: Config file not found: $CONFIG_FILE" >&2 - exit 1 -fi - -STRATEGY=$(jq -r '.defaults.strategy // "combined"' "$CONFIG_FILE") - -# Parse args -BASELINE_FILE="${BASELINES_DIR}/${STRATEGY}.json" -ACCEPT=false -while [[ $# -gt 0 ]]; do - case "$1" in - --accept) ACCEPT=true; shift ;; - --baseline) BASELINE_FILE="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -if [[ "$ACCEPT" != "true" ]]; then - echo "Usage: $0 --accept [--baseline ]" - echo "" - echo "This will overwrite the baseline. Run check-baseline.sh first" - echo "to review changes before accepting." - exit 1 -fi - -if [[ ! -f "$BASELINE_FILE" ]]; then - echo "Baseline not found: $BASELINE_FILE" - echo "Creating new baseline instead..." - exec "${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")" -fi - -# Show what will change -echo "Current baseline: ${BASELINE_FILE}" -echo "" -jq -r '" MRR: \(.metrics.mrr)\n P@1: \(.metrics.p_at_1)\n Hit@3: \(.metrics.hit_at_3)"' "$BASELINE_FILE" -echo "" -echo "Running benchmark to generate new baseline..." -echo "" - -# Backup old baseline -BACKUP_FILE="${BASELINE_FILE%.json}_$(date +%Y%m%d_%H%M%S).backup.json" -cp "$BASELINE_FILE" "$BACKUP_FILE" -echo "Backed up old baseline to: $BACKUP_FILE" - -# Create new baseline (overwrites) -"${SCRIPT_DIR}/create-baseline.sh" --name "$(basename "${BASELINE_FILE%.json}")" - -echo "" -echo "Baseline updated. Old baseline backed up to:" -echo " $BACKUP_FILE" From 96d7142b20e645014105bd2831b62ce19a52b47e Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 17:35:49 +0100 Subject: [PATCH 08/14] feat: move runtime baseline check to Go CLI Add `semantic-bench runtime` command to check Go benchmark performance against baseline. Remove last bash script and the scripts/ directory. --- cmd/semantic-bench/main.go | 16 ++ dev | 2 +- internal/benchmark/commands.go | 209 ++++++++++++++++++ internal/benchmark/config.go | 14 ++ .../scripts/check-runtime-baseline.sh | 137 ------------ 5 files changed, 240 insertions(+), 138 deletions(-) delete mode 100755 tests/benchmark/scripts/check-runtime-baseline.sh diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go index 4866601..076d71a 100644 --- a/cmd/semantic-bench/main.go +++ b/cmd/semantic-bench/main.go @@ -21,6 +21,7 @@ Commands: baseline Manage quality baselines (create, update) calibrate Find optimal thresholds via precision/recall analysis tune Grid-search lexical/embedding weights + runtime Check Go benchmark performance against baseline Flags: -h, --help Show help @@ -54,6 +55,8 @@ func main() { runCalibrate(args) case "tune": runTune(args) + case "runtime": + runRuntime(args) case "-h", "--help", "help": fmt.Print(usage) default: @@ -150,3 +153,16 @@ func runTune(args []string) { } benchmark.PrintTuneResult(result, cfg) } + +func runRuntime(args []string) { + cfg := benchmark.ParseRuntimeFlags(args) + result, err := benchmark.RunRuntime(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintRuntimeResult(result, cfg) + if result.Status == "fail" && cfg.FailOnRegression { + os.Exit(1) + } +} diff --git a/dev b/dev index da0f70c..987e04c 100755 --- a/dev +++ b/dev @@ -197,7 +197,7 @@ run_calibrate() { run_runtime() { echo " ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}" - bash tests/benchmark/scripts/check-runtime-baseline.sh "$@" + go run ./cmd/semantic-bench runtime "$@" } run_tune() { diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go index 7f37ed5..f537934 100644 --- a/internal/benchmark/commands.go +++ b/internal/benchmark/commands.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "os" + "os/exec" "path/filepath" "sort" "strings" @@ -871,3 +872,211 @@ func PrintTuneResult(result *TuneResult, cfg TuneConfig) { } fmt.Println() } + +// Runtime baseline + +type RuntimeResult struct { + Status string `json:"status"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` + Regressions int `json:"regressions"` + BaselinePath string `json:"baseline_path"` + Created bool `json:"created"` +} + +type RuntimeBenchmark struct { + Name string `json:"name"` + NsOp float64 `json:"ns_op"` + BytesOp int `json:"bytes_op"` + AllocsOp int `json:"allocs_op"` + BaselineNs float64 `json:"baseline_ns,omitempty"` + Ratio float64 `json:"ratio,omitempty"` + Status string `json:"status"` +} + +type runtimeBaseline struct { + Timestamp string `json:"timestamp"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` +} + +func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { + root := FindBenchmarkRoot() + baselinePath := filepath.Join(root, "baselines", "runtime.json") + + benchmarks, err := runGoBenchmarks() + if err != nil { + return nil, err + } + + result := &RuntimeResult{ + Status: "pass", + Benchmarks: benchmarks, + BaselinePath: baselinePath, + } + + if _, err := os.Stat(baselinePath); os.IsNotExist(err) { + if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil { + return nil, err + } + result.Created = true + return result, nil + } + + baseline, err := loadRuntimeBaseline(baselinePath) + if err != nil { + return nil, err + } + + baselineMap := make(map[string]RuntimeBenchmark) + for _, b := range baseline.Benchmarks { + baselineMap[b.Name] = b + } + + maxRatio := 1.25 + for i, b := range result.Benchmarks { + if base, ok := baselineMap[b.Name]; ok { + ratio := b.NsOp / base.NsOp + result.Benchmarks[i].BaselineNs = base.NsOp + result.Benchmarks[i].Ratio = ratio + + if ratio > maxRatio { + result.Benchmarks[i].Status = "regression" + result.Regressions++ + } else if ratio > 1.1 { + result.Benchmarks[i].Status = "warning" + } else { + result.Benchmarks[i].Status = "ok" + } + } else { + result.Benchmarks[i].Status = "new" + } + } + + if result.Regressions > 0 { + result.Status = "fail" + } + + return result, nil +} + +func runGoBenchmarks() ([]RuntimeBenchmark, error) { + root := FindBenchmarkRoot() + projectRoot := filepath.Join(root, "..", "..") + + cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...") + cmd.Dir = projectRoot + output, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("go test failed: %w\n%s", err, output) + } + + return parseBenchOutput(string(output)), nil +} + +func parseBenchOutput(output string) []RuntimeBenchmark { + var results []RuntimeBenchmark + lines := strings.Split(output, "\n") + + for _, line := range lines { + if !strings.HasPrefix(line, "Benchmark") { + continue + } + + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + + name := strings.TrimSuffix(fields[0], "-8") + name = strings.TrimSuffix(name, "-10") + name = strings.TrimSuffix(name, "-12") + name = strings.TrimSuffix(name, "-16") + + var nsOp float64 + var bytesOp, allocsOp int + + for i, f := range fields { + if f == "ns/op" && i > 0 { + fmt.Sscanf(fields[i-1], "%f", &nsOp) + } + if f == "B/op" && i > 0 { + fmt.Sscanf(fields[i-1], "%d", &bytesOp) + } + if f == "allocs/op" && i > 0 { + fmt.Sscanf(fields[i-1], "%d", &allocsOp) + } + } + + if nsOp > 0 { + results = append(results, RuntimeBenchmark{ + Name: name, + NsOp: nsOp, + BytesOp: bytesOp, + AllocsOp: allocsOp, + }) + } + } + + return results +} + +func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error { + baseline := runtimeBaseline{ + Timestamp: time.Now().UTC().Format(time.RFC3339), + Benchmarks: benchmarks, + } + data, err := json.MarshalIndent(baseline, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, data, 0644) +} + +func loadRuntimeBaseline(path string) (*runtimeBaseline, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var baseline runtimeBaseline + if err := json.Unmarshal(data, &baseline); err != nil { + return nil, err + } + return &baseline, nil +} + +func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) { + if result.Created { + fmt.Printf("\n Created runtime baseline: %s\n", result.BaselinePath) + fmt.Printf(" Benchmarks: %d\n\n", len(result.Benchmarks)) + return + } + + fmt.Printf("\n Runtime Baseline Check\n\n") + + for _, b := range result.Benchmarks { + var status string + switch b.Status { + case "regression": + status = "\033[31mREGRESSION\033[0m" + case "warning": + status = "\033[33mWARNING\033[0m" + case "ok": + status = "\033[32mOK\033[0m" + case "new": + status = "\033[33mNEW\033[0m" + } + + if b.BaselineNs > 0 { + fmt.Printf(" %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n", + status, b.Name, b.BaselineNs, b.NsOp, b.Ratio) + } else { + fmt.Printf(" %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp) + } + } + + fmt.Println() + if result.Regressions > 0 { + fmt.Printf(" \033[31mRegressions: %d\033[0m\n\n", result.Regressions) + } else { + fmt.Printf(" \033[32mNo regressions\033[0m\n\n") + } +} diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go index eb2fe57..83e3f5c 100644 --- a/internal/benchmark/config.go +++ b/internal/benchmark/config.go @@ -118,6 +118,11 @@ type TuneConfig struct { Verbose bool } +type RuntimeConfig struct { + FailOnRegression bool + Verbose bool +} + func FindBenchmarkRoot() string { cwd, _ := os.Getwd() for d := cwd; d != "/"; d = filepath.Dir(d) { @@ -304,3 +309,12 @@ func ParseTuneFlags(args []string) TuneConfig { fs.Parse(args) return cfg } + +func ParseRuntimeFlags(args []string) RuntimeConfig { + fs := flag.NewFlagSet("runtime", flag.ExitOnError) + cfg := RuntimeConfig{} + fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.Parse(args) + return cfg +} diff --git a/tests/benchmark/scripts/check-runtime-baseline.sh b/tests/benchmark/scripts/check-runtime-baseline.sh deleted file mode 100755 index 75bc4fc..0000000 --- a/tests/benchmark/scripts/check-runtime-baseline.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash -# -# Check Go benchmark results against runtime baseline. -# -# Usage: -# ./check-runtime-baseline.sh [--fail-on-regression] -# -# Runs Go benchmarks and compares against saved baseline. -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -BASELINES_DIR="${BENCHMARK_DIR}/baselines" -RESULTS_DIR="${BENCHMARK_DIR}/results" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" -PROJECT_ROOT="${BENCHMARK_DIR}/../.." - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[0;33m' -NC='\033[0m' - -# Read tolerances from config -if [[ -f "$CONFIG_FILE" ]]; then - MAX_NS_RATIO=$(jq -r '.baseline.runtime.max_ns_op_regression_ratio // 1.25' "$CONFIG_FILE") - MAX_ALLOC_RATIO=$(jq -r '.baseline.runtime.max_alloc_regression_ratio // 1.25' "$CONFIG_FILE") -else - MAX_NS_RATIO=1.25 - MAX_ALLOC_RATIO=1.25 -fi - -# Parse args -FAIL_ON_REGRESSION=false -while [[ $# -gt 0 ]]; do - case "$1" in - --fail-on-regression) FAIL_ON_REGRESSION=true; shift ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -mkdir -p "${RESULTS_DIR}" -mkdir -p "${BASELINES_DIR}" - -BASELINE_FILE="${BASELINES_DIR}/runtime.json" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/runtime_${TIMESTAMP}.json" - -echo "Running Go benchmarks..." -echo "" - -# Run benchmarks -BENCH_OUTPUT=$(mktemp) -(cd "$PROJECT_ROOT" && go test -bench=. -benchmem ./internal/engine/... 2>&1) | tee "$BENCH_OUTPUT" - -# Parse benchmark output into JSON -echo "" -echo "Parsing results..." - -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" '{timestamp: $ts, benchmarks: []}' > "$REPORT_FILE" - -while IFS= read -r line; do - if [[ "$line" =~ ^Benchmark ]]; then - # Parse: BenchmarkName-N iterations ns/op bytes/op allocs/op - name=$(echo "$line" | awk '{print $1}' | sed 's/-[0-9]*$//') - ns_op=$(echo "$line" | grep -oE '[0-9.]+ ns/op' | awk '{print $1}' || echo "0") - bytes_op=$(echo "$line" | grep -oE '[0-9]+ B/op' | awk '{print $1}' || echo "0") - allocs_op=$(echo "$line" | grep -oE '[0-9]+ allocs/op' | awk '{print $1}' || echo "0") - - if [[ -n "$ns_op" ]] && [[ "$ns_op" != "0" ]]; then - tmp=$(mktemp) - jq --arg name "$name" \ - --argjson ns "$ns_op" \ - --argjson bytes "${bytes_op:-0}" \ - --argjson allocs "${allocs_op:-0}" \ - '.benchmarks += [{name: $name, ns_op: $ns, bytes_op: $bytes, allocs_op: $allocs}]' \ - "$REPORT_FILE" > "$tmp" - mv "$tmp" "$REPORT_FILE" - fi - fi -done < "$BENCH_OUTPUT" - -rm -f "$BENCH_OUTPUT" - -# If no baseline exists, create one -if [[ ! -f "$BASELINE_FILE" ]]; then - echo "" - echo "No runtime baseline found. Creating initial baseline..." - cp "$REPORT_FILE" "$BASELINE_FILE" - echo "Baseline saved to: $BASELINE_FILE" - exit 0 -fi - -# Compare against baseline -echo "" -echo "=== Comparing against baseline ===" -echo "" - -REGRESSIONS=0 - -for name in $(jq -r '.benchmarks[].name' "$REPORT_FILE"); do - baseline_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$BASELINE_FILE") - current_ns=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .ns_op // 0" "$REPORT_FILE") - - baseline_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$BASELINE_FILE") - current_allocs=$(jq -r ".benchmarks[] | select(.name == \"$name\") | .allocs_op // 0" "$REPORT_FILE") - - if [[ "$baseline_ns" == "0" ]] || [[ "$baseline_ns" == "null" ]]; then - echo -e "${YELLOW}NEW${NC} $name: ${current_ns} ns/op" - continue - fi - - ratio=$(echo "scale=4; $current_ns / $baseline_ns" | bc) - - if (( $(echo "$ratio > $MAX_NS_RATIO" | bc -l) )); then - echo -e "${RED}REGRESSION${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x, max: ${MAX_NS_RATIO}x)" - REGRESSIONS=$((REGRESSIONS + 1)) - elif (( $(echo "$ratio > 1.1" | bc -l) )); then - echo -e "${YELLOW}WARNING${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)" - else - echo -e "${GREEN}OK${NC} $name: ${baseline_ns} -> ${current_ns} ns/op (${ratio}x)" - fi -done - -echo "" -echo "================================================" -if [[ $REGRESSIONS -gt 0 ]]; then - echo -e "${RED}RUNTIME REGRESSIONS: $REGRESSIONS${NC}" - if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then - exit 1 - fi -else - echo -e "${GREEN}NO RUNTIME REGRESSIONS${NC}" -fi -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" From b7ee014f3bd8a4fc94c1c7823904c05f64d57c41 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 17:39:48 +0100 Subject: [PATCH 09/14] chore: ignore generated baseline files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9a58d8e..419dfaa 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ cover.out tests/e2e/results/*.txt tests/benchmark/results/*.json tests/benchmark/results/*.md -tests/benchmark/baselines/*.backup.json \ No newline at end of file +tests/benchmark/baselines/*.json \ No newline at end of file From 9ef5b362c52659a3c9a3b9c93a08aa81634401f5 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 17:59:19 +0100 Subject: [PATCH 10/14] chore: simplify dev tool and update SKILL.md - Remove redundant ./dev loop (same as ./dev bench) - Add cmd/semantic-bench to architecture docs - Simplify benchmark improvement loop section --- README.md | 2 +- dev | 7 ---- scripts/check-docs-links.sh | 62 ++++++++++++++++++++++++++++++++++++ skills/semantic-dev/SKILL.md | 54 +++++++------------------------ 4 files changed, 74 insertions(+), 51 deletions(-) create mode 100755 scripts/check-docs-links.sh diff --git a/README.md b/README.md index 57e3053..83fb48e 100644 --- a/README.md +++ b/README.md @@ -204,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode ## Design Trade-offs -See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration. +See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration. ## Origin diff --git a/dev b/dev index 987e04c..5d8c88d 100755 --- a/dev +++ b/dev @@ -33,7 +33,6 @@ commands=( "runtime:⏱️:Check runtime baseline" "tune:πŸŽ›οΈ:Tune combined weights" "e2e:🐳:Run E2E tests (Docker)" - "loop:πŸ”„:Benchmark loop (bench β†’ compare β†’ report)" ) show_help() { @@ -214,11 +213,6 @@ run_e2e() { bash scripts/e2e.sh } -run_loop() { - echo " ${ACCENT}${BOLD}πŸ”„ Benchmark Loop${NC}" - go run ./cmd/semantic-bench check -verbose "$@" -} - case "${1:-help}" in pr) run_pr ;; doctor) exec bash scripts/doctor.sh ;; @@ -258,6 +252,5 @@ case "${1:-help}" in runtime) shift; run_runtime "$@" ;; tune) shift; run_tune "$@" ;; e2e) run_e2e ;; - loop) run_loop ;; help|*) show_help ;; esac diff --git a/scripts/check-docs-links.sh b/scripts/check-docs-links.sh new file mode 100755 index 0000000..90a8738 --- /dev/null +++ b/scripts/check-docs-links.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# Check for broken documentation links +# +# Usage: +# ./scripts/check-docs-links.sh +# +set -uo pipefail + +cd "$(dirname "$0")/.." + +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' + +ERRORS=0 + +echo "Checking documentation links..." +echo "" + +# Find all markdown files and check links +while IFS= read -r file; do + dir=$(dirname "$file") + + # Extract markdown links: [text](path) + while IFS= read -r link; do + # Skip URLs and anchors + if [[ "$link" =~ ^https?:// ]] || [[ "$link" =~ ^mailto: ]] || [[ "$link" =~ ^# ]]; then + continue + fi + + # Remove anchor from link + link_path="${link%%#*}" + + # Skip empty paths + if [[ -z "$link_path" ]]; then + continue + fi + + # Resolve relative path + if [[ "$link_path" =~ ^/ ]]; then + target="$link_path" + else + target="$dir/$link_path" + fi + + # Check if target exists + if [[ ! -e "$target" ]]; then + echo -e "${RED}BROKEN:${NC} $file -> $link" + ERRORS=$((ERRORS + 1)) + fi + done < <(grep -oE '\]\([^)]+\)' "$file" 2>/dev/null | sed 's/\](//' | sed 's/)//') +done < <(find . -name "*.md" -not -path "./.git/*" -not -path "./node_modules/*") + +echo "" +if [[ $ERRORS -eq 0 ]]; then + echo -e "${GREEN}βœ“${NC} All documentation links valid" + exit 0 +else + echo -e "${RED}Found $ERRORS broken link(s)${NC}" + exit 1 +fi diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md index 7cbb684..2bea9dd 100644 --- a/skills/semantic-dev/SKILL.md +++ b/skills/semantic-dev/SKILL.md @@ -65,6 +65,7 @@ recovery/ Public subpackage failure.go FailureType classification cmd/semantic/main.go CLI tool (find, match, classify) +cmd/semantic-bench/ Benchmark CLI (check, baseline, calibrate, tune, runtime) ``` ## Key Design Decisions @@ -92,57 +93,24 @@ cmd/semantic/main.go CLI tool (find, match, classify) ## Benchmark Improvement Loop -When implementing changes that affect matching quality, follow this loop: - -### Step 1: Ensure baseline exists - -```bash -./dev baseline -``` - -Creates `tests/benchmark/baselines/combined.json` if missing. - -### Step 2: Implement change - -Make one focused improvement at a time. - -### Step 3: Run benchmark loop +When implementing changes that affect matching quality: ```bash -./dev loop +./dev baseline # create baseline (first time only) +# ... make changes ... +./dev bench # run benchmark, compare to baseline +./dev baseline update # accept new baseline (if improved) ``` -Shows comparison table with deltas: -- **Green (+)** = improved -- **Red (-)** = regressed -- **Gray** = unchanged - -### Step 4: Evaluate and decide - -| Result | Action | -|--------|--------| -| All metrics improved/unchanged | `./dev baseline update` | -| Mixed (some up, some down) | Investigate tradeoff | -| Key metrics regressed | Fix before merging | - -### Step 5: Iterate - -Repeat steps 2-4. Each `baseline update` sets new goalpost. - -### Key metrics - +**Key metrics:** - **MRR** β€” Mean Reciprocal Rank (higher = finds correct element faster) - **P@1** β€” Precision at 1 (is top result correct?) - **Hit@3** β€” Any correct result in top 3? -- **Margin** β€” Score gap between best correct and best wrong - -### Adding test cases - -When a query should work better: -1. Add to `tests/benchmark/corpus/*/queries.json` or `cases/*.json` -2. Run `./dev lint corpus` -3. Run `./dev loop` β€” benchmark will show regression until fixed +**Adding test cases:** +1. Add to `tests/benchmark/corpus/*/queries.json` +2. Run `./dev lint corpus` to validate +3. Run `./dev bench` β€” shows regression until fixed ## Public API Surface From 8786b2f430e99ae2b88ec02485234451443bd275 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 18:04:03 +0100 Subject: [PATCH 11/14] refactor: split benchmark commands.go into separate files - types.go: shared result types - check.go: RunCheck, PrintCheckResult - compare.go: RunCompare, PrintCompareResult - lint.go: RunLint, PrintLintResult - catalog.go: RunCatalog, PrintCatalogResult - baseline.go: baseline management - calibrate.go: threshold calibration - tune.go: weight tuning - runtime.go: Go benchmark performance --- internal/benchmark/baseline.go | 110 ++++ internal/benchmark/calibrate.go | 175 +++++ internal/benchmark/catalog.go | 75 +++ internal/benchmark/check.go | 237 +++++++ internal/benchmark/commands.go | 1082 ------------------------------- internal/benchmark/compare.go | 78 +++ internal/benchmark/lint.go | 68 ++ internal/benchmark/runtime.go | 217 +++++++ internal/benchmark/tune.go | 90 +++ internal/benchmark/types.go | 67 ++ 10 files changed, 1117 insertions(+), 1082 deletions(-) create mode 100644 internal/benchmark/baseline.go create mode 100644 internal/benchmark/calibrate.go create mode 100644 internal/benchmark/catalog.go create mode 100644 internal/benchmark/check.go delete mode 100644 internal/benchmark/commands.go create mode 100644 internal/benchmark/compare.go create mode 100644 internal/benchmark/lint.go create mode 100644 internal/benchmark/runtime.go create mode 100644 internal/benchmark/tune.go create mode 100644 internal/benchmark/types.go diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go new file mode 100644 index 0000000..de2a371 --- /dev/null +++ b/internal/benchmark/baseline.go @@ -0,0 +1,110 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" +) + +type BaselineResult struct { + Action string `json:"action"` + Path string `json:"path"` + Metrics OverallMetrics `json:"metrics"` + Previous *OverallMetrics `json:"previous,omitempty"` +} + +func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) { + root := FindBenchmarkRoot() + baselinesDir := filepath.Join(root, "baselines") + if err := os.MkdirAll(baselinesDir, 0755); err != nil { + return nil, err + } + + baselinePath := filepath.Join(baselinesDir, cfg.Name+".json") + + switch cfg.Action { + case "create": + return createBaseline(root, baselinePath, cfg) + case "update": + if !cfg.Accept { + return nil, fmt.Errorf("use --accept to confirm baseline update") + } + return updateBaseline(root, baselinePath, cfg) + default: + return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action) + } +} + +func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + Mode: "library", + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + data, err := json.MarshalIndent(report, "", " ") + if err != nil { + return nil, err + } + if err := os.WriteFile(baselinePath, data, 0644); err != nil { + return nil, err + } + + return &BaselineResult{ + Action: "create", + Path: baselinePath, + Metrics: report.Metrics.Overall, + }, nil +} + +func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + var previous *OverallMetrics + if data, err := os.ReadFile(baselinePath); err == nil { + var old Report + if json.Unmarshal(data, &old) == nil { + previous = &old.Metrics.Overall + } + backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json" + os.WriteFile(backupPath, data, 0644) + } + + result, err := createBaseline(root, baselinePath, cfg) + if err != nil { + return nil, err + } + result.Action = "update" + result.Previous = previous + return result, nil +} + +func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) { + fmt.Printf("\n Baseline %sd: %s\n\n", result.Action, result.Path) + fmt.Printf(" MRR: %.4f\n", result.Metrics.MRR) + fmt.Printf(" P@1: %.4f\n", result.Metrics.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Metrics.HitAt3) + + if result.Previous != nil { + fmt.Printf("\n Previous:\n") + fmt.Printf(" MRR: %.4f\n", result.Previous.MRR) + fmt.Printf(" P@1: %.4f\n", result.Previous.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Previous.HitAt3) + } + fmt.Println() +} diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go new file mode 100644 index 0000000..9c9fa33 --- /dev/null +++ b/internal/benchmark/calibrate.go @@ -0,0 +1,175 @@ +package benchmark + +import ( + "fmt" + + "github.com/pinchtab/semantic" +) + +type CalibrateResult struct { + ByThreshold map[string]ThresholdMetrics `json:"by_threshold"` + Recommendations CalibrateRecommendations `json:"recommendations"` + TotalCases int `json:"total_cases"` +} + +type ThresholdMetrics struct { + TP int `json:"tp"` + FP int `json:"fp"` + FN int `json:"fn"` + TN int `json:"tn"` + Recall float64 `json:"recall"` + Precision float64 `json:"precision"` + FPR float64 `json:"false_positive_rate"` + F1 float64 `json:"f1"` +} + +type CalibrateRecommendations struct { + DefaultThreshold float64 `json:"default_threshold"` + RecoveryThreshold float64 `json:"recovery_threshold"` + BestF1 float64 `json:"best_f1"` +} + +func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &CalibrateResult{ + ByThreshold: make(map[string]ThresholdMetrics), + } + + type testCase struct { + query Query + corpus *Corpus + } + + var cases []testCase + for i := range ds.Corpora { + corpus := &ds.Corpora[i] + if cfg.Corpus != "" && corpus.ID != cfg.Corpus { + continue + } + for _, q := range corpus.Queries { + cases = append(cases, testCase{query: q, corpus: corpus}) + } + } + result.TotalCases = len(cases) + + if cfg.Verbose { + fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases)) + } + + runCfg := RunConfig{ + Strategy: "combined", + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + } + matcher := createMatcher(runCfg) + + var bestF1, bestF1Threshold float64 + var bestRecallThreshold float64 + var bestRecallWithPrecision float64 + + for _, threshold := range cfg.Thresholds { + tp, fp, fn, tn := 0, 0, 0, 0 + + for _, tc := range cases { + findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{ + Threshold: threshold, + TopK: 5, + }) + + hasMatch := len(findResult.Matches) > 0 + topRef := "" + if hasMatch { + topRef = findResult.Matches[0].Ref + } + + if tc.query.ExpectNoMatch { + if hasMatch { + fp++ + } else { + tn++ + } + } else if len(tc.query.RelevantRefs) > 0 { + if !hasMatch { + fn++ + } else if contains(tc.query.RelevantRefs, topRef) { + tp++ + } else { + fp++ + } + } + } + + totalPos := tp + fn + totalNeg := tn + fp + + var recall, precision, fpr, f1 float64 + if totalPos > 0 { + recall = float64(tp) / float64(totalPos) + } + if tp+fp > 0 { + precision = float64(tp) / float64(tp+fp) + } + if totalNeg > 0 { + fpr = float64(fp) / float64(totalNeg) + } + if precision+recall > 0 { + f1 = 2 * precision * recall / (precision + recall) + } + + key := fmt.Sprintf("%.2f", threshold) + result.ByThreshold[key] = ThresholdMetrics{ + TP: tp, FP: fp, FN: fn, TN: tn, + Recall: recall, Precision: precision, FPR: fpr, F1: f1, + } + + if f1 > bestF1 { + bestF1 = f1 + bestF1Threshold = threshold + } + if recall >= 0.85 && precision > bestRecallWithPrecision { + bestRecallWithPrecision = precision + bestRecallThreshold = threshold + } + + if cfg.Verbose { + fmt.Printf(" threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n", + threshold, tp, fp, fn, tn, recall, precision, f1) + } + } + + if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 { + bestRecallThreshold = cfg.Thresholds[0] + } + + result.Recommendations = CalibrateRecommendations{ + DefaultThreshold: bestF1Threshold, + RecoveryThreshold: bestRecallThreshold, + BestF1: bestF1, + } + + return result, nil +} + +func contains(refs []string, ref string) bool { + for _, r := range refs { + if r == ref { + return true + } + } + return false +} + +func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) { + fmt.Printf("\n Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold)) + + fmt.Printf(" Recommendations:\n") + fmt.Printf(" Default (best F1): %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1) + fmt.Printf(" Recovery (recall): %.2f\n", result.Recommendations.RecoveryThreshold) + fmt.Println() +} diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go new file mode 100644 index 0000000..b4c4ec1 --- /dev/null +++ b/internal/benchmark/catalog.go @@ -0,0 +1,75 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "sort" +) + +func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + + result := &CatalogResult{ + ByTag: make(map[string]int), + ByDifficulty: make(map[string]int), + } + + for _, c := range ds.Corpora { + tags := make(map[string]bool) + for _, q := range c.Queries { + result.TotalQueries++ + result.ByDifficulty[q.Difficulty]++ + for _, t := range q.Tags { + tags[t] = true + result.ByTag[t]++ + } + } + var tagList []string + for t := range tags { + tagList = append(tagList, t) + } + sort.Strings(tagList) + result.Corpora = append(result.Corpora, CorpusSummary{ + ID: c.ID, + Queries: len(c.Queries), + Tags: tagList, + }) + } + + return result, nil +} + +func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n Corpora: %d\n", len(result.Corpora)) + fmt.Printf(" Total Queries: %d\n\n", result.TotalQueries) + + fmt.Printf(" %-30s %8s\n", "Corpus", "Queries") + fmt.Printf(" %-30s %8s\n", "------", "-------") + for _, c := range result.Corpora { + fmt.Printf(" %-30s %8d\n", c.ID, c.Queries) + } + + switch cfg.By { + case "difficulty": + fmt.Printf("\n By Difficulty:\n") + for d, n := range result.ByDifficulty { + fmt.Printf(" %-10s %4d\n", d, n) + } + case "tag": + fmt.Printf("\n By Tag:\n") + for t, n := range result.ByTag { + fmt.Printf(" %-20s %4d\n", t, n) + } + } + fmt.Printf("\n") +} diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go new file mode 100644 index 0000000..81171bb --- /dev/null +++ b/internal/benchmark/check.go @@ -0,0 +1,237 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" +) + +func RunCheck(cfg CheckConfig) (*CheckResult, error) { + root := FindBenchmarkRoot() + + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + benchCfg, _ := LoadConfig(root) + profile := Profile{ + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + } + if benchCfg != nil { + profile = ResolveProfile(benchCfg, cfg.Profile) + } + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: profile.Strategy, + Threshold: profile.Threshold, + TopK: profile.TopK, + LexicalWeight: profile.Weights.Lexical, + EmbeddingWeight: profile.Weights.Embedding, + Profile: cfg.Profile, + Mode: "library", + Verbose: cfg.Verbose, + Explain: cfg.Explain, + OutputDir: cfg.OutputDir, + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + result := &CheckResult{ + Status: "pass", + Report: report, + } + result.Summary.PAt1 = report.Metrics.Overall.PAt1 + result.Summary.MRR = report.Metrics.Overall.MRR + result.Summary.HitAt3 = report.Metrics.Overall.HitAt3 + result.Summary.Total = report.Metrics.Overall.Total + + for _, r := range report.Results { + if r.Status == "miss" { + result.TopRegs = append(result.TopRegs, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + Expected: r.Expected.RelevantRefs, + CurrentRef: r.Actual.BestRef, + Reason: "miss", + DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID), + }) + } + } + result.Summary.Regressions = len(result.TopRegs) + + baselinePath := cfg.BaselinePath + if baselinePath == "" { + baselinePath = filepath.Join(root, "baselines", "combined.json") + } + if _, err := os.Stat(baselinePath); err == nil { + baseline, err := loadReport(baselinePath) + if err == nil { + result.Delta = &MetricsDelta{ + PAt1: report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + } + if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) { + result.Status = "fail" + } + } + } + + os.MkdirAll(cfg.OutputDir, 0755) + ts := time.Now().Format("20060102_150405") + reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts)) + summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts)) + + reportJSON, _ := json.MarshalIndent(report, "", " ") + os.WriteFile(reportPath, reportJSON, 0644) + + summaryMD := generateSummaryMD(report, result) + os.WriteFile(summaryPath, []byte(summaryMD), 0644) + + result.Artifacts.ReportJSON = reportPath + result.Artifacts.SummaryMD = summaryPath + + return result, nil +} + +func RunBenchmark(cfg RunConfig) (*Report, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + return RunCorpusBenchmark(ds, cfg) +} + +func loadReport(path string) (*Report, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var r Report + if err := json.Unmarshal(data, &r); err != nil { + return nil, err + } + return &r, nil +} + +func generateSummaryMD(report *Report, result *CheckResult) string { + var sb strings.Builder + + sb.WriteString("# Benchmark Summary\n\n") + sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp)) + + sb.WriteString("## Overall Metrics\n\n") + sb.WriteString("| Metric | Value |\n") + sb.WriteString("|--------|-------|\n") + sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total)) + sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR)) + sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1)) + sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3)) + sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin)) + + if result.Delta != nil { + sb.WriteString("\n## Delta from Baseline\n\n") + sb.WriteString("| Metric | Delta |\n") + sb.WriteString("|--------|-------|\n") + sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1)) + sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR)) + sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3)) + } + + if len(result.TopRegs) > 0 { + sb.WriteString("\n## Misses\n\n") + sb.WriteString("| ID | Corpus | Query | Got | Expected |\n") + sb.WriteString("|----|--------|-------|-----|----------|\n") + for _, r := range result.TopRegs { + if len(result.TopRegs) > 10 { + break + } + sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n", + r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))) + } + } + + return sb.String() +} + +func PrintCheckResult(result *CheckResult, cfg CheckConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32mβœ“\033[0m Benchmark passed\n") + } else { + fmt.Printf(" \033[31mβœ—\033[0m Benchmark failed\n") + } + fmt.Printf("\n") + + fmt.Printf(" %-12s %8.4f\n", "MRR", result.Summary.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", result.Summary.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", result.Summary.Total) + fmt.Printf(" %-12s %8d\n", "Misses", result.Summary.Regressions) + + if result.Delta != nil { + fmt.Printf("\n Delta from baseline:\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + } + + fmt.Printf("\n Artifacts:\n") + fmt.Printf(" Report: %s\n", result.Artifacts.ReportJSON) + fmt.Printf(" Summary: %s\n", result.Artifacts.SummaryMD) + fmt.Printf("\n") +} + +func printDelta(name string, delta float64) { + color := "\033[0m" + sign := "" + if delta > 0.001 { + color = "\033[32m" + sign = "+" + } else if delta < -0.001 { + color = "\033[31m" + } + fmt.Printf(" %s%-8s %s%.4f\033[0m\n", color, name, sign, delta) +} + +func PrintRunResult(report *Report, cfg RunConfig) { + fmt.Printf("\n") + fmt.Printf(" %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", report.Metrics.Overall.Total) + fmt.Printf("\n") + + if cfg.Verbose { + for _, r := range report.Results { + status := "\033[32mHIT \033[0m" + switch r.Status { + case "miss": + status = "\033[31mMISS\033[0m" + case "partial": + status = "\033[33mPART\033[0m" + } + fmt.Printf(" [%s] %s | %s | got=%s score=%.3f\n", + r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore) + } + } +} diff --git a/internal/benchmark/commands.go b/internal/benchmark/commands.go deleted file mode 100644 index f537934..0000000 --- a/internal/benchmark/commands.go +++ /dev/null @@ -1,1082 +0,0 @@ -package benchmark - -import ( - "encoding/json" - "fmt" - "os" - "os/exec" - "path/filepath" - "sort" - "strings" - "time" - - "github.com/pinchtab/semantic" -) - -type CheckResult struct { - Status string `json:"status"` - Summary CheckSummary `json:"summary"` - Delta *MetricsDelta `json:"delta,omitempty"` - TopRegs []Regression `json:"top_regressions,omitempty"` - Artifacts Artifacts `json:"artifacts"` - Report *Report `json:"-"` -} - -type CheckSummary struct { - PAt1 float64 `json:"p_at_1"` - MRR float64 `json:"mrr"` - HitAt3 float64 `json:"hit_at_3"` - Total int `json:"total"` - Regressions int `json:"regressions"` - Warnings int `json:"warnings"` -} - -type MetricsDelta struct { - PAt1 float64 `json:"p_at_1"` - MRR float64 `json:"mrr"` - HitAt3 float64 `json:"hit_at_3"` -} - -type Regression struct { - ID string `json:"id"` - Corpus string `json:"corpus"` - Query string `json:"query"` - Expected []string `json:"expected"` - BaselineRef string `json:"baseline_ref,omitempty"` - CurrentRef string `json:"current_ref"` - Reason string `json:"reason"` - DebugCommand string `json:"debug_command"` -} - -type Artifacts struct { - ReportJSON string `json:"report_json"` - SummaryMD string `json:"summary_md"` -} - -type CompareResult struct { - Status string `json:"status"` - Delta MetricsDelta `json:"delta"` - Regressions []Regression `json:"regressions"` - Improvements []string `json:"improvements"` -} - -type LintResult struct { - Errors int `json:"errors"` - Warnings int `json:"warnings"` - Messages []string `json:"messages"` -} - -type CatalogResult struct { - Corpora []CorpusSummary `json:"corpora"` - TotalQueries int `json:"total_queries"` - ByTag map[string]int `json:"by_tag,omitempty"` - ByDifficulty map[string]int `json:"by_difficulty,omitempty"` -} - -type CorpusSummary struct { - ID string `json:"id"` - Queries int `json:"queries"` - Tags []string `json:"tags"` -} - -func RunCheck(cfg CheckConfig) (*CheckResult, error) { - root := FindBenchmarkRoot() - - ds, err := LoadDataset(root) - if err != nil { - return nil, fmt.Errorf("load dataset: %w", err) - } - - benchCfg, _ := LoadConfig(root) - profile := Profile{ - Strategy: "combined", - Threshold: 0.01, - TopK: 5, - Weights: Weights{Lexical: 0.6, Embedding: 0.4}, - } - if benchCfg != nil { - profile = ResolveProfile(benchCfg, cfg.Profile) - } - - runCfg := RunConfig{ - Suite: "corpus", - Strategy: profile.Strategy, - Threshold: profile.Threshold, - TopK: profile.TopK, - LexicalWeight: profile.Weights.Lexical, - EmbeddingWeight: profile.Weights.Embedding, - Profile: cfg.Profile, - Mode: "library", - Verbose: cfg.Verbose, - Explain: cfg.Explain, - OutputDir: cfg.OutputDir, - } - - report, err := RunCorpusBenchmark(ds, runCfg) - if err != nil { - return nil, fmt.Errorf("run benchmark: %w", err) - } - - result := &CheckResult{ - Status: "pass", - Report: report, - } - result.Summary.PAt1 = report.Metrics.Overall.PAt1 - result.Summary.MRR = report.Metrics.Overall.MRR - result.Summary.HitAt3 = report.Metrics.Overall.HitAt3 - result.Summary.Total = report.Metrics.Overall.Total - - // Count misses - for _, r := range report.Results { - if r.Status == "miss" { - result.TopRegs = append(result.TopRegs, Regression{ - ID: r.ID, - Corpus: r.Corpus, - Query: r.Query, - Expected: r.Expected.RelevantRefs, - CurrentRef: r.Actual.BestRef, - Reason: "miss", - DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID), - }) - } - } - result.Summary.Regressions = len(result.TopRegs) - - // Compare to baseline if exists - baselinePath := cfg.BaselinePath - if baselinePath == "" { - baselinePath = filepath.Join(root, "baselines", "combined.json") - } - if _, err := os.Stat(baselinePath); err == nil { - baseline, err := loadReport(baselinePath) - if err == nil { - result.Delta = &MetricsDelta{ - PAt1: report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, - MRR: report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, - HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, - } - if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) { - result.Status = "fail" - } - } - } - - // Write artifacts - os.MkdirAll(cfg.OutputDir, 0755) - ts := time.Now().Format("20060102_150405") - reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts)) - summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts)) - - reportJSON, _ := json.MarshalIndent(report, "", " ") - os.WriteFile(reportPath, reportJSON, 0644) - - summaryMD := generateSummaryMD(report, result) - os.WriteFile(summaryPath, []byte(summaryMD), 0644) - - result.Artifacts.ReportJSON = reportPath - result.Artifacts.SummaryMD = summaryPath - - return result, nil -} - -func RunBenchmark(cfg RunConfig) (*Report, error) { - root := FindBenchmarkRoot() - ds, err := LoadDataset(root) - if err != nil { - return nil, err - } - return RunCorpusBenchmark(ds, cfg) -} - -func RunCompare(cfg CompareConfig) (*CompareResult, error) { - baseline, err := loadReport(cfg.BaselinePath) - if err != nil { - return nil, fmt.Errorf("load baseline: %w", err) - } - current, err := loadReport(cfg.CurrentPath) - if err != nil { - return nil, fmt.Errorf("load current: %w", err) - } - - result := &CompareResult{ - Status: "pass", - Delta: MetricsDelta{ - PAt1: current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, - MRR: current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, - HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, - }, - } - - if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 { - result.Status = "fail" - } - - // Find regressions - baselineResults := make(map[string]QueryResult) - for _, r := range baseline.Results { - baselineResults[r.ID] = r - } - for _, r := range current.Results { - if base, ok := baselineResults[r.ID]; ok { - if base.Status == "hit" && r.Status != "hit" { - result.Regressions = append(result.Regressions, Regression{ - ID: r.ID, - Corpus: r.Corpus, - Query: r.Query, - BaselineRef: base.Actual.BestRef, - CurrentRef: r.Actual.BestRef, - Reason: fmt.Sprintf("%s -> %s", base.Status, r.Status), - }) - } - } - } - - return result, nil -} - -func RunLint(cfg LintConfig) (*LintResult, error) { - root := FindBenchmarkRoot() - result := &LintResult{} - - ds, err := LoadDataset(root) - if err != nil { - result.Errors++ - result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err)) - return result, nil - } - - // Check for duplicate IDs - ids := make(map[string]string) - for _, c := range ds.Corpora { - for _, q := range c.Queries { - if existing, ok := ids[q.ID]; ok { - result.Errors++ - result.Messages = append(result.Messages, - fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing)) - } else { - ids[q.ID] = c.ID - } - } - } - - // Check refs exist - for _, c := range ds.Corpora { - refs := make(map[string]bool) - for _, d := range c.Snapshot { - refs[d.Ref] = true - } - for _, q := range c.Queries { - for _, r := range q.RelevantRefs { - if !refs[r] { - result.Errors++ - result.Messages = append(result.Messages, - fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r)) - } - } - } - } - - // Check difficulty values - validDiff := map[string]bool{"easy": true, "medium": true, "hard": true} - for _, c := range ds.Corpora { - for _, q := range c.Queries { - if q.Difficulty != "" && !validDiff[q.Difficulty] { - result.Errors++ - result.Messages = append(result.Messages, - fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID)) - } - } - } - - if result.Errors == 0 && result.Warnings == 0 { - result.Messages = append(result.Messages, "All checks passed") - } - - return result, nil -} - -func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) { - root := FindBenchmarkRoot() - ds, err := LoadDataset(root) - if err != nil { - return nil, err - } - - result := &CatalogResult{ - ByTag: make(map[string]int), - ByDifficulty: make(map[string]int), - } - - for _, c := range ds.Corpora { - tags := make(map[string]bool) - for _, q := range c.Queries { - result.TotalQueries++ - result.ByDifficulty[q.Difficulty]++ - for _, t := range q.Tags { - tags[t] = true - result.ByTag[t]++ - } - } - var tagList []string - for t := range tags { - tagList = append(tagList, t) - } - sort.Strings(tagList) - result.Corpora = append(result.Corpora, CorpusSummary{ - ID: c.ID, - Queries: len(c.Queries), - Tags: tagList, - }) - } - - return result, nil -} - -func loadReport(path string) (*Report, error) { - data, err := os.ReadFile(path) - if err != nil { - return nil, err - } - var r Report - if err := json.Unmarshal(data, &r); err != nil { - return nil, err - } - return &r, nil -} - -func generateSummaryMD(report *Report, result *CheckResult) string { - var sb strings.Builder - - sb.WriteString("# Benchmark Summary\n\n") - sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp)) - - sb.WriteString("## Overall Metrics\n\n") - sb.WriteString("| Metric | Value |\n") - sb.WriteString("|--------|-------|\n") - sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total)) - sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR)) - sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1)) - sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3)) - sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin)) - - if result.Delta != nil { - sb.WriteString("\n## Delta from Baseline\n\n") - sb.WriteString("| Metric | Delta |\n") - sb.WriteString("|--------|-------|\n") - sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1)) - sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR)) - sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3)) - } - - if len(result.TopRegs) > 0 { - sb.WriteString("\n## Misses\n\n") - sb.WriteString("| ID | Corpus | Query | Got | Expected |\n") - sb.WriteString("|----|--------|-------|-----|----------|\n") - for _, r := range result.TopRegs { - if len(result.TopRegs) > 10 { - break - } - sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n", - r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))) - } - } - - return sb.String() -} - -func PrintCheckResult(result *CheckResult, cfg CheckConfig) { - if cfg.Format == "json" { - data, _ := json.MarshalIndent(result, "", " ") - fmt.Println(string(data)) - return - } - - fmt.Printf("\n") - if result.Status == "pass" { - fmt.Printf(" \033[32mβœ“\033[0m Benchmark passed\n") - } else { - fmt.Printf(" \033[31mβœ—\033[0m Benchmark failed\n") - } - fmt.Printf("\n") - - fmt.Printf(" %-12s %8.4f\n", "MRR", result.Summary.MRR) - fmt.Printf(" %-12s %8.4f\n", "P@1", result.Summary.PAt1) - fmt.Printf(" %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3) - fmt.Printf(" %-12s %8d\n", "Total", result.Summary.Total) - fmt.Printf(" %-12s %8d\n", "Misses", result.Summary.Regressions) - - if result.Delta != nil { - fmt.Printf("\n Delta from baseline:\n") - printDelta("P@1", result.Delta.PAt1) - printDelta("MRR", result.Delta.MRR) - printDelta("Hit@3", result.Delta.HitAt3) - } - - fmt.Printf("\n Artifacts:\n") - fmt.Printf(" Report: %s\n", result.Artifacts.ReportJSON) - fmt.Printf(" Summary: %s\n", result.Artifacts.SummaryMD) - fmt.Printf("\n") -} - -func printDelta(name string, delta float64) { - color := "\033[0m" - sign := "" - if delta > 0.001 { - color = "\033[32m" - sign = "+" - } else if delta < -0.001 { - color = "\033[31m" - } - fmt.Printf(" %s%-8s %s%.4f\033[0m\n", color, name, sign, delta) -} - -func PrintRunResult(report *Report, cfg RunConfig) { - fmt.Printf("\n") - fmt.Printf(" %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR) - fmt.Printf(" %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1) - fmt.Printf(" %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3) - fmt.Printf(" %-12s %8d\n", "Total", report.Metrics.Overall.Total) - fmt.Printf("\n") - - if cfg.Verbose { - for _, r := range report.Results { - status := "\033[32mHIT \033[0m" - switch r.Status { - case "miss": - status = "\033[31mMISS\033[0m" - case "partial": - status = "\033[33mPART\033[0m" - } - fmt.Printf(" [%s] %s | %s | got=%s score=%.3f\n", - r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore) - } - } -} - -func PrintCompareResult(result *CompareResult, cfg CompareConfig) { - fmt.Printf("\n") - if result.Status == "pass" { - fmt.Printf(" \033[32mβœ“\033[0m No regression\n") - } else { - fmt.Printf(" \033[31mβœ—\033[0m Regression detected\n") - } - fmt.Printf("\n") - printDelta("P@1", result.Delta.PAt1) - printDelta("MRR", result.Delta.MRR) - printDelta("Hit@3", result.Delta.HitAt3) - - if len(result.Regressions) > 0 { - fmt.Printf("\n Regressions:\n") - for _, r := range result.Regressions { - fmt.Printf(" %s: %s (%s)\n", r.ID, r.Reason, r.Query) - } - } - fmt.Printf("\n") -} - -func PrintLintResult(result *LintResult, cfg LintConfig) { - for _, msg := range result.Messages { - fmt.Println(msg) - } - fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings) -} - -func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) { - if cfg.Format == "json" { - data, _ := json.MarshalIndent(result, "", " ") - fmt.Println(string(data)) - return - } - - fmt.Printf("\n Corpora: %d\n", len(result.Corpora)) - fmt.Printf(" Total Queries: %d\n\n", result.TotalQueries) - - fmt.Printf(" %-30s %8s\n", "Corpus", "Queries") - fmt.Printf(" %-30s %8s\n", "------", "-------") - for _, c := range result.Corpora { - fmt.Printf(" %-30s %8d\n", c.ID, c.Queries) - } - - switch cfg.By { - case "difficulty": - fmt.Printf("\n By Difficulty:\n") - for d, n := range result.ByDifficulty { - fmt.Printf(" %-10s %4d\n", d, n) - } - case "tag": - fmt.Printf("\n By Tag:\n") - for t, n := range result.ByTag { - fmt.Printf(" %-20s %4d\n", t, n) - } - } - fmt.Printf("\n") -} - -// Baseline management - -type BaselineResult struct { - Action string `json:"action"` - Path string `json:"path"` - Metrics OverallMetrics `json:"metrics"` - Previous *OverallMetrics `json:"previous,omitempty"` -} - -func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) { - root := FindBenchmarkRoot() - baselinesDir := filepath.Join(root, "baselines") - if err := os.MkdirAll(baselinesDir, 0755); err != nil { - return nil, err - } - - baselinePath := filepath.Join(baselinesDir, cfg.Name+".json") - - switch cfg.Action { - case "create": - return createBaseline(root, baselinePath, cfg) - case "update": - if !cfg.Accept { - return nil, fmt.Errorf("use --accept to confirm baseline update") - } - return updateBaseline(root, baselinePath, cfg) - default: - return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action) - } -} - -func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { - ds, err := LoadDataset(root) - if err != nil { - return nil, fmt.Errorf("load dataset: %w", err) - } - - runCfg := RunConfig{ - Suite: "corpus", - Strategy: "combined", - Threshold: 0.01, - TopK: 5, - LexicalWeight: 0.6, - EmbeddingWeight: 0.4, - Mode: "library", - } - - report, err := RunCorpusBenchmark(ds, runCfg) - if err != nil { - return nil, fmt.Errorf("run benchmark: %w", err) - } - - data, err := json.MarshalIndent(report, "", " ") - if err != nil { - return nil, err - } - if err := os.WriteFile(baselinePath, data, 0644); err != nil { - return nil, err - } - - return &BaselineResult{ - Action: "create", - Path: baselinePath, - Metrics: report.Metrics.Overall, - }, nil -} - -func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { - var previous *OverallMetrics - if data, err := os.ReadFile(baselinePath); err == nil { - var old Report - if json.Unmarshal(data, &old) == nil { - previous = &old.Metrics.Overall - } - backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json" - os.WriteFile(backupPath, data, 0644) - } - - result, err := createBaseline(root, baselinePath, cfg) - if err != nil { - return nil, err - } - result.Action = "update" - result.Previous = previous - return result, nil -} - -func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) { - fmt.Printf("\n Baseline %sd: %s\n\n", result.Action, result.Path) - fmt.Printf(" MRR: %.4f\n", result.Metrics.MRR) - fmt.Printf(" P@1: %.4f\n", result.Metrics.PAt1) - fmt.Printf(" Hit@3: %.4f\n", result.Metrics.HitAt3) - - if result.Previous != nil { - fmt.Printf("\n Previous:\n") - fmt.Printf(" MRR: %.4f\n", result.Previous.MRR) - fmt.Printf(" P@1: %.4f\n", result.Previous.PAt1) - fmt.Printf(" Hit@3: %.4f\n", result.Previous.HitAt3) - } - fmt.Println() -} - -// Threshold calibration - -type CalibrateResult struct { - ByThreshold map[string]ThresholdMetrics `json:"by_threshold"` - Recommendations CalibrateRecommendations `json:"recommendations"` - TotalCases int `json:"total_cases"` -} - -type ThresholdMetrics struct { - TP int `json:"tp"` - FP int `json:"fp"` - FN int `json:"fn"` - TN int `json:"tn"` - Recall float64 `json:"recall"` - Precision float64 `json:"precision"` - FPR float64 `json:"false_positive_rate"` - F1 float64 `json:"f1"` -} - -type CalibrateRecommendations struct { - DefaultThreshold float64 `json:"default_threshold"` - RecoveryThreshold float64 `json:"recovery_threshold"` - BestF1 float64 `json:"best_f1"` -} - -func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) { - root := FindBenchmarkRoot() - ds, err := LoadDataset(root) - if err != nil { - return nil, fmt.Errorf("load dataset: %w", err) - } - - result := &CalibrateResult{ - ByThreshold: make(map[string]ThresholdMetrics), - } - - type testCase struct { - query Query - corpus *Corpus - } - - var cases []testCase - for i := range ds.Corpora { - corpus := &ds.Corpora[i] - if cfg.Corpus != "" && corpus.ID != cfg.Corpus { - continue - } - for _, q := range corpus.Queries { - cases = append(cases, testCase{query: q, corpus: corpus}) - } - } - result.TotalCases = len(cases) - - if cfg.Verbose { - fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases)) - } - - runCfg := RunConfig{ - Strategy: "combined", - TopK: 5, - LexicalWeight: 0.6, - EmbeddingWeight: 0.4, - } - matcher := createMatcher(runCfg) - - var bestF1, bestF1Threshold float64 - var bestRecallThreshold float64 - var bestRecallWithPrecision float64 - - for _, threshold := range cfg.Thresholds { - tp, fp, fn, tn := 0, 0, 0, 0 - - for _, tc := range cases { - findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{ - Threshold: threshold, - TopK: 5, - }) - - hasMatch := len(findResult.Matches) > 0 - topRef := "" - if hasMatch { - topRef = findResult.Matches[0].Ref - } - - if tc.query.ExpectNoMatch { - if hasMatch { - fp++ - } else { - tn++ - } - } else if len(tc.query.RelevantRefs) > 0 { - if !hasMatch { - fn++ - } else if contains(tc.query.RelevantRefs, topRef) { - tp++ - } else { - fp++ - } - } - } - - totalPos := tp + fn - totalNeg := tn + fp - - var recall, precision, fpr, f1 float64 - if totalPos > 0 { - recall = float64(tp) / float64(totalPos) - } - if tp+fp > 0 { - precision = float64(tp) / float64(tp+fp) - } - if totalNeg > 0 { - fpr = float64(fp) / float64(totalNeg) - } - if precision+recall > 0 { - f1 = 2 * precision * recall / (precision + recall) - } - - key := fmt.Sprintf("%.2f", threshold) - result.ByThreshold[key] = ThresholdMetrics{ - TP: tp, FP: fp, FN: fn, TN: tn, - Recall: recall, Precision: precision, FPR: fpr, F1: f1, - } - - if f1 > bestF1 { - bestF1 = f1 - bestF1Threshold = threshold - } - if recall >= 0.85 && precision > bestRecallWithPrecision { - bestRecallWithPrecision = precision - bestRecallThreshold = threshold - } - - if cfg.Verbose { - fmt.Printf(" threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n", - threshold, tp, fp, fn, tn, recall, precision, f1) - } - } - - if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 { - bestRecallThreshold = cfg.Thresholds[0] - } - - result.Recommendations = CalibrateRecommendations{ - DefaultThreshold: bestF1Threshold, - RecoveryThreshold: bestRecallThreshold, - BestF1: bestF1, - } - - return result, nil -} - -func contains(refs []string, ref string) bool { - for _, r := range refs { - if r == ref { - return true - } - } - return false -} - -func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) { - fmt.Printf("\n Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold)) - - fmt.Printf(" Recommendations:\n") - fmt.Printf(" Default (best F1): %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1) - fmt.Printf(" Recovery (recall): %.2f\n", result.Recommendations.RecoveryThreshold) - fmt.Println() -} - -// Weight tuning - -type TuneResult struct { - Results []TuneRun `json:"results"` - Best *TuneRun `json:"best"` -} - -type TuneRun struct { - LexicalWeight float64 `json:"lexical_weight"` - EmbeddingWeight float64 `json:"embedding_weight"` - MRR float64 `json:"mrr"` - PAt1 float64 `json:"p_at_1"` - HitAt3 float64 `json:"hit_at_3"` -} - -func RunTune(cfg TuneConfig) (*TuneResult, error) { - root := FindBenchmarkRoot() - ds, err := LoadDataset(root) - if err != nil { - return nil, fmt.Errorf("load dataset: %w", err) - } - - result := &TuneResult{} - - if cfg.Verbose { - fmt.Printf(" %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3") - } - - for w := 0.0; w <= 1.0001; w += cfg.Step { - lexW := w - embW := 1.0 - w - - runCfg := RunConfig{ - Suite: "corpus", - Strategy: "combined", - Threshold: 0.01, - TopK: 5, - LexicalWeight: lexW, - EmbeddingWeight: embW, - Mode: "library", - } - - if cfg.Corpus != "" { - runCfg.Corpus = cfg.Corpus - } - - report, err := RunCorpusBenchmark(ds, runCfg) - if err != nil { - return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err) - } - - run := TuneRun{ - LexicalWeight: lexW, - EmbeddingWeight: embW, - MRR: report.Metrics.Overall.MRR, - PAt1: report.Metrics.Overall.PAt1, - HitAt3: report.Metrics.Overall.HitAt3, - } - result.Results = append(result.Results, run) - - if result.Best == nil || run.PAt1 > result.Best.PAt1 || - (run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) { - best := run - result.Best = &best - } - - if cfg.Verbose { - fmt.Printf(" %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n", - lexW, embW, run.MRR, run.PAt1, run.HitAt3) - } - } - - return result, nil -} - -func PrintTuneResult(result *TuneResult, cfg TuneConfig) { - fmt.Printf("\n Tested %d weight combinations\n\n", len(result.Results)) - - if result.Best != nil { - fmt.Printf(" Best weights:\n") - fmt.Printf(" Lexical: %.2f\n", result.Best.LexicalWeight) - fmt.Printf(" Embedding: %.2f\n", result.Best.EmbeddingWeight) - fmt.Printf(" MRR: %.4f\n", result.Best.MRR) - fmt.Printf(" P@1: %.4f\n", result.Best.PAt1) - fmt.Printf(" Hit@3: %.4f\n", result.Best.HitAt3) - } - fmt.Println() -} - -// Runtime baseline - -type RuntimeResult struct { - Status string `json:"status"` - Benchmarks []RuntimeBenchmark `json:"benchmarks"` - Regressions int `json:"regressions"` - BaselinePath string `json:"baseline_path"` - Created bool `json:"created"` -} - -type RuntimeBenchmark struct { - Name string `json:"name"` - NsOp float64 `json:"ns_op"` - BytesOp int `json:"bytes_op"` - AllocsOp int `json:"allocs_op"` - BaselineNs float64 `json:"baseline_ns,omitempty"` - Ratio float64 `json:"ratio,omitempty"` - Status string `json:"status"` -} - -type runtimeBaseline struct { - Timestamp string `json:"timestamp"` - Benchmarks []RuntimeBenchmark `json:"benchmarks"` -} - -func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { - root := FindBenchmarkRoot() - baselinePath := filepath.Join(root, "baselines", "runtime.json") - - benchmarks, err := runGoBenchmarks() - if err != nil { - return nil, err - } - - result := &RuntimeResult{ - Status: "pass", - Benchmarks: benchmarks, - BaselinePath: baselinePath, - } - - if _, err := os.Stat(baselinePath); os.IsNotExist(err) { - if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil { - return nil, err - } - result.Created = true - return result, nil - } - - baseline, err := loadRuntimeBaseline(baselinePath) - if err != nil { - return nil, err - } - - baselineMap := make(map[string]RuntimeBenchmark) - for _, b := range baseline.Benchmarks { - baselineMap[b.Name] = b - } - - maxRatio := 1.25 - for i, b := range result.Benchmarks { - if base, ok := baselineMap[b.Name]; ok { - ratio := b.NsOp / base.NsOp - result.Benchmarks[i].BaselineNs = base.NsOp - result.Benchmarks[i].Ratio = ratio - - if ratio > maxRatio { - result.Benchmarks[i].Status = "regression" - result.Regressions++ - } else if ratio > 1.1 { - result.Benchmarks[i].Status = "warning" - } else { - result.Benchmarks[i].Status = "ok" - } - } else { - result.Benchmarks[i].Status = "new" - } - } - - if result.Regressions > 0 { - result.Status = "fail" - } - - return result, nil -} - -func runGoBenchmarks() ([]RuntimeBenchmark, error) { - root := FindBenchmarkRoot() - projectRoot := filepath.Join(root, "..", "..") - - cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...") - cmd.Dir = projectRoot - output, err := cmd.CombinedOutput() - if err != nil { - return nil, fmt.Errorf("go test failed: %w\n%s", err, output) - } - - return parseBenchOutput(string(output)), nil -} - -func parseBenchOutput(output string) []RuntimeBenchmark { - var results []RuntimeBenchmark - lines := strings.Split(output, "\n") - - for _, line := range lines { - if !strings.HasPrefix(line, "Benchmark") { - continue - } - - fields := strings.Fields(line) - if len(fields) < 3 { - continue - } - - name := strings.TrimSuffix(fields[0], "-8") - name = strings.TrimSuffix(name, "-10") - name = strings.TrimSuffix(name, "-12") - name = strings.TrimSuffix(name, "-16") - - var nsOp float64 - var bytesOp, allocsOp int - - for i, f := range fields { - if f == "ns/op" && i > 0 { - fmt.Sscanf(fields[i-1], "%f", &nsOp) - } - if f == "B/op" && i > 0 { - fmt.Sscanf(fields[i-1], "%d", &bytesOp) - } - if f == "allocs/op" && i > 0 { - fmt.Sscanf(fields[i-1], "%d", &allocsOp) - } - } - - if nsOp > 0 { - results = append(results, RuntimeBenchmark{ - Name: name, - NsOp: nsOp, - BytesOp: bytesOp, - AllocsOp: allocsOp, - }) - } - } - - return results -} - -func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error { - baseline := runtimeBaseline{ - Timestamp: time.Now().UTC().Format(time.RFC3339), - Benchmarks: benchmarks, - } - data, err := json.MarshalIndent(baseline, "", " ") - if err != nil { - return err - } - return os.WriteFile(path, data, 0644) -} - -func loadRuntimeBaseline(path string) (*runtimeBaseline, error) { - data, err := os.ReadFile(path) - if err != nil { - return nil, err - } - var baseline runtimeBaseline - if err := json.Unmarshal(data, &baseline); err != nil { - return nil, err - } - return &baseline, nil -} - -func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) { - if result.Created { - fmt.Printf("\n Created runtime baseline: %s\n", result.BaselinePath) - fmt.Printf(" Benchmarks: %d\n\n", len(result.Benchmarks)) - return - } - - fmt.Printf("\n Runtime Baseline Check\n\n") - - for _, b := range result.Benchmarks { - var status string - switch b.Status { - case "regression": - status = "\033[31mREGRESSION\033[0m" - case "warning": - status = "\033[33mWARNING\033[0m" - case "ok": - status = "\033[32mOK\033[0m" - case "new": - status = "\033[33mNEW\033[0m" - } - - if b.BaselineNs > 0 { - fmt.Printf(" %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n", - status, b.Name, b.BaselineNs, b.NsOp, b.Ratio) - } else { - fmt.Printf(" %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp) - } - } - - fmt.Println() - if result.Regressions > 0 { - fmt.Printf(" \033[31mRegressions: %d\033[0m\n\n", result.Regressions) - } else { - fmt.Printf(" \033[32mNo regressions\033[0m\n\n") - } -} diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go new file mode 100644 index 0000000..2b0a3d5 --- /dev/null +++ b/internal/benchmark/compare.go @@ -0,0 +1,78 @@ +package benchmark + +import ( + "encoding/json" + "fmt" +) + +func RunCompare(cfg CompareConfig) (*CompareResult, error) { + baseline, err := loadReport(cfg.BaselinePath) + if err != nil { + return nil, fmt.Errorf("load baseline: %w", err) + } + current, err := loadReport(cfg.CurrentPath) + if err != nil { + return nil, fmt.Errorf("load current: %w", err) + } + + result := &CompareResult{ + Status: "pass", + Delta: MetricsDelta{ + PAt1: current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + }, + } + + if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 { + result.Status = "fail" + } + + baselineResults := make(map[string]QueryResult) + for _, r := range baseline.Results { + baselineResults[r.ID] = r + } + for _, r := range current.Results { + if base, ok := baselineResults[r.ID]; ok { + if base.Status == "hit" && r.Status != "hit" { + result.Regressions = append(result.Regressions, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + BaselineRef: base.Actual.BestRef, + CurrentRef: r.Actual.BestRef, + Reason: fmt.Sprintf("%s -> %s", base.Status, r.Status), + }) + } + } + } + + return result, nil +} + +func PrintCompareResult(result *CompareResult, cfg CompareConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32mβœ“\033[0m No regression\n") + } else { + fmt.Printf(" \033[31mβœ—\033[0m Regression detected\n") + } + fmt.Printf("\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + + if len(result.Regressions) > 0 { + fmt.Printf("\n Regressions:\n") + for _, r := range result.Regressions { + fmt.Printf(" %s: %s (%s)\n", r.ID, r.Reason, r.Query) + } + } + fmt.Printf("\n") +} diff --git a/internal/benchmark/lint.go b/internal/benchmark/lint.go new file mode 100644 index 0000000..20565ce --- /dev/null +++ b/internal/benchmark/lint.go @@ -0,0 +1,68 @@ +package benchmark + +import "fmt" + +func RunLint(cfg LintConfig) (*LintResult, error) { + root := FindBenchmarkRoot() + result := &LintResult{} + + ds, err := LoadDataset(root) + if err != nil { + result.Errors++ + result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err)) + return result, nil + } + + ids := make(map[string]string) + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if existing, ok := ids[q.ID]; ok { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing)) + } else { + ids[q.ID] = c.ID + } + } + } + + for _, c := range ds.Corpora { + refs := make(map[string]bool) + for _, d := range c.Snapshot { + refs[d.Ref] = true + } + for _, q := range c.Queries { + for _, r := range q.RelevantRefs { + if !refs[r] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r)) + } + } + } + } + + validDiff := map[string]bool{"easy": true, "medium": true, "hard": true} + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if q.Difficulty != "" && !validDiff[q.Difficulty] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID)) + } + } + } + + if result.Errors == 0 && result.Warnings == 0 { + result.Messages = append(result.Messages, "All checks passed") + } + + return result, nil +} + +func PrintLintResult(result *LintResult, cfg LintConfig) { + for _, msg := range result.Messages { + fmt.Println(msg) + } + fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings) +} diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go new file mode 100644 index 0000000..8e28dcb --- /dev/null +++ b/internal/benchmark/runtime.go @@ -0,0 +1,217 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +type RuntimeResult struct { + Status string `json:"status"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` + Regressions int `json:"regressions"` + BaselinePath string `json:"baseline_path"` + Created bool `json:"created"` +} + +type RuntimeBenchmark struct { + Name string `json:"name"` + NsOp float64 `json:"ns_op"` + BytesOp int `json:"bytes_op"` + AllocsOp int `json:"allocs_op"` + BaselineNs float64 `json:"baseline_ns,omitempty"` + Ratio float64 `json:"ratio,omitempty"` + Status string `json:"status"` +} + +type runtimeBaseline struct { + Timestamp string `json:"timestamp"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` +} + +func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { + root := FindBenchmarkRoot() + baselinePath := filepath.Join(root, "baselines", "runtime.json") + + benchmarks, err := runGoBenchmarks() + if err != nil { + return nil, err + } + + result := &RuntimeResult{ + Status: "pass", + Benchmarks: benchmarks, + BaselinePath: baselinePath, + } + + if _, err := os.Stat(baselinePath); os.IsNotExist(err) { + if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil { + return nil, err + } + result.Created = true + return result, nil + } + + baseline, err := loadRuntimeBaseline(baselinePath) + if err != nil { + return nil, err + } + + baselineMap := make(map[string]RuntimeBenchmark) + for _, b := range baseline.Benchmarks { + baselineMap[b.Name] = b + } + + maxRatio := 1.25 + for i, b := range result.Benchmarks { + if base, ok := baselineMap[b.Name]; ok { + ratio := b.NsOp / base.NsOp + result.Benchmarks[i].BaselineNs = base.NsOp + result.Benchmarks[i].Ratio = ratio + + if ratio > maxRatio { + result.Benchmarks[i].Status = "regression" + result.Regressions++ + } else if ratio > 1.1 { + result.Benchmarks[i].Status = "warning" + } else { + result.Benchmarks[i].Status = "ok" + } + } else { + result.Benchmarks[i].Status = "new" + } + } + + if result.Regressions > 0 { + result.Status = "fail" + } + + return result, nil +} + +func runGoBenchmarks() ([]RuntimeBenchmark, error) { + root := FindBenchmarkRoot() + projectRoot := filepath.Join(root, "..", "..") + + cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...") + cmd.Dir = projectRoot + output, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("go test failed: %w\n%s", err, output) + } + + return parseBenchOutput(string(output)), nil +} + +func parseBenchOutput(output string) []RuntimeBenchmark { + var results []RuntimeBenchmark + lines := strings.Split(output, "\n") + + for _, line := range lines { + if !strings.HasPrefix(line, "Benchmark") { + continue + } + + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + + name := strings.TrimSuffix(fields[0], "-8") + name = strings.TrimSuffix(name, "-10") + name = strings.TrimSuffix(name, "-12") + name = strings.TrimSuffix(name, "-16") + + var nsOp float64 + var bytesOp, allocsOp int + + for i, f := range fields { + if f == "ns/op" && i > 0 { + fmt.Sscanf(fields[i-1], "%f", &nsOp) + } + if f == "B/op" && i > 0 { + fmt.Sscanf(fields[i-1], "%d", &bytesOp) + } + if f == "allocs/op" && i > 0 { + fmt.Sscanf(fields[i-1], "%d", &allocsOp) + } + } + + if nsOp > 0 { + results = append(results, RuntimeBenchmark{ + Name: name, + NsOp: nsOp, + BytesOp: bytesOp, + AllocsOp: allocsOp, + }) + } + } + + return results +} + +func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error { + baseline := runtimeBaseline{ + Timestamp: time.Now().UTC().Format(time.RFC3339), + Benchmarks: benchmarks, + } + data, err := json.MarshalIndent(baseline, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, data, 0644) +} + +func loadRuntimeBaseline(path string) (*runtimeBaseline, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var baseline runtimeBaseline + if err := json.Unmarshal(data, &baseline); err != nil { + return nil, err + } + return &baseline, nil +} + +func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) { + if result.Created { + fmt.Printf("\n Created runtime baseline: %s\n", result.BaselinePath) + fmt.Printf(" Benchmarks: %d\n\n", len(result.Benchmarks)) + return + } + + fmt.Printf("\n Runtime Baseline Check\n\n") + + for _, b := range result.Benchmarks { + var status string + switch b.Status { + case "regression": + status = "\033[31mREGRESSION\033[0m" + case "warning": + status = "\033[33mWARNING\033[0m" + case "ok": + status = "\033[32mOK\033[0m" + case "new": + status = "\033[33mNEW\033[0m" + } + + if b.BaselineNs > 0 { + fmt.Printf(" %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n", + status, b.Name, b.BaselineNs, b.NsOp, b.Ratio) + } else { + fmt.Printf(" %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp) + } + } + + fmt.Println() + if result.Regressions > 0 { + fmt.Printf(" \033[31mRegressions: %d\033[0m\n\n", result.Regressions) + } else { + fmt.Printf(" \033[32mNo regressions\033[0m\n\n") + } +} diff --git a/internal/benchmark/tune.go b/internal/benchmark/tune.go new file mode 100644 index 0000000..7db259b --- /dev/null +++ b/internal/benchmark/tune.go @@ -0,0 +1,90 @@ +package benchmark + +import "fmt" + +type TuneResult struct { + Results []TuneRun `json:"results"` + Best *TuneRun `json:"best"` +} + +type TuneRun struct { + LexicalWeight float64 `json:"lexical_weight"` + EmbeddingWeight float64 `json:"embedding_weight"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + HitAt3 float64 `json:"hit_at_3"` +} + +func RunTune(cfg TuneConfig) (*TuneResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &TuneResult{} + + if cfg.Verbose { + fmt.Printf(" %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3") + } + + for w := 0.0; w <= 1.0001; w += cfg.Step { + lexW := w + embW := 1.0 - w + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: lexW, + EmbeddingWeight: embW, + Mode: "library", + } + + if cfg.Corpus != "" { + runCfg.Corpus = cfg.Corpus + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err) + } + + run := TuneRun{ + LexicalWeight: lexW, + EmbeddingWeight: embW, + MRR: report.Metrics.Overall.MRR, + PAt1: report.Metrics.Overall.PAt1, + HitAt3: report.Metrics.Overall.HitAt3, + } + result.Results = append(result.Results, run) + + if result.Best == nil || run.PAt1 > result.Best.PAt1 || + (run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) { + best := run + result.Best = &best + } + + if cfg.Verbose { + fmt.Printf(" %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n", + lexW, embW, run.MRR, run.PAt1, run.HitAt3) + } + } + + return result, nil +} + +func PrintTuneResult(result *TuneResult, cfg TuneConfig) { + fmt.Printf("\n Tested %d weight combinations\n\n", len(result.Results)) + + if result.Best != nil { + fmt.Printf(" Best weights:\n") + fmt.Printf(" Lexical: %.2f\n", result.Best.LexicalWeight) + fmt.Printf(" Embedding: %.2f\n", result.Best.EmbeddingWeight) + fmt.Printf(" MRR: %.4f\n", result.Best.MRR) + fmt.Printf(" P@1: %.4f\n", result.Best.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Best.HitAt3) + } + fmt.Println() +} diff --git a/internal/benchmark/types.go b/internal/benchmark/types.go new file mode 100644 index 0000000..916978a --- /dev/null +++ b/internal/benchmark/types.go @@ -0,0 +1,67 @@ +package benchmark + +type CheckResult struct { + Status string `json:"status"` + Summary CheckSummary `json:"summary"` + Delta *MetricsDelta `json:"delta,omitempty"` + TopRegs []Regression `json:"top_regressions,omitempty"` + Artifacts Artifacts `json:"artifacts"` + Report *Report `json:"-"` +} + +type CheckSummary struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` + Total int `json:"total"` + Regressions int `json:"regressions"` + Warnings int `json:"warnings"` +} + +type MetricsDelta struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` +} + +type Regression struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Expected []string `json:"expected"` + BaselineRef string `json:"baseline_ref,omitempty"` + CurrentRef string `json:"current_ref"` + Reason string `json:"reason"` + DebugCommand string `json:"debug_command"` +} + +type Artifacts struct { + ReportJSON string `json:"report_json"` + SummaryMD string `json:"summary_md"` +} + +type CompareResult struct { + Status string `json:"status"` + Delta MetricsDelta `json:"delta"` + Regressions []Regression `json:"regressions"` + Improvements []string `json:"improvements"` +} + +type LintResult struct { + Errors int `json:"errors"` + Warnings int `json:"warnings"` + Messages []string `json:"messages"` +} + +type CatalogResult struct { + Corpora []CorpusSummary `json:"corpora"` + TotalQueries int `json:"total_queries"` + ByTag map[string]int `json:"by_tag,omitempty"` + ByDifficulty map[string]int `json:"by_difficulty,omitempty"` +} + +type CorpusSummary struct { + ID string `json:"id"` + Queries int `json:"queries"` + Tags []string `json:"tags"` +} From 4f166857d1cfb1818b07c7651b5c9fd1579622f9 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 18:50:28 +0100 Subject: [PATCH 12/14] fix: resolve golangci-lint errors in benchmark package - Fix unchecked error returns (errcheck) - Convert if-else chains to switch statements (gocritic) - Use context.Background() instead of nil context (staticcheck) - Replace WriteString(fmt.Sprintf) with fmt.Fprintf (staticcheck) --- dev | 14 +++++++-- internal/benchmark/baseline.go | 2 +- internal/benchmark/calibrate.go | 28 +++++++++--------- internal/benchmark/check.go | 28 +++++++++--------- internal/benchmark/config.go | 50 ++++++++++++++++----------------- internal/benchmark/dataset.go | 8 +++--- internal/benchmark/runner.go | 50 +++++++++++++++++---------------- internal/benchmark/runtime.go | 13 +++++---- recovery/benchmark_test.go | 2 +- 9 files changed, 103 insertions(+), 92 deletions(-) diff --git a/dev b/dev index 5d8c88d..11d53d9 100755 --- a/dev +++ b/dev @@ -128,9 +128,19 @@ run_check() { if [ -n "$unformatted" ]; then echo " ${ERROR}βœ—${NC} Unformatted files:" echo "$unformatted" - exit 1 + echo "" + printf " Fix formatting now? (Y/n) " + read -r answer + if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then + gofmt -w . + echo " ${SUCCESS}βœ“${NC} Format (fixed)" + else + echo " ${MUTED}Run: gofmt -w .${NC}" + exit 1 + fi + else + echo " ${SUCCESS}βœ“${NC} Format" fi - echo " ${SUCCESS}βœ“${NC} Format" echo " ${MUTED}2/4 Vet${NC}" go vet ./... diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go index de2a371..07cc418 100644 --- a/internal/benchmark/baseline.go +++ b/internal/benchmark/baseline.go @@ -82,7 +82,7 @@ func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*Baseline previous = &old.Metrics.Overall } backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json" - os.WriteFile(backupPath, data, 0644) + _ = os.WriteFile(backupPath, data, 0644) } result, err := createBaseline(root, baselinePath, cfg) diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go index 9c9fa33..48ec06e 100644 --- a/internal/benchmark/calibrate.go +++ b/internal/benchmark/calibrate.go @@ -1,6 +1,7 @@ package benchmark import ( + "context" "fmt" "github.com/pinchtab/semantic" @@ -77,7 +78,7 @@ func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) { tp, fp, fn, tn := 0, 0, 0, 0 for _, tc := range cases { - findResult, _ := matcher.Find(nil, tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{ + findResult, _ := matcher.Find(context.Background(), tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{ Threshold: threshold, TopK: 5, }) @@ -88,20 +89,17 @@ func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) { topRef = findResult.Matches[0].Ref } - if tc.query.ExpectNoMatch { - if hasMatch { - fp++ - } else { - tn++ - } - } else if len(tc.query.RelevantRefs) > 0 { - if !hasMatch { - fn++ - } else if contains(tc.query.RelevantRefs, topRef) { - tp++ - } else { - fp++ - } + switch { + case tc.query.ExpectNoMatch && hasMatch: + fp++ + case tc.query.ExpectNoMatch && !hasMatch: + tn++ + case len(tc.query.RelevantRefs) > 0 && !hasMatch: + fn++ + case len(tc.query.RelevantRefs) > 0 && contains(tc.query.RelevantRefs, topRef): + tp++ + case len(tc.query.RelevantRefs) > 0: + fp++ } } diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go index 81171bb..e2ceedc 100644 --- a/internal/benchmark/check.go +++ b/internal/benchmark/check.go @@ -89,16 +89,16 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) { } } - os.MkdirAll(cfg.OutputDir, 0755) + _ = os.MkdirAll(cfg.OutputDir, 0755) ts := time.Now().Format("20060102_150405") reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts)) summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts)) reportJSON, _ := json.MarshalIndent(report, "", " ") - os.WriteFile(reportPath, reportJSON, 0644) + _ = os.WriteFile(reportPath, reportJSON, 0644) summaryMD := generateSummaryMD(report, result) - os.WriteFile(summaryPath, []byte(summaryMD), 0644) + _ = os.WriteFile(summaryPath, []byte(summaryMD), 0644) result.Artifacts.ReportJSON = reportPath result.Artifacts.SummaryMD = summaryPath @@ -131,24 +131,24 @@ func generateSummaryMD(report *Report, result *CheckResult) string { var sb strings.Builder sb.WriteString("# Benchmark Summary\n\n") - sb.WriteString(fmt.Sprintf("Generated: %s\n\n", report.Run.Timestamp)) + fmt.Fprintf(&sb, "Generated: %s\n\n", report.Run.Timestamp) sb.WriteString("## Overall Metrics\n\n") sb.WriteString("| Metric | Value |\n") sb.WriteString("|--------|-------|\n") - sb.WriteString(fmt.Sprintf("| Total | %d |\n", report.Metrics.Overall.Total)) - sb.WriteString(fmt.Sprintf("| MRR | %.4f |\n", report.Metrics.Overall.MRR)) - sb.WriteString(fmt.Sprintf("| P@1 | %.4f |\n", report.Metrics.Overall.PAt1)) - sb.WriteString(fmt.Sprintf("| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3)) - sb.WriteString(fmt.Sprintf("| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin)) + fmt.Fprintf(&sb, "| Total | %d |\n", report.Metrics.Overall.Total) + fmt.Fprintf(&sb, "| MRR | %.4f |\n", report.Metrics.Overall.MRR) + fmt.Fprintf(&sb, "| P@1 | %.4f |\n", report.Metrics.Overall.PAt1) + fmt.Fprintf(&sb, "| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3) + fmt.Fprintf(&sb, "| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin) if result.Delta != nil { sb.WriteString("\n## Delta from Baseline\n\n") sb.WriteString("| Metric | Delta |\n") sb.WriteString("|--------|-------|\n") - sb.WriteString(fmt.Sprintf("| P@1 | %+.4f |\n", result.Delta.PAt1)) - sb.WriteString(fmt.Sprintf("| MRR | %+.4f |\n", result.Delta.MRR)) - sb.WriteString(fmt.Sprintf("| Hit@3 | %+.4f |\n", result.Delta.HitAt3)) + fmt.Fprintf(&sb, "| P@1 | %+.4f |\n", result.Delta.PAt1) + fmt.Fprintf(&sb, "| MRR | %+.4f |\n", result.Delta.MRR) + fmt.Fprintf(&sb, "| Hit@3 | %+.4f |\n", result.Delta.HitAt3) } if len(result.TopRegs) > 0 { @@ -159,8 +159,8 @@ func generateSummaryMD(report *Report, result *CheckResult) string { if len(result.TopRegs) > 10 { break } - sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n", - r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ","))) + fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n", + r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")) } } diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go index 83e3f5c..e41fe1c 100644 --- a/internal/benchmark/config.go +++ b/internal/benchmark/config.go @@ -19,16 +19,16 @@ type DefaultsConfig struct { } type Profile struct { - Strategy string `json:"strategy"` - Threshold float64 `json:"threshold"` - TopK int `json:"top_k"` - Weights Weights `json:"weights"` - Suites []string `json:"suites"` - Mode string `json:"mode"` - Inherits string `json:"inherits"` - Verbose bool `json:"verbose"` - Explain bool `json:"explain"` - FailOnReg bool `json:"fail_on_regression"` + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` + Suites []string `json:"suites"` + Mode string `json:"mode"` + Inherits string `json:"inherits"` + Verbose bool `json:"verbose"` + Explain bool `json:"explain"` + FailOnReg bool `json:"fail_on_regression"` } type Weights struct { @@ -42,16 +42,16 @@ type BaselineConfig struct { } type BaselineQuality struct { - MaxOverallPAt1Drop float64 `json:"max_overall_p_at_1_drop"` - MaxOverallMRRDrop float64 `json:"max_overall_mrr_drop"` + MaxOverallPAt1Drop float64 `json:"max_overall_p_at_1_drop"` + MaxOverallMRRDrop float64 `json:"max_overall_mrr_drop"` MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"` - MaxCorpusPAt1Drop float64 `json:"max_corpus_p_at_1_drop"` - MaxTagPAt1Drop float64 `json:"max_tag_p_at_1_drop"` + MaxCorpusPAt1Drop float64 `json:"max_corpus_p_at_1_drop"` + MaxTagPAt1Drop float64 `json:"max_tag_p_at_1_drop"` } type BaselineRuntime struct { - MaxNsOpRegressionRatio float64 `json:"max_ns_op_regression_ratio"` - MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"` + MaxNsOpRegressionRatio float64 `json:"max_ns_op_regression_ratio"` + MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"` } type CheckConfig struct { @@ -200,7 +200,7 @@ func ParseCheckFlags(args []string) CheckConfig { fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks") fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details") fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations") - fs.Parse(args) + _ = fs.Parse(args) return cfg } @@ -231,7 +231,7 @@ func ParseRunFlags(args []string) RunConfig { fs.BoolVar(&cfg.Explain, "explain", false, "include explanations") fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name") - fs.Parse(args) + _ = fs.Parse(args) return cfg } @@ -244,7 +244,7 @@ func ParseCompareFlags(args []string) CompareConfig { fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)") fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") - fs.Parse(args) + _ = fs.Parse(args) return cfg } @@ -255,7 +255,7 @@ func ParseLintFlags(args []string) LintConfig { } fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") - fs.Parse(args) + _ = fs.Parse(args) return cfg } @@ -266,7 +266,7 @@ func ParseCatalogFlags(args []string) CatalogConfig { } fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)") fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)") - fs.Parse(args) + _ = fs.Parse(args) return cfg } @@ -279,7 +279,7 @@ func ParseBaselineFlags(args []string) BaselineCmdConfig { fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name") fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)") fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") - fs.Parse(args) + _ = fs.Parse(args) if len(fs.Args()) > 0 { cfg.Action = fs.Args()[0] @@ -294,7 +294,7 @@ func ParseCalibrateFlags(args []string) CalibrateConfig { } fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test") fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") - fs.Parse(args) + _ = fs.Parse(args) return cfg } @@ -306,7 +306,7 @@ func ParseTuneFlags(args []string) TuneConfig { fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against") fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)") fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") - fs.Parse(args) + _ = fs.Parse(args) return cfg } @@ -315,6 +315,6 @@ func ParseRuntimeFlags(args []string) RuntimeConfig { cfg := RuntimeConfig{} fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression") fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") - fs.Parse(args) + _ = fs.Parse(args) return cfg } diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go index 555b503..86c5014 100644 --- a/internal/benchmark/dataset.go +++ b/internal/benchmark/dataset.go @@ -25,10 +25,10 @@ type Query struct { } type Corpus struct { - ID string - Path string - Snapshot []semantic.ElementDescriptor - Queries []Query + ID string + Path string + Snapshot []semantic.ElementDescriptor + Queries []Query } type Dataset struct { diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go index 391cc0a..f5b3a7d 100644 --- a/internal/benchmark/runner.go +++ b/internal/benchmark/runner.go @@ -8,14 +8,14 @@ import ( ) type QueryResult struct { - ID string `json:"id"` - Corpus string `json:"corpus"` - Query string `json:"query"` - Difficulty string `json:"difficulty"` - Tags []string `json:"tags"` - Intent string `json:"intent,omitempty"` - PageType string `json:"page_type,omitempty"` - Expected struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Difficulty string `json:"difficulty"` + Tags []string `json:"tags"` + Intent string `json:"intent,omitempty"` + PageType string `json:"page_type,omitempty"` + Expected struct { RelevantRefs []string `json:"relevant_refs"` PartiallyRelevantRefs []string `json:"partially_relevant_refs"` } `json:"expected"` @@ -36,7 +36,7 @@ type QueryResult struct { Margin float64 `json:"margin"` } `json:"metrics"` Latency struct { - LibraryMs int64 `json:"library_ms"` + LibraryMs int64 `json:"library_ms"` CLIMs *int64 `json:"cli_ms,omitempty"` } `json:"latency"` Status string `json:"status"` @@ -60,10 +60,10 @@ type Report struct { Command string `json:"command"` } `json:"run"` Dataset struct { - Name string `json:"name"` - Version string `json:"version,omitempty"` - QueryCount int `json:"query_count"` - CorpusCount int `json:"corpus_count"` + Name string `json:"name"` + Version string `json:"version,omitempty"` + QueryCount int `json:"query_count"` + CorpusCount int `json:"corpus_count"` } `json:"dataset"` Config struct { Profile string `json:"profile"` @@ -74,11 +74,11 @@ type Report struct { } `json:"config"` Status string `json:"status"` Metrics struct { - Overall OverallMetrics `json:"overall"` - Latency LatencyMetrics `json:"latency"` - ByCorpus map[string]CorpusMetrics `json:"by_corpus"` + Overall OverallMetrics `json:"overall"` + Latency LatencyMetrics `json:"latency"` + ByCorpus map[string]CorpusMetrics `json:"by_corpus"` ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"` - ByTag map[string]CorpusMetrics `json:"by_tag"` + ByTag map[string]CorpusMetrics `json:"by_tag"` } `json:"metrics"` Results []QueryResult `json:"results"` } @@ -243,7 +243,8 @@ func computeQueryMetrics(result *QueryResult, query Query) { if i >= 5 { break } - if relevantSet[m.Ref] { + switch { + case relevantSet[m.Ref]: if result.Metrics.BestRelevantRank == nil { rank := i + 1 result.Metrics.BestRelevantRank = &rank @@ -256,11 +257,11 @@ func computeQueryMetrics(result *QueryResult, query Query) { result.Metrics.HitAt3 = 1 } result.Metrics.HitAt5 = 1 - } else if partialSet[m.Ref] { + case partialSet[m.Ref]: if i < 3 { partialInTop3++ } - } else { + default: if m.Score > result.Metrics.BestWrongScore { result.Metrics.BestWrongScore = m.Score } @@ -270,17 +271,18 @@ func computeQueryMetrics(result *QueryResult, query Query) { result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore // Status - if query.ExpectNoMatch { + switch { + case query.ExpectNoMatch: if len(result.Actual.Matches) == 0 { result.Status = "no_match_expected" } else { result.Status = "unexpected_match" } - } else if result.Metrics.PAt1 >= 1.0 { + case result.Metrics.PAt1 >= 1.0: result.Status = "hit" - } else if result.Metrics.PAt1 >= 0.5 { + case result.Metrics.PAt1 >= 0.5: result.Status = "partial" - } else { + default: result.Status = "miss" } } diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go index 8e28dcb..e7622f1 100644 --- a/internal/benchmark/runtime.go +++ b/internal/benchmark/runtime.go @@ -73,12 +73,13 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { result.Benchmarks[i].BaselineNs = base.NsOp result.Benchmarks[i].Ratio = ratio - if ratio > maxRatio { + switch { + case ratio > maxRatio: result.Benchmarks[i].Status = "regression" result.Regressions++ - } else if ratio > 1.1 { + case ratio > 1.1: result.Benchmarks[i].Status = "warning" - } else { + default: result.Benchmarks[i].Status = "ok" } } else { @@ -131,13 +132,13 @@ func parseBenchOutput(output string) []RuntimeBenchmark { for i, f := range fields { if f == "ns/op" && i > 0 { - fmt.Sscanf(fields[i-1], "%f", &nsOp) + _, _ = fmt.Sscanf(fields[i-1], "%f", &nsOp) } if f == "B/op" && i > 0 { - fmt.Sscanf(fields[i-1], "%d", &bytesOp) + _, _ = fmt.Sscanf(fields[i-1], "%d", &bytesOp) } if f == "allocs/op" && i > 0 { - fmt.Sscanf(fields[i-1], "%d", &allocsOp) + _, _ = fmt.Sscanf(fields[i-1], "%d", &allocsOp) } } diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go index 9670a68..1261dd6 100644 --- a/recovery/benchmark_test.go +++ b/recovery/benchmark_test.go @@ -237,7 +237,7 @@ func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc Ben err := fmt.Errorf("could not find node with id %s", sc.OriginalRef) - re.AttemptWithClassification( + _, _, _ = re.AttemptWithClassification( context.Background(), "test-tab", sc.OriginalRef, From b37cd438aa18e035fbb24754cfbb9125b8ca224a Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 22:52:43 +0100 Subject: [PATCH 13/14] feat: config-driven thresholds, validation, and deterministic output --- internal/benchmark/catalog.go | 19 ++- internal/benchmark/check.go | 45 +++++- internal/benchmark/compare.go | 11 ++ internal/benchmark/config.go | 239 ++++++++++++++++++++++++++++-- internal/benchmark/config_test.go | 147 ++++++++++++++++++ internal/benchmark/runner.go | 84 ++++++++++- internal/benchmark/runtime.go | 42 +++++- internal/engine/benchmark_test.go | 119 ++++++++++++++- 8 files changed, 673 insertions(+), 33 deletions(-) create mode 100644 internal/benchmark/config_test.go diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go index b4c4ec1..69a3091 100644 --- a/internal/benchmark/catalog.go +++ b/internal/benchmark/catalog.go @@ -62,14 +62,25 @@ func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) { switch cfg.By { case "difficulty": fmt.Printf("\n By Difficulty:\n") - for d, n := range result.ByDifficulty { - fmt.Printf(" %-10s %4d\n", d, n) + diffs := sortedKeys(result.ByDifficulty) + for _, d := range diffs { + fmt.Printf(" %-10s %4d\n", d, result.ByDifficulty[d]) } case "tag": fmt.Printf("\n By Tag:\n") - for t, n := range result.ByTag { - fmt.Printf(" %-20s %4d\n", t, n) + tags := sortedKeys(result.ByTag) + for _, t := range tags { + fmt.Printf(" %-20s %4d\n", t, result.ByTag[t]) } } fmt.Printf("\n") } + +func sortedKeys(m map[string]int) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go index e2ceedc..0528059 100644 --- a/internal/benchmark/check.go +++ b/internal/benchmark/check.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path/filepath" + "sort" "strings" "time" ) @@ -40,6 +41,7 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) { Verbose: cfg.Verbose, Explain: cfg.Explain, OutputDir: cfg.OutputDir, + Quick: cfg.Quick, } report, err := RunCorpusBenchmark(ds, runCfg) @@ -71,10 +73,28 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) { } result.Summary.Regressions = len(result.TopRegs) + // Determine baseline path from config baselinePath := cfg.BaselinePath if baselinePath == "" { - baselinePath = filepath.Join(root, "baselines", "combined.json") + if benchCfg != nil { + baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json") + } else { + baselinePath = filepath.Join(root, "baselines", "combined.json") + } + } + + // Get quality thresholds from config + var thresholds BaselineQuality + if benchCfg != nil { + thresholds = benchCfg.QualityThresholds() + } else { + thresholds = BaselineQuality{ + MaxOverallPAt1Drop: 0.02, + MaxOverallMRRDrop: 0.02, + MaxOverallHitAt3Drop: 0.02, + } } + if _, err := os.Stat(baselinePath); err == nil { baseline, err := loadReport(baselinePath) if err == nil { @@ -83,12 +103,24 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) { MRR: report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, } - if cfg.FailOnReg && (result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02) { - result.Status = "fail" + if cfg.FailOnReg { + if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop || + result.Delta.MRR < -thresholds.MaxOverallMRRDrop || + result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop { + result.Status = "fail" + } } } } + // Sort regressions for deterministic output + sort.Slice(result.TopRegs, func(i, j int) bool { + if result.TopRegs[i].Corpus != result.TopRegs[j].Corpus { + return result.TopRegs[i].Corpus < result.TopRegs[j].Corpus + } + return result.TopRegs[i].ID < result.TopRegs[j].ID + }) + _ = os.MkdirAll(cfg.OutputDir, 0755) ts := time.Now().Format("20060102_150405") reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts)) @@ -155,13 +187,16 @@ func generateSummaryMD(report *Report, result *CheckResult) string { sb.WriteString("\n## Misses\n\n") sb.WriteString("| ID | Corpus | Query | Got | Expected |\n") sb.WriteString("|----|--------|-------|-----|----------|\n") - for _, r := range result.TopRegs { - if len(result.TopRegs) > 10 { + for i, r := range result.TopRegs { + if i >= 10 { break } fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n", r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")) } + if len(result.TopRegs) > 10 { + fmt.Fprintf(&sb, "\n*Showing 10 of %d misses.*\n", len(result.TopRegs)) + } } return sb.String() diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go index 2b0a3d5..f0e6ccf 100644 --- a/internal/benchmark/compare.go +++ b/internal/benchmark/compare.go @@ -3,6 +3,7 @@ package benchmark import ( "encoding/json" "fmt" + "sort" ) func RunCompare(cfg CompareConfig) (*CompareResult, error) { @@ -70,9 +71,19 @@ func PrintCompareResult(result *CompareResult, cfg CompareConfig) { if len(result.Regressions) > 0 { fmt.Printf("\n Regressions:\n") + sortRegressions(result.Regressions) for _, r := range result.Regressions { fmt.Printf(" %s: %s (%s)\n", r.ID, r.Reason, r.Query) } } fmt.Printf("\n") } + +func sortRegressions(regs []Regression) { + sort.Slice(regs, func(i, j int) bool { + if regs[i].Corpus != regs[j].Corpus { + return regs[i].Corpus < regs[j].Corpus + } + return regs[i].ID < regs[j].ID + }) +} diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go index e41fe1c..cd0bbec 100644 --- a/internal/benchmark/config.go +++ b/internal/benchmark/config.go @@ -2,20 +2,35 @@ package benchmark import ( "encoding/json" + "errors" "flag" + "fmt" "os" "path/filepath" ) type Config struct { - Version string `json:"version"` - Defaults DefaultsConfig `json:"defaults"` - Profiles map[string]Profile `json:"profiles"` - Baseline BaselineConfig `json:"baseline"` + Version string `json:"version"` + Defaults DefaultsConfig `json:"defaults"` + Profiles map[string]Profile `json:"profiles"` + Baseline BaselineConfig `json:"baseline"` + Results ResultsConfig `json:"results"` + Strategies []string `json:"strategies"` + SnapshotsDir string `json:"snapshots_dir"` } type DefaultsConfig struct { - Profile string `json:"profile"` + Profile string `json:"profile"` + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` +} + +type ResultsConfig struct { + Dir string `json:"dir"` + BaselinesDir string `json:"baselines_dir"` + GeneratedFilesPolicy string `json:"generated_files_policy"` } type Profile struct { @@ -42,16 +57,20 @@ type BaselineConfig struct { } type BaselineQuality struct { - MaxOverallPAt1Drop float64 `json:"max_overall_p_at_1_drop"` - MaxOverallMRRDrop float64 `json:"max_overall_mrr_drop"` - MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"` - MaxCorpusPAt1Drop float64 `json:"max_corpus_p_at_1_drop"` - MaxTagPAt1Drop float64 `json:"max_tag_p_at_1_drop"` + MaxOverallPAt1Drop float64 `json:"max_overall_p_at_1_drop"` + MaxOverallMRRDrop float64 `json:"max_overall_mrr_drop"` + MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"` + MaxCorpusPAt1Drop float64 `json:"max_corpus_p_at_1_drop"` + MaxDifficultyPAt1Drop float64 `json:"max_difficulty_p_at_1_drop"` + MaxTagPAt1Drop float64 `json:"max_tag_p_at_1_drop"` + MaxMarginDropReport float64 `json:"max_margin_drop_report"` } type BaselineRuntime struct { MaxNsOpRegressionRatio float64 `json:"max_ns_op_regression_ratio"` MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"` + MaxCorpusLatencyP50MS int `json:"max_corpus_latency_p50_ms"` + MaxCorpusLatencyP95MS int `json:"max_corpus_latency_p95_ms"` } type CheckConfig struct { @@ -80,6 +99,7 @@ type RunConfig struct { Explain bool OutputDir string ReportName string + Quick bool } type CompareConfig struct { @@ -152,11 +172,28 @@ func LoadConfig(benchmarkRoot string) (*Config, error) { func ResolveProfile(cfg *Config, name string) Profile { p, ok := cfg.Profiles[name] if !ok { + // Use defaults from config, falling back to hardcoded values + strategy := cfg.Defaults.Strategy + if strategy == "" { + strategy = "combined" + } + threshold := cfg.Defaults.Threshold + if threshold == 0 { + threshold = 0.01 + } + topK := cfg.Defaults.TopK + if topK == 0 { + topK = 5 + } + weights := cfg.Defaults.Weights + if weights.Lexical == 0 && weights.Embedding == 0 { + weights = Weights{Lexical: 0.6, Embedding: 0.4} + } return Profile{ - Strategy: "combined", - Threshold: 0.01, - TopK: 5, - Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + Strategy: strategy, + Threshold: threshold, + TopK: topK, + Weights: weights, Suites: []string{"corpus"}, Mode: "library", } @@ -185,6 +222,180 @@ func ResolveProfile(cfg *Config, name string) Profile { return p } +// projectRoot returns the project root (parent of tests/benchmark). +func projectRoot(benchmarkRoot string) string { + return filepath.Dir(filepath.Dir(benchmarkRoot)) +} + +// ResultsDir returns the configured results directory. +func (c *Config) ResultsDir(benchmarkRoot string) string { + if c.Results.Dir != "" { + if filepath.IsAbs(c.Results.Dir) { + return c.Results.Dir + } + return filepath.Join(projectRoot(benchmarkRoot), c.Results.Dir) + } + return filepath.Join(benchmarkRoot, "results") +} + +// BaselinesDir returns the configured baselines directory. +func (c *Config) BaselinesDir(benchmarkRoot string) string { + if c.Results.BaselinesDir != "" { + if filepath.IsAbs(c.Results.BaselinesDir) { + return c.Results.BaselinesDir + } + return filepath.Join(projectRoot(benchmarkRoot), c.Results.BaselinesDir) + } + return filepath.Join(benchmarkRoot, "baselines") +} + +// QualityThresholds returns quality thresholds with fallback defaults. +func (c *Config) QualityThresholds() BaselineQuality { + q := c.Baseline.Quality + if q.MaxOverallPAt1Drop == 0 { + q.MaxOverallPAt1Drop = 0.02 + } + if q.MaxOverallMRRDrop == 0 { + q.MaxOverallMRRDrop = 0.02 + } + if q.MaxOverallHitAt3Drop == 0 { + q.MaxOverallHitAt3Drop = 0.02 + } + if q.MaxCorpusPAt1Drop == 0 { + q.MaxCorpusPAt1Drop = 0.08 + } + if q.MaxDifficultyPAt1Drop == 0 { + q.MaxDifficultyPAt1Drop = 0.08 + } + if q.MaxTagPAt1Drop == 0 { + q.MaxTagPAt1Drop = 0.08 + } + if q.MaxMarginDropReport == 0 { + q.MaxMarginDropReport = 0.15 + } + return q +} + +// RuntimeThresholds returns runtime thresholds with fallback defaults. +func (c *Config) RuntimeThresholds() BaselineRuntime { + r := c.Baseline.Runtime + if r.MaxNsOpRegressionRatio == 0 { + r.MaxNsOpRegressionRatio = 1.25 + } + if r.MaxAllocRegressionRatio == 0 { + r.MaxAllocRegressionRatio = 1.25 + } + return r +} + +// ValidateConfig checks the config for errors and returns a descriptive error if invalid. +func ValidateConfig(cfg *Config) error { + var errs []error + + // Validate strategies + if len(cfg.Strategies) == 0 { + errs = append(errs, errors.New("strategies list is empty")) + } else { + validStrategies := make(map[string]bool) + for _, s := range cfg.Strategies { + validStrategies[s] = true + } + // Check default strategy is in list + if cfg.Defaults.Strategy != "" && !validStrategies[cfg.Defaults.Strategy] { + errs = append(errs, fmt.Errorf("default strategy %q not in strategies list", cfg.Defaults.Strategy)) + } + // Check profile strategies + for name, p := range cfg.Profiles { + if p.Strategy != "" && !validStrategies[p.Strategy] { + errs = append(errs, fmt.Errorf("profile %q uses strategy %q not in strategies list", name, p.Strategy)) + } + } + } + + // Validate weights + if cfg.Defaults.Weights.Lexical < 0 { + errs = append(errs, errors.New("defaults.weights.lexical must be non-negative")) + } + if cfg.Defaults.Weights.Embedding < 0 { + errs = append(errs, errors.New("defaults.weights.embedding must be non-negative")) + } + if cfg.Defaults.Weights.Lexical == 0 && cfg.Defaults.Weights.Embedding == 0 { + errs = append(errs, errors.New("defaults.weights: lexical and embedding cannot both be zero")) + } + + // Validate profile weights + for name, p := range cfg.Profiles { + if p.Weights.Lexical < 0 { + errs = append(errs, fmt.Errorf("profile %q: weights.lexical must be non-negative", name)) + } + if p.Weights.Embedding < 0 { + errs = append(errs, fmt.Errorf("profile %q: weights.embedding must be non-negative", name)) + } + } + + // Validate quality thresholds (should be positive when set) + q := cfg.Baseline.Quality + if q.MaxOverallPAt1Drop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_p_at_1_drop must be non-negative")) + } + if q.MaxOverallMRRDrop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_mrr_drop must be non-negative")) + } + if q.MaxOverallHitAt3Drop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_hit_at_3_drop must be non-negative")) + } + + // Validate runtime thresholds (must be >= 1) + r := cfg.Baseline.Runtime + if r.MaxNsOpRegressionRatio != 0 && r.MaxNsOpRegressionRatio < 1 { + errs = append(errs, errors.New("baseline.runtime.max_ns_op_regression_ratio must be >= 1")) + } + if r.MaxAllocRegressionRatio != 0 && r.MaxAllocRegressionRatio < 1 { + errs = append(errs, errors.New("baseline.runtime.max_alloc_regression_ratio must be >= 1")) + } + + // Validate profile inheritance + if err := validateProfileInheritance(cfg); err != nil { + errs = append(errs, err) + } + + if len(errs) == 0 { + return nil + } + if len(errs) == 1 { + return errs[0] + } + return fmt.Errorf("config has %d errors: %v", len(errs), errs) +} + +// validateProfileInheritance checks for missing references and cycles. +func validateProfileInheritance(cfg *Config) error { + for name, p := range cfg.Profiles { + if p.Inherits == "" { + continue + } + // Check reference exists + if _, ok := cfg.Profiles[p.Inherits]; !ok { + return fmt.Errorf("profile %q inherits from non-existent profile %q", name, p.Inherits) + } + // Check for cycles + visited := map[string]bool{name: true} + current := p.Inherits + for current != "" { + if visited[current] { + return fmt.Errorf("profile inheritance cycle detected: %q -> %q", name, current) + } + visited[current] = true + if parent, ok := cfg.Profiles[current]; ok { + current = parent.Inherits + } else { + break + } + } + } + return nil +} + func ParseCheckFlags(args []string) CheckConfig { fs := flag.NewFlagSet("check", flag.ExitOnError) cfg := CheckConfig{ diff --git a/internal/benchmark/config_test.go b/internal/benchmark/config_test.go new file mode 100644 index 0000000..2590556 --- /dev/null +++ b/internal/benchmark/config_test.go @@ -0,0 +1,147 @@ +package benchmark + +import "testing" + +func TestValidateConfig_Valid(t *testing.T) { + cfg := &Config{ + Strategies: []string{"lexical", "embedding", "combined"}, + Defaults: DefaultsConfig{ + Strategy: "combined", + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Quality: BaselineQuality{ + MaxOverallPAt1Drop: 0.02, + }, + Runtime: BaselineRuntime{ + MaxNsOpRegressionRatio: 1.25, + }, + }, + } + if err := ValidateConfig(cfg); err != nil { + t.Errorf("expected valid config, got error: %v", err) + } +} + +func TestValidateConfig_EmptyStrategies(t *testing.T) { + cfg := &Config{ + Strategies: []string{}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for empty strategies") + } +} + +func TestValidateConfig_InvalidDefaultStrategy(t *testing.T) { + cfg := &Config{ + Strategies: []string{"lexical", "embedding"}, + Defaults: DefaultsConfig{ + Strategy: "combined", + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for invalid default strategy") + } +} + +func TestValidateConfig_NegativeWeights(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: -0.5, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for negative weight") + } +} + +func TestValidateConfig_BothWeightsZero(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0, Embedding: 0}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error when both weights are zero") + } +} + +func TestValidateConfig_RuntimeRatioTooLow(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Runtime: BaselineRuntime{ + MaxNsOpRegressionRatio: 0.5, + }, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for runtime ratio < 1") + } +} + +func TestValidateConfig_ProfileInheritsMissing(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Profiles: map[string]Profile{ + "fast": {Inherits: "nonexistent"}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for missing inherited profile") + } +} + +func TestValidateConfig_ProfileInheritanceCycle(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Profiles: map[string]Profile{ + "a": {Inherits: "b"}, + "b": {Inherits: "c"}, + "c": {Inherits: "a"}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for inheritance cycle") + } +} + +func TestValidateConfig_NegativeQualityThreshold(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Quality: BaselineQuality{ + MaxOverallPAt1Drop: -0.02, + }, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for negative quality threshold") + } +} diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go index f5b3a7d..253a4c3 100644 --- a/internal/benchmark/runner.go +++ b/internal/benchmark/runner.go @@ -2,6 +2,8 @@ package benchmark import ( "context" + "os/exec" + "strings" "time" "github.com/pinchtab/semantic" @@ -118,6 +120,7 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) { report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339) report.Run.Tool = "semantic-bench" + report.Run.GitSHA, report.Run.GitDirty = getGitInfo() report.Dataset.Name = "semantic-ui-matching-corpus" report.Dataset.QueryCount = ds.QueryCount() report.Dataset.CorpusCount = ds.CorpusCount() @@ -138,7 +141,12 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) { continue } - for _, query := range corpus.Queries { + queries := corpus.Queries + if cfg.Quick { + queries = selectQuickSubset(corpus.Queries) + } + + for _, query := range queries { if cfg.QueryID != "" && query.ID != cfg.QueryID { continue } @@ -153,6 +161,56 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) { return report, nil } +// selectQuickSubset returns a deterministic subset of queries for quick mode. +// It selects at most 3 queries per corpus, preferring a mix of difficulties. +func selectQuickSubset(queries []Query) []Query { + if len(queries) <= 3 { + return queries + } + + // Group by difficulty + byDiff := make(map[string][]Query) + for _, q := range queries { + diff := q.Difficulty + if diff == "" { + diff = "medium" + } + byDiff[diff] = append(byDiff[diff], q) + } + + // Select one from each difficulty level, up to 3 total + var selected []Query + for _, diff := range []string{"easy", "medium", "hard"} { + if qs, ok := byDiff[diff]; ok && len(qs) > 0 { + selected = append(selected, qs[0]) + if len(selected) >= 3 { + break + } + } + } + + // If we don't have 3 yet, fill from remaining + if len(selected) < 3 { + for _, q := range queries { + found := false + for _, s := range selected { + if s.ID == q.ID { + found = true + break + } + } + if !found { + selected = append(selected, q) + if len(selected) >= 3 { + break + } + } + } + } + + return selected +} + func createMatcher(cfg RunConfig) semantic.ElementMatcher { embedder := semantic.NewHashingEmbedder(128) switch cfg.Strategy { @@ -189,8 +247,11 @@ func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg R start := time.Now() findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{ - Threshold: threshold, - TopK: topK, + Threshold: threshold, + TopK: topK, + LexicalWeight: cfg.LexicalWeight, + EmbeddingWeight: cfg.EmbeddingWeight, + Explain: cfg.Explain, }) result.Latency.LibraryMs = time.Since(start).Milliseconds() @@ -384,3 +445,20 @@ func sortInt64(s []int64) { } } } + +func getGitInfo() (sha string, dirty bool) { + cmd := exec.Command("git", "rev-parse", "HEAD") + out, err := cmd.Output() + if err != nil { + return "", false + } + sha = strings.TrimSpace(string(out)) + + cmd = exec.Command("git", "status", "--porcelain") + out, err = cmd.Output() + if err != nil { + return sha, false + } + dirty = len(strings.TrimSpace(string(out))) > 0 + return sha, dirty +} diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go index e7622f1..6545913 100644 --- a/internal/benchmark/runtime.go +++ b/internal/benchmark/runtime.go @@ -35,7 +35,26 @@ type runtimeBaseline struct { func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { root := FindBenchmarkRoot() - baselinePath := filepath.Join(root, "baselines", "runtime.json") + + // Load config for thresholds + benchCfg, _ := LoadConfig(root) + var thresholds BaselineRuntime + if benchCfg != nil { + thresholds = benchCfg.RuntimeThresholds() + } else { + thresholds = BaselineRuntime{ + MaxNsOpRegressionRatio: 1.25, + MaxAllocRegressionRatio: 1.25, + } + } + + // Determine baseline path from config + var baselinePath string + if benchCfg != nil { + baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "runtime.json") + } else { + baselinePath = filepath.Join(root, "baselines", "runtime.json") + } benchmarks, err := runGoBenchmarks() if err != nil { @@ -66,18 +85,29 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { baselineMap[b.Name] = b } - maxRatio := 1.25 + // Warning threshold is halfway between 1.0 and max ratio + warnRatio := 1.0 + ((thresholds.MaxNsOpRegressionRatio - 1.0) / 2.0) + for i, b := range result.Benchmarks { if base, ok := baselineMap[b.Name]; ok { - ratio := b.NsOp / base.NsOp + nsRatio := b.NsOp / base.NsOp result.Benchmarks[i].BaselineNs = base.NsOp - result.Benchmarks[i].Ratio = ratio + result.Benchmarks[i].Ratio = nsRatio + + // Check allocation regression if baseline has allocation data + var allocRatio float64 + if base.AllocsOp > 0 && b.AllocsOp > 0 { + allocRatio = float64(b.AllocsOp) / float64(base.AllocsOp) + } switch { - case ratio > maxRatio: + case nsRatio > thresholds.MaxNsOpRegressionRatio: + result.Benchmarks[i].Status = "regression" + result.Regressions++ + case allocRatio > thresholds.MaxAllocRegressionRatio: result.Benchmarks[i].Status = "regression" result.Regressions++ - case ratio > 1.1: + case nsRatio > warnRatio: result.Benchmarks[i].Status = "warning" default: result.Benchmarks[i].Status = "ok" diff --git a/internal/engine/benchmark_test.go b/internal/engine/benchmark_test.go index c37528c..0ebc2c6 100644 --- a/internal/engine/benchmark_test.go +++ b/internal/engine/benchmark_test.go @@ -2,9 +2,10 @@ package engine import ( "context" - "github.com/pinchtab/semantic/internal/types" "strconv" "testing" + + "github.com/pinchtab/semantic/internal/types" ) // benchElements returns a realistic set of elements for benchmarking. @@ -244,3 +245,119 @@ func BenchmarkCombinedFind_Issue24_100Elements(b *testing.B) { }) } } + +// Focused microbenchmarks for individual components + +func BenchmarkParseQueryContext(b *testing.B) { + queries := []string{ + "sign in button", + "the first email textbox in the login form", + "button not submit near the checkout section", + "second item in the dropdown menu", + } + b.ReportAllocs() + + for b.Loop() { + for _, q := range queries { + ParseQueryContext(q) + } + } +} + +func BenchmarkParseQueryContext_Complex(b *testing.B) { + q := "the third blue submit button in the checkout form not disabled" + b.ReportAllocs() + + for b.Loop() { + ParseQueryContext(q) + } +} + +func BenchmarkRemoveStopwords(b *testing.B) { + tokenSets := [][]string{ + {"click", "the", "sign", "in", "button"}, + {"find", "the", "email", "address", "textbox"}, + {"the", "first", "item", "in", "a", "dropdown", "menu"}, + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, tokens := range tokenSets { + removeStopwords(tokens) + } + } +} + +func BenchmarkScoreFusion(b *testing.B) { + // Test the score fusion calculation + lexScores := make([]float64, 100) + embScores := make([]float64, 100) + for i := range lexScores { + lexScores[i] = float64(i) / 100.0 + embScores[i] = float64(100-i) / 100.0 + } + lexWeight, embWeight := 0.6, 0.4 + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + for j := range lexScores { + _ = lexWeight*lexScores[j] + embWeight*embScores[j] + } + } +} + +func BenchmarkLexicalScore_Variants(b *testing.B) { + cases := []struct { + name string + query string + desc string + }{ + {"exact", "Sign In", "button: Sign In"}, + {"partial", "sign", "button: Sign In"}, + {"synonym", "login", "button: Sign In"}, + {"mismatch", "checkout", "button: Sign In"}, + {"long_query", "click the sign in button on the login page", "button: Sign In"}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + LexicalScore(tc.query, tc.desc) + } + }) + } +} + +func BenchmarkCombinedFind_WeightVariants(b *testing.B) { + elements := benchElements() + ctx := context.Background() + + weights := []struct { + name string + lex float64 + emb float64 + }{ + {"lex_only", 1.0, 0.0}, + {"emb_only", 0.0, 1.0}, + {"balanced", 0.5, 0.5}, + {"lex_heavy", 0.8, 0.2}, + {"emb_heavy", 0.2, 0.8}, + } + + for _, w := range weights { + b.Run(w.name, func(b *testing.B) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + opts := types.FindOptions{ + Threshold: 0.3, + TopK: 3, + LexicalWeight: w.lex, + EmbeddingWeight: w.emb, + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = m.Find(ctx, "sign in button", elements, opts) + } + }) + } +} From b6fadf1d4f35feba339ac7ebdb5ab25dfeae32c0 Mon Sep 17 00:00:00 2001 From: Luigi Agosti Date: Fri, 24 Apr 2026 23:05:00 +0100 Subject: [PATCH 14/14] feat: config-driven thresholds with validation and enforcement --- internal/benchmark/check.go | 55 ++++++++++++++++++++--------------- internal/benchmark/config.go | 5 +++- internal/benchmark/runner.go | 6 ++-- internal/benchmark/runtime.go | 22 ++++---------- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go index 0528059..88234f6 100644 --- a/internal/benchmark/check.go +++ b/internal/benchmark/check.go @@ -18,16 +18,11 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) { return nil, fmt.Errorf("load dataset: %w", err) } - benchCfg, _ := LoadConfig(root) - profile := Profile{ - Strategy: "combined", - Threshold: 0.01, - TopK: 5, - Weights: Weights{Lexical: 0.6, Embedding: 0.4}, - } - if benchCfg != nil { - profile = ResolveProfile(benchCfg, cfg.Profile) + benchCfg, err := LoadConfig(root) + if err != nil { + return nil, fmt.Errorf("load config: %w", err) } + profile := ResolveProfile(benchCfg, cfg.Profile) runCfg := RunConfig{ Suite: "corpus", @@ -76,24 +71,11 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) { // Determine baseline path from config baselinePath := cfg.BaselinePath if baselinePath == "" { - if benchCfg != nil { - baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json") - } else { - baselinePath = filepath.Join(root, "baselines", "combined.json") - } + baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json") } // Get quality thresholds from config - var thresholds BaselineQuality - if benchCfg != nil { - thresholds = benchCfg.QualityThresholds() - } else { - thresholds = BaselineQuality{ - MaxOverallPAt1Drop: 0.02, - MaxOverallMRRDrop: 0.02, - MaxOverallHitAt3Drop: 0.02, - } - } + thresholds := benchCfg.QualityThresholds() if _, err := os.Stat(baselinePath); err == nil { baseline, err := loadReport(baselinePath) @@ -104,11 +86,36 @@ func RunCheck(cfg CheckConfig) (*CheckResult, error) { HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, } if cfg.FailOnReg { + // Check overall thresholds if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop || result.Delta.MRR < -thresholds.MaxOverallMRRDrop || result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop { result.Status = "fail" } + // Check corpus-level thresholds + for corpus, current := range report.Metrics.ByCorpus { + if base, ok := baseline.Metrics.ByCorpus[corpus]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxCorpusPAt1Drop { + result.Status = "fail" + } + } + } + // Check difficulty-level thresholds + for diff, current := range report.Metrics.ByDifficulty { + if base, ok := baseline.Metrics.ByDifficulty[diff]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxDifficultyPAt1Drop { + result.Status = "fail" + } + } + } + // Check tag-level thresholds + for tag, current := range report.Metrics.ByTag { + if base, ok := baseline.Metrics.ByTag[tag]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxTagPAt1Drop { + result.Status = "fail" + } + } + } } } } diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go index cd0bbec..2d233e2 100644 --- a/internal/benchmark/config.go +++ b/internal/benchmark/config.go @@ -166,6 +166,9 @@ func LoadConfig(benchmarkRoot string) (*Config, error) { if err := json.Unmarshal(data, &cfg); err != nil { return nil, err } + if err := ValidateConfig(&cfg); err != nil { + return nil, fmt.Errorf("invalid config: %w", err) + } return &cfg, nil } @@ -408,7 +411,7 @@ func ParseCheckFlags(args []string) CheckConfig { fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)") fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression") - fs.BoolVar(&cfg.Quick, "quick", false, "run subset for fast checks") + fs.BoolVar(&cfg.Quick, "quick", false, "smoke mode: 3 queries per corpus (not representative)") fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details") fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations") _ = fs.Parse(args) diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go index 253a4c3..6f00821 100644 --- a/internal/benchmark/runner.go +++ b/internal/benchmark/runner.go @@ -161,8 +161,10 @@ func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) { return report, nil } -// selectQuickSubset returns a deterministic subset of queries for quick mode. -// It selects at most 3 queries per corpus, preferring a mix of difficulties. +// selectQuickSubset returns a deterministic subset for smoke testing. +// Selects up to 3 queries per corpus by difficulty. This is NOT representative +// of full corpus coverageβ€”edge-case tags may be missed. Use for fast iteration, +// not for final regression checks. func selectQuickSubset(queries []Query) []Query { if len(queries) <= 3 { return queries diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go index 6545913..dd68f75 100644 --- a/internal/benchmark/runtime.go +++ b/internal/benchmark/runtime.go @@ -37,24 +37,12 @@ func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { root := FindBenchmarkRoot() // Load config for thresholds - benchCfg, _ := LoadConfig(root) - var thresholds BaselineRuntime - if benchCfg != nil { - thresholds = benchCfg.RuntimeThresholds() - } else { - thresholds = BaselineRuntime{ - MaxNsOpRegressionRatio: 1.25, - MaxAllocRegressionRatio: 1.25, - } - } - - // Determine baseline path from config - var baselinePath string - if benchCfg != nil { - baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "runtime.json") - } else { - baselinePath = filepath.Join(root, "baselines", "runtime.json") + benchCfg, err := LoadConfig(root) + if err != nil { + return nil, fmt.Errorf("load config: %w", err) } + thresholds := benchCfg.RuntimeThresholds() + baselinePath := filepath.Join(benchCfg.BaselinesDir(root), "runtime.json") benchmarks, err := runGoBenchmarks() if err != nil {