diff --git a/.gitignore b/.gitignore index 2f3b5cc..419dfaa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Binary /semantic +/semantic-bench tests/benchmark/semantic tests/e2e/semantic *.exe @@ -21,4 +22,5 @@ cover.out .claude tests/e2e/results/*.txt tests/benchmark/results/*.json -tests/benchmark/results/*.md \ No newline at end of file +tests/benchmark/results/*.md +tests/benchmark/baselines/*.json \ No newline at end of file diff --git a/README.md b/README.md index 57e3053..83fb48e 100644 --- a/README.md +++ b/README.md @@ -204,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode ## Design Trade-offs -See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration. +See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration. ## Origin diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go new file mode 100644 index 0000000..076d71a --- /dev/null +++ b/cmd/semantic-bench/main.go @@ -0,0 +1,168 @@ +package main + +import ( + "fmt" + "os" + + "github.com/pinchtab/semantic/internal/benchmark" +) + +const usage = `semantic-bench - Benchmark runner for semantic matching + +Usage: + semantic-bench [flags] + +Commands: + check Run benchmark and compare against baseline (default) + run Run benchmark suites + compare Compare two reports + lint Validate dataset + catalog Print dataset inventory + baseline Manage quality baselines (create, update) + calibrate Find optimal thresholds via precision/recall analysis + tune Grid-search lexical/embedding weights + runtime Check Go benchmark performance against baseline + +Flags: + -h, --help Show help + +Run 'semantic-bench --help' for command-specific help. +` + +func main() { + if len(os.Args) < 2 { + runCheck(os.Args[1:]) + return + } + + cmd := os.Args[1] + args := os.Args[2:] + + switch cmd { + case "check": + runCheck(args) + case "run": + runRun(args) + case "compare": + runCompare(args) + case "lint": + runLint(args) + case "catalog": + runCatalog(args) + case "baseline": + runBaseline(args) + case "calibrate": + runCalibrate(args) + case "tune": + runTune(args) + case "runtime": + runRuntime(args) + case "-h", "--help", "help": + fmt.Print(usage) + default: + fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage) + os.Exit(2) + } +} + +func runCheck(args []string) { + cfg := benchmark.ParseCheckFlags(args) + result, err := benchmark.RunCheck(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCheckResult(result, cfg) + if result.Status == "fail" { + os.Exit(1) + } +} + +func runRun(args []string) { + cfg := benchmark.ParseRunFlags(args) + result, err := benchmark.RunBenchmark(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintRunResult(result, cfg) +} + +func runCompare(args []string) { + cfg := benchmark.ParseCompareFlags(args) + result, err := benchmark.RunCompare(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCompareResult(result, cfg) + if result.Status == "fail" { + os.Exit(1) + } +} + +func runLint(args []string) { + cfg := benchmark.ParseLintFlags(args) + result, err := benchmark.RunLint(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintLintResult(result, cfg) + if result.Errors > 0 { + os.Exit(1) + } +} + +func runCatalog(args []string) { + cfg := benchmark.ParseCatalogFlags(args) + result, err := benchmark.RunCatalog(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCatalogResult(result, cfg) +} + +func runBaseline(args []string) { + cfg := benchmark.ParseBaselineFlags(args) + result, err := benchmark.RunBaseline(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintBaselineResult(result, cfg) +} + +func runCalibrate(args []string) { + cfg := benchmark.ParseCalibrateFlags(args) + result, err := benchmark.RunCalibrate(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCalibrateResult(result, cfg) +} + +func runTune(args []string) { + cfg := benchmark.ParseTuneFlags(args) + result, err := benchmark.RunTune(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintTuneResult(result, cfg) +} + +func runRuntime(args []string) { + cfg := benchmark.ParseRuntimeFlags(args) + result, err := benchmark.RunRuntime(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintRuntimeResult(result, cfg) + if result.Status == "fail" && cfg.FailOnRegression { + os.Exit(1) + } +} diff --git a/dev b/dev index dc15e75..11d53d9 100755 --- a/dev +++ b/dev @@ -11,17 +11,27 @@ ERROR=$'\033[38;2;230;57;70m' NC=$'\033[0m' commands=( + "pr:πŸš€:Pre-PR checks (check + e2e + bench)" "doctor:🩺:Setup dev environment" "test:πŸ§ͺ:Run unit tests" "test verbose:πŸ§ͺ:Run unit tests (verbose)" "test race:πŸ§ͺ:Run unit tests with race detector" "coverage:πŸ“Š:Run tests with coverage report" "lint:πŸ”:Run golangci-lint" + "lint corpus:πŸ”:Lint benchmark corpus" + "lint docs:πŸ”:Check documentation links" "fmt:✨:Format code" "vet:πŸ”¬:Run go vet" "check:βœ…:Run all checks (fmt + vet + lint + test)" "build:πŸ“¦:Build CLI binary" - "bench:πŸ‹:Run corpus benchmark suite" + "bench:πŸ‹:Run corpus benchmark" + "bench full:πŸ‹:Run full benchmark suite" + "baseline:πŸ“:Create quality baseline" + "baseline check:πŸ“:Check against baseline" + "baseline update:πŸ“:Update baseline (--accept)" + "calibrate:🎯:Calibrate threshold recommendations" + "runtime:⏱️:Check runtime baseline" + "tune:πŸŽ›οΈ:Tune combined weights" "e2e:🐳:Run E2E tests (Docker)" ) @@ -36,6 +46,36 @@ show_help() { echo "" } +run_pr() { + echo " ${ACCENT}${BOLD}πŸš€ Pre-PR checks${NC}" + echo "" + + echo " ${MUTED}1/4 All checks (fmt + vet + lint + test)${NC}" + run_check + + echo "" + echo " ${MUTED}2/4 E2E tests${NC}" + if [[ -f tests/e2e/run.sh ]]; then + go build -o /tmp/semantic ./cmd/semantic + PATH="/tmp:$PATH" bash tests/e2e/run.sh + echo " ${SUCCESS}βœ“${NC} E2E passed" + else + echo " ${MUTED}Skipped (no e2e/run.sh)${NC}" + fi + + echo "" + echo " ${MUTED}3/4 Lint corpus${NC}" + run_lint_corpus + + echo "" + echo " ${MUTED}4/4 Corpus benchmark${NC}" + run_bench > /dev/null 2>&1 + echo " ${SUCCESS}βœ“${NC} Benchmark complete" + + echo "" + echo " ${SUCCESS}${BOLD}πŸš€ Ready for PR${NC}" +} + run_test() { echo " ${ACCENT}${BOLD}πŸ§ͺ Running tests${NC}" go test ./... -count=1 @@ -88,9 +128,19 @@ run_check() { if [ -n "$unformatted" ]; then echo " ${ERROR}βœ—${NC} Unformatted files:" echo "$unformatted" - exit 1 + echo "" + printf " Fix formatting now? (Y/n) " + read -r answer + if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then + gofmt -w . + echo " ${SUCCESS}βœ“${NC} Format (fixed)" + else + echo " ${MUTED}Run: gofmt -w .${NC}" + exit 1 + fi + else + echo " ${SUCCESS}βœ“${NC} Format" fi - echo " ${SUCCESS}βœ“${NC} Format" echo " ${MUTED}2/4 Vet${NC}" go vet ./... @@ -115,8 +165,53 @@ run_build() { } run_bench() { - echo " ${ACCENT}${BOLD}⏱️ Running corpus benchmark suite${NC}" - bash tests/benchmark/scripts/run-corpus-benchmark.sh + echo " ${ACCENT}${BOLD}πŸ‹ Running corpus benchmark${NC}" + go run ./cmd/semantic-bench check "$@" +} + +run_bench_full() { + echo " ${ACCENT}${BOLD}πŸ‹ Running full benchmark suite${NC}" + go run ./cmd/semantic-bench run -suite=all "$@" +} + +run_lint_corpus() { + echo " ${ACCENT}${BOLD}πŸ” Linting benchmark corpus${NC}" + go run ./cmd/semantic-bench lint "$@" +} + +run_lint_docs() { + echo " ${ACCENT}${BOLD}πŸ” Checking documentation links${NC}" + bash scripts/check-docs-links.sh +} + +run_baseline() { + echo " ${ACCENT}${BOLD}πŸ“ Creating quality baseline${NC}" + go run ./cmd/semantic-bench baseline create "$@" +} + +run_baseline_check() { + echo " ${ACCENT}${BOLD}πŸ“ Checking against baseline${NC}" + go run ./cmd/semantic-bench check "$@" +} + +run_baseline_update() { + echo " ${ACCENT}${BOLD}πŸ“ Updating baseline${NC}" + go run ./cmd/semantic-bench baseline update --accept "$@" +} + +run_calibrate() { + echo " ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}" + go run ./cmd/semantic-bench calibrate -verbose "$@" +} + +run_runtime() { + echo " ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}" + go run ./cmd/semantic-bench runtime "$@" +} + +run_tune() { + echo " ${ACCENT}${BOLD}πŸŽ›οΈ Tuning combined weights${NC}" + go run ./cmd/semantic-bench tune -verbose "$@" } run_e2e() { @@ -129,6 +224,7 @@ run_e2e() { } case "${1:-help}" in + pr) run_pr ;; doctor) exec bash scripts/doctor.sh ;; test) case "${2:-}" in @@ -138,12 +234,33 @@ case "${1:-help}" in esac ;; coverage) run_coverage ;; - lint) run_lint ;; + lint) + case "${2:-}" in + corpus) run_lint_corpus ;; + docs) run_lint_docs ;; + *) run_lint ;; + esac + ;; fmt) run_fmt ;; vet) run_vet ;; check) run_check ;; build) run_build ;; - bench|benchmark) run_bench ;; + bench|benchmark) + case "${2:-}" in + full) run_bench_full ;; + *) shift; run_bench "$@" ;; + esac + ;; + baseline) + case "${2:-}" in + check) shift 2; run_baseline_check "$@" ;; + update) shift 2; run_baseline_update "$@" ;; + *) shift; run_baseline "$@" ;; + esac + ;; + calibrate) shift; run_calibrate "$@" ;; + runtime) shift; run_runtime "$@" ;; + tune) shift; run_tune "$@" ;; e2e) run_e2e ;; help|*) show_help ;; esac diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go new file mode 100644 index 0000000..07cc418 --- /dev/null +++ b/internal/benchmark/baseline.go @@ -0,0 +1,110 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" +) + +type BaselineResult struct { + Action string `json:"action"` + Path string `json:"path"` + Metrics OverallMetrics `json:"metrics"` + Previous *OverallMetrics `json:"previous,omitempty"` +} + +func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) { + root := FindBenchmarkRoot() + baselinesDir := filepath.Join(root, "baselines") + if err := os.MkdirAll(baselinesDir, 0755); err != nil { + return nil, err + } + + baselinePath := filepath.Join(baselinesDir, cfg.Name+".json") + + switch cfg.Action { + case "create": + return createBaseline(root, baselinePath, cfg) + case "update": + if !cfg.Accept { + return nil, fmt.Errorf("use --accept to confirm baseline update") + } + return updateBaseline(root, baselinePath, cfg) + default: + return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action) + } +} + +func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + Mode: "library", + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + data, err := json.MarshalIndent(report, "", " ") + if err != nil { + return nil, err + } + if err := os.WriteFile(baselinePath, data, 0644); err != nil { + return nil, err + } + + return &BaselineResult{ + Action: "create", + Path: baselinePath, + Metrics: report.Metrics.Overall, + }, nil +} + +func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + var previous *OverallMetrics + if data, err := os.ReadFile(baselinePath); err == nil { + var old Report + if json.Unmarshal(data, &old) == nil { + previous = &old.Metrics.Overall + } + backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json" + _ = os.WriteFile(backupPath, data, 0644) + } + + result, err := createBaseline(root, baselinePath, cfg) + if err != nil { + return nil, err + } + result.Action = "update" + result.Previous = previous + return result, nil +} + +func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) { + fmt.Printf("\n Baseline %sd: %s\n\n", result.Action, result.Path) + fmt.Printf(" MRR: %.4f\n", result.Metrics.MRR) + fmt.Printf(" P@1: %.4f\n", result.Metrics.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Metrics.HitAt3) + + if result.Previous != nil { + fmt.Printf("\n Previous:\n") + fmt.Printf(" MRR: %.4f\n", result.Previous.MRR) + fmt.Printf(" P@1: %.4f\n", result.Previous.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Previous.HitAt3) + } + fmt.Println() +} diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go new file mode 100644 index 0000000..48ec06e --- /dev/null +++ b/internal/benchmark/calibrate.go @@ -0,0 +1,173 @@ +package benchmark + +import ( + "context" + "fmt" + + "github.com/pinchtab/semantic" +) + +type CalibrateResult struct { + ByThreshold map[string]ThresholdMetrics `json:"by_threshold"` + Recommendations CalibrateRecommendations `json:"recommendations"` + TotalCases int `json:"total_cases"` +} + +type ThresholdMetrics struct { + TP int `json:"tp"` + FP int `json:"fp"` + FN int `json:"fn"` + TN int `json:"tn"` + Recall float64 `json:"recall"` + Precision float64 `json:"precision"` + FPR float64 `json:"false_positive_rate"` + F1 float64 `json:"f1"` +} + +type CalibrateRecommendations struct { + DefaultThreshold float64 `json:"default_threshold"` + RecoveryThreshold float64 `json:"recovery_threshold"` + BestF1 float64 `json:"best_f1"` +} + +func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &CalibrateResult{ + ByThreshold: make(map[string]ThresholdMetrics), + } + + type testCase struct { + query Query + corpus *Corpus + } + + var cases []testCase + for i := range ds.Corpora { + corpus := &ds.Corpora[i] + if cfg.Corpus != "" && corpus.ID != cfg.Corpus { + continue + } + for _, q := range corpus.Queries { + cases = append(cases, testCase{query: q, corpus: corpus}) + } + } + result.TotalCases = len(cases) + + if cfg.Verbose { + fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases)) + } + + runCfg := RunConfig{ + Strategy: "combined", + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + } + matcher := createMatcher(runCfg) + + var bestF1, bestF1Threshold float64 + var bestRecallThreshold float64 + var bestRecallWithPrecision float64 + + for _, threshold := range cfg.Thresholds { + tp, fp, fn, tn := 0, 0, 0, 0 + + for _, tc := range cases { + findResult, _ := matcher.Find(context.Background(), tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{ + Threshold: threshold, + TopK: 5, + }) + + hasMatch := len(findResult.Matches) > 0 + topRef := "" + if hasMatch { + topRef = findResult.Matches[0].Ref + } + + switch { + case tc.query.ExpectNoMatch && hasMatch: + fp++ + case tc.query.ExpectNoMatch && !hasMatch: + tn++ + case len(tc.query.RelevantRefs) > 0 && !hasMatch: + fn++ + case len(tc.query.RelevantRefs) > 0 && contains(tc.query.RelevantRefs, topRef): + tp++ + case len(tc.query.RelevantRefs) > 0: + fp++ + } + } + + totalPos := tp + fn + totalNeg := tn + fp + + var recall, precision, fpr, f1 float64 + if totalPos > 0 { + recall = float64(tp) / float64(totalPos) + } + if tp+fp > 0 { + precision = float64(tp) / float64(tp+fp) + } + if totalNeg > 0 { + fpr = float64(fp) / float64(totalNeg) + } + if precision+recall > 0 { + f1 = 2 * precision * recall / (precision + recall) + } + + key := fmt.Sprintf("%.2f", threshold) + result.ByThreshold[key] = ThresholdMetrics{ + TP: tp, FP: fp, FN: fn, TN: tn, + Recall: recall, Precision: precision, FPR: fpr, F1: f1, + } + + if f1 > bestF1 { + bestF1 = f1 + bestF1Threshold = threshold + } + if recall >= 0.85 && precision > bestRecallWithPrecision { + bestRecallWithPrecision = precision + bestRecallThreshold = threshold + } + + if cfg.Verbose { + fmt.Printf(" threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n", + threshold, tp, fp, fn, tn, recall, precision, f1) + } + } + + if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 { + bestRecallThreshold = cfg.Thresholds[0] + } + + result.Recommendations = CalibrateRecommendations{ + DefaultThreshold: bestF1Threshold, + RecoveryThreshold: bestRecallThreshold, + BestF1: bestF1, + } + + return result, nil +} + +func contains(refs []string, ref string) bool { + for _, r := range refs { + if r == ref { + return true + } + } + return false +} + +func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) { + fmt.Printf("\n Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold)) + + fmt.Printf(" Recommendations:\n") + fmt.Printf(" Default (best F1): %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1) + fmt.Printf(" Recovery (recall): %.2f\n", result.Recommendations.RecoveryThreshold) + fmt.Println() +} diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go new file mode 100644 index 0000000..69a3091 --- /dev/null +++ b/internal/benchmark/catalog.go @@ -0,0 +1,86 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "sort" +) + +func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + + result := &CatalogResult{ + ByTag: make(map[string]int), + ByDifficulty: make(map[string]int), + } + + for _, c := range ds.Corpora { + tags := make(map[string]bool) + for _, q := range c.Queries { + result.TotalQueries++ + result.ByDifficulty[q.Difficulty]++ + for _, t := range q.Tags { + tags[t] = true + result.ByTag[t]++ + } + } + var tagList []string + for t := range tags { + tagList = append(tagList, t) + } + sort.Strings(tagList) + result.Corpora = append(result.Corpora, CorpusSummary{ + ID: c.ID, + Queries: len(c.Queries), + Tags: tagList, + }) + } + + return result, nil +} + +func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n Corpora: %d\n", len(result.Corpora)) + fmt.Printf(" Total Queries: %d\n\n", result.TotalQueries) + + fmt.Printf(" %-30s %8s\n", "Corpus", "Queries") + fmt.Printf(" %-30s %8s\n", "------", "-------") + for _, c := range result.Corpora { + fmt.Printf(" %-30s %8d\n", c.ID, c.Queries) + } + + switch cfg.By { + case "difficulty": + fmt.Printf("\n By Difficulty:\n") + diffs := sortedKeys(result.ByDifficulty) + for _, d := range diffs { + fmt.Printf(" %-10s %4d\n", d, result.ByDifficulty[d]) + } + case "tag": + fmt.Printf("\n By Tag:\n") + tags := sortedKeys(result.ByTag) + for _, t := range tags { + fmt.Printf(" %-20s %4d\n", t, result.ByTag[t]) + } + } + fmt.Printf("\n") +} + +func sortedKeys(m map[string]int) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go new file mode 100644 index 0000000..88234f6 --- /dev/null +++ b/internal/benchmark/check.go @@ -0,0 +1,279 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +func RunCheck(cfg CheckConfig) (*CheckResult, error) { + root := FindBenchmarkRoot() + + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + benchCfg, err := LoadConfig(root) + if err != nil { + return nil, fmt.Errorf("load config: %w", err) + } + profile := ResolveProfile(benchCfg, cfg.Profile) + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: profile.Strategy, + Threshold: profile.Threshold, + TopK: profile.TopK, + LexicalWeight: profile.Weights.Lexical, + EmbeddingWeight: profile.Weights.Embedding, + Profile: cfg.Profile, + Mode: "library", + Verbose: cfg.Verbose, + Explain: cfg.Explain, + OutputDir: cfg.OutputDir, + Quick: cfg.Quick, + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + result := &CheckResult{ + Status: "pass", + Report: report, + } + result.Summary.PAt1 = report.Metrics.Overall.PAt1 + result.Summary.MRR = report.Metrics.Overall.MRR + result.Summary.HitAt3 = report.Metrics.Overall.HitAt3 + result.Summary.Total = report.Metrics.Overall.Total + + for _, r := range report.Results { + if r.Status == "miss" { + result.TopRegs = append(result.TopRegs, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + Expected: r.Expected.RelevantRefs, + CurrentRef: r.Actual.BestRef, + Reason: "miss", + DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID), + }) + } + } + result.Summary.Regressions = len(result.TopRegs) + + // Determine baseline path from config + baselinePath := cfg.BaselinePath + if baselinePath == "" { + baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json") + } + + // Get quality thresholds from config + thresholds := benchCfg.QualityThresholds() + + if _, err := os.Stat(baselinePath); err == nil { + baseline, err := loadReport(baselinePath) + if err == nil { + result.Delta = &MetricsDelta{ + PAt1: report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + } + if cfg.FailOnReg { + // Check overall thresholds + if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop || + result.Delta.MRR < -thresholds.MaxOverallMRRDrop || + result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop { + result.Status = "fail" + } + // Check corpus-level thresholds + for corpus, current := range report.Metrics.ByCorpus { + if base, ok := baseline.Metrics.ByCorpus[corpus]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxCorpusPAt1Drop { + result.Status = "fail" + } + } + } + // Check difficulty-level thresholds + for diff, current := range report.Metrics.ByDifficulty { + if base, ok := baseline.Metrics.ByDifficulty[diff]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxDifficultyPAt1Drop { + result.Status = "fail" + } + } + } + // Check tag-level thresholds + for tag, current := range report.Metrics.ByTag { + if base, ok := baseline.Metrics.ByTag[tag]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxTagPAt1Drop { + result.Status = "fail" + } + } + } + } + } + } + + // Sort regressions for deterministic output + sort.Slice(result.TopRegs, func(i, j int) bool { + if result.TopRegs[i].Corpus != result.TopRegs[j].Corpus { + return result.TopRegs[i].Corpus < result.TopRegs[j].Corpus + } + return result.TopRegs[i].ID < result.TopRegs[j].ID + }) + + _ = os.MkdirAll(cfg.OutputDir, 0755) + ts := time.Now().Format("20060102_150405") + reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts)) + summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts)) + + reportJSON, _ := json.MarshalIndent(report, "", " ") + _ = os.WriteFile(reportPath, reportJSON, 0644) + + summaryMD := generateSummaryMD(report, result) + _ = os.WriteFile(summaryPath, []byte(summaryMD), 0644) + + result.Artifacts.ReportJSON = reportPath + result.Artifacts.SummaryMD = summaryPath + + return result, nil +} + +func RunBenchmark(cfg RunConfig) (*Report, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + return RunCorpusBenchmark(ds, cfg) +} + +func loadReport(path string) (*Report, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var r Report + if err := json.Unmarshal(data, &r); err != nil { + return nil, err + } + return &r, nil +} + +func generateSummaryMD(report *Report, result *CheckResult) string { + var sb strings.Builder + + sb.WriteString("# Benchmark Summary\n\n") + fmt.Fprintf(&sb, "Generated: %s\n\n", report.Run.Timestamp) + + sb.WriteString("## Overall Metrics\n\n") + sb.WriteString("| Metric | Value |\n") + sb.WriteString("|--------|-------|\n") + fmt.Fprintf(&sb, "| Total | %d |\n", report.Metrics.Overall.Total) + fmt.Fprintf(&sb, "| MRR | %.4f |\n", report.Metrics.Overall.MRR) + fmt.Fprintf(&sb, "| P@1 | %.4f |\n", report.Metrics.Overall.PAt1) + fmt.Fprintf(&sb, "| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3) + fmt.Fprintf(&sb, "| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin) + + if result.Delta != nil { + sb.WriteString("\n## Delta from Baseline\n\n") + sb.WriteString("| Metric | Delta |\n") + sb.WriteString("|--------|-------|\n") + fmt.Fprintf(&sb, "| P@1 | %+.4f |\n", result.Delta.PAt1) + fmt.Fprintf(&sb, "| MRR | %+.4f |\n", result.Delta.MRR) + fmt.Fprintf(&sb, "| Hit@3 | %+.4f |\n", result.Delta.HitAt3) + } + + if len(result.TopRegs) > 0 { + sb.WriteString("\n## Misses\n\n") + sb.WriteString("| ID | Corpus | Query | Got | Expected |\n") + sb.WriteString("|----|--------|-------|-----|----------|\n") + for i, r := range result.TopRegs { + if i >= 10 { + break + } + fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n", + r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")) + } + if len(result.TopRegs) > 10 { + fmt.Fprintf(&sb, "\n*Showing 10 of %d misses.*\n", len(result.TopRegs)) + } + } + + return sb.String() +} + +func PrintCheckResult(result *CheckResult, cfg CheckConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32mβœ“\033[0m Benchmark passed\n") + } else { + fmt.Printf(" \033[31mβœ—\033[0m Benchmark failed\n") + } + fmt.Printf("\n") + + fmt.Printf(" %-12s %8.4f\n", "MRR", result.Summary.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", result.Summary.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", result.Summary.Total) + fmt.Printf(" %-12s %8d\n", "Misses", result.Summary.Regressions) + + if result.Delta != nil { + fmt.Printf("\n Delta from baseline:\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + } + + fmt.Printf("\n Artifacts:\n") + fmt.Printf(" Report: %s\n", result.Artifacts.ReportJSON) + fmt.Printf(" Summary: %s\n", result.Artifacts.SummaryMD) + fmt.Printf("\n") +} + +func printDelta(name string, delta float64) { + color := "\033[0m" + sign := "" + if delta > 0.001 { + color = "\033[32m" + sign = "+" + } else if delta < -0.001 { + color = "\033[31m" + } + fmt.Printf(" %s%-8s %s%.4f\033[0m\n", color, name, sign, delta) +} + +func PrintRunResult(report *Report, cfg RunConfig) { + fmt.Printf("\n") + fmt.Printf(" %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", report.Metrics.Overall.Total) + fmt.Printf("\n") + + if cfg.Verbose { + for _, r := range report.Results { + status := "\033[32mHIT \033[0m" + switch r.Status { + case "miss": + status = "\033[31mMISS\033[0m" + case "partial": + status = "\033[33mPART\033[0m" + } + fmt.Printf(" [%s] %s | %s | got=%s score=%.3f\n", + r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore) + } + } +} diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go new file mode 100644 index 0000000..f0e6ccf --- /dev/null +++ b/internal/benchmark/compare.go @@ -0,0 +1,89 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "sort" +) + +func RunCompare(cfg CompareConfig) (*CompareResult, error) { + baseline, err := loadReport(cfg.BaselinePath) + if err != nil { + return nil, fmt.Errorf("load baseline: %w", err) + } + current, err := loadReport(cfg.CurrentPath) + if err != nil { + return nil, fmt.Errorf("load current: %w", err) + } + + result := &CompareResult{ + Status: "pass", + Delta: MetricsDelta{ + PAt1: current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + }, + } + + if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 { + result.Status = "fail" + } + + baselineResults := make(map[string]QueryResult) + for _, r := range baseline.Results { + baselineResults[r.ID] = r + } + for _, r := range current.Results { + if base, ok := baselineResults[r.ID]; ok { + if base.Status == "hit" && r.Status != "hit" { + result.Regressions = append(result.Regressions, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + BaselineRef: base.Actual.BestRef, + CurrentRef: r.Actual.BestRef, + Reason: fmt.Sprintf("%s -> %s", base.Status, r.Status), + }) + } + } + } + + return result, nil +} + +func PrintCompareResult(result *CompareResult, cfg CompareConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32mβœ“\033[0m No regression\n") + } else { + fmt.Printf(" \033[31mβœ—\033[0m Regression detected\n") + } + fmt.Printf("\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + + if len(result.Regressions) > 0 { + fmt.Printf("\n Regressions:\n") + sortRegressions(result.Regressions) + for _, r := range result.Regressions { + fmt.Printf(" %s: %s (%s)\n", r.ID, r.Reason, r.Query) + } + } + fmt.Printf("\n") +} + +func sortRegressions(regs []Regression) { + sort.Slice(regs, func(i, j int) bool { + if regs[i].Corpus != regs[j].Corpus { + return regs[i].Corpus < regs[j].Corpus + } + return regs[i].ID < regs[j].ID + }) +} diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go new file mode 100644 index 0000000..2d233e2 --- /dev/null +++ b/internal/benchmark/config.go @@ -0,0 +1,534 @@ +package benchmark + +import ( + "encoding/json" + "errors" + "flag" + "fmt" + "os" + "path/filepath" +) + +type Config struct { + Version string `json:"version"` + Defaults DefaultsConfig `json:"defaults"` + Profiles map[string]Profile `json:"profiles"` + Baseline BaselineConfig `json:"baseline"` + Results ResultsConfig `json:"results"` + Strategies []string `json:"strategies"` + SnapshotsDir string `json:"snapshots_dir"` +} + +type DefaultsConfig struct { + Profile string `json:"profile"` + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` +} + +type ResultsConfig struct { + Dir string `json:"dir"` + BaselinesDir string `json:"baselines_dir"` + GeneratedFilesPolicy string `json:"generated_files_policy"` +} + +type Profile struct { + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` + Suites []string `json:"suites"` + Mode string `json:"mode"` + Inherits string `json:"inherits"` + Verbose bool `json:"verbose"` + Explain bool `json:"explain"` + FailOnReg bool `json:"fail_on_regression"` +} + +type Weights struct { + Lexical float64 `json:"lexical"` + Embedding float64 `json:"embedding"` +} + +type BaselineConfig struct { + Quality BaselineQuality `json:"quality"` + Runtime BaselineRuntime `json:"runtime"` +} + +type BaselineQuality struct { + MaxOverallPAt1Drop float64 `json:"max_overall_p_at_1_drop"` + MaxOverallMRRDrop float64 `json:"max_overall_mrr_drop"` + MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"` + MaxCorpusPAt1Drop float64 `json:"max_corpus_p_at_1_drop"` + MaxDifficultyPAt1Drop float64 `json:"max_difficulty_p_at_1_drop"` + MaxTagPAt1Drop float64 `json:"max_tag_p_at_1_drop"` + MaxMarginDropReport float64 `json:"max_margin_drop_report"` +} + +type BaselineRuntime struct { + MaxNsOpRegressionRatio float64 `json:"max_ns_op_regression_ratio"` + MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"` + MaxCorpusLatencyP50MS int `json:"max_corpus_latency_p50_ms"` + MaxCorpusLatencyP95MS int `json:"max_corpus_latency_p95_ms"` +} + +type CheckConfig struct { + Profile string + BaselinePath string + OutputDir string + Format string + FailOnReg bool + Quick bool + Verbose bool + Explain bool +} + +type RunConfig struct { + Suite string + Corpus string + QueryID string + Strategy string + Threshold float64 + TopK int + LexicalWeight float64 + EmbeddingWeight float64 + Profile string + Mode string + Verbose bool + Explain bool + OutputDir string + ReportName string + Quick bool +} + +type CompareConfig struct { + BaselinePath string + CurrentPath string + Format string + Verbose bool +} + +type LintConfig struct { + Format string + Verbose bool +} + +type CatalogConfig struct { + Format string + By string +} + +type BaselineCmdConfig struct { + Action string // "create" or "update" + Name string + Accept bool + Verbose bool +} + +type CalibrateConfig struct { + Corpus string + Thresholds []float64 + Verbose bool +} + +type TuneConfig struct { + Corpus string + Step float64 + Verbose bool +} + +type RuntimeConfig struct { + FailOnRegression bool + Verbose bool +} + +func FindBenchmarkRoot() string { + cwd, _ := os.Getwd() + for d := cwd; d != "/"; d = filepath.Dir(d) { + if _, err := os.Stat(filepath.Join(d, "tests/benchmark/config/benchmark.json")); err == nil { + return filepath.Join(d, "tests/benchmark") + } + if _, err := os.Stat(filepath.Join(d, "go.mod")); err == nil { + return filepath.Join(d, "tests/benchmark") + } + } + return filepath.Join(cwd, "tests/benchmark") +} + +func LoadConfig(benchmarkRoot string) (*Config, error) { + path := filepath.Join(benchmarkRoot, "config/benchmark.json") + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var cfg Config + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, err + } + if err := ValidateConfig(&cfg); err != nil { + return nil, fmt.Errorf("invalid config: %w", err) + } + return &cfg, nil +} + +func ResolveProfile(cfg *Config, name string) Profile { + p, ok := cfg.Profiles[name] + if !ok { + // Use defaults from config, falling back to hardcoded values + strategy := cfg.Defaults.Strategy + if strategy == "" { + strategy = "combined" + } + threshold := cfg.Defaults.Threshold + if threshold == 0 { + threshold = 0.01 + } + topK := cfg.Defaults.TopK + if topK == 0 { + topK = 5 + } + weights := cfg.Defaults.Weights + if weights.Lexical == 0 && weights.Embedding == 0 { + weights = Weights{Lexical: 0.6, Embedding: 0.4} + } + return Profile{ + Strategy: strategy, + Threshold: threshold, + TopK: topK, + Weights: weights, + Suites: []string{"corpus"}, + Mode: "library", + } + } + if p.Inherits != "" { + base := ResolveProfile(cfg, p.Inherits) + if p.Strategy == "" { + p.Strategy = base.Strategy + } + if p.Threshold == 0 { + p.Threshold = base.Threshold + } + if p.TopK == 0 { + p.TopK = base.TopK + } + if p.Weights.Lexical == 0 && p.Weights.Embedding == 0 { + p.Weights = base.Weights + } + if len(p.Suites) == 0 { + p.Suites = base.Suites + } + if p.Mode == "" { + p.Mode = base.Mode + } + } + return p +} + +// projectRoot returns the project root (parent of tests/benchmark). +func projectRoot(benchmarkRoot string) string { + return filepath.Dir(filepath.Dir(benchmarkRoot)) +} + +// ResultsDir returns the configured results directory. +func (c *Config) ResultsDir(benchmarkRoot string) string { + if c.Results.Dir != "" { + if filepath.IsAbs(c.Results.Dir) { + return c.Results.Dir + } + return filepath.Join(projectRoot(benchmarkRoot), c.Results.Dir) + } + return filepath.Join(benchmarkRoot, "results") +} + +// BaselinesDir returns the configured baselines directory. +func (c *Config) BaselinesDir(benchmarkRoot string) string { + if c.Results.BaselinesDir != "" { + if filepath.IsAbs(c.Results.BaselinesDir) { + return c.Results.BaselinesDir + } + return filepath.Join(projectRoot(benchmarkRoot), c.Results.BaselinesDir) + } + return filepath.Join(benchmarkRoot, "baselines") +} + +// QualityThresholds returns quality thresholds with fallback defaults. +func (c *Config) QualityThresholds() BaselineQuality { + q := c.Baseline.Quality + if q.MaxOverallPAt1Drop == 0 { + q.MaxOverallPAt1Drop = 0.02 + } + if q.MaxOverallMRRDrop == 0 { + q.MaxOverallMRRDrop = 0.02 + } + if q.MaxOverallHitAt3Drop == 0 { + q.MaxOverallHitAt3Drop = 0.02 + } + if q.MaxCorpusPAt1Drop == 0 { + q.MaxCorpusPAt1Drop = 0.08 + } + if q.MaxDifficultyPAt1Drop == 0 { + q.MaxDifficultyPAt1Drop = 0.08 + } + if q.MaxTagPAt1Drop == 0 { + q.MaxTagPAt1Drop = 0.08 + } + if q.MaxMarginDropReport == 0 { + q.MaxMarginDropReport = 0.15 + } + return q +} + +// RuntimeThresholds returns runtime thresholds with fallback defaults. +func (c *Config) RuntimeThresholds() BaselineRuntime { + r := c.Baseline.Runtime + if r.MaxNsOpRegressionRatio == 0 { + r.MaxNsOpRegressionRatio = 1.25 + } + if r.MaxAllocRegressionRatio == 0 { + r.MaxAllocRegressionRatio = 1.25 + } + return r +} + +// ValidateConfig checks the config for errors and returns a descriptive error if invalid. +func ValidateConfig(cfg *Config) error { + var errs []error + + // Validate strategies + if len(cfg.Strategies) == 0 { + errs = append(errs, errors.New("strategies list is empty")) + } else { + validStrategies := make(map[string]bool) + for _, s := range cfg.Strategies { + validStrategies[s] = true + } + // Check default strategy is in list + if cfg.Defaults.Strategy != "" && !validStrategies[cfg.Defaults.Strategy] { + errs = append(errs, fmt.Errorf("default strategy %q not in strategies list", cfg.Defaults.Strategy)) + } + // Check profile strategies + for name, p := range cfg.Profiles { + if p.Strategy != "" && !validStrategies[p.Strategy] { + errs = append(errs, fmt.Errorf("profile %q uses strategy %q not in strategies list", name, p.Strategy)) + } + } + } + + // Validate weights + if cfg.Defaults.Weights.Lexical < 0 { + errs = append(errs, errors.New("defaults.weights.lexical must be non-negative")) + } + if cfg.Defaults.Weights.Embedding < 0 { + errs = append(errs, errors.New("defaults.weights.embedding must be non-negative")) + } + if cfg.Defaults.Weights.Lexical == 0 && cfg.Defaults.Weights.Embedding == 0 { + errs = append(errs, errors.New("defaults.weights: lexical and embedding cannot both be zero")) + } + + // Validate profile weights + for name, p := range cfg.Profiles { + if p.Weights.Lexical < 0 { + errs = append(errs, fmt.Errorf("profile %q: weights.lexical must be non-negative", name)) + } + if p.Weights.Embedding < 0 { + errs = append(errs, fmt.Errorf("profile %q: weights.embedding must be non-negative", name)) + } + } + + // Validate quality thresholds (should be positive when set) + q := cfg.Baseline.Quality + if q.MaxOverallPAt1Drop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_p_at_1_drop must be non-negative")) + } + if q.MaxOverallMRRDrop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_mrr_drop must be non-negative")) + } + if q.MaxOverallHitAt3Drop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_hit_at_3_drop must be non-negative")) + } + + // Validate runtime thresholds (must be >= 1) + r := cfg.Baseline.Runtime + if r.MaxNsOpRegressionRatio != 0 && r.MaxNsOpRegressionRatio < 1 { + errs = append(errs, errors.New("baseline.runtime.max_ns_op_regression_ratio must be >= 1")) + } + if r.MaxAllocRegressionRatio != 0 && r.MaxAllocRegressionRatio < 1 { + errs = append(errs, errors.New("baseline.runtime.max_alloc_regression_ratio must be >= 1")) + } + + // Validate profile inheritance + if err := validateProfileInheritance(cfg); err != nil { + errs = append(errs, err) + } + + if len(errs) == 0 { + return nil + } + if len(errs) == 1 { + return errs[0] + } + return fmt.Errorf("config has %d errors: %v", len(errs), errs) +} + +// validateProfileInheritance checks for missing references and cycles. +func validateProfileInheritance(cfg *Config) error { + for name, p := range cfg.Profiles { + if p.Inherits == "" { + continue + } + // Check reference exists + if _, ok := cfg.Profiles[p.Inherits]; !ok { + return fmt.Errorf("profile %q inherits from non-existent profile %q", name, p.Inherits) + } + // Check for cycles + visited := map[string]bool{name: true} + current := p.Inherits + for current != "" { + if visited[current] { + return fmt.Errorf("profile inheritance cycle detected: %q -> %q", name, current) + } + visited[current] = true + if parent, ok := cfg.Profiles[current]; ok { + current = parent.Inherits + } else { + break + } + } + } + return nil +} + +func ParseCheckFlags(args []string) CheckConfig { + fs := flag.NewFlagSet("check", flag.ExitOnError) + cfg := CheckConfig{ + Profile: "default", + OutputDir: filepath.Join(FindBenchmarkRoot(), "results"), + Format: "text", + } + fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile") + fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline file path") + fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)") + fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression") + fs.BoolVar(&cfg.Quick, "quick", false, "smoke mode: 3 queries per corpus (not representative)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details") + fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations") + _ = fs.Parse(args) + return cfg +} + +func ParseRunFlags(args []string) RunConfig { + fs := flag.NewFlagSet("run", flag.ExitOnError) + cfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + Profile: "default", + Mode: "library", + OutputDir: filepath.Join(FindBenchmarkRoot(), "results"), + } + fs.StringVar(&cfg.Suite, "suite", cfg.Suite, "suite to run (corpus|recovery|classification|runtime|all)") + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to run") + fs.StringVar(&cfg.QueryID, "query", "", "specific query ID to run") + fs.StringVar(&cfg.Strategy, "strategy", cfg.Strategy, "matching strategy") + fs.Float64Var(&cfg.Threshold, "threshold", cfg.Threshold, "score threshold") + fs.IntVar(&cfg.TopK, "top-k", cfg.TopK, "number of results") + fs.Float64Var(&cfg.LexicalWeight, "lexical-weight", cfg.LexicalWeight, "lexical weight") + fs.Float64Var(&cfg.EmbeddingWeight, "embedding-weight", cfg.EmbeddingWeight, "embedding weight") + fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile") + fs.StringVar(&cfg.Mode, "mode", cfg.Mode, "execution mode (cli|library|both)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.BoolVar(&cfg.Explain, "explain", false, "include explanations") + fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") + fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name") + _ = fs.Parse(args) + return cfg +} + +func ParseCompareFlags(args []string) CompareConfig { + fs := flag.NewFlagSet("compare", flag.ExitOnError) + cfg := CompareConfig{ + Format: "text", + } + fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline report path (required)") + fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)") + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseLintFlags(args []string) LintConfig { + fs := flag.NewFlagSet("lint", flag.ExitOnError) + cfg := LintConfig{ + Format: "text", + } + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseCatalogFlags(args []string) CatalogConfig { + fs := flag.NewFlagSet("catalog", flag.ExitOnError) + cfg := CatalogConfig{ + Format: "table", + } + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)") + fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)") + _ = fs.Parse(args) + return cfg +} + +func ParseBaselineFlags(args []string) BaselineCmdConfig { + fs := flag.NewFlagSet("baseline", flag.ExitOnError) + cfg := BaselineCmdConfig{ + Action: "create", + Name: "combined", + } + fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name") + fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + + if len(fs.Args()) > 0 { + cfg.Action = fs.Args()[0] + } + return cfg +} + +func ParseCalibrateFlags(args []string) CalibrateConfig { + fs := flag.NewFlagSet("calibrate", flag.ExitOnError) + cfg := CalibrateConfig{ + Thresholds: []float64{0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60}, + } + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseTuneFlags(args []string) TuneConfig { + fs := flag.NewFlagSet("tune", flag.ExitOnError) + cfg := TuneConfig{ + Step: 0.1, + } + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against") + fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseRuntimeFlags(args []string) RuntimeConfig { + fs := flag.NewFlagSet("runtime", flag.ExitOnError) + cfg := RuntimeConfig{} + fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} diff --git a/internal/benchmark/config_test.go b/internal/benchmark/config_test.go new file mode 100644 index 0000000..2590556 --- /dev/null +++ b/internal/benchmark/config_test.go @@ -0,0 +1,147 @@ +package benchmark + +import "testing" + +func TestValidateConfig_Valid(t *testing.T) { + cfg := &Config{ + Strategies: []string{"lexical", "embedding", "combined"}, + Defaults: DefaultsConfig{ + Strategy: "combined", + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Quality: BaselineQuality{ + MaxOverallPAt1Drop: 0.02, + }, + Runtime: BaselineRuntime{ + MaxNsOpRegressionRatio: 1.25, + }, + }, + } + if err := ValidateConfig(cfg); err != nil { + t.Errorf("expected valid config, got error: %v", err) + } +} + +func TestValidateConfig_EmptyStrategies(t *testing.T) { + cfg := &Config{ + Strategies: []string{}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for empty strategies") + } +} + +func TestValidateConfig_InvalidDefaultStrategy(t *testing.T) { + cfg := &Config{ + Strategies: []string{"lexical", "embedding"}, + Defaults: DefaultsConfig{ + Strategy: "combined", + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for invalid default strategy") + } +} + +func TestValidateConfig_NegativeWeights(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: -0.5, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for negative weight") + } +} + +func TestValidateConfig_BothWeightsZero(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0, Embedding: 0}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error when both weights are zero") + } +} + +func TestValidateConfig_RuntimeRatioTooLow(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Runtime: BaselineRuntime{ + MaxNsOpRegressionRatio: 0.5, + }, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for runtime ratio < 1") + } +} + +func TestValidateConfig_ProfileInheritsMissing(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Profiles: map[string]Profile{ + "fast": {Inherits: "nonexistent"}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for missing inherited profile") + } +} + +func TestValidateConfig_ProfileInheritanceCycle(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Profiles: map[string]Profile{ + "a": {Inherits: "b"}, + "b": {Inherits: "c"}, + "c": {Inherits: "a"}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for inheritance cycle") + } +} + +func TestValidateConfig_NegativeQualityThreshold(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Quality: BaselineQuality{ + MaxOverallPAt1Drop: -0.02, + }, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for negative quality threshold") + } +} diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go new file mode 100644 index 0000000..86c5014 --- /dev/null +++ b/internal/benchmark/dataset.go @@ -0,0 +1,117 @@ +package benchmark + +import ( + "encoding/json" + "os" + "path/filepath" + + "github.com/pinchtab/semantic" +) + +type Query struct { + ID string `json:"id"` + QueryText string `json:"query"` + RelevantRefs []string `json:"relevant_refs"` + PartiallyRelevantRefs []string `json:"partially_relevant_refs"` + Difficulty string `json:"difficulty"` + Tags []string `json:"tags"` + Intent string `json:"intent,omitempty"` + PageType string `json:"page_type,omitempty"` + Threshold *float64 `json:"threshold,omitempty"` + TopK *int `json:"top_k,omitempty"` + ExpectNoMatch bool `json:"expect_no_match,omitempty"` + MinScore *float64 `json:"min_score,omitempty"` + Notes string `json:"notes,omitempty"` +} + +type Corpus struct { + ID string + Path string + Snapshot []semantic.ElementDescriptor + Queries []Query +} + +type Dataset struct { + Root string + Corpora []Corpus +} + +func LoadDataset(benchmarkRoot string) (*Dataset, error) { + corpusDir := filepath.Join(benchmarkRoot, "corpus") + entries, err := os.ReadDir(corpusDir) + if err != nil { + return nil, err + } + + ds := &Dataset{Root: benchmarkRoot} + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + corpusPath := filepath.Join(corpusDir, entry.Name()) + snapshotPath := filepath.Join(corpusPath, "snapshot.json") + queriesPath := filepath.Join(corpusPath, "queries.json") + + if _, err := os.Stat(snapshotPath); os.IsNotExist(err) { + continue + } + if _, err := os.Stat(queriesPath); os.IsNotExist(err) { + continue + } + + corpus, err := loadCorpus(entry.Name(), corpusPath) + if err != nil { + return nil, err + } + + ds.Corpora = append(ds.Corpora, *corpus) + } + + return ds, nil +} + +func loadCorpus(id, path string) (*Corpus, error) { + snapshotPath := filepath.Join(path, "snapshot.json") + queriesPath := filepath.Join(path, "queries.json") + + snapshotData, err := os.ReadFile(snapshotPath) + if err != nil { + return nil, err + } + + var snapshot []semantic.ElementDescriptor + if err := json.Unmarshal(snapshotData, &snapshot); err != nil { + return nil, err + } + + queriesData, err := os.ReadFile(queriesPath) + if err != nil { + return nil, err + } + + var queries []Query + if err := json.Unmarshal(queriesData, &queries); err != nil { + return nil, err + } + + return &Corpus{ + ID: id, + Path: path, + Snapshot: snapshot, + Queries: queries, + }, nil +} + +func (ds *Dataset) QueryCount() int { + count := 0 + for _, c := range ds.Corpora { + count += len(c.Queries) + } + return count +} + +func (ds *Dataset) CorpusCount() int { + return len(ds.Corpora) +} diff --git a/internal/benchmark/lint.go b/internal/benchmark/lint.go new file mode 100644 index 0000000..20565ce --- /dev/null +++ b/internal/benchmark/lint.go @@ -0,0 +1,68 @@ +package benchmark + +import "fmt" + +func RunLint(cfg LintConfig) (*LintResult, error) { + root := FindBenchmarkRoot() + result := &LintResult{} + + ds, err := LoadDataset(root) + if err != nil { + result.Errors++ + result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err)) + return result, nil + } + + ids := make(map[string]string) + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if existing, ok := ids[q.ID]; ok { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing)) + } else { + ids[q.ID] = c.ID + } + } + } + + for _, c := range ds.Corpora { + refs := make(map[string]bool) + for _, d := range c.Snapshot { + refs[d.Ref] = true + } + for _, q := range c.Queries { + for _, r := range q.RelevantRefs { + if !refs[r] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r)) + } + } + } + } + + validDiff := map[string]bool{"easy": true, "medium": true, "hard": true} + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if q.Difficulty != "" && !validDiff[q.Difficulty] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID)) + } + } + } + + if result.Errors == 0 && result.Warnings == 0 { + result.Messages = append(result.Messages, "All checks passed") + } + + return result, nil +} + +func PrintLintResult(result *LintResult, cfg LintConfig) { + for _, msg := range result.Messages { + fmt.Println(msg) + } + fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings) +} diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go new file mode 100644 index 0000000..6f00821 --- /dev/null +++ b/internal/benchmark/runner.go @@ -0,0 +1,466 @@ +package benchmark + +import ( + "context" + "os/exec" + "strings" + "time" + + "github.com/pinchtab/semantic" +) + +type QueryResult struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Difficulty string `json:"difficulty"` + Tags []string `json:"tags"` + Intent string `json:"intent,omitempty"` + PageType string `json:"page_type,omitempty"` + Expected struct { + RelevantRefs []string `json:"relevant_refs"` + PartiallyRelevantRefs []string `json:"partially_relevant_refs"` + } `json:"expected"` + Actual struct { + BestRef string `json:"best_ref"` + BestScore float64 `json:"best_score"` + Matches []Match `json:"matches"` + } `json:"actual"` + Metrics struct { + RR float64 `json:"rr"` + PAt1 float64 `json:"p_at_1"` + PAt3 float64 `json:"p_at_3"` + HitAt3 int `json:"hit_at_3"` + HitAt5 int `json:"hit_at_5"` + BestRelevantRank *int `json:"best_relevant_rank"` + BestRelevantScore float64 `json:"best_relevant_score"` + BestWrongScore float64 `json:"best_wrong_score"` + Margin float64 `json:"margin"` + } `json:"metrics"` + Latency struct { + LibraryMs int64 `json:"library_ms"` + CLIMs *int64 `json:"cli_ms,omitempty"` + } `json:"latency"` + Status string `json:"status"` +} + +type Match struct { + Ref string `json:"ref"` + Score float64 `json:"score"` + Role string `json:"role"` + Name string `json:"name"` +} + +type Report struct { + SchemaVersion string `json:"schema_version"` + Run struct { + ID string `json:"id"` + Timestamp string `json:"timestamp"` + Tool string `json:"tool"` + GitSHA string `json:"git_sha,omitempty"` + GitDirty bool `json:"git_dirty,omitempty"` + Command string `json:"command"` + } `json:"run"` + Dataset struct { + Name string `json:"name"` + Version string `json:"version,omitempty"` + QueryCount int `json:"query_count"` + CorpusCount int `json:"corpus_count"` + } `json:"dataset"` + Config struct { + Profile string `json:"profile"` + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` + } `json:"config"` + Status string `json:"status"` + Metrics struct { + Overall OverallMetrics `json:"overall"` + Latency LatencyMetrics `json:"latency"` + ByCorpus map[string]CorpusMetrics `json:"by_corpus"` + ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"` + ByTag map[string]CorpusMetrics `json:"by_tag"` + } `json:"metrics"` + Results []QueryResult `json:"results"` +} + +type OverallMetrics struct { + Total int `json:"total"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + PAt3 float64 `json:"p_at_3"` + HitAt3 float64 `json:"hit_at_3"` + HitAt5 float64 `json:"hit_at_5"` + AvgMargin float64 `json:"avg_margin"` +} + +type LatencyMetrics struct { + LibraryP50Ms int64 `json:"library_p50_ms"` + LibraryP95Ms int64 `json:"library_p95_ms"` + CLIP50Ms *int64 `json:"cli_p50_ms,omitempty"` + CLIP95Ms *int64 `json:"cli_p95_ms,omitempty"` +} + +type CorpusMetrics struct { + Count int `json:"count"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + HitAt3 float64 `json:"hit_at_3"` + AvgMargin float64 `json:"avg_margin"` +} + +func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) { + matcher := createMatcher(cfg) + + report := &Report{ + SchemaVersion: "1.0.0", + Status: "pass", + } + report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile + report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339) + report.Run.Tool = "semantic-bench" + report.Run.GitSHA, report.Run.GitDirty = getGitInfo() + report.Dataset.Name = "semantic-ui-matching-corpus" + report.Dataset.QueryCount = ds.QueryCount() + report.Dataset.CorpusCount = ds.CorpusCount() + report.Config.Profile = cfg.Profile + report.Config.Strategy = cfg.Strategy + report.Config.Threshold = cfg.Threshold + report.Config.TopK = cfg.TopK + report.Config.Weights = Weights{Lexical: cfg.LexicalWeight, Embedding: cfg.EmbeddingWeight} + + report.Metrics.ByCorpus = make(map[string]CorpusMetrics) + report.Metrics.ByDifficulty = make(map[string]CorpusMetrics) + report.Metrics.ByTag = make(map[string]CorpusMetrics) + + var allLatencies []int64 + + for _, corpus := range ds.Corpora { + if cfg.Corpus != "" && corpus.ID != cfg.Corpus { + continue + } + + queries := corpus.Queries + if cfg.Quick { + queries = selectQuickSubset(corpus.Queries) + } + + for _, query := range queries { + if cfg.QueryID != "" && query.ID != cfg.QueryID { + continue + } + + result := runQuery(matcher, corpus, query, cfg) + report.Results = append(report.Results, result) + allLatencies = append(allLatencies, result.Latency.LibraryMs) + } + } + + aggregateMetrics(report, allLatencies) + return report, nil +} + +// selectQuickSubset returns a deterministic subset for smoke testing. +// Selects up to 3 queries per corpus by difficulty. This is NOT representative +// of full corpus coverageβ€”edge-case tags may be missed. Use for fast iteration, +// not for final regression checks. +func selectQuickSubset(queries []Query) []Query { + if len(queries) <= 3 { + return queries + } + + // Group by difficulty + byDiff := make(map[string][]Query) + for _, q := range queries { + diff := q.Difficulty + if diff == "" { + diff = "medium" + } + byDiff[diff] = append(byDiff[diff], q) + } + + // Select one from each difficulty level, up to 3 total + var selected []Query + for _, diff := range []string{"easy", "medium", "hard"} { + if qs, ok := byDiff[diff]; ok && len(qs) > 0 { + selected = append(selected, qs[0]) + if len(selected) >= 3 { + break + } + } + } + + // If we don't have 3 yet, fill from remaining + if len(selected) < 3 { + for _, q := range queries { + found := false + for _, s := range selected { + if s.ID == q.ID { + found = true + break + } + } + if !found { + selected = append(selected, q) + if len(selected) >= 3 { + break + } + } + } + } + + return selected +} + +func createMatcher(cfg RunConfig) semantic.ElementMatcher { + embedder := semantic.NewHashingEmbedder(128) + switch cfg.Strategy { + case "lexical": + return semantic.NewLexicalMatcher() + case "embedding": + return semantic.NewEmbeddingMatcher(embedder) + default: + return semantic.NewCombinedMatcher(embedder) + } +} + +func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg RunConfig) QueryResult { + result := QueryResult{ + ID: query.ID, + Corpus: corpus.ID, + Query: query.QueryText, + Difficulty: query.Difficulty, + Tags: query.Tags, + Intent: query.Intent, + PageType: query.PageType, + } + result.Expected.RelevantRefs = query.RelevantRefs + result.Expected.PartiallyRelevantRefs = query.PartiallyRelevantRefs + + threshold := cfg.Threshold + if query.Threshold != nil { + threshold = *query.Threshold + } + topK := cfg.TopK + if query.TopK != nil { + topK = *query.TopK + } + + start := time.Now() + findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{ + Threshold: threshold, + TopK: topK, + LexicalWeight: cfg.LexicalWeight, + EmbeddingWeight: cfg.EmbeddingWeight, + Explain: cfg.Explain, + }) + result.Latency.LibraryMs = time.Since(start).Milliseconds() + + result.Actual.BestRef = findResult.BestRef + result.Actual.BestScore = findResult.BestScore + for _, m := range findResult.Matches { + result.Actual.Matches = append(result.Actual.Matches, Match{ + Ref: m.Ref, + Score: m.Score, + Role: m.Role, + Name: m.Name, + }) + } + + computeQueryMetrics(&result, query) + return result +} + +func computeQueryMetrics(result *QueryResult, query Query) { + relevantSet := make(map[string]bool) + for _, r := range query.RelevantRefs { + relevantSet[r] = true + } + partialSet := make(map[string]bool) + for _, r := range query.PartiallyRelevantRefs { + partialSet[r] = true + } + + // Reciprocal Rank + for i, m := range result.Actual.Matches { + if relevantSet[m.Ref] { + result.Metrics.RR = 1.0 / float64(i+1) + break + } + } + + // P@1 + if len(result.Actual.Matches) > 0 { + if relevantSet[result.Actual.Matches[0].Ref] { + result.Metrics.PAt1 = 1.0 + } else if partialSet[result.Actual.Matches[0].Ref] { + result.Metrics.PAt1 = 0.5 + } + } + + // P@3, Hit@3, Hit@5 + relevantInTop3 := 0 + partialInTop3 := 0 + for i, m := range result.Actual.Matches { + if i >= 5 { + break + } + switch { + case relevantSet[m.Ref]: + if result.Metrics.BestRelevantRank == nil { + rank := i + 1 + result.Metrics.BestRelevantRank = &rank + } + if result.Metrics.BestRelevantScore == 0 || m.Score > result.Metrics.BestRelevantScore { + result.Metrics.BestRelevantScore = m.Score + } + if i < 3 { + relevantInTop3++ + result.Metrics.HitAt3 = 1 + } + result.Metrics.HitAt5 = 1 + case partialSet[m.Ref]: + if i < 3 { + partialInTop3++ + } + default: + if m.Score > result.Metrics.BestWrongScore { + result.Metrics.BestWrongScore = m.Score + } + } + } + result.Metrics.PAt3 = (float64(relevantInTop3) + float64(partialInTop3)*0.5) / 3.0 + result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore + + // Status + switch { + case query.ExpectNoMatch: + if len(result.Actual.Matches) == 0 { + result.Status = "no_match_expected" + } else { + result.Status = "unexpected_match" + } + case result.Metrics.PAt1 >= 1.0: + result.Status = "hit" + case result.Metrics.PAt1 >= 0.5: + result.Status = "partial" + default: + result.Status = "miss" + } +} + +func aggregateMetrics(report *Report, latencies []int64) { + n := len(report.Results) + if n == 0 { + return + } + + report.Metrics.Overall.Total = n + + var sumRR, sumP1, sumP3, sumHit3, sumHit5, sumMargin float64 + corpusAgg := make(map[string]*aggregator) + diffAgg := make(map[string]*aggregator) + tagAgg := make(map[string]*aggregator) + + for _, r := range report.Results { + sumRR += r.Metrics.RR + sumP1 += r.Metrics.PAt1 + sumP3 += r.Metrics.PAt3 + sumHit3 += float64(r.Metrics.HitAt3) + sumHit5 += float64(r.Metrics.HitAt5) + sumMargin += r.Metrics.Margin + + addToAgg(corpusAgg, r.Corpus, r) + addToAgg(diffAgg, r.Difficulty, r) + for _, t := range r.Tags { + addToAgg(tagAgg, t, r) + } + } + + report.Metrics.Overall.MRR = sumRR / float64(n) + report.Metrics.Overall.PAt1 = sumP1 / float64(n) + report.Metrics.Overall.PAt3 = sumP3 / float64(n) + report.Metrics.Overall.HitAt3 = sumHit3 / float64(n) + report.Metrics.Overall.HitAt5 = sumHit5 / float64(n) + report.Metrics.Overall.AvgMargin = sumMargin / float64(n) + + for k, a := range corpusAgg { + report.Metrics.ByCorpus[k] = a.toMetrics() + } + for k, a := range diffAgg { + report.Metrics.ByDifficulty[k] = a.toMetrics() + } + for k, a := range tagAgg { + report.Metrics.ByTag[k] = a.toMetrics() + } + + // Latency percentiles + if len(latencies) > 0 { + sorted := make([]int64, len(latencies)) + copy(sorted, latencies) + sortInt64(sorted) + report.Metrics.Latency.LibraryP50Ms = sorted[len(sorted)*50/100] + report.Metrics.Latency.LibraryP95Ms = sorted[len(sorted)*95/100] + } +} + +type aggregator struct { + count int + sumRR float64 + sumP1 float64 + sumHit3 float64 + sumMargin float64 +} + +func addToAgg(m map[string]*aggregator, key string, r QueryResult) { + if _, ok := m[key]; !ok { + m[key] = &aggregator{} + } + a := m[key] + a.count++ + a.sumRR += r.Metrics.RR + a.sumP1 += r.Metrics.PAt1 + a.sumHit3 += float64(r.Metrics.HitAt3) + a.sumMargin += r.Metrics.Margin +} + +func (a *aggregator) toMetrics() CorpusMetrics { + if a.count == 0 { + return CorpusMetrics{} + } + return CorpusMetrics{ + Count: a.count, + MRR: a.sumRR / float64(a.count), + PAt1: a.sumP1 / float64(a.count), + HitAt3: a.sumHit3 / float64(a.count), + AvgMargin: a.sumMargin / float64(a.count), + } +} + +func sortInt64(s []int64) { + for i := range s { + for j := i + 1; j < len(s); j++ { + if s[j] < s[i] { + s[i], s[j] = s[j], s[i] + } + } + } +} + +func getGitInfo() (sha string, dirty bool) { + cmd := exec.Command("git", "rev-parse", "HEAD") + out, err := cmd.Output() + if err != nil { + return "", false + } + sha = strings.TrimSpace(string(out)) + + cmd = exec.Command("git", "status", "--porcelain") + out, err = cmd.Output() + if err != nil { + return sha, false + } + dirty = len(strings.TrimSpace(string(out))) > 0 + return sha, dirty +} diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go new file mode 100644 index 0000000..dd68f75 --- /dev/null +++ b/internal/benchmark/runtime.go @@ -0,0 +1,236 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +type RuntimeResult struct { + Status string `json:"status"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` + Regressions int `json:"regressions"` + BaselinePath string `json:"baseline_path"` + Created bool `json:"created"` +} + +type RuntimeBenchmark struct { + Name string `json:"name"` + NsOp float64 `json:"ns_op"` + BytesOp int `json:"bytes_op"` + AllocsOp int `json:"allocs_op"` + BaselineNs float64 `json:"baseline_ns,omitempty"` + Ratio float64 `json:"ratio,omitempty"` + Status string `json:"status"` +} + +type runtimeBaseline struct { + Timestamp string `json:"timestamp"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` +} + +func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { + root := FindBenchmarkRoot() + + // Load config for thresholds + benchCfg, err := LoadConfig(root) + if err != nil { + return nil, fmt.Errorf("load config: %w", err) + } + thresholds := benchCfg.RuntimeThresholds() + baselinePath := filepath.Join(benchCfg.BaselinesDir(root), "runtime.json") + + benchmarks, err := runGoBenchmarks() + if err != nil { + return nil, err + } + + result := &RuntimeResult{ + Status: "pass", + Benchmarks: benchmarks, + BaselinePath: baselinePath, + } + + if _, err := os.Stat(baselinePath); os.IsNotExist(err) { + if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil { + return nil, err + } + result.Created = true + return result, nil + } + + baseline, err := loadRuntimeBaseline(baselinePath) + if err != nil { + return nil, err + } + + baselineMap := make(map[string]RuntimeBenchmark) + for _, b := range baseline.Benchmarks { + baselineMap[b.Name] = b + } + + // Warning threshold is halfway between 1.0 and max ratio + warnRatio := 1.0 + ((thresholds.MaxNsOpRegressionRatio - 1.0) / 2.0) + + for i, b := range result.Benchmarks { + if base, ok := baselineMap[b.Name]; ok { + nsRatio := b.NsOp / base.NsOp + result.Benchmarks[i].BaselineNs = base.NsOp + result.Benchmarks[i].Ratio = nsRatio + + // Check allocation regression if baseline has allocation data + var allocRatio float64 + if base.AllocsOp > 0 && b.AllocsOp > 0 { + allocRatio = float64(b.AllocsOp) / float64(base.AllocsOp) + } + + switch { + case nsRatio > thresholds.MaxNsOpRegressionRatio: + result.Benchmarks[i].Status = "regression" + result.Regressions++ + case allocRatio > thresholds.MaxAllocRegressionRatio: + result.Benchmarks[i].Status = "regression" + result.Regressions++ + case nsRatio > warnRatio: + result.Benchmarks[i].Status = "warning" + default: + result.Benchmarks[i].Status = "ok" + } + } else { + result.Benchmarks[i].Status = "new" + } + } + + if result.Regressions > 0 { + result.Status = "fail" + } + + return result, nil +} + +func runGoBenchmarks() ([]RuntimeBenchmark, error) { + root := FindBenchmarkRoot() + projectRoot := filepath.Join(root, "..", "..") + + cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...") + cmd.Dir = projectRoot + output, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("go test failed: %w\n%s", err, output) + } + + return parseBenchOutput(string(output)), nil +} + +func parseBenchOutput(output string) []RuntimeBenchmark { + var results []RuntimeBenchmark + lines := strings.Split(output, "\n") + + for _, line := range lines { + if !strings.HasPrefix(line, "Benchmark") { + continue + } + + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + + name := strings.TrimSuffix(fields[0], "-8") + name = strings.TrimSuffix(name, "-10") + name = strings.TrimSuffix(name, "-12") + name = strings.TrimSuffix(name, "-16") + + var nsOp float64 + var bytesOp, allocsOp int + + for i, f := range fields { + if f == "ns/op" && i > 0 { + _, _ = fmt.Sscanf(fields[i-1], "%f", &nsOp) + } + if f == "B/op" && i > 0 { + _, _ = fmt.Sscanf(fields[i-1], "%d", &bytesOp) + } + if f == "allocs/op" && i > 0 { + _, _ = fmt.Sscanf(fields[i-1], "%d", &allocsOp) + } + } + + if nsOp > 0 { + results = append(results, RuntimeBenchmark{ + Name: name, + NsOp: nsOp, + BytesOp: bytesOp, + AllocsOp: allocsOp, + }) + } + } + + return results +} + +func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error { + baseline := runtimeBaseline{ + Timestamp: time.Now().UTC().Format(time.RFC3339), + Benchmarks: benchmarks, + } + data, err := json.MarshalIndent(baseline, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, data, 0644) +} + +func loadRuntimeBaseline(path string) (*runtimeBaseline, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var baseline runtimeBaseline + if err := json.Unmarshal(data, &baseline); err != nil { + return nil, err + } + return &baseline, nil +} + +func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) { + if result.Created { + fmt.Printf("\n Created runtime baseline: %s\n", result.BaselinePath) + fmt.Printf(" Benchmarks: %d\n\n", len(result.Benchmarks)) + return + } + + fmt.Printf("\n Runtime Baseline Check\n\n") + + for _, b := range result.Benchmarks { + var status string + switch b.Status { + case "regression": + status = "\033[31mREGRESSION\033[0m" + case "warning": + status = "\033[33mWARNING\033[0m" + case "ok": + status = "\033[32mOK\033[0m" + case "new": + status = "\033[33mNEW\033[0m" + } + + if b.BaselineNs > 0 { + fmt.Printf(" %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n", + status, b.Name, b.BaselineNs, b.NsOp, b.Ratio) + } else { + fmt.Printf(" %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp) + } + } + + fmt.Println() + if result.Regressions > 0 { + fmt.Printf(" \033[31mRegressions: %d\033[0m\n\n", result.Regressions) + } else { + fmt.Printf(" \033[32mNo regressions\033[0m\n\n") + } +} diff --git a/internal/benchmark/tune.go b/internal/benchmark/tune.go new file mode 100644 index 0000000..7db259b --- /dev/null +++ b/internal/benchmark/tune.go @@ -0,0 +1,90 @@ +package benchmark + +import "fmt" + +type TuneResult struct { + Results []TuneRun `json:"results"` + Best *TuneRun `json:"best"` +} + +type TuneRun struct { + LexicalWeight float64 `json:"lexical_weight"` + EmbeddingWeight float64 `json:"embedding_weight"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + HitAt3 float64 `json:"hit_at_3"` +} + +func RunTune(cfg TuneConfig) (*TuneResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &TuneResult{} + + if cfg.Verbose { + fmt.Printf(" %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3") + } + + for w := 0.0; w <= 1.0001; w += cfg.Step { + lexW := w + embW := 1.0 - w + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: lexW, + EmbeddingWeight: embW, + Mode: "library", + } + + if cfg.Corpus != "" { + runCfg.Corpus = cfg.Corpus + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err) + } + + run := TuneRun{ + LexicalWeight: lexW, + EmbeddingWeight: embW, + MRR: report.Metrics.Overall.MRR, + PAt1: report.Metrics.Overall.PAt1, + HitAt3: report.Metrics.Overall.HitAt3, + } + result.Results = append(result.Results, run) + + if result.Best == nil || run.PAt1 > result.Best.PAt1 || + (run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) { + best := run + result.Best = &best + } + + if cfg.Verbose { + fmt.Printf(" %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n", + lexW, embW, run.MRR, run.PAt1, run.HitAt3) + } + } + + return result, nil +} + +func PrintTuneResult(result *TuneResult, cfg TuneConfig) { + fmt.Printf("\n Tested %d weight combinations\n\n", len(result.Results)) + + if result.Best != nil { + fmt.Printf(" Best weights:\n") + fmt.Printf(" Lexical: %.2f\n", result.Best.LexicalWeight) + fmt.Printf(" Embedding: %.2f\n", result.Best.EmbeddingWeight) + fmt.Printf(" MRR: %.4f\n", result.Best.MRR) + fmt.Printf(" P@1: %.4f\n", result.Best.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Best.HitAt3) + } + fmt.Println() +} diff --git a/internal/benchmark/types.go b/internal/benchmark/types.go new file mode 100644 index 0000000..916978a --- /dev/null +++ b/internal/benchmark/types.go @@ -0,0 +1,67 @@ +package benchmark + +type CheckResult struct { + Status string `json:"status"` + Summary CheckSummary `json:"summary"` + Delta *MetricsDelta `json:"delta,omitempty"` + TopRegs []Regression `json:"top_regressions,omitempty"` + Artifacts Artifacts `json:"artifacts"` + Report *Report `json:"-"` +} + +type CheckSummary struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` + Total int `json:"total"` + Regressions int `json:"regressions"` + Warnings int `json:"warnings"` +} + +type MetricsDelta struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` +} + +type Regression struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Expected []string `json:"expected"` + BaselineRef string `json:"baseline_ref,omitempty"` + CurrentRef string `json:"current_ref"` + Reason string `json:"reason"` + DebugCommand string `json:"debug_command"` +} + +type Artifacts struct { + ReportJSON string `json:"report_json"` + SummaryMD string `json:"summary_md"` +} + +type CompareResult struct { + Status string `json:"status"` + Delta MetricsDelta `json:"delta"` + Regressions []Regression `json:"regressions"` + Improvements []string `json:"improvements"` +} + +type LintResult struct { + Errors int `json:"errors"` + Warnings int `json:"warnings"` + Messages []string `json:"messages"` +} + +type CatalogResult struct { + Corpora []CorpusSummary `json:"corpora"` + TotalQueries int `json:"total_queries"` + ByTag map[string]int `json:"by_tag,omitempty"` + ByDifficulty map[string]int `json:"by_difficulty,omitempty"` +} + +type CorpusSummary struct { + ID string `json:"id"` + Queries int `json:"queries"` + Tags []string `json:"tags"` +} diff --git a/internal/engine/benchmark_test.go b/internal/engine/benchmark_test.go index c37528c..0ebc2c6 100644 --- a/internal/engine/benchmark_test.go +++ b/internal/engine/benchmark_test.go @@ -2,9 +2,10 @@ package engine import ( "context" - "github.com/pinchtab/semantic/internal/types" "strconv" "testing" + + "github.com/pinchtab/semantic/internal/types" ) // benchElements returns a realistic set of elements for benchmarking. @@ -244,3 +245,119 @@ func BenchmarkCombinedFind_Issue24_100Elements(b *testing.B) { }) } } + +// Focused microbenchmarks for individual components + +func BenchmarkParseQueryContext(b *testing.B) { + queries := []string{ + "sign in button", + "the first email textbox in the login form", + "button not submit near the checkout section", + "second item in the dropdown menu", + } + b.ReportAllocs() + + for b.Loop() { + for _, q := range queries { + ParseQueryContext(q) + } + } +} + +func BenchmarkParseQueryContext_Complex(b *testing.B) { + q := "the third blue submit button in the checkout form not disabled" + b.ReportAllocs() + + for b.Loop() { + ParseQueryContext(q) + } +} + +func BenchmarkRemoveStopwords(b *testing.B) { + tokenSets := [][]string{ + {"click", "the", "sign", "in", "button"}, + {"find", "the", "email", "address", "textbox"}, + {"the", "first", "item", "in", "a", "dropdown", "menu"}, + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, tokens := range tokenSets { + removeStopwords(tokens) + } + } +} + +func BenchmarkScoreFusion(b *testing.B) { + // Test the score fusion calculation + lexScores := make([]float64, 100) + embScores := make([]float64, 100) + for i := range lexScores { + lexScores[i] = float64(i) / 100.0 + embScores[i] = float64(100-i) / 100.0 + } + lexWeight, embWeight := 0.6, 0.4 + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + for j := range lexScores { + _ = lexWeight*lexScores[j] + embWeight*embScores[j] + } + } +} + +func BenchmarkLexicalScore_Variants(b *testing.B) { + cases := []struct { + name string + query string + desc string + }{ + {"exact", "Sign In", "button: Sign In"}, + {"partial", "sign", "button: Sign In"}, + {"synonym", "login", "button: Sign In"}, + {"mismatch", "checkout", "button: Sign In"}, + {"long_query", "click the sign in button on the login page", "button: Sign In"}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + LexicalScore(tc.query, tc.desc) + } + }) + } +} + +func BenchmarkCombinedFind_WeightVariants(b *testing.B) { + elements := benchElements() + ctx := context.Background() + + weights := []struct { + name string + lex float64 + emb float64 + }{ + {"lex_only", 1.0, 0.0}, + {"emb_only", 0.0, 1.0}, + {"balanced", 0.5, 0.5}, + {"lex_heavy", 0.8, 0.2}, + {"emb_heavy", 0.2, 0.8}, + } + + for _, w := range weights { + b.Run(w.name, func(b *testing.B) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + opts := types.FindOptions{ + Threshold: 0.3, + TopK: 3, + LexicalWeight: w.lex, + EmbeddingWeight: w.emb, + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = m.Find(ctx, "sign in button", elements, opts) + } + }) + } +} diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go new file mode 100644 index 0000000..1261dd6 --- /dev/null +++ b/recovery/benchmark_test.go @@ -0,0 +1,250 @@ +package recovery + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "runtime" + "testing" + "time" + + "github.com/pinchtab/semantic" +) + +type BenchmarkScenario struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + OriginalQuery string `json:"original_query"` + OriginalRef string `json:"original_ref"` + Before []semantic.ElementDescriptor `json:"before"` + After []semantic.ElementDescriptor `json:"after"` + ExpectedRef *string `json:"expected_ref"` + ExpectedAlt []string `json:"expected_alt"` + ExpectNoMatch bool `json:"expect_no_match"` + Difficulty string `json:"difficulty"` +} + +func loadScenarios(t *testing.T) []BenchmarkScenario { + _, thisFile, _, _ := runtime.Caller(0) + repoRoot := filepath.Join(filepath.Dir(thisFile), "..") + scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json") + + data, err := os.ReadFile(scenariosPath) + if err != nil { + t.Fatalf("failed to read scenarios: %v", err) + } + + var scenarios []BenchmarkScenario + if err := json.Unmarshal(data, &scenarios); err != nil { + t.Fatalf("failed to parse scenarios: %v", err) + } + + return scenarios +} + +func TestRecoveryBenchmark_Scenarios(t *testing.T) { + scenarios := loadScenarios(t) + matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128)) + + passed, failed := 0, 0 + + for _, sc := range scenarios { + t.Run(sc.ID, func(t *testing.T) { + result := runBenchmarkScenario(t, matcher, sc) + + if result.pass { + passed++ + t.Logf("PASS: recovered=%v got=%s expected=%s score=%.3f", + result.recovered, result.gotRef, result.expectedRef, result.score) + } else { + failed++ + t.Errorf("FAIL: recovered=%v got=%s expected=%s score=%.3f error=%s", + result.recovered, result.gotRef, result.expectedRef, result.score, result.err) + } + }) + } + + t.Logf("Summary: %d passed, %d failed out of %d scenarios", passed, failed, len(scenarios)) +} + +type scenarioResult struct { + pass bool + recovered bool + gotRef string + expectedRef string + score float64 + confidence string + latencyMs int64 + err string +} + +func runBenchmarkScenario(t *testing.T, matcher semantic.ElementMatcher, sc BenchmarkScenario) scenarioResult { + result := scenarioResult{} + + if sc.ExpectedRef != nil { + result.expectedRef = *sc.ExpectedRef + } + + var origDesc semantic.ElementDescriptor + for _, d := range sc.Before { + if d.Ref == sc.OriginalRef { + origDesc = d + break + } + } + + cache := NewIntentCache(100, 5*time.Minute) + cache.Store("test-tab", sc.OriginalRef, IntentEntry{ + Query: sc.OriginalQuery, + Descriptor: origDesc, + Score: 0.95, + Confidence: "high", + Strategy: "combined", + }) + + re := NewRecoveryEngine( + DefaultRecoveryConfig(), + matcher, + cache, + func(_ context.Context, _ string) error { return nil }, + func(_, ref string) (int64, bool) { + for i, d := range sc.After { + if d.Ref == ref { + return int64(1000 + i), true + } + } + return 0, false + }, + func(_ string) []semantic.ElementDescriptor { return sc.After }, + ) + + start := time.Now() + + err := fmt.Errorf("could not find node with id %s", sc.OriginalRef) + + if !re.ShouldAttempt(err, sc.OriginalRef) { + result.err = "ShouldAttempt returned false" + result.pass = sc.ExpectNoMatch + result.latencyMs = time.Since(start).Milliseconds() + return result + } + + rr, _, recErr := re.AttemptWithClassification( + context.Background(), + "test-tab", + sc.OriginalRef, + "click", + ClassifyFailure(err), + func(_ context.Context, kind string, nodeID int64) (map[string]any, error) { + return map[string]any{"clicked": true}, nil + }, + ) + + result.latencyMs = time.Since(start).Milliseconds() + result.recovered = rr.Recovered + result.gotRef = rr.NewRef + result.score = rr.Score + result.confidence = rr.Confidence + + if recErr != nil { + result.err = recErr.Error() + } + + if sc.ExpectNoMatch { + result.pass = !rr.Recovered + } else if sc.ExpectedRef != nil { + if rr.NewRef == *sc.ExpectedRef { + result.pass = true + } else { + for _, alt := range sc.ExpectedAlt { + if rr.NewRef == alt { + result.pass = true + break + } + } + } + } + + return result +} + +func BenchmarkRecoveryEngine_Scenarios(b *testing.B) { + scenarios := loadScenariosB(b) + matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128)) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, sc := range scenarios { + runBenchmarkScenarioB(b, matcher, sc) + } + } +} + +func loadScenariosB(b *testing.B) []BenchmarkScenario { + _, thisFile, _, _ := runtime.Caller(0) + repoRoot := filepath.Join(filepath.Dir(thisFile), "..") + scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json") + + data, err := os.ReadFile(scenariosPath) + if err != nil { + b.Fatalf("failed to read scenarios: %v", err) + } + + var scenarios []BenchmarkScenario + if err := json.Unmarshal(data, &scenarios); err != nil { + b.Fatalf("failed to parse scenarios: %v", err) + } + + return scenarios +} + +func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc BenchmarkScenario) { + var origDesc semantic.ElementDescriptor + for _, d := range sc.Before { + if d.Ref == sc.OriginalRef { + origDesc = d + break + } + } + + cache := NewIntentCache(100, 5*time.Minute) + cache.Store("test-tab", sc.OriginalRef, IntentEntry{ + Query: sc.OriginalQuery, + Descriptor: origDesc, + Score: 0.95, + Confidence: "high", + Strategy: "combined", + }) + + re := NewRecoveryEngine( + DefaultRecoveryConfig(), + matcher, + cache, + func(_ context.Context, _ string) error { return nil }, + func(_, ref string) (int64, bool) { + for i, d := range sc.After { + if d.Ref == ref { + return int64(1000 + i), true + } + } + return 0, false + }, + func(_ string) []semantic.ElementDescriptor { return sc.After }, + ) + + err := fmt.Errorf("could not find node with id %s", sc.OriginalRef) + + _, _, _ = re.AttemptWithClassification( + context.Background(), + "test-tab", + sc.OriginalRef, + "click", + ClassifyFailure(err), + func(_ context.Context, kind string, nodeID int64) (map[string]any, error) { + return map[string]any{"clicked": true}, nil + }, + ) +} diff --git a/scripts/check-docs-links.sh b/scripts/check-docs-links.sh new file mode 100755 index 0000000..90a8738 --- /dev/null +++ b/scripts/check-docs-links.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# Check for broken documentation links +# +# Usage: +# ./scripts/check-docs-links.sh +# +set -uo pipefail + +cd "$(dirname "$0")/.." + +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' + +ERRORS=0 + +echo "Checking documentation links..." +echo "" + +# Find all markdown files and check links +while IFS= read -r file; do + dir=$(dirname "$file") + + # Extract markdown links: [text](path) + while IFS= read -r link; do + # Skip URLs and anchors + if [[ "$link" =~ ^https?:// ]] || [[ "$link" =~ ^mailto: ]] || [[ "$link" =~ ^# ]]; then + continue + fi + + # Remove anchor from link + link_path="${link%%#*}" + + # Skip empty paths + if [[ -z "$link_path" ]]; then + continue + fi + + # Resolve relative path + if [[ "$link_path" =~ ^/ ]]; then + target="$link_path" + else + target="$dir/$link_path" + fi + + # Check if target exists + if [[ ! -e "$target" ]]; then + echo -e "${RED}BROKEN:${NC} $file -> $link" + ERRORS=$((ERRORS + 1)) + fi + done < <(grep -oE '\]\([^)]+\)' "$file" 2>/dev/null | sed 's/\](//' | sed 's/)//') +done < <(find . -name "*.md" -not -path "./.git/*" -not -path "./node_modules/*") + +echo "" +if [[ $ERRORS -eq 0 ]]; then + echo -e "${GREEN}βœ“${NC} All documentation links valid" + exit 0 +else + echo -e "${RED}Found $ERRORS broken link(s)${NC}" + exit 1 +fi diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md index 84ade33..2bea9dd 100644 --- a/skills/semantic-dev/SKILL.md +++ b/skills/semantic-dev/SKILL.md @@ -5,32 +5,43 @@ description: Develop and contribute to the Semantic project. Use when working on # Semantic Development -Semantic is a zero-dependency Go library for matching natural language queries against accessibility tree elements. +Zero-dependency Go library for matching natural language queries against accessibility tree elements. -## Project Location +## Essential Commands +**Before any PR:** ```bash -cd ~/dev/semantic +./dev pr # runs: check + e2e + lint corpus + bench ``` -## Dev Commands - -All development commands run via `./dev`: - -| Command | Description | -|---------|-------------| -| `./dev doctor` | Setup dev environment | -| `./dev test` | Run unit tests | -| `./dev test verbose` | Run unit tests (verbose) | -| `./dev test race` | Run unit tests with race detector | -| `./dev coverage` | Run tests with coverage report | -| `./dev lint` | Run golangci-lint | -| `./dev fmt` | Format code | -| `./dev vet` | Run go vet | -| `./dev check` | All checks (fmt + vet + lint + test) | -| `./dev build` | Build CLI binary | -| `./dev bench` | Run corpus benchmark suite | -| `./dev e2e` | Run E2E tests (Docker) | +**During development:** +```bash +./dev test # unit tests (fast) +./dev check # fmt + vet + lint + test race (full validation) +./dev build # build ./semantic CLI binary +``` + +**Quality regression checks:** +```bash +./dev baseline check # compare quality against baseline +./dev runtime # compare performance against baseline +``` + +**When quality changes intentionally:** +```bash +./dev baseline update # accept new quality baseline (after review) +``` + +## When to Use Each + +| Scenario | Command | +|----------|---------| +| Made code changes, quick sanity | `./dev test` | +| Ready to commit | `./dev check` | +| Before opening PR | `./dev pr` | +| Changed scoring/matching logic | `./dev baseline check` | +| Performance-sensitive changes | `./dev runtime` | +| Tuning weights | `./dev tune` then `./dev bench` | ## Architecture @@ -54,6 +65,7 @@ recovery/ Public subpackage failure.go FailureType classification cmd/semantic/main.go CLI tool (find, match, classify) +cmd/semantic-bench/ Benchmark CLI (check, baseline, calibrate, tune, runtime) ``` ## Key Design Decisions @@ -79,6 +91,27 @@ cmd/semantic/main.go CLI tool (find, match, classify) 4. **Pre-commit hook** runs gofmt + golangci-lint automatically on staged files. +## Benchmark Improvement Loop + +When implementing changes that affect matching quality: + +```bash +./dev baseline # create baseline (first time only) +# ... make changes ... +./dev bench # run benchmark, compare to baseline +./dev baseline update # accept new baseline (if improved) +``` + +**Key metrics:** +- **MRR** β€” Mean Reciprocal Rank (higher = finds correct element faster) +- **P@1** β€” Precision at 1 (is top result correct?) +- **Hit@3** β€” Any correct result in top 3? + +**Adding test cases:** +1. Add to `tests/benchmark/corpus/*/queries.json` +2. Run `./dev lint corpus` to validate +3. Run `./dev bench` β€” shows regression until fixed + ## Public API Surface Only these symbols are visible to consumers: diff --git a/tests/benchmark/baselines/.gitkeep b/tests/benchmark/baselines/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmark/config/benchmark.json b/tests/benchmark/config/benchmark.json index 23b5661..7b06060 100644 --- a/tests/benchmark/config/benchmark.json +++ b/tests/benchmark/config/benchmark.json @@ -1,13 +1,35 @@ { - "version": "1.0.0", - "strategies": ["lexical", "embedding", "combined"], - "default_strategy": "combined", - "default_threshold": 0.3, - "default_top_k": 3, - "metrics": { - "min_accuracy": 0.85, - "min_avg_score": 0.5, - "max_latency_ms": 100 + "version": "1.1.0", + "defaults": { + "strategy": "combined", + "threshold": 0.01, + "top_k": 5, + "weights": { + "lexical": 0.6, + "embedding": 0.4 + } + }, + "baseline": { + "quality": { + "max_overall_p_at_1_drop": 0.02, + "max_overall_mrr_drop": 0.02, + "max_overall_hit_at_3_drop": 0.02, + "max_corpus_p_at_1_drop": 0.08, + "max_difficulty_p_at_1_drop": 0.08, + "max_margin_drop_report": 0.15 + }, + "runtime": { + "max_ns_op_regression_ratio": 1.25, + "max_alloc_regression_ratio": 1.25, + "max_corpus_latency_p50_ms": 75, + "max_corpus_latency_p95_ms": 200 + } }, + "results": { + "dir": "tests/benchmark/results", + "baselines_dir": "tests/benchmark/baselines", + "generated_files_policy": "warn" + }, + "strategies": ["lexical", "embedding", "combined"], "snapshots_dir": "../e2e/assets/snapshots" } diff --git a/tests/benchmark/scripts/finalize-report.sh b/tests/benchmark/scripts/finalize-report.sh deleted file mode 100755 index 38d314f..0000000 --- a/tests/benchmark/scripts/finalize-report.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -# -# Finalize benchmark report and generate summary -# -# Usage: -# ./finalize-report.sh -# -set -euo pipefail - -if [[ $# -lt 1 ]]; then - echo "Usage: $0 " - exit 1 -fi - -REPORT_FILE="$1" -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -# Calculate final metrics -TMP_FILE=$(mktemp) -jq ' - .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 10000 | floor / 100) else 0 end) | - .summary.avg_score = (if (.results | length) > 0 then ([.results[].score] | add / length | . * 1000 | floor / 1000) else 0 end) | - .summary.avg_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | add / length | floor) else 0 end) | - .summary.min_score = (if (.results | length) > 0 then ([.results[].score] | min) else 0 end) | - .summary.max_score = (if (.results | length) > 0 then ([.results[].score] | max) else 0 end) | - .summary.min_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | min) else 0 end) | - .summary.max_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | max) else 0 end) -' "${REPORT_FILE}" > "${TMP_FILE}" -mv "${TMP_FILE}" "${REPORT_FILE}" - -# Generate markdown summary -TIMESTAMP=$(jq -r '.benchmark.timestamp' "${REPORT_FILE}") -STRATEGY=$(jq -r '.benchmark.strategy' "${REPORT_FILE}") -VERSION=$(jq -r '.benchmark.version' "${REPORT_FILE}") -TOTAL=$(jq -r '.summary.total' "${REPORT_FILE}") -PASSED=$(jq -r '.summary.passed' "${REPORT_FILE}") -FAILED=$(jq -r '.summary.failed' "${REPORT_FILE}") -SKIPPED=$(jq -r '.summary.skipped' "${REPORT_FILE}") -ACCURACY=$(jq -r '.summary.accuracy' "${REPORT_FILE}") -AVG_SCORE=$(jq -r '.summary.avg_score' "${REPORT_FILE}") -AVG_LATENCY=$(jq -r '.summary.avg_latency_ms' "${REPORT_FILE}") -MIN_SCORE=$(jq -r '.summary.min_score' "${REPORT_FILE}") -MAX_SCORE=$(jq -r '.summary.max_score' "${REPORT_FILE}") -MIN_LATENCY=$(jq -r '.summary.min_latency_ms' "${REPORT_FILE}") -MAX_LATENCY=$(jq -r '.summary.max_latency_ms' "${REPORT_FILE}") - -cat > "${SUMMARY_FILE}" << EOF -# Semantic Matching Benchmark Results - -## Benchmark Info - -| Field | Value | -|-------|-------| -| Timestamp | ${TIMESTAMP} | -| Strategy | ${STRATEGY} | -| Version | ${VERSION} | - -## Results Summary - -| Metric | Value | -|--------|-------| -| Total Cases | ${TOTAL} | -| Passed | ${PASSED} | -| Failed | ${FAILED} | -| Skipped | ${SKIPPED} | -| **Accuracy** | **${ACCURACY}%** | - -## Score Distribution - -| Metric | Value | -|--------|-------| -| Average Score | ${AVG_SCORE} | -| Min Score | ${MIN_SCORE} | -| Max Score | ${MAX_SCORE} | - -## Latency - -| Metric | Value | -|--------|-------| -| Average | ${AVG_LATENCY} ms | -| Min | ${MIN_LATENCY} ms | -| Max | ${MAX_LATENCY} ms | - -## Failed Cases - -EOF - -# Add failed cases -jq -r '.results[] | select(.status == "fail") | "| \(.id) | \(.notes) |"' "${REPORT_FILE}" >> "${SUMMARY_FILE}" - -if [[ $(jq '[.results[] | select(.status == "fail")] | length' "${REPORT_FILE}") -eq 0 ]]; then - echo "_No failures_" >> "${SUMMARY_FILE}" -else - # Add header - sed -i.bak '/## Failed Cases/a\ -| ID | Notes |\ -|-----|-------|' "${SUMMARY_FILE}" - rm -f "${SUMMARY_FILE}.bak" -fi - -echo "" -echo "================================================" -echo " BENCHMARK SUMMARY" -echo "================================================" -echo " Strategy: ${STRATEGY}" -echo " Total: ${TOTAL}" -echo " Passed: ${PASSED}" -echo " Failed: ${FAILED}" -echo " Accuracy: ${ACCURACY}%" -echo " Avg Score: ${AVG_SCORE}" -echo " Avg Latency: ${AVG_LATENCY} ms" -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/lint-corpus.sh b/tests/benchmark/scripts/lint-corpus.sh deleted file mode 100755 index 29f81b2..0000000 --- a/tests/benchmark/scripts/lint-corpus.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -CASES_DIR="${BENCHMARK_DIR}/cases" -SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots" - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[0;33m' -NC='\033[0m' - -ERRORS=0 -WARNINGS=0 - -error() { - echo -e "${RED}ERROR:${NC} $1" - ((ERRORS++)) -} - -warn() { - echo -e "${YELLOW}WARN:${NC} $1" - ((WARNINGS++)) -} - -ok() { - echo -e "${GREEN}βœ“${NC} $1" -} - -echo "=== Corpus Lint ===" -echo "" - -# 1. Check for invalid JSON in all benchmark files -echo "Checking JSON validity..." -for f in "${CORPUS_DIR}"/*/*.json "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - if ! jq . "$f" >/dev/null 2>&1; then - error "Invalid JSON: $f" - fi - fi -done - -# 2. Check for duplicate query IDs across corpus files -echo "Checking for duplicate query IDs..." -declare -A QUERY_IDS -for f in "${CORPUS_DIR}"/*/queries.json; do - if [[ -f "$f" ]]; then - while IFS= read -r id; do - if [[ -n "$id" && "$id" != "null" ]]; then - if [[ -n "${QUERY_IDS[$id]:-}" ]]; then - error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})" - else - QUERY_IDS[$id]="$f" - fi - fi - done < <(jq -r '.[].id // empty' "$f" 2>/dev/null) - fi -done - -# Also check cases files -for f in "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - while IFS= read -r id; do - if [[ -n "$id" && "$id" != "null" ]]; then - if [[ -n "${QUERY_IDS[$id]:-}" ]]; then - error "Duplicate query ID '$id' in $f (first seen in ${QUERY_IDS[$id]})" - else - QUERY_IDS[$id]="$f" - fi - fi - done < <(jq -r '.[].id // empty' "$f" 2>/dev/null) - fi -done - -# 3. Check for duplicate refs within snapshots -echo "Checking for duplicate refs in snapshots..." -for f in "${CORPUS_DIR}"/*/snapshot.json; do - if [[ -f "$f" ]]; then - dupes=$(jq -r '.[].ref' "$f" 2>/dev/null | sort | uniq -d) - if [[ -n "$dupes" ]]; then - error "Duplicate refs in $f: $dupes" - fi - fi -done - -# 4. Check that relevant_refs exist in snapshot -echo "Checking relevant_refs exist in snapshots..." -for corpus_dir in "${CORPUS_DIR}"/*/; do - corpus_name=$(basename "$corpus_dir") - snapshot="${corpus_dir}snapshot.json" - queries="${corpus_dir}queries.json" - - if [[ -f "$snapshot" && -f "$queries" ]]; then - # Get all refs from snapshot - refs=$(jq -r '.[].ref' "$snapshot" 2>/dev/null | sort | uniq) - - # Check relevant_refs - while IFS= read -r ref; do - if [[ -n "$ref" && "$ref" != "null" ]]; then - if ! echo "$refs" | grep -qx "$ref"; then - error "[$corpus_name] relevant_ref '$ref' not found in snapshot" - fi - fi - done < <(jq -r '.[].relevant_refs[]? // empty' "$queries" 2>/dev/null) - - # Check partially_relevant_refs - while IFS= read -r ref; do - if [[ -n "$ref" && "$ref" != "null" ]]; then - if ! echo "$refs" | grep -qx "$ref"; then - error "[$corpus_name] partially_relevant_ref '$ref' not found in snapshot" - fi - fi - done < <(jq -r '.[].partially_relevant_refs[]? // empty' "$queries" 2>/dev/null) - fi -done - -# 5. Check for empty relevant_refs (except no-match cases) -echo "Checking for empty relevant_refs..." -for f in "${CORPUS_DIR}"/*/queries.json; do - if [[ -f "$f" ]]; then - empty_relevant=$(jq -r '.[] | select(.relevant_refs | length == 0) | select(.partially_relevant_refs | length == 0) | select(.expect_no_match != true) | .id' "$f" 2>/dev/null) - for id in $empty_relevant; do - if [[ -n "$id" ]]; then - warn "Query '$id' in $f has empty relevant_refs" - fi - done - fi -done - -# 6. Check difficulty values -echo "Checking difficulty values..." -VALID_DIFFICULTIES="easy medium hard" -for f in "${CORPUS_DIR}"/*/queries.json; do - if [[ -f "$f" ]]; then - while IFS= read -r line; do - id=$(echo "$line" | cut -d'|' -f1) - diff=$(echo "$line" | cut -d'|' -f2) - if [[ -n "$diff" && "$diff" != "null" ]]; then - if ! echo "$VALID_DIFFICULTIES" | grep -qw "$diff"; then - error "Invalid difficulty '$diff' for query '$id' in $f" - fi - fi - done < <(jq -r '.[] | "\(.id)|\(.difficulty // "null")"' "$f" 2>/dev/null) - fi -done - -# 7. Check for known tags (warn on unknown) -echo "Checking tags..." -KNOWN_TAGS="absent-control accessibility action action-synonym action-verb adversarial alertdialog all-stopwords auth basket-cart bulk-action button cell checkbox combobox compound context-exclusion conversational dashboard description descriptive dialog directional disambiguation domain-intent download-export duplicate-labels ecommerce empty-query empty-snapshot exact exact-match filter find-search generic-verb github guard icon implicit input interactive-boost keyboard-mash legal link literal-text login login-signin long-query lookup-search media menu menuitem missing-letter name-match natural-language navigation negative-context no-match noise-tokens nonsense option ordinal pagination parent-context partial position preferences-settings purchase-buy question-form radio register-create registration repeated-word row-context search searchbox section section-context signout-logout single-char social special-chars spinbutton stale-ref state switch synonym synonym-chain tab table textbox threshold toggle transposition typo vague-query visual weak-match wikipedia" -for f in "${CORPUS_DIR}"/*/queries.json "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - while IFS= read -r tag; do - if [[ -n "$tag" && "$tag" != "null" ]]; then - if ! echo "$KNOWN_TAGS" | grep -qw "$tag"; then - warn "Unknown tag '$tag' in $f" - fi - fi - done < <(jq -r '.[].tags[]? // empty' "$f" 2>/dev/null) - fi -done - -# 8. Check case files reference existing snapshots -echo "Checking case file snapshot references..." -for f in "${CASES_DIR}"/*.json; do - if [[ -f "$f" ]]; then - while IFS= read -r snapshot; do - if [[ -n "$snapshot" && "$snapshot" != "null" ]]; then - if [[ ! -f "${SNAPSHOTS_DIR}/${snapshot}" ]]; then - error "Case file $f references missing snapshot: $snapshot" - fi - fi - done < <(jq -r '.[].snapshot // empty' "$f" 2>/dev/null) - fi -done - -# 9. Check for generated result files in source tree -echo "Checking for generated result files..." -if ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | grep -v '.gitkeep' | head -1 >/dev/null 2>&1; then - result_count=$(ls "${BENCHMARK_DIR}"/results/*.json 2>/dev/null | wc -l | tr -d ' ') - warn "Found $result_count generated result files in tests/benchmark/results/ (should be gitignored)" -fi - -echo "" -echo "=== Summary ===" -if [[ $ERRORS -eq 0 && $WARNINGS -eq 0 ]]; then - ok "All checks passed" - exit 0 -elif [[ $ERRORS -eq 0 ]]; then - echo -e "${YELLOW}Warnings: $WARNINGS${NC}" - exit 0 -else - echo -e "${RED}Errors: $ERRORS${NC}" - echo -e "${YELLOW}Warnings: $WARNINGS${NC}" - exit 1 -fi diff --git a/tests/benchmark/scripts/record-result.sh b/tests/benchmark/scripts/record-result.sh deleted file mode 100755 index 2288f7c..0000000 --- a/tests/benchmark/scripts/record-result.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# -# Record a benchmark result -# -# Usage: -# ./record-result.sh "notes" -# -set -euo pipefail - -if [[ $# -lt 5 ]]; then - echo "Usage: $0 [notes]" - exit 1 -fi - -REPORT_FILE="$1" -ID="$2" -STATUS="$3" -SCORE="$4" -LATENCY_MS="$5" -NOTES="${6:-}" -TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) - -# Create result entry -RESULT_JSON=$(jq -n \ - --arg id "${ID}" \ - --arg status "${STATUS}" \ - --argjson score "${SCORE}" \ - --argjson latency "${LATENCY_MS}" \ - --arg notes "${NOTES}" \ - --arg ts "${TIMESTAMP}" \ - '{id: $id, status: $status, score: $score, latency_ms: $latency, notes: $notes, timestamp: $ts}') - -# Append to report -TMP_FILE=$(mktemp) -jq --argjson result "${RESULT_JSON}" \ - --arg status "${STATUS}" \ - '.results += [$result] | - .summary.total += 1 | - if $status == "pass" then .summary.passed += 1 - elif $status == "fail" then .summary.failed += 1 - else .summary.skipped += 1 end' \ - "${REPORT_FILE}" > "${TMP_FILE}" - -mv "${TMP_FILE}" "${REPORT_FILE}" diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh deleted file mode 100755 index 4ce67d6..0000000 --- a/tests/benchmark/scripts/run-benchmark.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash -# -# Run semantic matching benchmark -# -# Usage: -# ./run-benchmark.sh [--strategy ] [--cases ] -# -# Options: -# --strategy Strategy to benchmark (lexical, embedding, combined) -# --cases Specific case file to run (default: all) -# --output Output directory (default: ../results) -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CASES_DIR="${BENCHMARK_DIR}/cases" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" -SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -# Parse args -STRATEGY="combined" -CASE_FILE="" -while [[ $# -gt 0 ]]; do - case "$1" in - --strategy) STRATEGY="$2"; shift 2 ;; - --cases) CASE_FILE="$2"; shift 2 ;; - --output) RESULTS_DIR="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -case "${STRATEGY}" in - lexical|embedding|combined) ;; - *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;; -esac - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.json" - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg strategy "${STRATEGY}" \ - --arg version "$(${SEMANTIC} --version 2>/dev/null || echo 'dev')" \ - '{ - benchmark: { - timestamp: $ts, - strategy: $strategy, - version: $version - }, - results: [], - summary: { - total: 0, - passed: 0, - failed: 0, - skipped: 0, - accuracy: 0, - avg_score: 0, - avg_latency_ms: 0 - } - }' > "${REPORT_FILE}" - -# Run cases -score_at_least() { - local score="$1" - local min_score="$2" - awk -v score="${score}" -v min_score="${min_score}" 'BEGIN { exit (score + 0 >= min_score + 0) ? 0 : 1 }' -} - -run_case() { - local case_file="$1" - local case_name - case_name=$(basename "$case_file" .json) - - echo "" - echo "=== Running: ${case_name} ===" - - local count - count=$(jq length "$case_file") - - for i in $(seq 0 $((count - 1))); do - local id query snapshot expect_ref expect_ref_alt expect_no_match expect_no_crash expect_has_matches threshold min_score - - id=$(jq -r ".[$i].id" "$case_file") - query=$(jq -r ".[$i].query" "$case_file") - snapshot=$(jq -r ".[$i].snapshot" "$case_file") - expect_ref=$(jq -r ".[$i].expect_ref // empty" "$case_file") - expect_ref_alt=$(jq -r ".[$i].expect_ref_alt // [] | join(\",\")" "$case_file") - expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$case_file") - expect_no_crash=$(jq -r ".[$i].expect_no_crash // false" "$case_file") - expect_has_matches=$(jq -r ".[$i].expect_has_matches // false" "$case_file") - threshold=$(jq -r ".[$i].threshold // 0.3" "$case_file") - min_score=$(jq -r ".[$i].min_score // 0" "$case_file") - - local snapshot_path="${SNAPSHOTS_DIR}/${snapshot}" - if [[ ! -f "${snapshot_path}" ]]; then - echo " [${id}] SKIP: snapshot not found: ${snapshot}" - "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "skip" 0 0 "snapshot not found" - continue - fi - - # Run query and measure time - local start_ms end_ms duration_ms result exit_code - start_ms=$(python3 -c 'import time; print(int(time.time() * 1000))') - - set +e - result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot_path}" \ - --strategy "${STRATEGY}" \ - --threshold "${threshold}" \ - --format json 2>&1) - exit_code=$? - set -e - - end_ms=$(python3 -c 'import time; print(int(time.time() * 1000))') - duration_ms=$((end_ms - start_ms)) - - # Evaluate result - local status="fail" - local got_ref="" - local got_score=0 - local notes="" - - if [[ ${exit_code} -ne 0 ]]; then - if [[ "${expect_no_crash}" == "true" ]]; then - # Some crashes are expected (empty query, etc) - status="pass" - notes="exit ${exit_code} (expected)" - else - notes="exit ${exit_code}: ${result}" - fi - else - got_ref=$(echo "$result" | jq -r '.best_ref // empty') - got_score=$(echo "$result" | jq -r '.best_score // 0') - local match_count - match_count=$(echo "$result" | jq -r '.matches | length') - - if [[ "${expect_no_match}" == "true" ]]; then - if [[ ${match_count} -eq 0 ]]; then - status="pass" - notes="no matches (expected)" - else - notes="expected no matches, got ${match_count}" - fi - elif [[ "${expect_has_matches}" == "true" ]]; then - if [[ ${match_count} -gt 0 ]]; then - if score_at_least "${got_score}" "${min_score}"; then - status="pass" - notes="${match_count} matches, score=${got_score}" - else - notes="${match_count} matches, score=${got_score} below min_score=${min_score}" - fi - else - notes="expected matches, got 0" - fi - elif [[ -n "${expect_ref}" ]]; then - if [[ "${got_ref}" == "${expect_ref}" ]]; then - if score_at_least "${got_score}" "${min_score}"; then - status="pass" - notes="ref=${got_ref}, score=${got_score}" - else - notes="ref=${got_ref}, score=${got_score} below min_score=${min_score}" - fi - elif [[ -n "${expect_ref_alt}" ]] && echo ",${expect_ref_alt}," | grep -q ",${got_ref},"; then - if score_at_least "${got_score}" "${min_score}"; then - status="pass" - notes="ref=${got_ref} (alt), score=${got_score}" - else - notes="ref=${got_ref} (alt), score=${got_score} below min_score=${min_score}" - fi - else - notes="got ${got_ref}, want ${expect_ref}" - fi - elif [[ "${expect_no_crash}" == "true" ]]; then - status="pass" - notes="no crash" - fi - fi - - # Record result - "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "${status}" "${got_score}" "${duration_ms}" "${notes}" - - if [[ "${status}" == "pass" ]]; then - echo " [${id}] PASS: ${notes}" - else - echo " [${id}] FAIL: ${notes}" - fi - done -} - -# Find case files -if [[ -n "${CASE_FILE}" ]]; then - run_case "${CASES_DIR}/${CASE_FILE}" -else - for case_file in "${CASES_DIR}"/*.json; do - [[ -f "$case_file" ]] || continue - run_case "$case_file" - done -fi - -# Finalize report -"${SCRIPT_DIR}/finalize-report.sh" "${REPORT_FILE}" - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo "Benchmark complete: ${REPORT_FILE}" diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh deleted file mode 100755 index b5579bf..0000000 --- a/tests/benchmark/scripts/run-corpus-benchmark.sh +++ /dev/null @@ -1,500 +0,0 @@ -#!/bin/bash -# -# Run semantic matching benchmark with ranking metrics -# -# Usage: -# ./run-corpus-benchmark.sh [--strategy ] [--corpus ] [--lexical-weight ] [--embedding-weight ] -# -# Metrics: -# - MRR (Mean Reciprocal Rank) -# - P@1 (Precision at 1) -# - P@3 (Precision at 3) -# - Latency distribution (p50, p95, p99) -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -# Parse args -STRATEGY="combined" -SPECIFIC_CORPUS="" -TOP_K=5 -LEXICAL_WEIGHT=0.6 -EMBEDDING_WEIGHT=0.4 -while [[ $# -gt 0 ]]; do - case "$1" in - --strategy) STRATEGY="$2"; shift 2 ;; - --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; - --top-k) TOP_K="$2"; shift 2 ;; - --lexical-weight) LEXICAL_WEIGHT="$2"; shift 2 ;; - --embedding-weight) EMBEDDING_WEIGHT="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -case "${STRATEGY}" in - lexical|embedding|combined) ;; - *) echo "Unknown strategy: ${STRATEGY}"; exit 1 ;; -esac - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json" - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg strategy "${STRATEGY}" \ - --argjson top_k "${TOP_K}" \ - --argjson lexical_weight "${LEXICAL_WEIGHT}" \ - --argjson embedding_weight "${EMBEDDING_WEIGHT}" \ - '{ - benchmark: { - timestamp: $ts, - strategy: $strategy, - top_k: $top_k, - type: "corpus", - weights: { - lexical: $lexical_weight, - embedding: $embedding_weight - } - }, - results: [], - metrics: { - total: 0, - mrr: 0, - p_at_1: 0, - p_at_3: 0, - latencies_ms: [], - by_difficulty: {}, - by_tag: {} - } - }' > "${REPORT_FILE}" - -# Arrays to collect metrics -declare -a ALL_RRS=() -declare -a ALL_P1=() -declare -a ALL_P3=() -declare -a ALL_HIT3=() -declare -a ALL_HIT5=() -declare -a ALL_MARGINS=() -declare -a ALL_LATENCIES=() - -run_corpus() { - local corpus_path="$1" - local corpus_name - corpus_name=$(basename "$corpus_path") - - local snapshot="${corpus_path}/snapshot.json" - local queries="${corpus_path}/queries.json" - - if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then - if [[ -f "${corpus_path}/cases.json" ]] || [[ -f "${corpus_path}/scenarios.json" ]]; then - return - fi - echo " Skipping ${corpus_name}: missing files" - return - fi - - echo "" - echo "=== Corpus: ${corpus_name} ===" - - local count - count=$(jq length "$queries") - - for i in $(seq 0 $((count - 1))); do - local id query relevant_refs partial_refs difficulty tags - - id=$(jq -r ".[$i].id" "$queries") - query=$(jq -r ".[$i].query" "$queries") - relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries") - partial_refs=$(jq -c ".[$i].partially_relevant_refs // []" "$queries") - difficulty=$(jq -r ".[$i].difficulty // \"medium\"" "$queries") - tags=$(jq -c ".[$i].tags // []" "$queries") - - # Run query and measure time - local start_ns end_ns duration_ms result - start_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))') - - if ! result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot}" \ - --strategy "${STRATEGY}" \ - --threshold 0.01 \ - --top-k "${TOP_K}" \ - --lexical-weight "${LEXICAL_WEIGHT}" \ - --embedding-weight "${EMBEDDING_WEIGHT}" \ - --format json 2>&1); then - echo " [${id}] ERROR: semantic find failed for query: ${query}" >&2 - echo "${result}" >&2 - exit 1 - fi - - if ! echo "$result" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then - echo " [${id}] ERROR: semantic find returned invalid JSON" >&2 - echo "${result}" >&2 - exit 1 - fi - - end_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))') - duration_ms=$(( (end_ns - start_ns) / 1000 )) - - # Extract results - local matches best_ref best_score - matches=$(echo "$result" | jq -c '[.matches[].ref]') - best_ref=$(echo "$result" | jq -r '.best_ref // ""') - best_score=$(echo "$result" | jq -r '.best_score // 0') - - # Calculate Reciprocal Rank - local rr=0 - for rank in $(seq 1 ${TOP_K}); do - local ref_at_rank - ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - rr=$(echo "scale=4; 1 / ${rank}" | bc) - break - fi - done - - # Calculate P@1 - local p1=0 - if echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - p1=1 - elif echo "$partial_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - p1=0.5 - fi - - # Calculate P@3 (count relevant in top 3, partials count as 0.5) - local relevant_in_top3=0 - local partial_in_top3=0 - local hit_at_3=0 - local hit_at_5=0 - local best_relevant_rank="null" - for rank in 1 2 3 4 5; do - local ref_at_rank - ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - if [[ "$best_relevant_rank" == "null" ]]; then - best_relevant_rank=$rank - fi - if [[ $rank -le 3 ]]; then - relevant_in_top3=$((relevant_in_top3 + 1)) - hit_at_3=1 - fi - hit_at_5=1 - elif [[ $rank -le 3 ]]; then - if echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - partial_in_top3=$((partial_in_top3 + 1)) - fi - fi - done - local p3 - p3=$(echo "scale=4; (${relevant_in_top3} + ${partial_in_top3} * 0.5) / 3" | bc) - - # Calculate best_relevant_score, best_wrong_score, and margin - local best_relevant_score=0 - local best_wrong_score=0 - local num_matches - num_matches=$(echo "$result" | jq '.matches | length') - for idx in $(seq 0 $((num_matches - 1))); do - local ref_at_idx score_at_idx - ref_at_idx=$(echo "$result" | jq -r ".matches[$idx].ref // \"\"") - score_at_idx=$(echo "$result" | jq -r ".matches[$idx].score // 0") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then - if (( $(echo "$score_at_idx > $best_relevant_score" | bc -l) )); then - best_relevant_score=$score_at_idx - fi - elif echo "$partial_refs" | jq -e "index(\"${ref_at_idx}\")" > /dev/null 2>&1; then - : # partials don't count as wrong - else - if (( $(echo "$score_at_idx > $best_wrong_score" | bc -l) )); then - best_wrong_score=$score_at_idx - fi - fi - done - local margin - margin=$(echo "scale=4; $best_relevant_score - $best_wrong_score" | bc) - - # Collect metrics - ALL_RRS+=("$rr") - ALL_P1+=("$p1") - ALL_P3+=("$p3") - ALL_HIT3+=("$hit_at_3") - ALL_HIT5+=("$hit_at_5") - ALL_MARGINS+=("$margin") - ALL_LATENCIES+=("$duration_ms") - - # Status indicator - local status="MISS" - if (( $(echo "$p1 >= 1" | bc -l) )); then - status="HIT " - elif (( $(echo "$p1 >= 0.5" | bc -l) )); then - status="PART" - fi - - printf " [%s] %s | RR=%.2f P@1=%.1f P@3=%.2f | %dms | %s\n" \ - "$id" "$status" "$rr" "$p1" "$p3" "$duration_ms" "$query" - - # Record to report - local result_json - result_json=$(jq -n \ - --arg id "$id" \ - --arg query "$query" \ - --arg corpus "$corpus_name" \ - --arg difficulty "$difficulty" \ - --argjson tags "$tags" \ - --arg best_ref "$best_ref" \ - --argjson best_score "$best_score" \ - --argjson matches "$matches" \ - --argjson relevant "$relevant_refs" \ - --argjson rr "$rr" \ - --argjson p1 "$p1" \ - --argjson p3 "$p3" \ - --argjson hit_at_3 "$hit_at_3" \ - --argjson hit_at_5 "$hit_at_5" \ - --argjson best_relevant_rank "$best_relevant_rank" \ - --argjson best_relevant_score "$best_relevant_score" \ - --argjson best_wrong_score "$best_wrong_score" \ - --argjson margin "$margin" \ - --argjson latency "$duration_ms" \ - '{ - id: $id, query: $query, corpus: $corpus, - difficulty: $difficulty, tags: $tags, - best_ref: $best_ref, best_score: $best_score, - matches: $matches, relevant_refs: $relevant, - rr: $rr, p_at_1: $p1, p_at_3: $p3, - hit_at_3: $hit_at_3, hit_at_5: $hit_at_5, - best_relevant_rank: $best_relevant_rank, - best_relevant_score: $best_relevant_score, - best_wrong_score: $best_wrong_score, - margin: $margin, - latency_ms: $latency - }') - - # Append to report - local tmp - tmp=$(mktemp) - jq --argjson r "$result_json" '.results += [$r]' "$REPORT_FILE" > "$tmp" - mv "$tmp" "$REPORT_FILE" - done -} - -# Run benchmarks -if [[ -n "${SPECIFIC_CORPUS}" ]]; then - run_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}" -else - for corpus in "${CORPUS_DIR}"/*/; do - [[ -d "$corpus" ]] || continue - run_corpus "$corpus" - done -fi - -# Calculate aggregate metrics -echo "" -echo "Calculating aggregate metrics..." - -TOTAL=${#ALL_RRS[@]} -if [[ $TOTAL -eq 0 ]]; then - echo "No results to aggregate" - exit 1 -fi - -# MRR -MRR=$(printf '%s\n' "${ALL_RRS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# P@1 -P1=$(printf '%s\n' "${ALL_P1[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# P@3 -P3=$(printf '%s\n' "${ALL_P3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Hit@3 -HIT3=$(printf '%s\n' "${ALL_HIT3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Hit@5 -HIT5=$(printf '%s\n' "${ALL_HIT5[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Average margin -AVG_MARGIN=$(printf '%s\n' "${ALL_MARGINS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Latency percentiles -SORTED_LAT=($(printf '%s\n' "${ALL_LATENCIES[@]}" | sort -n)) -P50_IDX=$(( TOTAL * 50 / 100 )) -P95_IDX=$(( TOTAL * 95 / 100 )) -P99_IDX=$(( TOTAL * 99 / 100 )) -LAT_P50=${SORTED_LAT[$P50_IDX]:-0} -LAT_P95=${SORTED_LAT[$P95_IDX]:-0} -LAT_P99=${SORTED_LAT[$P99_IDX]:-0} -LAT_AVG=$(printf '%s\n' "${ALL_LATENCIES[@]}" | awk '{s+=$1} END {printf "%.0f", s/NR}') - -# Update report with aggregates -tmp=$(mktemp) -jq \ - --argjson total "$TOTAL" \ - --argjson mrr "$MRR" \ - --argjson p1 "$P1" \ - --argjson p3 "$P3" \ - --argjson hit3 "$HIT3" \ - --argjson hit5 "$HIT5" \ - --argjson avg_margin "$AVG_MARGIN" \ - --argjson lat_avg "$LAT_AVG" \ - --argjson lat_p50 "$LAT_P50" \ - --argjson lat_p95 "$LAT_P95" \ - --argjson lat_p99 "$LAT_P99" \ - '.metrics = { - total: $total, - mrr: $mrr, - p_at_1: $p1, - p_at_3: $p3, - hit_at_3: $hit3, - hit_at_5: $hit5, - avg_margin: $avg_margin, - latency_avg_ms: $lat_avg, - latency_p50_ms: $lat_p50, - latency_p95_ms: $lat_p95, - latency_p99_ms: $lat_p99 - }' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Add by-difficulty breakdown -tmp=$(mktemp) -jq '.metrics.by_difficulty = ( - .results | group_by(.difficulty) | map({ - key: .[0].difficulty, - value: { - count: length, - mrr: ([.[].rr] | add / length), - p_at_1: ([.[].p_at_1] | add / length), - hit_at_3: ([.[].hit_at_3] | add / length), - hit_at_5: ([.[].hit_at_5] | add / length), - avg_margin: ([.[].margin] | add / length) - } - }) | from_entries -)' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Add by-corpus breakdown -tmp=$(mktemp) -jq '.metrics.by_corpus = ( - .results | group_by(.corpus) | map({ - key: .[0].corpus, - value: { - count: length, - mrr: ([.[].rr] | add / length), - p_at_1: ([.[].p_at_1] | add / length), - hit_at_3: ([.[].hit_at_3] | add / length), - hit_at_5: ([.[].hit_at_5] | add / length), - avg_margin: ([.[].margin] | add / length) - } - }) | from_entries -)' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Add by-tag breakdown -tmp=$(mktemp) -jq '.metrics.by_tag = ( - [.results[] | {tags: .tags, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}] - | [.[] | .tags[] as $tag | {tag: $tag, rr: .rr, p_at_1: .p_at_1, hit_at_3: .hit_at_3, hit_at_5: .hit_at_5, margin: .margin}] - | group_by(.tag) - | map({ - key: .[0].tag, - value: { - count: length, - mrr: ([.[].rr] | add / length), - p_at_1: ([.[].p_at_1] | add / length), - hit_at_3: ([.[].hit_at_3] | add / length), - hit_at_5: ([.[].hit_at_5] | add / length), - avg_margin: ([.[].margin] | add / length) - } - }) - | from_entries -)' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Generate summary -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -cat > "${SUMMARY_FILE}" << EOF -# Semantic Matching Benchmark Results - -## Configuration - -| Field | Value | -|-------|-------| -| Timestamp | $(jq -r '.benchmark.timestamp' "$REPORT_FILE") | -| Strategy | ${STRATEGY} | -| Lexical Weight | ${LEXICAL_WEIGHT} | -| Embedding Weight | ${EMBEDDING_WEIGHT} | -| Top-K | ${TOP_K} | -| Total Queries | ${TOTAL} | - -## Ranking Metrics - -| Metric | Value | Description | -|--------|-------|-------------| -| **MRR** | **${MRR}** | Mean Reciprocal Rank | -| **P@1** | **${P1}** | Precision at rank 1 | -| **P@3** | **${P3}** | Precision at rank 3 | -| **Hit@3** | **${HIT3}** | Any relevant in top 3 | -| **Hit@5** | **${HIT5}** | Any relevant in top 5 | -| **Avg Margin** | **${AVG_MARGIN}** | best_relevant - best_wrong | - -## Latency - -| Percentile | Value | -|------------|-------| -| Average | ${LAT_AVG} ms | -| P50 | ${LAT_P50} ms | -| P95 | ${LAT_P95} ms | -| P99 | ${LAT_P99} ms | - -## By Difficulty - -| Difficulty | Count | MRR | P@1 | Hit@3 | Margin | -|------------|-------|-----|-----|-------|--------| -$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE") - -## By Corpus - -| Corpus | Count | MRR | P@1 | Hit@3 | Margin | -|--------|-------|-----|-----|-------|--------| -$(jq -r '.metrics.by_corpus | to_entries | .[] | "| \(.key) | \(.value.count) | \(.value.mrr | . * 100 | floor / 100) | \(.value.p_at_1 | . * 100 | floor / 100) | \(.value.hit_at_3 | . * 100 | floor / 100) | \(.value.avg_margin | . * 100 | floor / 100) |"' "$REPORT_FILE") - -## Misses (P@1 = 0) - -| ID | Query | Got | Expected | -|----|-------|-----|----------| -$(jq -r '.results[] | select(.p_at_1 == 0) | "| \(.id) | \(.query) | \(.best_ref) | \(.relevant_refs | join(",")) |"' "$REPORT_FILE") - -EOF - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo "================================================" -echo " CORPUS BENCHMARK RESULTS" -echo "================================================" -echo " Strategy: ${STRATEGY}" -echo " Weights: lexical=${LEXICAL_WEIGHT} embedding=${EMBEDDING_WEIGHT}" -echo " Queries: ${TOTAL}" -echo " MRR: ${MRR}" -echo " P@1: ${P1}" -echo " P@3: ${P3}" -echo " Hit@3: ${HIT3}" -echo " Hit@5: ${HIT5}" -echo " Avg Margin: ${AVG_MARGIN}" -echo " Latency P50: ${LAT_P50} ms" -echo " Latency P95: ${LAT_P95} ms" -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh deleted file mode 100755 index eadaad7..0000000 --- a/tests/benchmark/scripts/run-full-benchmark.sh +++ /dev/null @@ -1,304 +0,0 @@ -#!/bin/bash -# -# Full semantic benchmark: Find + Recovery + Classification -# -# Produces a composite score for overall system health. -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary with recovery support -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/full_benchmark_${TIMESTAMP}.json" - -has_role_keyword() { - local query="$1" - echo "$query" | grep -Eiq '(^|[^[:alnum:]])(button|input|link|textbox|checkbox|radio|select|option|tab|menu|form|search)([^[:alnum:]]|$)' -} - -enrich_recovery_query() { - local query="$1" - local role="$2" - - if [[ -z "$query" || -z "$role" ]]; then - printf '%s' "$query" - return - fi - if has_role_keyword "$query"; then - printf '%s' "$query" - return - fi - printf '%s %s' "$query" "$role" -} - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - '{ - timestamp: $ts, - find: { total: 0, mrr: 0, p_at_1: 0, latency_p50: 0 }, - recovery: { total: 0, recovered: 0, rate: 0 }, - classification: { total: 0, correct: 0, accuracy: 0 }, - composite: { score: 0, grade: "" } - }' > "${REPORT_FILE}" - -echo "" -echo "==============================================" -echo " PHASE 1: FIND BENCHMARK" -echo "==============================================" - -# Run corpus benchmark and capture metrics -FIND_OUTPUT=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" 2>&1) -echo "$FIND_OUTPUT" - -# Extract metrics from the corpus report rather than the human-readable output. -FIND_REPORT=$(echo "$FIND_OUTPUT" | awk '/^Report:/ {print $2}' | tail -1) -if [[ -z "${FIND_REPORT}" ]] || [[ ! -f "${FIND_REPORT}" ]]; then - echo "error: could not locate corpus benchmark report" >&2 - exit 1 -fi -FIND_MRR=$(jq -r '.metrics.mrr' "$FIND_REPORT") -FIND_P1=$(jq -r '.metrics.p_at_1' "$FIND_REPORT") -FIND_TOTAL=$(jq -r '.metrics.total' "$FIND_REPORT") -FIND_LAT=$(jq -r '.metrics.latency_p50_ms' "$FIND_REPORT") - -# Rebuild semantic binary (corpus benchmark deletes it) -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -echo "" -echo "==============================================" -echo " PHASE 2: RECOVERY BENCHMARK" -echo "==============================================" - -SCENARIOS_FILE="${CORPUS_DIR}/recovery-scenarios/scenarios.json" -RECOVERY_TOTAL=0 -RECOVERY_SUCCESS=0 - -if [[ -f "$SCENARIOS_FILE" ]]; then - SCENARIO_COUNT=$(jq length "$SCENARIOS_FILE") - - for i in $(seq 0 $((SCENARIO_COUNT - 1))); do - ID=$(jq -r ".[$i].id" "$SCENARIOS_FILE") - NAME=$(jq -r ".[$i].name" "$SCENARIOS_FILE") - RAW_QUERY=$(jq -r ".[$i].original_query" "$SCENARIOS_FILE") - ORIGINAL_REF=$(jq -r ".[$i].original_ref // empty" "$SCENARIOS_FILE") - ORIGINAL_ROLE=$(jq -r ".[$i].before[]? | select(.ref == \"$ORIGINAL_REF\") | .role // empty" "$SCENARIOS_FILE") - QUERY=$(enrich_recovery_query "$RAW_QUERY" "$ORIGINAL_ROLE") - EXPECTED=$(jq -r ".[$i].expected_ref // empty" "$SCENARIOS_FILE") - EXPECTED_ALT=$(jq -r ".[$i].expected_alt // [] | join(\",\")" "$SCENARIOS_FILE") - EXPECT_NO_MATCH=$(jq -r ".[$i].expect_no_match // false" "$SCENARIOS_FILE") - - # Write after snapshot to temp file - AFTER_FILE=$(mktemp) - jq ".[$i].after" "$SCENARIOS_FILE" > "$AFTER_FILE" - - # Run semantic find on after snapshot with the same minimum score - # enforced by DefaultRecoveryConfig in the recovery engine. - if ! RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>&1); then - echo " [$ID] ERROR: semantic find failed during recovery benchmark" >&2 - echo "$RESULT" >&2 - rm -f "$AFTER_FILE" - exit 1 - fi - if ! echo "$RESULT" | jq -e '(.matches | type) == "array"' > /dev/null 2>&1; then - echo " [$ID] ERROR: semantic find returned invalid JSON during recovery benchmark" >&2 - echo "$RESULT" >&2 - rm -f "$AFTER_FILE" - exit 1 - fi - BEST_REF=$(echo "$RESULT" | jq -r '.best_ref // ""') - - rm -f "$AFTER_FILE" - - RECOVERY_TOTAL=$((RECOVERY_TOTAL + 1)) - STATUS="FAIL" - - if [[ "$EXPECT_NO_MATCH" == "true" ]]; then - if [[ -z "$BEST_REF" ]] || [[ "$BEST_REF" == "null" ]]; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - fi - elif [[ "$BEST_REF" == "$EXPECTED" ]]; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - elif [[ -n "$EXPECTED_ALT" ]] && echo ",$EXPECTED_ALT," | grep -q ",$BEST_REF,"; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - fi - - printf " [%s] %s | %s | got=%s want=%s\n" "$ID" "$STATUS" "$NAME" "$BEST_REF" "$EXPECTED" - done -fi - -RECOVERY_RATE=0 -if [[ $RECOVERY_TOTAL -gt 0 ]]; then - RECOVERY_RATE=$(echo "scale=4; $RECOVERY_SUCCESS / $RECOVERY_TOTAL" | bc) -fi - -echo "" -echo " Recovery: $RECOVERY_SUCCESS / $RECOVERY_TOTAL = $RECOVERY_RATE" - -echo "" -echo "==============================================" -echo " PHASE 3: CLASSIFICATION BENCHMARK" -echo "==============================================" - -CLASS_FILE="${CORPUS_DIR}/classification/cases.json" -CLASS_TOTAL=0 -CLASS_CORRECT=0 - -if [[ -f "$CLASS_FILE" ]]; then - CLASS_COUNT=$(jq length "$CLASS_FILE") - - for i in $(seq 0 $((CLASS_COUNT - 1))); do - ID=$(jq -r ".[$i].id" "$CLASS_FILE") - ERROR=$(jq -r ".[$i].error" "$CLASS_FILE") - EXPECTED=$(jq -r ".[$i].expected_type" "$CLASS_FILE") - - # Run semantic classify (extract just the type, first word) - if ! RESULT=$("${SEMANTIC}" classify "$ERROR" 2>&1); then - echo " [$ID] ERROR: semantic classify failed" >&2 - echo "$RESULT" >&2 - exit 1 - fi - GOT=$(echo "$RESULT" | awk '{print $1}') - - CLASS_TOTAL=$((CLASS_TOTAL + 1)) - STATUS="FAIL" - - if [[ "$GOT" == "$EXPECTED" ]]; then - STATUS="PASS" - CLASS_CORRECT=$((CLASS_CORRECT + 1)) - fi - - printf " [%s] %s | \"%s\" β†’ %s (want %s)\n" "$ID" "$STATUS" "${ERROR:0:40}" "$GOT" "$EXPECTED" - done -fi - -CLASS_ACCURACY=0 -if [[ $CLASS_TOTAL -gt 0 ]]; then - CLASS_ACCURACY=$(echo "scale=4; $CLASS_CORRECT / $CLASS_TOTAL" | bc) -fi - -echo "" -echo " Classification: $CLASS_CORRECT / $CLASS_TOTAL = $CLASS_ACCURACY" - -echo "" -echo "==============================================" -echo " COMPOSITE SCORE" -echo "==============================================" - -# Calculate composite score with weights: -# Find P@1: 40% -# Find MRR: 20% -# Recovery Rate: 25% -# Classification: 15% - -COMPOSITE=$(echo "scale=4; \ - ($FIND_P1 * 0.40) + \ - ($FIND_MRR * 0.20) + \ - ($RECOVERY_RATE * 0.25) + \ - ($CLASS_ACCURACY * 0.15)" | bc) -COMPOSITE=$(awk -v value="$COMPOSITE" 'BEGIN { printf "%.4f", value }') - -# Assign grade -GRADE="F" -if (( $(echo "$COMPOSITE >= 0.95" | bc -l) )); then GRADE="A+" -elif (( $(echo "$COMPOSITE >= 0.90" | bc -l) )); then GRADE="A" -elif (( $(echo "$COMPOSITE >= 0.85" | bc -l) )); then GRADE="B+" -elif (( $(echo "$COMPOSITE >= 0.80" | bc -l) )); then GRADE="B" -elif (( $(echo "$COMPOSITE >= 0.75" | bc -l) )); then GRADE="C+" -elif (( $(echo "$COMPOSITE >= 0.70" | bc -l) )); then GRADE="C" -elif (( $(echo "$COMPOSITE >= 0.60" | bc -l) )); then GRADE="D" -fi - -# Update report -TMP=$(mktemp) -jq \ - --argjson find_total "${FIND_TOTAL:-0}" \ - --argjson find_mrr "${FIND_MRR:-0}" \ - --argjson find_p1 "${FIND_P1:-0}" \ - --argjson find_lat "${FIND_LAT:-0}" \ - --argjson rec_total "$RECOVERY_TOTAL" \ - --argjson rec_success "$RECOVERY_SUCCESS" \ - --argjson rec_rate "$RECOVERY_RATE" \ - --argjson class_total "$CLASS_TOTAL" \ - --argjson class_correct "$CLASS_CORRECT" \ - --argjson class_acc "$CLASS_ACCURACY" \ - --argjson composite "$COMPOSITE" \ - --arg grade "$GRADE" \ - '.find = { total: $find_total, mrr: $find_mrr, p_at_1: $find_p1, latency_p50: $find_lat } | - .recovery = { total: $rec_total, recovered: $rec_success, rate: $rec_rate } | - .classification = { total: $class_total, correct: $class_correct, accuracy: $class_acc } | - .composite = { score: $composite, grade: $grade }' \ - "$REPORT_FILE" > "$TMP" -mv "$TMP" "$REPORT_FILE" - -# Generate summary -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" -cat > "$SUMMARY_FILE" << EOF -# Semantic Benchmark Report - -## Composite Score: ${COMPOSITE} (${GRADE}) - -| Component | Weight | Score | Weighted | -|-----------|--------|-------|----------| -| Find P@1 | 40% | ${FIND_P1:-0} | $(echo "scale=3; ${FIND_P1:-0} * 0.40" | bc) | -| Find MRR | 20% | ${FIND_MRR:-0} | $(echo "scale=3; ${FIND_MRR:-0} * 0.20" | bc) | -| Recovery | 25% | ${RECOVERY_RATE} | $(echo "scale=3; ${RECOVERY_RATE} * 0.25" | bc) | -| Classification | 15% | ${CLASS_ACCURACY} | $(echo "scale=3; ${CLASS_ACCURACY} * 0.15" | bc) | - -## Find Performance -- Queries: ${FIND_TOTAL:-0} -- MRR: ${FIND_MRR:-0} -- P@1: ${FIND_P1:-0} -- Latency P50: ${FIND_LAT:-0} ms - -## Recovery Performance -- Scenarios: ${RECOVERY_TOTAL} -- Recovered: ${RECOVERY_SUCCESS} -- Rate: ${RECOVERY_RATE} - -## Classification Performance -- Cases: ${CLASS_TOTAL} -- Correct: ${CLASS_CORRECT} -- Accuracy: ${CLASS_ACCURACY} - -## Grade Scale -| Grade | Score | -|-------|-------| -| A+ | >= 0.95 | -| A | >= 0.90 | -| B+ | >= 0.85 | -| B | >= 0.80 | -| C+ | >= 0.75 | -| C | >= 0.70 | -| D | >= 0.60 | -| F | < 0.60 | -EOF - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”" -echo " β”‚ COMPOSITE SCORE: ${COMPOSITE} GRADE: ${GRADE} β”‚" -echo " β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€" -echo " β”‚ Find P@1: ${FIND_P1:-0} (40%) β”‚" -echo " β”‚ Find MRR: ${FIND_MRR:-0} (20%) β”‚" -echo " β”‚ Recovery: ${RECOVERY_RATE} (25%) β”‚" -echo " β”‚ Classification: ${CLASS_ACCURACY} (15%) β”‚" -echo " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/tune-weights.sh b/tests/benchmark/scripts/tune-weights.sh deleted file mode 100755 index ef61d88..0000000 --- a/tests/benchmark/scripts/tune-weights.sh +++ /dev/null @@ -1,157 +0,0 @@ -#!/bin/bash -# -# Grid-search combined matcher lexical/embedding weights against the corpus. -# -# Usage: -# ./tune-weights.sh [--corpus ] [--step ] [--output ] -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -RESULTS_DIR="${BENCHMARK_DIR}/results" - -SPECIFIC_CORPUS="" -STEP="0.1" -while [[ $# -gt 0 ]]; do - case "$1" in - --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; - --step) STEP="$2"; shift 2 ;; - --output) RESULTS_DIR="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -mkdir -p "${RESULTS_DIR}" - -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/tuning_weights_${TIMESTAMP}.json" -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg step "${STEP}" \ - '{ - benchmark: { - timestamp: $ts, - type: "weight-tuning", - strategy: "combined", - step: ($step | tonumber) - }, - results: [], - best: null - }' > "${REPORT_FILE}" - -weights=$(awk -v step="${STEP}" 'BEGIN { - if (step <= 0 || step > 1) { - exit 1 - } - for (w = 0; w <= 1.000001; w += step) { - printf "%.4f\n", w - } -}') - -if [[ -z "${weights}" ]]; then - echo "Invalid step: ${STEP}" >&2 - exit 1 -fi - -echo "Weight tuning: step=${STEP}" -echo "" -printf "%-10s %-10s %-8s %-8s %-8s %-8s %-8s\n" "lexical" "embedding" "MRR" "P@1" "P@3" "P50" "report" - -while IFS= read -r lexical_weight; do - embedding_weight=$(awk -v w="${lexical_weight}" 'BEGIN { printf "%.4f", 1 - w }') - - args=( - --strategy combined - --lexical-weight "${lexical_weight}" - --embedding-weight "${embedding_weight}" - ) - if [[ -n "${SPECIFIC_CORPUS}" ]]; then - args+=(--corpus "${SPECIFIC_CORPUS}") - fi - - if ! output=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" "${args[@]}" 2>&1); then - echo "$output" >&2 - exit 1 - fi - - corpus_report=$(echo "$output" | awk '/^Report:/ {print $2}' | tail -1) - if [[ -z "${corpus_report}" || ! -f "${corpus_report}" ]]; then - echo "Could not find corpus report for lexical=${lexical_weight}" >&2 - echo "$output" >&2 - exit 1 - fi - - mrr=$(jq -r '.metrics.mrr' "$corpus_report") - p1=$(jq -r '.metrics.p_at_1' "$corpus_report") - p3=$(jq -r '.metrics.p_at_3' "$corpus_report") - p50=$(jq -r '.metrics.latency_p50_ms' "$corpus_report") - total=$(jq -r '.metrics.total' "$corpus_report") - - printf "%-10s %-10s %-8s %-8s %-8s %-8s %s\n" \ - "${lexical_weight}" "${embedding_weight}" "${mrr}" "${p1}" "${p3}" "${p50}" "$(basename "$corpus_report")" - - result_json=$(jq -n \ - --argjson lexical_weight "${lexical_weight}" \ - --argjson embedding_weight "${embedding_weight}" \ - --argjson total "${total}" \ - --argjson mrr "${mrr}" \ - --argjson p1 "${p1}" \ - --argjson p3 "${p3}" \ - --argjson p50 "${p50}" \ - --arg report "${corpus_report}" \ - '{ - lexical_weight: $lexical_weight, - embedding_weight: $embedding_weight, - total: $total, - mrr: $mrr, - p_at_1: $p1, - p_at_3: $p3, - latency_p50_ms: $p50, - report: $report - }') - - tmp=$(mktemp) - jq --argjson result "${result_json}" '.results += [$result]' "${REPORT_FILE}" > "$tmp" - mv "$tmp" "${REPORT_FILE}" -done <<< "${weights}" - -tmp=$(mktemp) -jq ' - .best = ( - .results - | sort_by(.p_at_1, .mrr, .p_at_3, -(.latency_p50_ms)) - | last - ) -' "${REPORT_FILE}" > "$tmp" -mv "$tmp" "${REPORT_FILE}" - -cat > "${SUMMARY_FILE}" << EOF -# Combined Weight Tuning - -## Best - -| Field | Value | -|-------|-------| -| Lexical Weight | $(jq -r '.best.lexical_weight' "$REPORT_FILE") | -| Embedding Weight | $(jq -r '.best.embedding_weight' "$REPORT_FILE") | -| MRR | $(jq -r '.best.mrr' "$REPORT_FILE") | -| P@1 | $(jq -r '.best.p_at_1' "$REPORT_FILE") | -| P@3 | $(jq -r '.best.p_at_3' "$REPORT_FILE") | -| Latency P50 | $(jq -r '.best.latency_p50_ms' "$REPORT_FILE") ms | - -## All Runs - -| Lexical | Embedding | MRR | P@1 | P@3 | P50 | -|---------|-----------|-----|-----|-----|-----| -$(jq -r '.results | sort_by(-.p_at_1, -.mrr, -.p_at_3, .latency_p50_ms)[] | "| \(.lexical_weight) | \(.embedding_weight) | \(.mrr) | \(.p_at_1) | \(.p_at_3) | \(.latency_p50_ms) ms |"' "$REPORT_FILE") -EOF - -echo "" -echo "Best weights:" -jq '.best' "${REPORT_FILE}" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}"