diff --git a/.gitignore b/.gitignore index 09584bd..419dfaa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ -# Binary (root only, not cmd/semantic/) +# Binary /semantic +/semantic-bench +tests/benchmark/semantic +tests/e2e/semantic *.exe # Test @@ -19,4 +22,5 @@ cover.out .claude tests/e2e/results/*.txt tests/benchmark/results/*.json -tests/benchmark/results/*.md \ No newline at end of file +tests/benchmark/results/*.md +tests/benchmark/baselines/*.json \ No newline at end of file diff --git a/README.md b/README.md index dfaba84..83fb48e 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ Implementations are internal — consumers use the `ElementMatcher` interface an ## Features - **Synonym expansion** — 54 UI synonym groups ("sign in" ↔ "log in", "cart" ↔ "basket", "preferences" ↔ "settings", etc.) +- **Visual position hints** — Understand layout cues like `top`, `bottom`, `left`, `right`, and `above`/`below` anchors - **Confidence calibration** — Scores mapped to high (≥ 0.8) / medium (≥ 0.6) / low labels - **Error classification** — Classify browser errors (CDP, chromedp) as recoverable or not - **Self-healing recovery** — Re-locate stale elements after DOM changes via callback interfaces @@ -184,6 +185,11 @@ semantic find "login" --snapshot page.json --format json # machine-readable semantic find "login" --snapshot page.json --format table # human-readable semantic find "login" --snapshot page.json --format refs # just refs +# Visual position hints +semantic find "button in top right corner" --snapshot page.json +semantic find "link below the search box" --snapshot page.json +semantic find "sidebar on the left" --snapshot page.json + # Score a specific element semantic match "login" e4 --snapshot page.json @@ -198,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode ## Design Trade-offs -See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration. +See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration. ## Origin diff --git a/cmd/semantic-bench/main.go b/cmd/semantic-bench/main.go new file mode 100644 index 0000000..076d71a --- /dev/null +++ b/cmd/semantic-bench/main.go @@ -0,0 +1,168 @@ +package main + +import ( + "fmt" + "os" + + "github.com/pinchtab/semantic/internal/benchmark" +) + +const usage = `semantic-bench - Benchmark runner for semantic matching + +Usage: + semantic-bench [flags] + +Commands: + check Run benchmark and compare against baseline (default) + run Run benchmark suites + compare Compare two reports + lint Validate dataset + catalog Print dataset inventory + baseline Manage quality baselines (create, update) + calibrate Find optimal thresholds via precision/recall analysis + tune Grid-search lexical/embedding weights + runtime Check Go benchmark performance against baseline + +Flags: + -h, --help Show help + +Run 'semantic-bench --help' for command-specific help. +` + +func main() { + if len(os.Args) < 2 { + runCheck(os.Args[1:]) + return + } + + cmd := os.Args[1] + args := os.Args[2:] + + switch cmd { + case "check": + runCheck(args) + case "run": + runRun(args) + case "compare": + runCompare(args) + case "lint": + runLint(args) + case "catalog": + runCatalog(args) + case "baseline": + runBaseline(args) + case "calibrate": + runCalibrate(args) + case "tune": + runTune(args) + case "runtime": + runRuntime(args) + case "-h", "--help", "help": + fmt.Print(usage) + default: + fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage) + os.Exit(2) + } +} + +func runCheck(args []string) { + cfg := benchmark.ParseCheckFlags(args) + result, err := benchmark.RunCheck(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCheckResult(result, cfg) + if result.Status == "fail" { + os.Exit(1) + } +} + +func runRun(args []string) { + cfg := benchmark.ParseRunFlags(args) + result, err := benchmark.RunBenchmark(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintRunResult(result, cfg) +} + +func runCompare(args []string) { + cfg := benchmark.ParseCompareFlags(args) + result, err := benchmark.RunCompare(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCompareResult(result, cfg) + if result.Status == "fail" { + os.Exit(1) + } +} + +func runLint(args []string) { + cfg := benchmark.ParseLintFlags(args) + result, err := benchmark.RunLint(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintLintResult(result, cfg) + if result.Errors > 0 { + os.Exit(1) + } +} + +func runCatalog(args []string) { + cfg := benchmark.ParseCatalogFlags(args) + result, err := benchmark.RunCatalog(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCatalogResult(result, cfg) +} + +func runBaseline(args []string) { + cfg := benchmark.ParseBaselineFlags(args) + result, err := benchmark.RunBaseline(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintBaselineResult(result, cfg) +} + +func runCalibrate(args []string) { + cfg := benchmark.ParseCalibrateFlags(args) + result, err := benchmark.RunCalibrate(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintCalibrateResult(result, cfg) +} + +func runTune(args []string) { + cfg := benchmark.ParseTuneFlags(args) + result, err := benchmark.RunTune(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintTuneResult(result, cfg) +} + +func runRuntime(args []string) { + cfg := benchmark.ParseRuntimeFlags(args) + result, err := benchmark.RunRuntime(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(2) + } + benchmark.PrintRuntimeResult(result, cfg) + if result.Status == "fail" && cfg.FailOnRegression { + os.Exit(1) + } +} diff --git a/cmd/semantic/main.go b/cmd/semantic/main.go index ae8bc89..b99815c 100644 --- a/cmd/semantic/main.go +++ b/cmd/semantic/main.go @@ -56,16 +56,24 @@ Flags (find/match): --threshold Minimum score (default: 0.3) --top-k Max results (default: 3) --strategy lexical, embedding, or combined (default: combined) + --lexical-weight Combined strategy lexical weight override + --embedding-weight Combined strategy embedding weight override --format json, table, or refs (default: table) `) } // snapshotElement is the JSON shape from pinchtab's /snapshot endpoint. type snapshotPositional struct { - Depth int `json:"depth"` - SiblingIndex int `json:"sibling_index"` - SiblingCount int `json:"sibling_count"` - LabelledBy string `json:"labelled_by"` + Depth int `json:"depth"` + SiblingIndex int `json:"sibling_index"` + SiblingCount int `json:"sibling_count"` + LabelledBy string `json:"labelled_by"` + X float64 `json:"x"` + Y float64 `json:"y"` + Top float64 `json:"top"` + Left float64 `json:"left"` + Width float64 `json:"width"` + Height float64 `json:"height"` } type snapshotElement struct { @@ -80,6 +88,12 @@ type snapshotElement struct { SiblingIdx int `json:"sibling_index"` SiblingCnt int `json:"sibling_count"` LabelledBy string `json:"labelled_by"` + X float64 `json:"x"` + Y float64 `json:"y"` + Top float64 `json:"top"` + Left float64 `json:"left"` + Width float64 `json:"width"` + Height float64 `json:"height"` Positional *snapshotPositional `json:"positional"` } @@ -112,6 +126,16 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) { depth := e.Depth siblingIdx := e.SiblingIdx siblingCnt := e.SiblingCnt + x := e.X + y := e.Y + if x == 0 && e.Left != 0 { + x = e.Left + } + if y == 0 && e.Top != 0 { + y = e.Top + } + width := e.Width + height := e.Height if e.Positional != nil { if e.Positional.Depth != 0 { depth = e.Positional.Depth @@ -125,6 +149,23 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) { if e.Positional.LabelledBy != "" { labelledBy = e.Positional.LabelledBy } + + hasHorizontal := e.Positional.X != 0 || e.Positional.Left != 0 || e.Positional.Width > 0 + hasVertical := e.Positional.Y != 0 || e.Positional.Top != 0 || e.Positional.Height > 0 + if hasHorizontal { + x = e.Positional.X + if x == 0 && e.Positional.Left != 0 { + x = e.Positional.Left + } + width = e.Positional.Width + } + if hasVertical { + y = e.Positional.Y + if y == 0 && e.Positional.Top != 0 { + y = e.Positional.Top + } + height = e.Positional.Height + } } descs[i] = semantic.ElementDescriptor{ @@ -140,6 +181,10 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) { SiblingIndex: siblingIdx, SiblingCount: siblingCnt, LabelledBy: labelledBy, + X: x, + Y: y, + Width: width, + Height: height, }, } } @@ -166,6 +211,8 @@ func runFind(args []string) { threshold := fs.Float64("threshold", 0.3, "minimum score") topK := fs.Int("top-k", 3, "max results") strategy := fs.String("strategy", "combined", "matching strategy") + lexicalWeight := fs.Float64("lexical-weight", 0, "combined strategy lexical weight override") + embeddingWeight := fs.Float64("embedding-weight", 0, "combined strategy embedding weight override") format := fs.String("format", "table", "output format: json, table, refs") _ = fs.Parse(args) @@ -183,8 +230,10 @@ func runFind(args []string) { matcher := newMatcher(*strategy) result, err := matcher.Find(context.Background(), query, elements, semantic.FindOptions{ - Threshold: *threshold, - TopK: *topK, + Threshold: *threshold, + TopK: *topK, + LexicalWeight: *lexicalWeight, + EmbeddingWeight: *embeddingWeight, }) if err != nil { fmt.Fprintf(os.Stderr, "error: %v\n", err) diff --git a/cmd/semantic/main_test.go b/cmd/semantic/main_test.go index d423841..0a7d117 100644 --- a/cmd/semantic/main_test.go +++ b/cmd/semantic/main_test.go @@ -12,8 +12,8 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) { } json := `[ - {"ref":"e1","role":"button","name":"Submit","interactive":true,"parent":"Login form","section":"Authentication","depth":3,"sibling_index":1,"sibling_count":2,"labelled_by":"Primary Action"}, - {"ref":"e2","role":"text","name":"Submit","interactive":false,"parent":"Payment form","section":"Checkout","positional":{"depth":2,"sibling_index":0,"sibling_count":1,"labelled_by":"Secondary Action"}} + {"ref":"e1","role":"button","name":"Submit","interactive":true,"parent":"Login form","section":"Authentication","depth":3,"sibling_index":1,"sibling_count":2,"labelled_by":"Primary Action","x":20,"y":40,"width":120,"height":30}, + {"ref":"e2","role":"text","name":"Submit","interactive":false,"parent":"Payment form","section":"Checkout","positional":{"depth":2,"sibling_index":0,"sibling_count":1,"labelled_by":"Secondary Action","left":300,"top":640,"width":200,"height":44}} ]` if _, err := f.WriteString(json); err != nil { t.Fatalf("WriteString failed: %v", err) @@ -50,6 +50,12 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) { if descs[0].Positional.LabelledBy != "Primary Action" { t.Fatalf("expected first descriptor labelled_by=Primary Action, got %q", descs[0].Positional.LabelledBy) } + if descs[0].Positional.X != 20 || descs[0].Positional.Y != 40 { + t.Fatalf("expected first descriptor x/y=20/40, got %f/%f", descs[0].Positional.X, descs[0].Positional.Y) + } + if descs[0].Positional.Width != 120 || descs[0].Positional.Height != 30 { + t.Fatalf("expected first descriptor width/height=120/30, got %f/%f", descs[0].Positional.Width, descs[0].Positional.Height) + } if descs[1].Interactive { t.Fatalf("expected second descriptor interactive=false") } @@ -71,4 +77,10 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) { if descs[1].Positional.LabelledBy != "Secondary Action" { t.Fatalf("expected second descriptor labelled_by=Secondary Action, got %q", descs[1].Positional.LabelledBy) } + if descs[1].Positional.X != 300 || descs[1].Positional.Y != 640 { + t.Fatalf("expected second descriptor x/y=300/640, got %f/%f", descs[1].Positional.X, descs[1].Positional.Y) + } + if descs[1].Positional.Width != 200 || descs[1].Positional.Height != 44 { + t.Fatalf("expected second descriptor width/height=200/44, got %f/%f", descs[1].Positional.Width, descs[1].Positional.Height) + } } diff --git a/dev b/dev index ec35d82..11d53d9 100755 --- a/dev +++ b/dev @@ -11,17 +11,27 @@ ERROR=$'\033[38;2;230;57;70m' NC=$'\033[0m' commands=( + "pr:🚀:Pre-PR checks (check + e2e + bench)" "doctor:🩺:Setup dev environment" "test:🧪:Run unit tests" "test verbose:🧪:Run unit tests (verbose)" "test race:🧪:Run unit tests with race detector" "coverage:📊:Run tests with coverage report" "lint:🔍:Run golangci-lint" + "lint corpus:🔍:Lint benchmark corpus" + "lint docs:🔍:Check documentation links" "fmt:✨:Format code" "vet:🔬:Run go vet" "check:✅:Run all checks (fmt + vet + lint + test)" "build:📦:Build CLI binary" - "benchmark:🏋:Run benchmark study" + "bench:🏋:Run corpus benchmark" + "bench full:🏋:Run full benchmark suite" + "baseline:📏:Create quality baseline" + "baseline check:📏:Check against baseline" + "baseline update:📏:Update baseline (--accept)" + "calibrate:🎯:Calibrate threshold recommendations" + "runtime:⏱️:Check runtime baseline" + "tune:🎛️:Tune combined weights" "e2e:🐳:Run E2E tests (Docker)" ) @@ -36,6 +46,36 @@ show_help() { echo "" } +run_pr() { + echo " ${ACCENT}${BOLD}🚀 Pre-PR checks${NC}" + echo "" + + echo " ${MUTED}1/4 All checks (fmt + vet + lint + test)${NC}" + run_check + + echo "" + echo " ${MUTED}2/4 E2E tests${NC}" + if [[ -f tests/e2e/run.sh ]]; then + go build -o /tmp/semantic ./cmd/semantic + PATH="/tmp:$PATH" bash tests/e2e/run.sh + echo " ${SUCCESS}✓${NC} E2E passed" + else + echo " ${MUTED}Skipped (no e2e/run.sh)${NC}" + fi + + echo "" + echo " ${MUTED}3/4 Lint corpus${NC}" + run_lint_corpus + + echo "" + echo " ${MUTED}4/4 Corpus benchmark${NC}" + run_bench > /dev/null 2>&1 + echo " ${SUCCESS}✓${NC} Benchmark complete" + + echo "" + echo " ${SUCCESS}${BOLD}🚀 Ready for PR${NC}" +} + run_test() { echo " ${ACCENT}${BOLD}🧪 Running tests${NC}" go test ./... -count=1 @@ -88,9 +128,19 @@ run_check() { if [ -n "$unformatted" ]; then echo " ${ERROR}✗${NC} Unformatted files:" echo "$unformatted" - exit 1 + echo "" + printf " Fix formatting now? (Y/n) " + read -r answer + if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then + gofmt -w . + echo " ${SUCCESS}✓${NC} Format (fixed)" + else + echo " ${MUTED}Run: gofmt -w .${NC}" + exit 1 + fi + else + echo " ${SUCCESS}✓${NC} Format" fi - echo " ${SUCCESS}✓${NC} Format" echo " ${MUTED}2/4 Vet${NC}" go vet ./... @@ -114,9 +164,54 @@ run_build() { echo " ${SUCCESS}✓${NC} Built: ./semantic" } -run_benchmark() { - echo " ${ACCENT}${BOLD}⏱️ Running benchmark study${NC}" - go test -run TestBenchmarkStudy -v -count=1 +run_bench() { + echo " ${ACCENT}${BOLD}🏋 Running corpus benchmark${NC}" + go run ./cmd/semantic-bench check "$@" +} + +run_bench_full() { + echo " ${ACCENT}${BOLD}🏋 Running full benchmark suite${NC}" + go run ./cmd/semantic-bench run -suite=all "$@" +} + +run_lint_corpus() { + echo " ${ACCENT}${BOLD}🔍 Linting benchmark corpus${NC}" + go run ./cmd/semantic-bench lint "$@" +} + +run_lint_docs() { + echo " ${ACCENT}${BOLD}🔍 Checking documentation links${NC}" + bash scripts/check-docs-links.sh +} + +run_baseline() { + echo " ${ACCENT}${BOLD}📏 Creating quality baseline${NC}" + go run ./cmd/semantic-bench baseline create "$@" +} + +run_baseline_check() { + echo " ${ACCENT}${BOLD}📏 Checking against baseline${NC}" + go run ./cmd/semantic-bench check "$@" +} + +run_baseline_update() { + echo " ${ACCENT}${BOLD}📏 Updating baseline${NC}" + go run ./cmd/semantic-bench baseline update --accept "$@" +} + +run_calibrate() { + echo " ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}" + go run ./cmd/semantic-bench calibrate -verbose "$@" +} + +run_runtime() { + echo " ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}" + go run ./cmd/semantic-bench runtime "$@" +} + +run_tune() { + echo " ${ACCENT}${BOLD}🎛️ Tuning combined weights${NC}" + go run ./cmd/semantic-bench tune -verbose "$@" } run_e2e() { @@ -129,6 +224,7 @@ run_e2e() { } case "${1:-help}" in + pr) run_pr ;; doctor) exec bash scripts/doctor.sh ;; test) case "${2:-}" in @@ -138,12 +234,33 @@ case "${1:-help}" in esac ;; coverage) run_coverage ;; - lint) run_lint ;; + lint) + case "${2:-}" in + corpus) run_lint_corpus ;; + docs) run_lint_docs ;; + *) run_lint ;; + esac + ;; fmt) run_fmt ;; vet) run_vet ;; check) run_check ;; build) run_build ;; - benchmark) run_benchmark ;; + bench|benchmark) + case "${2:-}" in + full) run_bench_full ;; + *) shift; run_bench "$@" ;; + esac + ;; + baseline) + case "${2:-}" in + check) shift 2; run_baseline_check "$@" ;; + update) shift 2; run_baseline_update "$@" ;; + *) shift; run_baseline "$@" ;; + esac + ;; + calibrate) shift; run_calibrate "$@" ;; + runtime) shift; run_runtime "$@" ;; + tune) shift; run_tune "$@" ;; e2e) run_e2e ;; help|*) show_help ;; esac diff --git a/docs/guides/contributing.md b/docs/guides/contributing.md index 81f9736..695a7d6 100644 --- a/docs/guides/contributing.md +++ b/docs/guides/contributing.md @@ -19,7 +19,7 @@ Doctor checks Go version, golangci-lint, dependencies, build, tests, and git hoo ./dev lint # golangci-lint ./dev check # all checks (fmt + vet + lint + test) ./dev build # build CLI binary -./dev benchmark # run benchmark study +./dev bench # run corpus benchmark suite ``` ## Project Structure diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 33055c6..1e7155d 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -16,6 +16,8 @@ semantic find [flags] | `--threshold` | 0.3 | Minimum score | | `--top-k` | 3 | Maximum results | | `--strategy` | combined | `combined`, `lexical`, or `embedding` | +| `--lexical-weight` | 0 | Combined strategy lexical weight override | +| `--embedding-weight` | 0 | Combined strategy embedding weight override | | `--format` | table | `table`, `json`, or `refs` | **Examples:** @@ -31,8 +33,16 @@ curl -s localhost:9999/snapshot | semantic find "search box" # Machine-readable semantic find "login" --snapshot page.json --format json +# Tune combined scoring +semantic find "login" --snapshot page.json --lexical-weight 0.7 --embedding-weight 0.3 + # Just refs (for piping) semantic find "submit" --snapshot page.json --format refs + +# Visual layout hints +semantic find "button in top right corner" --snapshot page.json +semantic find "link below the search box" --snapshot page.json +semantic find "sidebar on the left" --snapshot page.json ``` ### `semantic match` @@ -81,8 +91,34 @@ The CLI expects a JSON array of element descriptors: ```json [ - {"ref": "e0", "role": "button", "name": "Sign In"}, - {"ref": "e1", "role": "textbox", "name": "Email"}, - {"ref": "e2", "role": "link", "name": "Forgot Password"} + { + "ref": "e0", + "role": "button", + "name": "Sign In", + "interactive": true, + "parent": "Auth card", + "section": "Header", + "x": 920, + "y": 16, + "width": 96, + "height": 32 + }, + { + "ref": "e1", + "role": "textbox", + "name": "Email", + "positional": { + "depth": 3, + "sibling_index": 1, + "sibling_count": 2, + "labelled_by": "Email", + "left": 120, + "top": 240, + "width": 320, + "height": 36 + } + } ] ``` + +Top-level geometry (`x`, `y`, `top`, `left`, `width`, `height`) and nested `positional` fields are both supported. Supplying coordinates improves results for visual hints such as `top right`, `below`, and `left`. diff --git a/internal/benchmark/baseline.go b/internal/benchmark/baseline.go new file mode 100644 index 0000000..07cc418 --- /dev/null +++ b/internal/benchmark/baseline.go @@ -0,0 +1,110 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" +) + +type BaselineResult struct { + Action string `json:"action"` + Path string `json:"path"` + Metrics OverallMetrics `json:"metrics"` + Previous *OverallMetrics `json:"previous,omitempty"` +} + +func RunBaseline(cfg BaselineCmdConfig) (*BaselineResult, error) { + root := FindBenchmarkRoot() + baselinesDir := filepath.Join(root, "baselines") + if err := os.MkdirAll(baselinesDir, 0755); err != nil { + return nil, err + } + + baselinePath := filepath.Join(baselinesDir, cfg.Name+".json") + + switch cfg.Action { + case "create": + return createBaseline(root, baselinePath, cfg) + case "update": + if !cfg.Accept { + return nil, fmt.Errorf("use --accept to confirm baseline update") + } + return updateBaseline(root, baselinePath, cfg) + default: + return nil, fmt.Errorf("unknown baseline action: %s (use 'create' or 'update')", cfg.Action) + } +} + +func createBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + Mode: "library", + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + data, err := json.MarshalIndent(report, "", " ") + if err != nil { + return nil, err + } + if err := os.WriteFile(baselinePath, data, 0644); err != nil { + return nil, err + } + + return &BaselineResult{ + Action: "create", + Path: baselinePath, + Metrics: report.Metrics.Overall, + }, nil +} + +func updateBaseline(root, baselinePath string, cfg BaselineCmdConfig) (*BaselineResult, error) { + var previous *OverallMetrics + if data, err := os.ReadFile(baselinePath); err == nil { + var old Report + if json.Unmarshal(data, &old) == nil { + previous = &old.Metrics.Overall + } + backupPath := strings.TrimSuffix(baselinePath, ".json") + "_" + time.Now().Format("20060102_150405") + ".backup.json" + _ = os.WriteFile(backupPath, data, 0644) + } + + result, err := createBaseline(root, baselinePath, cfg) + if err != nil { + return nil, err + } + result.Action = "update" + result.Previous = previous + return result, nil +} + +func PrintBaselineResult(result *BaselineResult, cfg BaselineCmdConfig) { + fmt.Printf("\n Baseline %sd: %s\n\n", result.Action, result.Path) + fmt.Printf(" MRR: %.4f\n", result.Metrics.MRR) + fmt.Printf(" P@1: %.4f\n", result.Metrics.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Metrics.HitAt3) + + if result.Previous != nil { + fmt.Printf("\n Previous:\n") + fmt.Printf(" MRR: %.4f\n", result.Previous.MRR) + fmt.Printf(" P@1: %.4f\n", result.Previous.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Previous.HitAt3) + } + fmt.Println() +} diff --git a/internal/benchmark/calibrate.go b/internal/benchmark/calibrate.go new file mode 100644 index 0000000..48ec06e --- /dev/null +++ b/internal/benchmark/calibrate.go @@ -0,0 +1,173 @@ +package benchmark + +import ( + "context" + "fmt" + + "github.com/pinchtab/semantic" +) + +type CalibrateResult struct { + ByThreshold map[string]ThresholdMetrics `json:"by_threshold"` + Recommendations CalibrateRecommendations `json:"recommendations"` + TotalCases int `json:"total_cases"` +} + +type ThresholdMetrics struct { + TP int `json:"tp"` + FP int `json:"fp"` + FN int `json:"fn"` + TN int `json:"tn"` + Recall float64 `json:"recall"` + Precision float64 `json:"precision"` + FPR float64 `json:"false_positive_rate"` + F1 float64 `json:"f1"` +} + +type CalibrateRecommendations struct { + DefaultThreshold float64 `json:"default_threshold"` + RecoveryThreshold float64 `json:"recovery_threshold"` + BestF1 float64 `json:"best_f1"` +} + +func RunCalibrate(cfg CalibrateConfig) (*CalibrateResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &CalibrateResult{ + ByThreshold: make(map[string]ThresholdMetrics), + } + + type testCase struct { + query Query + corpus *Corpus + } + + var cases []testCase + for i := range ds.Corpora { + corpus := &ds.Corpora[i] + if cfg.Corpus != "" && corpus.ID != cfg.Corpus { + continue + } + for _, q := range corpus.Queries { + cases = append(cases, testCase{query: q, corpus: corpus}) + } + } + result.TotalCases = len(cases) + + if cfg.Verbose { + fmt.Printf("Testing %d thresholds against %d cases...\n\n", len(cfg.Thresholds), len(cases)) + } + + runCfg := RunConfig{ + Strategy: "combined", + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + } + matcher := createMatcher(runCfg) + + var bestF1, bestF1Threshold float64 + var bestRecallThreshold float64 + var bestRecallWithPrecision float64 + + for _, threshold := range cfg.Thresholds { + tp, fp, fn, tn := 0, 0, 0, 0 + + for _, tc := range cases { + findResult, _ := matcher.Find(context.Background(), tc.query.QueryText, tc.corpus.Snapshot, semantic.FindOptions{ + Threshold: threshold, + TopK: 5, + }) + + hasMatch := len(findResult.Matches) > 0 + topRef := "" + if hasMatch { + topRef = findResult.Matches[0].Ref + } + + switch { + case tc.query.ExpectNoMatch && hasMatch: + fp++ + case tc.query.ExpectNoMatch && !hasMatch: + tn++ + case len(tc.query.RelevantRefs) > 0 && !hasMatch: + fn++ + case len(tc.query.RelevantRefs) > 0 && contains(tc.query.RelevantRefs, topRef): + tp++ + case len(tc.query.RelevantRefs) > 0: + fp++ + } + } + + totalPos := tp + fn + totalNeg := tn + fp + + var recall, precision, fpr, f1 float64 + if totalPos > 0 { + recall = float64(tp) / float64(totalPos) + } + if tp+fp > 0 { + precision = float64(tp) / float64(tp+fp) + } + if totalNeg > 0 { + fpr = float64(fp) / float64(totalNeg) + } + if precision+recall > 0 { + f1 = 2 * precision * recall / (precision + recall) + } + + key := fmt.Sprintf("%.2f", threshold) + result.ByThreshold[key] = ThresholdMetrics{ + TP: tp, FP: fp, FN: fn, TN: tn, + Recall: recall, Precision: precision, FPR: fpr, F1: f1, + } + + if f1 > bestF1 { + bestF1 = f1 + bestF1Threshold = threshold + } + if recall >= 0.85 && precision > bestRecallWithPrecision { + bestRecallWithPrecision = precision + bestRecallThreshold = threshold + } + + if cfg.Verbose { + fmt.Printf(" threshold=%.2f | TP=%3d FP=%3d FN=%3d TN=%3d | recall=%.3f precision=%.3f F1=%.3f\n", + threshold, tp, fp, fn, tn, recall, precision, f1) + } + } + + if bestRecallThreshold == 0 && len(cfg.Thresholds) > 0 { + bestRecallThreshold = cfg.Thresholds[0] + } + + result.Recommendations = CalibrateRecommendations{ + DefaultThreshold: bestF1Threshold, + RecoveryThreshold: bestRecallThreshold, + BestF1: bestF1, + } + + return result, nil +} + +func contains(refs []string, ref string) bool { + for _, r := range refs { + if r == ref { + return true + } + } + return false +} + +func PrintCalibrateResult(result *CalibrateResult, cfg CalibrateConfig) { + fmt.Printf("\n Tested %d cases across %d thresholds\n\n", result.TotalCases, len(result.ByThreshold)) + + fmt.Printf(" Recommendations:\n") + fmt.Printf(" Default (best F1): %.2f (F1=%.3f)\n", result.Recommendations.DefaultThreshold, result.Recommendations.BestF1) + fmt.Printf(" Recovery (recall): %.2f\n", result.Recommendations.RecoveryThreshold) + fmt.Println() +} diff --git a/internal/benchmark/catalog.go b/internal/benchmark/catalog.go new file mode 100644 index 0000000..69a3091 --- /dev/null +++ b/internal/benchmark/catalog.go @@ -0,0 +1,86 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "sort" +) + +func RunCatalog(cfg CatalogConfig) (*CatalogResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + + result := &CatalogResult{ + ByTag: make(map[string]int), + ByDifficulty: make(map[string]int), + } + + for _, c := range ds.Corpora { + tags := make(map[string]bool) + for _, q := range c.Queries { + result.TotalQueries++ + result.ByDifficulty[q.Difficulty]++ + for _, t := range q.Tags { + tags[t] = true + result.ByTag[t]++ + } + } + var tagList []string + for t := range tags { + tagList = append(tagList, t) + } + sort.Strings(tagList) + result.Corpora = append(result.Corpora, CorpusSummary{ + ID: c.ID, + Queries: len(c.Queries), + Tags: tagList, + }) + } + + return result, nil +} + +func PrintCatalogResult(result *CatalogResult, cfg CatalogConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n Corpora: %d\n", len(result.Corpora)) + fmt.Printf(" Total Queries: %d\n\n", result.TotalQueries) + + fmt.Printf(" %-30s %8s\n", "Corpus", "Queries") + fmt.Printf(" %-30s %8s\n", "------", "-------") + for _, c := range result.Corpora { + fmt.Printf(" %-30s %8d\n", c.ID, c.Queries) + } + + switch cfg.By { + case "difficulty": + fmt.Printf("\n By Difficulty:\n") + diffs := sortedKeys(result.ByDifficulty) + for _, d := range diffs { + fmt.Printf(" %-10s %4d\n", d, result.ByDifficulty[d]) + } + case "tag": + fmt.Printf("\n By Tag:\n") + tags := sortedKeys(result.ByTag) + for _, t := range tags { + fmt.Printf(" %-20s %4d\n", t, result.ByTag[t]) + } + } + fmt.Printf("\n") +} + +func sortedKeys(m map[string]int) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/internal/benchmark/check.go b/internal/benchmark/check.go new file mode 100644 index 0000000..88234f6 --- /dev/null +++ b/internal/benchmark/check.go @@ -0,0 +1,279 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +func RunCheck(cfg CheckConfig) (*CheckResult, error) { + root := FindBenchmarkRoot() + + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + benchCfg, err := LoadConfig(root) + if err != nil { + return nil, fmt.Errorf("load config: %w", err) + } + profile := ResolveProfile(benchCfg, cfg.Profile) + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: profile.Strategy, + Threshold: profile.Threshold, + TopK: profile.TopK, + LexicalWeight: profile.Weights.Lexical, + EmbeddingWeight: profile.Weights.Embedding, + Profile: cfg.Profile, + Mode: "library", + Verbose: cfg.Verbose, + Explain: cfg.Explain, + OutputDir: cfg.OutputDir, + Quick: cfg.Quick, + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run benchmark: %w", err) + } + + result := &CheckResult{ + Status: "pass", + Report: report, + } + result.Summary.PAt1 = report.Metrics.Overall.PAt1 + result.Summary.MRR = report.Metrics.Overall.MRR + result.Summary.HitAt3 = report.Metrics.Overall.HitAt3 + result.Summary.Total = report.Metrics.Overall.Total + + for _, r := range report.Results { + if r.Status == "miss" { + result.TopRegs = append(result.TopRegs, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + Expected: r.Expected.RelevantRefs, + CurrentRef: r.Actual.BestRef, + Reason: "miss", + DebugCommand: fmt.Sprintf("semantic-bench run --query %s --verbose --explain", r.ID), + }) + } + } + result.Summary.Regressions = len(result.TopRegs) + + // Determine baseline path from config + baselinePath := cfg.BaselinePath + if baselinePath == "" { + baselinePath = filepath.Join(benchCfg.BaselinesDir(root), "combined.json") + } + + // Get quality thresholds from config + thresholds := benchCfg.QualityThresholds() + + if _, err := os.Stat(baselinePath); err == nil { + baseline, err := loadReport(baselinePath) + if err == nil { + result.Delta = &MetricsDelta{ + PAt1: report.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: report.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: report.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + } + if cfg.FailOnReg { + // Check overall thresholds + if result.Delta.PAt1 < -thresholds.MaxOverallPAt1Drop || + result.Delta.MRR < -thresholds.MaxOverallMRRDrop || + result.Delta.HitAt3 < -thresholds.MaxOverallHitAt3Drop { + result.Status = "fail" + } + // Check corpus-level thresholds + for corpus, current := range report.Metrics.ByCorpus { + if base, ok := baseline.Metrics.ByCorpus[corpus]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxCorpusPAt1Drop { + result.Status = "fail" + } + } + } + // Check difficulty-level thresholds + for diff, current := range report.Metrics.ByDifficulty { + if base, ok := baseline.Metrics.ByDifficulty[diff]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxDifficultyPAt1Drop { + result.Status = "fail" + } + } + } + // Check tag-level thresholds + for tag, current := range report.Metrics.ByTag { + if base, ok := baseline.Metrics.ByTag[tag]; ok { + if current.PAt1-base.PAt1 < -thresholds.MaxTagPAt1Drop { + result.Status = "fail" + } + } + } + } + } + } + + // Sort regressions for deterministic output + sort.Slice(result.TopRegs, func(i, j int) bool { + if result.TopRegs[i].Corpus != result.TopRegs[j].Corpus { + return result.TopRegs[i].Corpus < result.TopRegs[j].Corpus + } + return result.TopRegs[i].ID < result.TopRegs[j].ID + }) + + _ = os.MkdirAll(cfg.OutputDir, 0755) + ts := time.Now().Format("20060102_150405") + reportPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.json", ts)) + summaryPath := filepath.Join(cfg.OutputDir, fmt.Sprintf("bench_%s.md", ts)) + + reportJSON, _ := json.MarshalIndent(report, "", " ") + _ = os.WriteFile(reportPath, reportJSON, 0644) + + summaryMD := generateSummaryMD(report, result) + _ = os.WriteFile(summaryPath, []byte(summaryMD), 0644) + + result.Artifacts.ReportJSON = reportPath + result.Artifacts.SummaryMD = summaryPath + + return result, nil +} + +func RunBenchmark(cfg RunConfig) (*Report, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, err + } + return RunCorpusBenchmark(ds, cfg) +} + +func loadReport(path string) (*Report, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var r Report + if err := json.Unmarshal(data, &r); err != nil { + return nil, err + } + return &r, nil +} + +func generateSummaryMD(report *Report, result *CheckResult) string { + var sb strings.Builder + + sb.WriteString("# Benchmark Summary\n\n") + fmt.Fprintf(&sb, "Generated: %s\n\n", report.Run.Timestamp) + + sb.WriteString("## Overall Metrics\n\n") + sb.WriteString("| Metric | Value |\n") + sb.WriteString("|--------|-------|\n") + fmt.Fprintf(&sb, "| Total | %d |\n", report.Metrics.Overall.Total) + fmt.Fprintf(&sb, "| MRR | %.4f |\n", report.Metrics.Overall.MRR) + fmt.Fprintf(&sb, "| P@1 | %.4f |\n", report.Metrics.Overall.PAt1) + fmt.Fprintf(&sb, "| Hit@3 | %.4f |\n", report.Metrics.Overall.HitAt3) + fmt.Fprintf(&sb, "| Avg Margin | %.4f |\n", report.Metrics.Overall.AvgMargin) + + if result.Delta != nil { + sb.WriteString("\n## Delta from Baseline\n\n") + sb.WriteString("| Metric | Delta |\n") + sb.WriteString("|--------|-------|\n") + fmt.Fprintf(&sb, "| P@1 | %+.4f |\n", result.Delta.PAt1) + fmt.Fprintf(&sb, "| MRR | %+.4f |\n", result.Delta.MRR) + fmt.Fprintf(&sb, "| Hit@3 | %+.4f |\n", result.Delta.HitAt3) + } + + if len(result.TopRegs) > 0 { + sb.WriteString("\n## Misses\n\n") + sb.WriteString("| ID | Corpus | Query | Got | Expected |\n") + sb.WriteString("|----|--------|-------|-----|----------|\n") + for i, r := range result.TopRegs { + if i >= 10 { + break + } + fmt.Fprintf(&sb, "| %s | %s | %s | %s | %s |\n", + r.ID, r.Corpus, r.Query, r.CurrentRef, strings.Join(r.Expected, ",")) + } + if len(result.TopRegs) > 10 { + fmt.Fprintf(&sb, "\n*Showing 10 of %d misses.*\n", len(result.TopRegs)) + } + } + + return sb.String() +} + +func PrintCheckResult(result *CheckResult, cfg CheckConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32m✓\033[0m Benchmark passed\n") + } else { + fmt.Printf(" \033[31m✗\033[0m Benchmark failed\n") + } + fmt.Printf("\n") + + fmt.Printf(" %-12s %8.4f\n", "MRR", result.Summary.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", result.Summary.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", result.Summary.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", result.Summary.Total) + fmt.Printf(" %-12s %8d\n", "Misses", result.Summary.Regressions) + + if result.Delta != nil { + fmt.Printf("\n Delta from baseline:\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + } + + fmt.Printf("\n Artifacts:\n") + fmt.Printf(" Report: %s\n", result.Artifacts.ReportJSON) + fmt.Printf(" Summary: %s\n", result.Artifacts.SummaryMD) + fmt.Printf("\n") +} + +func printDelta(name string, delta float64) { + color := "\033[0m" + sign := "" + if delta > 0.001 { + color = "\033[32m" + sign = "+" + } else if delta < -0.001 { + color = "\033[31m" + } + fmt.Printf(" %s%-8s %s%.4f\033[0m\n", color, name, sign, delta) +} + +func PrintRunResult(report *Report, cfg RunConfig) { + fmt.Printf("\n") + fmt.Printf(" %-12s %8.4f\n", "MRR", report.Metrics.Overall.MRR) + fmt.Printf(" %-12s %8.4f\n", "P@1", report.Metrics.Overall.PAt1) + fmt.Printf(" %-12s %8.4f\n", "Hit@3", report.Metrics.Overall.HitAt3) + fmt.Printf(" %-12s %8d\n", "Total", report.Metrics.Overall.Total) + fmt.Printf("\n") + + if cfg.Verbose { + for _, r := range report.Results { + status := "\033[32mHIT \033[0m" + switch r.Status { + case "miss": + status = "\033[31mMISS\033[0m" + case "partial": + status = "\033[33mPART\033[0m" + } + fmt.Printf(" [%s] %s | %s | got=%s score=%.3f\n", + r.ID, status, r.Query, r.Actual.BestRef, r.Actual.BestScore) + } + } +} diff --git a/internal/benchmark/compare.go b/internal/benchmark/compare.go new file mode 100644 index 0000000..f0e6ccf --- /dev/null +++ b/internal/benchmark/compare.go @@ -0,0 +1,89 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "sort" +) + +func RunCompare(cfg CompareConfig) (*CompareResult, error) { + baseline, err := loadReport(cfg.BaselinePath) + if err != nil { + return nil, fmt.Errorf("load baseline: %w", err) + } + current, err := loadReport(cfg.CurrentPath) + if err != nil { + return nil, fmt.Errorf("load current: %w", err) + } + + result := &CompareResult{ + Status: "pass", + Delta: MetricsDelta{ + PAt1: current.Metrics.Overall.PAt1 - baseline.Metrics.Overall.PAt1, + MRR: current.Metrics.Overall.MRR - baseline.Metrics.Overall.MRR, + HitAt3: current.Metrics.Overall.HitAt3 - baseline.Metrics.Overall.HitAt3, + }, + } + + if result.Delta.PAt1 < -0.02 || result.Delta.MRR < -0.02 { + result.Status = "fail" + } + + baselineResults := make(map[string]QueryResult) + for _, r := range baseline.Results { + baselineResults[r.ID] = r + } + for _, r := range current.Results { + if base, ok := baselineResults[r.ID]; ok { + if base.Status == "hit" && r.Status != "hit" { + result.Regressions = append(result.Regressions, Regression{ + ID: r.ID, + Corpus: r.Corpus, + Query: r.Query, + BaselineRef: base.Actual.BestRef, + CurrentRef: r.Actual.BestRef, + Reason: fmt.Sprintf("%s -> %s", base.Status, r.Status), + }) + } + } + } + + return result, nil +} + +func PrintCompareResult(result *CompareResult, cfg CompareConfig) { + if cfg.Format == "json" { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + return + } + + fmt.Printf("\n") + if result.Status == "pass" { + fmt.Printf(" \033[32m✓\033[0m No regression\n") + } else { + fmt.Printf(" \033[31m✗\033[0m Regression detected\n") + } + fmt.Printf("\n") + printDelta("P@1", result.Delta.PAt1) + printDelta("MRR", result.Delta.MRR) + printDelta("Hit@3", result.Delta.HitAt3) + + if len(result.Regressions) > 0 { + fmt.Printf("\n Regressions:\n") + sortRegressions(result.Regressions) + for _, r := range result.Regressions { + fmt.Printf(" %s: %s (%s)\n", r.ID, r.Reason, r.Query) + } + } + fmt.Printf("\n") +} + +func sortRegressions(regs []Regression) { + sort.Slice(regs, func(i, j int) bool { + if regs[i].Corpus != regs[j].Corpus { + return regs[i].Corpus < regs[j].Corpus + } + return regs[i].ID < regs[j].ID + }) +} diff --git a/internal/benchmark/config.go b/internal/benchmark/config.go new file mode 100644 index 0000000..2d233e2 --- /dev/null +++ b/internal/benchmark/config.go @@ -0,0 +1,534 @@ +package benchmark + +import ( + "encoding/json" + "errors" + "flag" + "fmt" + "os" + "path/filepath" +) + +type Config struct { + Version string `json:"version"` + Defaults DefaultsConfig `json:"defaults"` + Profiles map[string]Profile `json:"profiles"` + Baseline BaselineConfig `json:"baseline"` + Results ResultsConfig `json:"results"` + Strategies []string `json:"strategies"` + SnapshotsDir string `json:"snapshots_dir"` +} + +type DefaultsConfig struct { + Profile string `json:"profile"` + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` +} + +type ResultsConfig struct { + Dir string `json:"dir"` + BaselinesDir string `json:"baselines_dir"` + GeneratedFilesPolicy string `json:"generated_files_policy"` +} + +type Profile struct { + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` + Suites []string `json:"suites"` + Mode string `json:"mode"` + Inherits string `json:"inherits"` + Verbose bool `json:"verbose"` + Explain bool `json:"explain"` + FailOnReg bool `json:"fail_on_regression"` +} + +type Weights struct { + Lexical float64 `json:"lexical"` + Embedding float64 `json:"embedding"` +} + +type BaselineConfig struct { + Quality BaselineQuality `json:"quality"` + Runtime BaselineRuntime `json:"runtime"` +} + +type BaselineQuality struct { + MaxOverallPAt1Drop float64 `json:"max_overall_p_at_1_drop"` + MaxOverallMRRDrop float64 `json:"max_overall_mrr_drop"` + MaxOverallHitAt3Drop float64 `json:"max_overall_hit_at_3_drop"` + MaxCorpusPAt1Drop float64 `json:"max_corpus_p_at_1_drop"` + MaxDifficultyPAt1Drop float64 `json:"max_difficulty_p_at_1_drop"` + MaxTagPAt1Drop float64 `json:"max_tag_p_at_1_drop"` + MaxMarginDropReport float64 `json:"max_margin_drop_report"` +} + +type BaselineRuntime struct { + MaxNsOpRegressionRatio float64 `json:"max_ns_op_regression_ratio"` + MaxAllocRegressionRatio float64 `json:"max_alloc_regression_ratio"` + MaxCorpusLatencyP50MS int `json:"max_corpus_latency_p50_ms"` + MaxCorpusLatencyP95MS int `json:"max_corpus_latency_p95_ms"` +} + +type CheckConfig struct { + Profile string + BaselinePath string + OutputDir string + Format string + FailOnReg bool + Quick bool + Verbose bool + Explain bool +} + +type RunConfig struct { + Suite string + Corpus string + QueryID string + Strategy string + Threshold float64 + TopK int + LexicalWeight float64 + EmbeddingWeight float64 + Profile string + Mode string + Verbose bool + Explain bool + OutputDir string + ReportName string + Quick bool +} + +type CompareConfig struct { + BaselinePath string + CurrentPath string + Format string + Verbose bool +} + +type LintConfig struct { + Format string + Verbose bool +} + +type CatalogConfig struct { + Format string + By string +} + +type BaselineCmdConfig struct { + Action string // "create" or "update" + Name string + Accept bool + Verbose bool +} + +type CalibrateConfig struct { + Corpus string + Thresholds []float64 + Verbose bool +} + +type TuneConfig struct { + Corpus string + Step float64 + Verbose bool +} + +type RuntimeConfig struct { + FailOnRegression bool + Verbose bool +} + +func FindBenchmarkRoot() string { + cwd, _ := os.Getwd() + for d := cwd; d != "/"; d = filepath.Dir(d) { + if _, err := os.Stat(filepath.Join(d, "tests/benchmark/config/benchmark.json")); err == nil { + return filepath.Join(d, "tests/benchmark") + } + if _, err := os.Stat(filepath.Join(d, "go.mod")); err == nil { + return filepath.Join(d, "tests/benchmark") + } + } + return filepath.Join(cwd, "tests/benchmark") +} + +func LoadConfig(benchmarkRoot string) (*Config, error) { + path := filepath.Join(benchmarkRoot, "config/benchmark.json") + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var cfg Config + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, err + } + if err := ValidateConfig(&cfg); err != nil { + return nil, fmt.Errorf("invalid config: %w", err) + } + return &cfg, nil +} + +func ResolveProfile(cfg *Config, name string) Profile { + p, ok := cfg.Profiles[name] + if !ok { + // Use defaults from config, falling back to hardcoded values + strategy := cfg.Defaults.Strategy + if strategy == "" { + strategy = "combined" + } + threshold := cfg.Defaults.Threshold + if threshold == 0 { + threshold = 0.01 + } + topK := cfg.Defaults.TopK + if topK == 0 { + topK = 5 + } + weights := cfg.Defaults.Weights + if weights.Lexical == 0 && weights.Embedding == 0 { + weights = Weights{Lexical: 0.6, Embedding: 0.4} + } + return Profile{ + Strategy: strategy, + Threshold: threshold, + TopK: topK, + Weights: weights, + Suites: []string{"corpus"}, + Mode: "library", + } + } + if p.Inherits != "" { + base := ResolveProfile(cfg, p.Inherits) + if p.Strategy == "" { + p.Strategy = base.Strategy + } + if p.Threshold == 0 { + p.Threshold = base.Threshold + } + if p.TopK == 0 { + p.TopK = base.TopK + } + if p.Weights.Lexical == 0 && p.Weights.Embedding == 0 { + p.Weights = base.Weights + } + if len(p.Suites) == 0 { + p.Suites = base.Suites + } + if p.Mode == "" { + p.Mode = base.Mode + } + } + return p +} + +// projectRoot returns the project root (parent of tests/benchmark). +func projectRoot(benchmarkRoot string) string { + return filepath.Dir(filepath.Dir(benchmarkRoot)) +} + +// ResultsDir returns the configured results directory. +func (c *Config) ResultsDir(benchmarkRoot string) string { + if c.Results.Dir != "" { + if filepath.IsAbs(c.Results.Dir) { + return c.Results.Dir + } + return filepath.Join(projectRoot(benchmarkRoot), c.Results.Dir) + } + return filepath.Join(benchmarkRoot, "results") +} + +// BaselinesDir returns the configured baselines directory. +func (c *Config) BaselinesDir(benchmarkRoot string) string { + if c.Results.BaselinesDir != "" { + if filepath.IsAbs(c.Results.BaselinesDir) { + return c.Results.BaselinesDir + } + return filepath.Join(projectRoot(benchmarkRoot), c.Results.BaselinesDir) + } + return filepath.Join(benchmarkRoot, "baselines") +} + +// QualityThresholds returns quality thresholds with fallback defaults. +func (c *Config) QualityThresholds() BaselineQuality { + q := c.Baseline.Quality + if q.MaxOverallPAt1Drop == 0 { + q.MaxOverallPAt1Drop = 0.02 + } + if q.MaxOverallMRRDrop == 0 { + q.MaxOverallMRRDrop = 0.02 + } + if q.MaxOverallHitAt3Drop == 0 { + q.MaxOverallHitAt3Drop = 0.02 + } + if q.MaxCorpusPAt1Drop == 0 { + q.MaxCorpusPAt1Drop = 0.08 + } + if q.MaxDifficultyPAt1Drop == 0 { + q.MaxDifficultyPAt1Drop = 0.08 + } + if q.MaxTagPAt1Drop == 0 { + q.MaxTagPAt1Drop = 0.08 + } + if q.MaxMarginDropReport == 0 { + q.MaxMarginDropReport = 0.15 + } + return q +} + +// RuntimeThresholds returns runtime thresholds with fallback defaults. +func (c *Config) RuntimeThresholds() BaselineRuntime { + r := c.Baseline.Runtime + if r.MaxNsOpRegressionRatio == 0 { + r.MaxNsOpRegressionRatio = 1.25 + } + if r.MaxAllocRegressionRatio == 0 { + r.MaxAllocRegressionRatio = 1.25 + } + return r +} + +// ValidateConfig checks the config for errors and returns a descriptive error if invalid. +func ValidateConfig(cfg *Config) error { + var errs []error + + // Validate strategies + if len(cfg.Strategies) == 0 { + errs = append(errs, errors.New("strategies list is empty")) + } else { + validStrategies := make(map[string]bool) + for _, s := range cfg.Strategies { + validStrategies[s] = true + } + // Check default strategy is in list + if cfg.Defaults.Strategy != "" && !validStrategies[cfg.Defaults.Strategy] { + errs = append(errs, fmt.Errorf("default strategy %q not in strategies list", cfg.Defaults.Strategy)) + } + // Check profile strategies + for name, p := range cfg.Profiles { + if p.Strategy != "" && !validStrategies[p.Strategy] { + errs = append(errs, fmt.Errorf("profile %q uses strategy %q not in strategies list", name, p.Strategy)) + } + } + } + + // Validate weights + if cfg.Defaults.Weights.Lexical < 0 { + errs = append(errs, errors.New("defaults.weights.lexical must be non-negative")) + } + if cfg.Defaults.Weights.Embedding < 0 { + errs = append(errs, errors.New("defaults.weights.embedding must be non-negative")) + } + if cfg.Defaults.Weights.Lexical == 0 && cfg.Defaults.Weights.Embedding == 0 { + errs = append(errs, errors.New("defaults.weights: lexical and embedding cannot both be zero")) + } + + // Validate profile weights + for name, p := range cfg.Profiles { + if p.Weights.Lexical < 0 { + errs = append(errs, fmt.Errorf("profile %q: weights.lexical must be non-negative", name)) + } + if p.Weights.Embedding < 0 { + errs = append(errs, fmt.Errorf("profile %q: weights.embedding must be non-negative", name)) + } + } + + // Validate quality thresholds (should be positive when set) + q := cfg.Baseline.Quality + if q.MaxOverallPAt1Drop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_p_at_1_drop must be non-negative")) + } + if q.MaxOverallMRRDrop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_mrr_drop must be non-negative")) + } + if q.MaxOverallHitAt3Drop < 0 { + errs = append(errs, errors.New("baseline.quality.max_overall_hit_at_3_drop must be non-negative")) + } + + // Validate runtime thresholds (must be >= 1) + r := cfg.Baseline.Runtime + if r.MaxNsOpRegressionRatio != 0 && r.MaxNsOpRegressionRatio < 1 { + errs = append(errs, errors.New("baseline.runtime.max_ns_op_regression_ratio must be >= 1")) + } + if r.MaxAllocRegressionRatio != 0 && r.MaxAllocRegressionRatio < 1 { + errs = append(errs, errors.New("baseline.runtime.max_alloc_regression_ratio must be >= 1")) + } + + // Validate profile inheritance + if err := validateProfileInheritance(cfg); err != nil { + errs = append(errs, err) + } + + if len(errs) == 0 { + return nil + } + if len(errs) == 1 { + return errs[0] + } + return fmt.Errorf("config has %d errors: %v", len(errs), errs) +} + +// validateProfileInheritance checks for missing references and cycles. +func validateProfileInheritance(cfg *Config) error { + for name, p := range cfg.Profiles { + if p.Inherits == "" { + continue + } + // Check reference exists + if _, ok := cfg.Profiles[p.Inherits]; !ok { + return fmt.Errorf("profile %q inherits from non-existent profile %q", name, p.Inherits) + } + // Check for cycles + visited := map[string]bool{name: true} + current := p.Inherits + for current != "" { + if visited[current] { + return fmt.Errorf("profile inheritance cycle detected: %q -> %q", name, current) + } + visited[current] = true + if parent, ok := cfg.Profiles[current]; ok { + current = parent.Inherits + } else { + break + } + } + } + return nil +} + +func ParseCheckFlags(args []string) CheckConfig { + fs := flag.NewFlagSet("check", flag.ExitOnError) + cfg := CheckConfig{ + Profile: "default", + OutputDir: filepath.Join(FindBenchmarkRoot(), "results"), + Format: "text", + } + fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile") + fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline file path") + fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (text|json|github)") + fs.BoolVar(&cfg.FailOnReg, "fail-on-regression", false, "exit 1 on regression") + fs.BoolVar(&cfg.Quick, "quick", false, "smoke mode: 3 queries per corpus (not representative)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "print per-corpus details") + fs.BoolVar(&cfg.Explain, "explain", false, "include matcher explanations") + _ = fs.Parse(args) + return cfg +} + +func ParseRunFlags(args []string) RunConfig { + fs := flag.NewFlagSet("run", flag.ExitOnError) + cfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: 0.6, + EmbeddingWeight: 0.4, + Profile: "default", + Mode: "library", + OutputDir: filepath.Join(FindBenchmarkRoot(), "results"), + } + fs.StringVar(&cfg.Suite, "suite", cfg.Suite, "suite to run (corpus|recovery|classification|runtime|all)") + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to run") + fs.StringVar(&cfg.QueryID, "query", "", "specific query ID to run") + fs.StringVar(&cfg.Strategy, "strategy", cfg.Strategy, "matching strategy") + fs.Float64Var(&cfg.Threshold, "threshold", cfg.Threshold, "score threshold") + fs.IntVar(&cfg.TopK, "top-k", cfg.TopK, "number of results") + fs.Float64Var(&cfg.LexicalWeight, "lexical-weight", cfg.LexicalWeight, "lexical weight") + fs.Float64Var(&cfg.EmbeddingWeight, "embedding-weight", cfg.EmbeddingWeight, "embedding weight") + fs.StringVar(&cfg.Profile, "profile", cfg.Profile, "benchmark profile") + fs.StringVar(&cfg.Mode, "mode", cfg.Mode, "execution mode (cli|library|both)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + fs.BoolVar(&cfg.Explain, "explain", false, "include explanations") + fs.StringVar(&cfg.OutputDir, "out", cfg.OutputDir, "output directory") + fs.StringVar(&cfg.ReportName, "report-name", "", "custom report name") + _ = fs.Parse(args) + return cfg +} + +func ParseCompareFlags(args []string) CompareConfig { + fs := flag.NewFlagSet("compare", flag.ExitOnError) + cfg := CompareConfig{ + Format: "text", + } + fs.StringVar(&cfg.BaselinePath, "baseline", "", "baseline report path (required)") + fs.StringVar(&cfg.CurrentPath, "current", "", "current report path (required)") + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseLintFlags(args []string) LintConfig { + fs := flag.NewFlagSet("lint", flag.ExitOnError) + cfg := LintConfig{ + Format: "text", + } + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseCatalogFlags(args []string) CatalogConfig { + fs := flag.NewFlagSet("catalog", flag.ExitOnError) + cfg := CatalogConfig{ + Format: "table", + } + fs.StringVar(&cfg.Format, "format", cfg.Format, "output format (table|json)") + fs.StringVar(&cfg.By, "by", "", "group by (tag|difficulty|intent)") + _ = fs.Parse(args) + return cfg +} + +func ParseBaselineFlags(args []string) BaselineCmdConfig { + fs := flag.NewFlagSet("baseline", flag.ExitOnError) + cfg := BaselineCmdConfig{ + Action: "create", + Name: "combined", + } + fs.StringVar(&cfg.Name, "name", cfg.Name, "baseline name") + fs.BoolVar(&cfg.Accept, "accept", false, "accept changes (for update)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + + if len(fs.Args()) > 0 { + cfg.Action = fs.Args()[0] + } + return cfg +} + +func ParseCalibrateFlags(args []string) CalibrateConfig { + fs := flag.NewFlagSet("calibrate", flag.ExitOnError) + cfg := CalibrateConfig{ + Thresholds: []float64{0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60}, + } + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to test") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseTuneFlags(args []string) TuneConfig { + fs := flag.NewFlagSet("tune", flag.ExitOnError) + cfg := TuneConfig{ + Step: 0.1, + } + fs.StringVar(&cfg.Corpus, "corpus", "", "specific corpus to tune against") + fs.Float64Var(&cfg.Step, "step", cfg.Step, "weight step size (0.05, 0.1, 0.2)") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} + +func ParseRuntimeFlags(args []string) RuntimeConfig { + fs := flag.NewFlagSet("runtime", flag.ExitOnError) + cfg := RuntimeConfig{} + fs.BoolVar(&cfg.FailOnRegression, "fail-on-regression", false, "exit 1 on regression") + fs.BoolVar(&cfg.Verbose, "verbose", false, "verbose output") + _ = fs.Parse(args) + return cfg +} diff --git a/internal/benchmark/config_test.go b/internal/benchmark/config_test.go new file mode 100644 index 0000000..2590556 --- /dev/null +++ b/internal/benchmark/config_test.go @@ -0,0 +1,147 @@ +package benchmark + +import "testing" + +func TestValidateConfig_Valid(t *testing.T) { + cfg := &Config{ + Strategies: []string{"lexical", "embedding", "combined"}, + Defaults: DefaultsConfig{ + Strategy: "combined", + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Quality: BaselineQuality{ + MaxOverallPAt1Drop: 0.02, + }, + Runtime: BaselineRuntime{ + MaxNsOpRegressionRatio: 1.25, + }, + }, + } + if err := ValidateConfig(cfg); err != nil { + t.Errorf("expected valid config, got error: %v", err) + } +} + +func TestValidateConfig_EmptyStrategies(t *testing.T) { + cfg := &Config{ + Strategies: []string{}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for empty strategies") + } +} + +func TestValidateConfig_InvalidDefaultStrategy(t *testing.T) { + cfg := &Config{ + Strategies: []string{"lexical", "embedding"}, + Defaults: DefaultsConfig{ + Strategy: "combined", + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for invalid default strategy") + } +} + +func TestValidateConfig_NegativeWeights(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: -0.5, Embedding: 0.4}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for negative weight") + } +} + +func TestValidateConfig_BothWeightsZero(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0, Embedding: 0}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error when both weights are zero") + } +} + +func TestValidateConfig_RuntimeRatioTooLow(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Runtime: BaselineRuntime{ + MaxNsOpRegressionRatio: 0.5, + }, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for runtime ratio < 1") + } +} + +func TestValidateConfig_ProfileInheritsMissing(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Profiles: map[string]Profile{ + "fast": {Inherits: "nonexistent"}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for missing inherited profile") + } +} + +func TestValidateConfig_ProfileInheritanceCycle(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Profiles: map[string]Profile{ + "a": {Inherits: "b"}, + "b": {Inherits: "c"}, + "c": {Inherits: "a"}, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for inheritance cycle") + } +} + +func TestValidateConfig_NegativeQualityThreshold(t *testing.T) { + cfg := &Config{ + Strategies: []string{"combined"}, + Defaults: DefaultsConfig{ + Weights: Weights{Lexical: 0.6, Embedding: 0.4}, + }, + Baseline: BaselineConfig{ + Quality: BaselineQuality{ + MaxOverallPAt1Drop: -0.02, + }, + }, + } + err := ValidateConfig(cfg) + if err == nil { + t.Error("expected error for negative quality threshold") + } +} diff --git a/internal/benchmark/dataset.go b/internal/benchmark/dataset.go new file mode 100644 index 0000000..86c5014 --- /dev/null +++ b/internal/benchmark/dataset.go @@ -0,0 +1,117 @@ +package benchmark + +import ( + "encoding/json" + "os" + "path/filepath" + + "github.com/pinchtab/semantic" +) + +type Query struct { + ID string `json:"id"` + QueryText string `json:"query"` + RelevantRefs []string `json:"relevant_refs"` + PartiallyRelevantRefs []string `json:"partially_relevant_refs"` + Difficulty string `json:"difficulty"` + Tags []string `json:"tags"` + Intent string `json:"intent,omitempty"` + PageType string `json:"page_type,omitempty"` + Threshold *float64 `json:"threshold,omitempty"` + TopK *int `json:"top_k,omitempty"` + ExpectNoMatch bool `json:"expect_no_match,omitempty"` + MinScore *float64 `json:"min_score,omitempty"` + Notes string `json:"notes,omitempty"` +} + +type Corpus struct { + ID string + Path string + Snapshot []semantic.ElementDescriptor + Queries []Query +} + +type Dataset struct { + Root string + Corpora []Corpus +} + +func LoadDataset(benchmarkRoot string) (*Dataset, error) { + corpusDir := filepath.Join(benchmarkRoot, "corpus") + entries, err := os.ReadDir(corpusDir) + if err != nil { + return nil, err + } + + ds := &Dataset{Root: benchmarkRoot} + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + corpusPath := filepath.Join(corpusDir, entry.Name()) + snapshotPath := filepath.Join(corpusPath, "snapshot.json") + queriesPath := filepath.Join(corpusPath, "queries.json") + + if _, err := os.Stat(snapshotPath); os.IsNotExist(err) { + continue + } + if _, err := os.Stat(queriesPath); os.IsNotExist(err) { + continue + } + + corpus, err := loadCorpus(entry.Name(), corpusPath) + if err != nil { + return nil, err + } + + ds.Corpora = append(ds.Corpora, *corpus) + } + + return ds, nil +} + +func loadCorpus(id, path string) (*Corpus, error) { + snapshotPath := filepath.Join(path, "snapshot.json") + queriesPath := filepath.Join(path, "queries.json") + + snapshotData, err := os.ReadFile(snapshotPath) + if err != nil { + return nil, err + } + + var snapshot []semantic.ElementDescriptor + if err := json.Unmarshal(snapshotData, &snapshot); err != nil { + return nil, err + } + + queriesData, err := os.ReadFile(queriesPath) + if err != nil { + return nil, err + } + + var queries []Query + if err := json.Unmarshal(queriesData, &queries); err != nil { + return nil, err + } + + return &Corpus{ + ID: id, + Path: path, + Snapshot: snapshot, + Queries: queries, + }, nil +} + +func (ds *Dataset) QueryCount() int { + count := 0 + for _, c := range ds.Corpora { + count += len(c.Queries) + } + return count +} + +func (ds *Dataset) CorpusCount() int { + return len(ds.Corpora) +} diff --git a/internal/benchmark/lint.go b/internal/benchmark/lint.go new file mode 100644 index 0000000..20565ce --- /dev/null +++ b/internal/benchmark/lint.go @@ -0,0 +1,68 @@ +package benchmark + +import "fmt" + +func RunLint(cfg LintConfig) (*LintResult, error) { + root := FindBenchmarkRoot() + result := &LintResult{} + + ds, err := LoadDataset(root) + if err != nil { + result.Errors++ + result.Messages = append(result.Messages, fmt.Sprintf("ERROR: failed to load dataset: %v", err)) + return result, nil + } + + ids := make(map[string]string) + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if existing, ok := ids[q.ID]; ok { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: duplicate ID '%s' in %s (first seen in %s)", q.ID, c.ID, existing)) + } else { + ids[q.ID] = c.ID + } + } + } + + for _, c := range ds.Corpora { + refs := make(map[string]bool) + for _, d := range c.Snapshot { + refs[d.Ref] = true + } + for _, q := range c.Queries { + for _, r := range q.RelevantRefs { + if !refs[r] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: [%s] relevant_ref '%s' not found in snapshot", q.ID, r)) + } + } + } + } + + validDiff := map[string]bool{"easy": true, "medium": true, "hard": true} + for _, c := range ds.Corpora { + for _, q := range c.Queries { + if q.Difficulty != "" && !validDiff[q.Difficulty] { + result.Errors++ + result.Messages = append(result.Messages, + fmt.Sprintf("ERROR: invalid difficulty '%s' for query '%s'", q.Difficulty, q.ID)) + } + } + } + + if result.Errors == 0 && result.Warnings == 0 { + result.Messages = append(result.Messages, "All checks passed") + } + + return result, nil +} + +func PrintLintResult(result *LintResult, cfg LintConfig) { + for _, msg := range result.Messages { + fmt.Println(msg) + } + fmt.Printf("\nErrors: %d, Warnings: %d\n", result.Errors, result.Warnings) +} diff --git a/internal/benchmark/runner.go b/internal/benchmark/runner.go new file mode 100644 index 0000000..6f00821 --- /dev/null +++ b/internal/benchmark/runner.go @@ -0,0 +1,466 @@ +package benchmark + +import ( + "context" + "os/exec" + "strings" + "time" + + "github.com/pinchtab/semantic" +) + +type QueryResult struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Difficulty string `json:"difficulty"` + Tags []string `json:"tags"` + Intent string `json:"intent,omitempty"` + PageType string `json:"page_type,omitempty"` + Expected struct { + RelevantRefs []string `json:"relevant_refs"` + PartiallyRelevantRefs []string `json:"partially_relevant_refs"` + } `json:"expected"` + Actual struct { + BestRef string `json:"best_ref"` + BestScore float64 `json:"best_score"` + Matches []Match `json:"matches"` + } `json:"actual"` + Metrics struct { + RR float64 `json:"rr"` + PAt1 float64 `json:"p_at_1"` + PAt3 float64 `json:"p_at_3"` + HitAt3 int `json:"hit_at_3"` + HitAt5 int `json:"hit_at_5"` + BestRelevantRank *int `json:"best_relevant_rank"` + BestRelevantScore float64 `json:"best_relevant_score"` + BestWrongScore float64 `json:"best_wrong_score"` + Margin float64 `json:"margin"` + } `json:"metrics"` + Latency struct { + LibraryMs int64 `json:"library_ms"` + CLIMs *int64 `json:"cli_ms,omitempty"` + } `json:"latency"` + Status string `json:"status"` +} + +type Match struct { + Ref string `json:"ref"` + Score float64 `json:"score"` + Role string `json:"role"` + Name string `json:"name"` +} + +type Report struct { + SchemaVersion string `json:"schema_version"` + Run struct { + ID string `json:"id"` + Timestamp string `json:"timestamp"` + Tool string `json:"tool"` + GitSHA string `json:"git_sha,omitempty"` + GitDirty bool `json:"git_dirty,omitempty"` + Command string `json:"command"` + } `json:"run"` + Dataset struct { + Name string `json:"name"` + Version string `json:"version,omitempty"` + QueryCount int `json:"query_count"` + CorpusCount int `json:"corpus_count"` + } `json:"dataset"` + Config struct { + Profile string `json:"profile"` + Strategy string `json:"strategy"` + Threshold float64 `json:"threshold"` + TopK int `json:"top_k"` + Weights Weights `json:"weights"` + } `json:"config"` + Status string `json:"status"` + Metrics struct { + Overall OverallMetrics `json:"overall"` + Latency LatencyMetrics `json:"latency"` + ByCorpus map[string]CorpusMetrics `json:"by_corpus"` + ByDifficulty map[string]CorpusMetrics `json:"by_difficulty"` + ByTag map[string]CorpusMetrics `json:"by_tag"` + } `json:"metrics"` + Results []QueryResult `json:"results"` +} + +type OverallMetrics struct { + Total int `json:"total"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + PAt3 float64 `json:"p_at_3"` + HitAt3 float64 `json:"hit_at_3"` + HitAt5 float64 `json:"hit_at_5"` + AvgMargin float64 `json:"avg_margin"` +} + +type LatencyMetrics struct { + LibraryP50Ms int64 `json:"library_p50_ms"` + LibraryP95Ms int64 `json:"library_p95_ms"` + CLIP50Ms *int64 `json:"cli_p50_ms,omitempty"` + CLIP95Ms *int64 `json:"cli_p95_ms,omitempty"` +} + +type CorpusMetrics struct { + Count int `json:"count"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + HitAt3 float64 `json:"hit_at_3"` + AvgMargin float64 `json:"avg_margin"` +} + +func RunCorpusBenchmark(ds *Dataset, cfg RunConfig) (*Report, error) { + matcher := createMatcher(cfg) + + report := &Report{ + SchemaVersion: "1.0.0", + Status: "pass", + } + report.Run.ID = time.Now().Format("20060102-150405") + "-" + cfg.Profile + report.Run.Timestamp = time.Now().UTC().Format(time.RFC3339) + report.Run.Tool = "semantic-bench" + report.Run.GitSHA, report.Run.GitDirty = getGitInfo() + report.Dataset.Name = "semantic-ui-matching-corpus" + report.Dataset.QueryCount = ds.QueryCount() + report.Dataset.CorpusCount = ds.CorpusCount() + report.Config.Profile = cfg.Profile + report.Config.Strategy = cfg.Strategy + report.Config.Threshold = cfg.Threshold + report.Config.TopK = cfg.TopK + report.Config.Weights = Weights{Lexical: cfg.LexicalWeight, Embedding: cfg.EmbeddingWeight} + + report.Metrics.ByCorpus = make(map[string]CorpusMetrics) + report.Metrics.ByDifficulty = make(map[string]CorpusMetrics) + report.Metrics.ByTag = make(map[string]CorpusMetrics) + + var allLatencies []int64 + + for _, corpus := range ds.Corpora { + if cfg.Corpus != "" && corpus.ID != cfg.Corpus { + continue + } + + queries := corpus.Queries + if cfg.Quick { + queries = selectQuickSubset(corpus.Queries) + } + + for _, query := range queries { + if cfg.QueryID != "" && query.ID != cfg.QueryID { + continue + } + + result := runQuery(matcher, corpus, query, cfg) + report.Results = append(report.Results, result) + allLatencies = append(allLatencies, result.Latency.LibraryMs) + } + } + + aggregateMetrics(report, allLatencies) + return report, nil +} + +// selectQuickSubset returns a deterministic subset for smoke testing. +// Selects up to 3 queries per corpus by difficulty. This is NOT representative +// of full corpus coverage—edge-case tags may be missed. Use for fast iteration, +// not for final regression checks. +func selectQuickSubset(queries []Query) []Query { + if len(queries) <= 3 { + return queries + } + + // Group by difficulty + byDiff := make(map[string][]Query) + for _, q := range queries { + diff := q.Difficulty + if diff == "" { + diff = "medium" + } + byDiff[diff] = append(byDiff[diff], q) + } + + // Select one from each difficulty level, up to 3 total + var selected []Query + for _, diff := range []string{"easy", "medium", "hard"} { + if qs, ok := byDiff[diff]; ok && len(qs) > 0 { + selected = append(selected, qs[0]) + if len(selected) >= 3 { + break + } + } + } + + // If we don't have 3 yet, fill from remaining + if len(selected) < 3 { + for _, q := range queries { + found := false + for _, s := range selected { + if s.ID == q.ID { + found = true + break + } + } + if !found { + selected = append(selected, q) + if len(selected) >= 3 { + break + } + } + } + } + + return selected +} + +func createMatcher(cfg RunConfig) semantic.ElementMatcher { + embedder := semantic.NewHashingEmbedder(128) + switch cfg.Strategy { + case "lexical": + return semantic.NewLexicalMatcher() + case "embedding": + return semantic.NewEmbeddingMatcher(embedder) + default: + return semantic.NewCombinedMatcher(embedder) + } +} + +func runQuery(matcher semantic.ElementMatcher, corpus Corpus, query Query, cfg RunConfig) QueryResult { + result := QueryResult{ + ID: query.ID, + Corpus: corpus.ID, + Query: query.QueryText, + Difficulty: query.Difficulty, + Tags: query.Tags, + Intent: query.Intent, + PageType: query.PageType, + } + result.Expected.RelevantRefs = query.RelevantRefs + result.Expected.PartiallyRelevantRefs = query.PartiallyRelevantRefs + + threshold := cfg.Threshold + if query.Threshold != nil { + threshold = *query.Threshold + } + topK := cfg.TopK + if query.TopK != nil { + topK = *query.TopK + } + + start := time.Now() + findResult, _ := matcher.Find(context.Background(), query.QueryText, corpus.Snapshot, semantic.FindOptions{ + Threshold: threshold, + TopK: topK, + LexicalWeight: cfg.LexicalWeight, + EmbeddingWeight: cfg.EmbeddingWeight, + Explain: cfg.Explain, + }) + result.Latency.LibraryMs = time.Since(start).Milliseconds() + + result.Actual.BestRef = findResult.BestRef + result.Actual.BestScore = findResult.BestScore + for _, m := range findResult.Matches { + result.Actual.Matches = append(result.Actual.Matches, Match{ + Ref: m.Ref, + Score: m.Score, + Role: m.Role, + Name: m.Name, + }) + } + + computeQueryMetrics(&result, query) + return result +} + +func computeQueryMetrics(result *QueryResult, query Query) { + relevantSet := make(map[string]bool) + for _, r := range query.RelevantRefs { + relevantSet[r] = true + } + partialSet := make(map[string]bool) + for _, r := range query.PartiallyRelevantRefs { + partialSet[r] = true + } + + // Reciprocal Rank + for i, m := range result.Actual.Matches { + if relevantSet[m.Ref] { + result.Metrics.RR = 1.0 / float64(i+1) + break + } + } + + // P@1 + if len(result.Actual.Matches) > 0 { + if relevantSet[result.Actual.Matches[0].Ref] { + result.Metrics.PAt1 = 1.0 + } else if partialSet[result.Actual.Matches[0].Ref] { + result.Metrics.PAt1 = 0.5 + } + } + + // P@3, Hit@3, Hit@5 + relevantInTop3 := 0 + partialInTop3 := 0 + for i, m := range result.Actual.Matches { + if i >= 5 { + break + } + switch { + case relevantSet[m.Ref]: + if result.Metrics.BestRelevantRank == nil { + rank := i + 1 + result.Metrics.BestRelevantRank = &rank + } + if result.Metrics.BestRelevantScore == 0 || m.Score > result.Metrics.BestRelevantScore { + result.Metrics.BestRelevantScore = m.Score + } + if i < 3 { + relevantInTop3++ + result.Metrics.HitAt3 = 1 + } + result.Metrics.HitAt5 = 1 + case partialSet[m.Ref]: + if i < 3 { + partialInTop3++ + } + default: + if m.Score > result.Metrics.BestWrongScore { + result.Metrics.BestWrongScore = m.Score + } + } + } + result.Metrics.PAt3 = (float64(relevantInTop3) + float64(partialInTop3)*0.5) / 3.0 + result.Metrics.Margin = result.Metrics.BestRelevantScore - result.Metrics.BestWrongScore + + // Status + switch { + case query.ExpectNoMatch: + if len(result.Actual.Matches) == 0 { + result.Status = "no_match_expected" + } else { + result.Status = "unexpected_match" + } + case result.Metrics.PAt1 >= 1.0: + result.Status = "hit" + case result.Metrics.PAt1 >= 0.5: + result.Status = "partial" + default: + result.Status = "miss" + } +} + +func aggregateMetrics(report *Report, latencies []int64) { + n := len(report.Results) + if n == 0 { + return + } + + report.Metrics.Overall.Total = n + + var sumRR, sumP1, sumP3, sumHit3, sumHit5, sumMargin float64 + corpusAgg := make(map[string]*aggregator) + diffAgg := make(map[string]*aggregator) + tagAgg := make(map[string]*aggregator) + + for _, r := range report.Results { + sumRR += r.Metrics.RR + sumP1 += r.Metrics.PAt1 + sumP3 += r.Metrics.PAt3 + sumHit3 += float64(r.Metrics.HitAt3) + sumHit5 += float64(r.Metrics.HitAt5) + sumMargin += r.Metrics.Margin + + addToAgg(corpusAgg, r.Corpus, r) + addToAgg(diffAgg, r.Difficulty, r) + for _, t := range r.Tags { + addToAgg(tagAgg, t, r) + } + } + + report.Metrics.Overall.MRR = sumRR / float64(n) + report.Metrics.Overall.PAt1 = sumP1 / float64(n) + report.Metrics.Overall.PAt3 = sumP3 / float64(n) + report.Metrics.Overall.HitAt3 = sumHit3 / float64(n) + report.Metrics.Overall.HitAt5 = sumHit5 / float64(n) + report.Metrics.Overall.AvgMargin = sumMargin / float64(n) + + for k, a := range corpusAgg { + report.Metrics.ByCorpus[k] = a.toMetrics() + } + for k, a := range diffAgg { + report.Metrics.ByDifficulty[k] = a.toMetrics() + } + for k, a := range tagAgg { + report.Metrics.ByTag[k] = a.toMetrics() + } + + // Latency percentiles + if len(latencies) > 0 { + sorted := make([]int64, len(latencies)) + copy(sorted, latencies) + sortInt64(sorted) + report.Metrics.Latency.LibraryP50Ms = sorted[len(sorted)*50/100] + report.Metrics.Latency.LibraryP95Ms = sorted[len(sorted)*95/100] + } +} + +type aggregator struct { + count int + sumRR float64 + sumP1 float64 + sumHit3 float64 + sumMargin float64 +} + +func addToAgg(m map[string]*aggregator, key string, r QueryResult) { + if _, ok := m[key]; !ok { + m[key] = &aggregator{} + } + a := m[key] + a.count++ + a.sumRR += r.Metrics.RR + a.sumP1 += r.Metrics.PAt1 + a.sumHit3 += float64(r.Metrics.HitAt3) + a.sumMargin += r.Metrics.Margin +} + +func (a *aggregator) toMetrics() CorpusMetrics { + if a.count == 0 { + return CorpusMetrics{} + } + return CorpusMetrics{ + Count: a.count, + MRR: a.sumRR / float64(a.count), + PAt1: a.sumP1 / float64(a.count), + HitAt3: a.sumHit3 / float64(a.count), + AvgMargin: a.sumMargin / float64(a.count), + } +} + +func sortInt64(s []int64) { + for i := range s { + for j := i + 1; j < len(s); j++ { + if s[j] < s[i] { + s[i], s[j] = s[j], s[i] + } + } + } +} + +func getGitInfo() (sha string, dirty bool) { + cmd := exec.Command("git", "rev-parse", "HEAD") + out, err := cmd.Output() + if err != nil { + return "", false + } + sha = strings.TrimSpace(string(out)) + + cmd = exec.Command("git", "status", "--porcelain") + out, err = cmd.Output() + if err != nil { + return sha, false + } + dirty = len(strings.TrimSpace(string(out))) > 0 + return sha, dirty +} diff --git a/internal/benchmark/runtime.go b/internal/benchmark/runtime.go new file mode 100644 index 0000000..dd68f75 --- /dev/null +++ b/internal/benchmark/runtime.go @@ -0,0 +1,236 @@ +package benchmark + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +type RuntimeResult struct { + Status string `json:"status"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` + Regressions int `json:"regressions"` + BaselinePath string `json:"baseline_path"` + Created bool `json:"created"` +} + +type RuntimeBenchmark struct { + Name string `json:"name"` + NsOp float64 `json:"ns_op"` + BytesOp int `json:"bytes_op"` + AllocsOp int `json:"allocs_op"` + BaselineNs float64 `json:"baseline_ns,omitempty"` + Ratio float64 `json:"ratio,omitempty"` + Status string `json:"status"` +} + +type runtimeBaseline struct { + Timestamp string `json:"timestamp"` + Benchmarks []RuntimeBenchmark `json:"benchmarks"` +} + +func RunRuntime(cfg RuntimeConfig) (*RuntimeResult, error) { + root := FindBenchmarkRoot() + + // Load config for thresholds + benchCfg, err := LoadConfig(root) + if err != nil { + return nil, fmt.Errorf("load config: %w", err) + } + thresholds := benchCfg.RuntimeThresholds() + baselinePath := filepath.Join(benchCfg.BaselinesDir(root), "runtime.json") + + benchmarks, err := runGoBenchmarks() + if err != nil { + return nil, err + } + + result := &RuntimeResult{ + Status: "pass", + Benchmarks: benchmarks, + BaselinePath: baselinePath, + } + + if _, err := os.Stat(baselinePath); os.IsNotExist(err) { + if err := saveRuntimeBaseline(baselinePath, benchmarks); err != nil { + return nil, err + } + result.Created = true + return result, nil + } + + baseline, err := loadRuntimeBaseline(baselinePath) + if err != nil { + return nil, err + } + + baselineMap := make(map[string]RuntimeBenchmark) + for _, b := range baseline.Benchmarks { + baselineMap[b.Name] = b + } + + // Warning threshold is halfway between 1.0 and max ratio + warnRatio := 1.0 + ((thresholds.MaxNsOpRegressionRatio - 1.0) / 2.0) + + for i, b := range result.Benchmarks { + if base, ok := baselineMap[b.Name]; ok { + nsRatio := b.NsOp / base.NsOp + result.Benchmarks[i].BaselineNs = base.NsOp + result.Benchmarks[i].Ratio = nsRatio + + // Check allocation regression if baseline has allocation data + var allocRatio float64 + if base.AllocsOp > 0 && b.AllocsOp > 0 { + allocRatio = float64(b.AllocsOp) / float64(base.AllocsOp) + } + + switch { + case nsRatio > thresholds.MaxNsOpRegressionRatio: + result.Benchmarks[i].Status = "regression" + result.Regressions++ + case allocRatio > thresholds.MaxAllocRegressionRatio: + result.Benchmarks[i].Status = "regression" + result.Regressions++ + case nsRatio > warnRatio: + result.Benchmarks[i].Status = "warning" + default: + result.Benchmarks[i].Status = "ok" + } + } else { + result.Benchmarks[i].Status = "new" + } + } + + if result.Regressions > 0 { + result.Status = "fail" + } + + return result, nil +} + +func runGoBenchmarks() ([]RuntimeBenchmark, error) { + root := FindBenchmarkRoot() + projectRoot := filepath.Join(root, "..", "..") + + cmd := exec.Command("go", "test", "-bench=.", "-benchmem", "./internal/engine/...") + cmd.Dir = projectRoot + output, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("go test failed: %w\n%s", err, output) + } + + return parseBenchOutput(string(output)), nil +} + +func parseBenchOutput(output string) []RuntimeBenchmark { + var results []RuntimeBenchmark + lines := strings.Split(output, "\n") + + for _, line := range lines { + if !strings.HasPrefix(line, "Benchmark") { + continue + } + + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + + name := strings.TrimSuffix(fields[0], "-8") + name = strings.TrimSuffix(name, "-10") + name = strings.TrimSuffix(name, "-12") + name = strings.TrimSuffix(name, "-16") + + var nsOp float64 + var bytesOp, allocsOp int + + for i, f := range fields { + if f == "ns/op" && i > 0 { + _, _ = fmt.Sscanf(fields[i-1], "%f", &nsOp) + } + if f == "B/op" && i > 0 { + _, _ = fmt.Sscanf(fields[i-1], "%d", &bytesOp) + } + if f == "allocs/op" && i > 0 { + _, _ = fmt.Sscanf(fields[i-1], "%d", &allocsOp) + } + } + + if nsOp > 0 { + results = append(results, RuntimeBenchmark{ + Name: name, + NsOp: nsOp, + BytesOp: bytesOp, + AllocsOp: allocsOp, + }) + } + } + + return results +} + +func saveRuntimeBaseline(path string, benchmarks []RuntimeBenchmark) error { + baseline := runtimeBaseline{ + Timestamp: time.Now().UTC().Format(time.RFC3339), + Benchmarks: benchmarks, + } + data, err := json.MarshalIndent(baseline, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, data, 0644) +} + +func loadRuntimeBaseline(path string) (*runtimeBaseline, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var baseline runtimeBaseline + if err := json.Unmarshal(data, &baseline); err != nil { + return nil, err + } + return &baseline, nil +} + +func PrintRuntimeResult(result *RuntimeResult, cfg RuntimeConfig) { + if result.Created { + fmt.Printf("\n Created runtime baseline: %s\n", result.BaselinePath) + fmt.Printf(" Benchmarks: %d\n\n", len(result.Benchmarks)) + return + } + + fmt.Printf("\n Runtime Baseline Check\n\n") + + for _, b := range result.Benchmarks { + var status string + switch b.Status { + case "regression": + status = "\033[31mREGRESSION\033[0m" + case "warning": + status = "\033[33mWARNING\033[0m" + case "ok": + status = "\033[32mOK\033[0m" + case "new": + status = "\033[33mNEW\033[0m" + } + + if b.BaselineNs > 0 { + fmt.Printf(" %-10s %s: %.0f -> %.0f ns/op (%.2fx)\n", + status, b.Name, b.BaselineNs, b.NsOp, b.Ratio) + } else { + fmt.Printf(" %-10s %s: %.0f ns/op\n", status, b.Name, b.NsOp) + } + } + + fmt.Println() + if result.Regressions > 0 { + fmt.Printf(" \033[31mRegressions: %d\033[0m\n\n", result.Regressions) + } else { + fmt.Printf(" \033[32mNo regressions\033[0m\n\n") + } +} diff --git a/internal/benchmark/tune.go b/internal/benchmark/tune.go new file mode 100644 index 0000000..7db259b --- /dev/null +++ b/internal/benchmark/tune.go @@ -0,0 +1,90 @@ +package benchmark + +import "fmt" + +type TuneResult struct { + Results []TuneRun `json:"results"` + Best *TuneRun `json:"best"` +} + +type TuneRun struct { + LexicalWeight float64 `json:"lexical_weight"` + EmbeddingWeight float64 `json:"embedding_weight"` + MRR float64 `json:"mrr"` + PAt1 float64 `json:"p_at_1"` + HitAt3 float64 `json:"hit_at_3"` +} + +func RunTune(cfg TuneConfig) (*TuneResult, error) { + root := FindBenchmarkRoot() + ds, err := LoadDataset(root) + if err != nil { + return nil, fmt.Errorf("load dataset: %w", err) + } + + result := &TuneResult{} + + if cfg.Verbose { + fmt.Printf(" %-10s %-10s %-8s %-8s %-8s\n", "lexical", "embedding", "MRR", "P@1", "Hit@3") + } + + for w := 0.0; w <= 1.0001; w += cfg.Step { + lexW := w + embW := 1.0 - w + + runCfg := RunConfig{ + Suite: "corpus", + Strategy: "combined", + Threshold: 0.01, + TopK: 5, + LexicalWeight: lexW, + EmbeddingWeight: embW, + Mode: "library", + } + + if cfg.Corpus != "" { + runCfg.Corpus = cfg.Corpus + } + + report, err := RunCorpusBenchmark(ds, runCfg) + if err != nil { + return nil, fmt.Errorf("run at lexical=%.2f: %w", lexW, err) + } + + run := TuneRun{ + LexicalWeight: lexW, + EmbeddingWeight: embW, + MRR: report.Metrics.Overall.MRR, + PAt1: report.Metrics.Overall.PAt1, + HitAt3: report.Metrics.Overall.HitAt3, + } + result.Results = append(result.Results, run) + + if result.Best == nil || run.PAt1 > result.Best.PAt1 || + (run.PAt1 == result.Best.PAt1 && run.MRR > result.Best.MRR) { + best := run + result.Best = &best + } + + if cfg.Verbose { + fmt.Printf(" %-10.2f %-10.2f %-8.4f %-8.4f %-8.4f\n", + lexW, embW, run.MRR, run.PAt1, run.HitAt3) + } + } + + return result, nil +} + +func PrintTuneResult(result *TuneResult, cfg TuneConfig) { + fmt.Printf("\n Tested %d weight combinations\n\n", len(result.Results)) + + if result.Best != nil { + fmt.Printf(" Best weights:\n") + fmt.Printf(" Lexical: %.2f\n", result.Best.LexicalWeight) + fmt.Printf(" Embedding: %.2f\n", result.Best.EmbeddingWeight) + fmt.Printf(" MRR: %.4f\n", result.Best.MRR) + fmt.Printf(" P@1: %.4f\n", result.Best.PAt1) + fmt.Printf(" Hit@3: %.4f\n", result.Best.HitAt3) + } + fmt.Println() +} diff --git a/internal/benchmark/types.go b/internal/benchmark/types.go new file mode 100644 index 0000000..916978a --- /dev/null +++ b/internal/benchmark/types.go @@ -0,0 +1,67 @@ +package benchmark + +type CheckResult struct { + Status string `json:"status"` + Summary CheckSummary `json:"summary"` + Delta *MetricsDelta `json:"delta,omitempty"` + TopRegs []Regression `json:"top_regressions,omitempty"` + Artifacts Artifacts `json:"artifacts"` + Report *Report `json:"-"` +} + +type CheckSummary struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` + Total int `json:"total"` + Regressions int `json:"regressions"` + Warnings int `json:"warnings"` +} + +type MetricsDelta struct { + PAt1 float64 `json:"p_at_1"` + MRR float64 `json:"mrr"` + HitAt3 float64 `json:"hit_at_3"` +} + +type Regression struct { + ID string `json:"id"` + Corpus string `json:"corpus"` + Query string `json:"query"` + Expected []string `json:"expected"` + BaselineRef string `json:"baseline_ref,omitempty"` + CurrentRef string `json:"current_ref"` + Reason string `json:"reason"` + DebugCommand string `json:"debug_command"` +} + +type Artifacts struct { + ReportJSON string `json:"report_json"` + SummaryMD string `json:"summary_md"` +} + +type CompareResult struct { + Status string `json:"status"` + Delta MetricsDelta `json:"delta"` + Regressions []Regression `json:"regressions"` + Improvements []string `json:"improvements"` +} + +type LintResult struct { + Errors int `json:"errors"` + Warnings int `json:"warnings"` + Messages []string `json:"messages"` +} + +type CatalogResult struct { + Corpora []CorpusSummary `json:"corpora"` + TotalQueries int `json:"total_queries"` + ByTag map[string]int `json:"by_tag,omitempty"` + ByDifficulty map[string]int `json:"by_difficulty,omitempty"` +} + +type CorpusSummary struct { + ID string `json:"id"` + Queries int `json:"queries"` + Tags []string `json:"tags"` +} diff --git a/internal/engine/benchmark_test.go b/internal/engine/benchmark_test.go index 92bbea6..0ebc2c6 100644 --- a/internal/engine/benchmark_test.go +++ b/internal/engine/benchmark_test.go @@ -2,8 +2,10 @@ package engine import ( "context" - "github.com/pinchtab/semantic/internal/types" + "strconv" "testing" + + "github.com/pinchtab/semantic/internal/types" ) // benchElements returns a realistic set of elements for benchmarking. @@ -169,8 +171,9 @@ func BenchmarkCombinedFind_100Elements(b *testing.B) { elements := make([]types.ElementDescriptor, 0, 100) for len(elements) < 100 { for _, e := range base { - e.Ref = "e" + string(rune('0'+len(elements))) - elements = append(elements, e) + clone := e + clone.Ref = "e" + strconv.Itoa(len(elements)) + elements = append(elements, clone) if len(elements) >= 100 { break } @@ -203,3 +206,158 @@ func BenchmarkCalibrateConfidence(b *testing.B) { types.CalibrateConfidence(0.75) } } + +func benchElementsSized(n int) []types.ElementDescriptor { + base := benchElements() + out := make([]types.ElementDescriptor, 0, n) + for len(out) < n { + for _, e := range base { + clone := e + clone.Ref = "e" + strconv.Itoa(len(out)) + out = append(out, clone) + if len(out) >= n { + break + } + } + } + return out +} + +func BenchmarkCombinedFind_Issue24_100Elements(b *testing.B) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := benchElementsSized(100) + ctx := context.Background() + opts := types.FindOptions{Threshold: 0.3, TopK: 3} + + queries := []string{ + "sign in button", + "button not submit", + "textbox not email", + } + + for _, q := range queries { + b.Run(q, func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = m.Find(ctx, q, elements, opts) + } + }) + } +} + +// Focused microbenchmarks for individual components + +func BenchmarkParseQueryContext(b *testing.B) { + queries := []string{ + "sign in button", + "the first email textbox in the login form", + "button not submit near the checkout section", + "second item in the dropdown menu", + } + b.ReportAllocs() + + for b.Loop() { + for _, q := range queries { + ParseQueryContext(q) + } + } +} + +func BenchmarkParseQueryContext_Complex(b *testing.B) { + q := "the third blue submit button in the checkout form not disabled" + b.ReportAllocs() + + for b.Loop() { + ParseQueryContext(q) + } +} + +func BenchmarkRemoveStopwords(b *testing.B) { + tokenSets := [][]string{ + {"click", "the", "sign", "in", "button"}, + {"find", "the", "email", "address", "textbox"}, + {"the", "first", "item", "in", "a", "dropdown", "menu"}, + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, tokens := range tokenSets { + removeStopwords(tokens) + } + } +} + +func BenchmarkScoreFusion(b *testing.B) { + // Test the score fusion calculation + lexScores := make([]float64, 100) + embScores := make([]float64, 100) + for i := range lexScores { + lexScores[i] = float64(i) / 100.0 + embScores[i] = float64(100-i) / 100.0 + } + lexWeight, embWeight := 0.6, 0.4 + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + for j := range lexScores { + _ = lexWeight*lexScores[j] + embWeight*embScores[j] + } + } +} + +func BenchmarkLexicalScore_Variants(b *testing.B) { + cases := []struct { + name string + query string + desc string + }{ + {"exact", "Sign In", "button: Sign In"}, + {"partial", "sign", "button: Sign In"}, + {"synonym", "login", "button: Sign In"}, + {"mismatch", "checkout", "button: Sign In"}, + {"long_query", "click the sign in button on the login page", "button: Sign In"}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + LexicalScore(tc.query, tc.desc) + } + }) + } +} + +func BenchmarkCombinedFind_WeightVariants(b *testing.B) { + elements := benchElements() + ctx := context.Background() + + weights := []struct { + name string + lex float64 + emb float64 + }{ + {"lex_only", 1.0, 0.0}, + {"emb_only", 0.0, 1.0}, + {"balanced", 0.5, 0.5}, + {"lex_heavy", 0.8, 0.2}, + {"emb_heavy", 0.2, 0.8}, + } + + for _, w := range weights { + b.Run(w.name, func(b *testing.B) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + opts := types.FindOptions{ + Threshold: 0.3, + TopK: 3, + LexicalWeight: w.lex, + EmbeddingWeight: w.emb, + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = m.Find(ctx, "sign in button", elements, opts) + } + }) + } +} diff --git a/internal/engine/combined.go b/internal/engine/combined.go index bf1f0dd..c42597f 100644 --- a/internal/engine/combined.go +++ b/internal/engine/combined.go @@ -3,8 +3,10 @@ package engine import ( "context" "fmt" - "github.com/pinchtab/semantic/internal/types" + "math" "sort" + + "github.com/pinchtab/semantic/internal/types" ) // combinedMatcher fuses lexical and embedding scores: @@ -40,25 +42,77 @@ func (c *CombinedMatcher) Strategy() string { } func (c *CombinedMatcher) Find(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) { - if opts.TopK <= 0 { - opts.TopK = 3 + if ctx == nil { + ctx = context.Background() + } + + opts = sanitizeFindOptions(opts, len(elements), 3) + + parsed := ParseQueryContext(query) + visualHints := parseVisualQueryHints(query) + mergeOpts := opts + internalOpts := opts + if parsed.Ordinal.HasOrdinal || visualHints.hasHints { + mergeOpts.TopK = len(elements) + internalOpts.TopK = len(elements) } lexW, embW := c.weights(opts) - lexResult, embResult, err := c.runBoth(ctx, query, elements, opts) + lexResult, embResult, err := c.runBothParsed(ctx, parsed, elements, internalOpts) if err != nil { return types.FindResult{}, err } - return c.mergeResults(lexResult, embResult, elements, opts, lexW, embW), nil + merged := c.mergeResults(lexResult, embResult, elements, mergeOpts, lexW, embW) + merged = applyVisualHintBoost(merged, visualHints, elements, mergeOpts.TopK) + return selectOrdinalMatchInOrder(merged, parsed.Ordinal, elements), nil } func (c *CombinedMatcher) weights(opts types.FindOptions) (float64, float64) { - if opts.LexicalWeight > 0 || opts.EmbeddingWeight > 0 { - return opts.LexicalWeight, opts.EmbeddingWeight + baseLex, baseEmb := normalizeWeights(c.LexicalWeight, c.EmbeddingWeight) + if baseLex == 0 && baseEmb == 0 { + baseLex, baseEmb = 0.6, 0.4 + } + + reqLex := sanitizeWeight(opts.LexicalWeight) + reqEmb := sanitizeWeight(opts.EmbeddingWeight) + if reqLex == 0 && reqEmb == 0 { + return baseLex, baseEmb + } + + if reqLex > 0 && reqEmb == 0 && reqLex <= 1 { + reqEmb = 1 - reqLex + } + if reqEmb > 0 && reqLex == 0 && reqEmb <= 1 { + reqLex = 1 - reqEmb } - return c.LexicalWeight, c.EmbeddingWeight + + lex, emb := normalizeWeights(reqLex, reqEmb) + if lex == 0 && emb == 0 { + return baseLex, baseEmb + } + + return lex, emb +} + +func sanitizeWeight(weight float64) float64 { + if math.IsNaN(weight) || math.IsInf(weight, 0) || weight < 0 { + return 0 + } + return weight +} + +func normalizeWeights(lexicalWeight, embeddingWeight float64) (float64, float64) { + lexicalWeight = sanitizeWeight(lexicalWeight) + embeddingWeight = sanitizeWeight(embeddingWeight) + + total := lexicalWeight + embeddingWeight + if total <= 0 { + return 0, 0 + } + + return lexicalWeight / total, embeddingWeight / total } type matcherResult struct { @@ -66,8 +120,10 @@ type matcherResult struct { err error } -func (c *CombinedMatcher) runBoth(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, types.FindResult, error) { +func (c *CombinedMatcher) runBothParsed(ctx context.Context, parsed QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, types.FindResult, error) { internalOpts := types.FindOptions{ + // Lower threshold allows both strategies to contribute to fusion + // before final filtering at the caller's requested threshold. Threshold: opts.Threshold * 0.5, TopK: len(elements), } @@ -81,8 +137,8 @@ func (c *CombinedMatcher) runBoth(ctx context.Context, query string, elements [] lexCh <- matcherResult{err: fmt.Errorf("lexical matcher panic: %v", p)} } }() - r, err := c.lexical.Find(ctx, query, elements, internalOpts) - lexCh <- matcherResult{r, err} + r := c.lexical.findWithParsed(parsed, elements, internalOpts) + lexCh <- matcherResult{result: r} }() go func() { defer func() { @@ -90,12 +146,23 @@ func (c *CombinedMatcher) runBoth(ctx context.Context, query string, elements [] embCh <- matcherResult{err: fmt.Errorf("embedding matcher panic: %v", p)} } }() - r, err := c.embedding.Find(ctx, query, elements, internalOpts) + r, err := c.embedding.findWithParsed(parsed, elements, internalOpts) embCh <- matcherResult{r, err} }() - lexRes := <-lexCh - embRes := <-embCh + var lexRes, embRes matcherResult + gotLex, gotEmb := false, false + + for !gotLex || !gotEmb { + select { + case <-ctx.Done(): + return types.FindResult{}, types.FindResult{}, ctx.Err() + case lexRes = <-lexCh: + gotLex = true + case embRes = <-embCh: + gotEmb = true + } + } if lexRes.err != nil { return types.FindResult{}, types.FindResult{}, lexRes.err @@ -110,6 +177,7 @@ type scored struct { ref string score float64 el types.ElementDescriptor + order int lexScore float64 embScore float64 } @@ -119,8 +187,10 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el embScores := scoreMap(embResult.Matches) refToElem := make(map[string]types.ElementDescriptor, len(elements)) - for _, el := range elements { + refToOrder := make(map[string]int, len(elements)) + for i, el := range elements { refToElem[el.Ref] = el + refToOrder[el.Ref] = i } // Collect all refs from either matcher. @@ -135,8 +205,14 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el candidates := make([]scored, 0, len(allRefs)) for ref := range allRefs { combined := lexW*lexScores[ref] + embW*embScores[ref] + if combined < 0 { + combined = 0 + } + if combined > 1 { + combined = 1 + } if combined >= opts.Threshold { - s := scored{ref: ref, score: combined, el: refToElem[ref]} + s := scored{ref: ref, score: combined, el: refToElem[ref], order: refToOrder[ref]} if opts.Explain { s.lexScore = lexW * lexScores[ref] s.embScore = embW * embScores[ref] @@ -146,7 +222,10 @@ func (c *CombinedMatcher) mergeResults(lexResult, embResult types.FindResult, el } sort.Slice(candidates, func(i, j int) bool { - return candidates[i].score > candidates[j].score + return rankedMatchLess( + candidates[i].score, candidates[i].el, candidates[i].order, + candidates[j].score, candidates[j].el, candidates[j].order, + ) }) if len(candidates) > opts.TopK { candidates = candidates[:opts.TopK] diff --git a/internal/engine/combined_test.go b/internal/engine/combined_test.go index 4411eb3..424b609 100644 --- a/internal/engine/combined_test.go +++ b/internal/engine/combined_test.go @@ -2,9 +2,13 @@ package engine import ( "context" + "errors" "fmt" - "github.com/pinchtab/semantic/internal/types" + "math" "testing" + "time" + + "github.com/pinchtab/semantic/internal/types" ) // CATEGORY 6: Role Boost Accumulation Test (Bug Fix) @@ -161,6 +165,93 @@ func TestCombinedMatcher_FusesBothStrategies(t *testing.T) { } } +func TestCombinedMatcher_NegativePenalization(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "submit", Role: "button", Name: "Submit"}, + {Ref: "cancel", Role: "button", Name: "Cancel"}, + } + + res, err := m.Find(context.Background(), "button not cancel", elements, types.FindOptions{Threshold: 0, TopK: 2}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(res.Matches) < 2 { + t.Fatalf("expected two matches, got %d", len(res.Matches)) + } + if res.BestRef != "submit" { + t.Fatalf("expected cancel to be penalized, got best=%s", res.BestRef) + } +} + +func TestCombinedMatcher_NegativeSynonymExpansion(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "password", Role: "textbox", Name: "Password"}, + {Ref: "email", Role: "textbox", Name: "Email"}, + } + + res, err := m.Find(context.Background(), "input no pwd", elements, types.FindOptions{Threshold: 0, TopK: 2}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(res.Matches) < 2 { + t.Fatalf("expected two matches, got %d", len(res.Matches)) + } + if res.BestRef != "email" { + t.Fatalf("expected password to be demoted by negative synonym, got best=%s", res.BestRef) + } +} + +func TestCombinedMatcher_NegativeOnlyQuery(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "submit", Role: "button", Name: "Submit"}, + {Ref: "cancel", Role: "button", Name: "Cancel"}, + {Ref: "email", Role: "textbox", Name: "Email"}, + } + + res, err := m.Find(context.Background(), "not submit", elements, types.FindOptions{Threshold: 0.3, TopK: 3}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(res.Matches) == 0 { + t.Fatalf("expected non-empty matches for leading-not query") + } + if res.BestRef != "submit" { + t.Fatalf("expected leading-not query to behave as positive text, got best=%s", res.BestRef) + } +} + +func TestCombinedMatcher_PositiveQueryRegression(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "submit", Role: "button", Name: "Submit"}, + {Ref: "cancel", Role: "button", Name: "Cancel"}, + } + + res, err := m.Find(context.Background(), "submit button", elements, types.FindOptions{Threshold: 0.05, TopK: 2}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if res.BestRef != "submit" { + t.Fatalf("expected positive query behavior unchanged, got best=%s", res.BestRef) + } +} + +func TestCombinedMatcher_EmptyQueryReturnsNoResults(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{{Ref: "submit", Role: "button", Name: "Submit"}} + + res, err := m.Find(context.Background(), " ", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(res.Matches) != 0 { + t.Fatalf("expected no matches for empty query, got %d", len(res.Matches)) + } +} + func TestCombinedMatcher_NoElements(t *testing.T) { m := NewCombinedMatcher(NewHashingEmbedder(128)) @@ -466,3 +557,188 @@ func TestCombinedMatcher_WeightsApplied(t *testing.T) { t.Errorf("expected BestRef=e0, got %s", result.BestRef) } } + +func TestCombinedMatcher_ClampsScoreWithCustomWeights(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "e0", Role: "button", Name: "Sign In"}, + {Ref: "e1", Role: "link", Name: "Help"}, + } + + result, err := m.Find(context.Background(), "sign in button", elements, types.FindOptions{ + Threshold: 0, + TopK: 2, + LexicalWeight: 2.0, + EmbeddingWeight: 2.0, + }) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + + for _, match := range result.Matches { + if match.Score < 0 || match.Score > 1 { + t.Fatalf("expected clamped score in [0,1], got %f for ref=%s", match.Score, match.Ref) + } + } +} + +func TestCombinedMatcher_DeterministicTieBreak(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "first", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 2, SiblingIndex: 0}}, + {Ref: "second", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 2, SiblingIndex: 0}}, + } + + for i := 0; i < 100; i++ { + result, err := m.Find(context.Background(), "open button", elements, types.FindOptions{Threshold: 0, TopK: 2}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if result.BestRef != "first" { + t.Fatalf("run %d: expected BestRef=first, got %s", i, result.BestRef) + } + } +} + +// Hardening tests + +func TestCombinedMatcher_Weights_AdversarialNormalizationGrid(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + + inputs := []float64{ + -10, -1, -0.25, 0, 0.001, 0.2, 0.5, 0.9, 1, 2, 10, + math.NaN(), math.Inf(1), + } + + for _, baseLex := range inputs { + for _, baseEmb := range inputs { + m.LexicalWeight = baseLex + m.EmbeddingWeight = baseEmb + + for _, reqLex := range inputs { + for _, reqEmb := range inputs { + lex, emb := m.weights(types.FindOptions{LexicalWeight: reqLex, EmbeddingWeight: reqEmb}) + + if math.IsNaN(lex) || math.IsNaN(emb) || math.IsInf(lex, 0) || math.IsInf(emb, 0) { + t.Fatalf("non-finite weights from base=(%v,%v) req=(%v,%v): lexical=%v embedding=%v", + baseLex, baseEmb, reqLex, reqEmb, lex, emb) + } + if lex < 0 || emb < 0 { + t.Fatalf("negative weights from base=(%v,%v) req=(%v,%v): lexical=%v embedding=%v", + baseLex, baseEmb, reqLex, reqEmb, lex, emb) + } + + sum := lex + emb + if math.Abs(sum-1) > 1e-9 { + t.Fatalf("weights do not normalize to 1 from base=(%v,%v) req=(%v,%v): sum=%v", + baseLex, baseEmb, reqLex, reqEmb, sum) + } + } + } + } + } +} + +func TestCombinedMatcher_ScoreBoundedUnderInvalidModelWeights(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + m.LexicalWeight = 9 + m.EmbeddingWeight = 9 + + elements := []types.ElementDescriptor{ + {Ref: "e0", Role: "button", Name: "Sign in"}, + {Ref: "e1", Role: "link", Name: "Register"}, + } + + result, err := m.Find(context.Background(), "sign in button", elements, types.FindOptions{Threshold: 0, TopK: 2}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if result.BestScore < 0 || result.BestScore > 1 { + t.Fatalf("best score out of [0,1]: %f", result.BestScore) + } + for _, match := range result.Matches { + if match.Score < 0 || match.Score > 1 { + t.Fatalf("match %s score out of [0,1]: %f", match.Ref, match.Score) + } + } +} + +func TestCombinedMatcher_Find_ContextCanceledWhileEmbeddingBlocked(t *testing.T) { + e := &blockingEmbedder{started: make(chan struct{}), release: make(chan struct{})} + defer close(e.release) + + m := NewCombinedMatcher(e) + elements := []types.ElementDescriptor{ + {Ref: "e1", Role: "button", Name: "Save"}, + {Ref: "e2", Role: "link", Name: "Cancel"}, + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { + _, err := m.Find(ctx, "save button", elements, types.FindOptions{Threshold: 0, TopK: 2}) + done <- err + }() + + select { + case <-e.started: + case <-time.After(500 * time.Millisecond): + t.Fatal("expected embedding to start") + } + + cancel() + + select { + case err := <-done: + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context canceled error, got %v", err) + } + case <-time.After(500 * time.Millisecond): + t.Fatal("Find did not return promptly after context cancellation") + } +} + +type blockingEmbedder struct { + started chan struct{} + release chan struct{} +} + +func (e *blockingEmbedder) Strategy() string { return "blocking" } + +func (e *blockingEmbedder) Embed(texts []string) ([][]float32, error) { + close(e.started) + <-e.release + return nil, errors.New("released") +} + +func TestSanitizeFindOptions(t *testing.T) { + tests := []struct { + name string + opts types.FindOptions + elemCount int + defaultTopK int + wantTopK int + wantThresh float64 + }{ + {"zero topk uses default", types.FindOptions{TopK: 0}, 10, 3, 3, 0}, + {"negative topk uses default", types.FindOptions{TopK: -5}, 10, 3, 3, 0}, + {"topk exceeds elements clamped", types.FindOptions{TopK: 20}, 5, 3, 5, 0}, + {"NaN threshold becomes 0", types.FindOptions{Threshold: math.NaN()}, 10, 3, 3, 0}, + {"Inf threshold becomes 0", types.FindOptions{Threshold: math.Inf(1)}, 10, 3, 3, 0}, + {"negative threshold becomes 0", types.FindOptions{Threshold: -0.5}, 10, 3, 3, 0}, + {"threshold > 1 becomes 1", types.FindOptions{Threshold: 1.5}, 10, 3, 3, 1}, + {"valid threshold preserved", types.FindOptions{Threshold: 0.5, TopK: 5}, 10, 3, 5, 0.5}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := sanitizeFindOptions(tt.opts, tt.elemCount, tt.defaultTopK) + if got.TopK != tt.wantTopK { + t.Errorf("TopK = %d, want %d", got.TopK, tt.wantTopK) + } + if got.Threshold != tt.wantThresh { + t.Errorf("Threshold = %f, want %f", got.Threshold, tt.wantThresh) + } + }) + } +} diff --git a/internal/engine/embedding.go b/internal/engine/embedding.go index 6723255..48922dc 100644 --- a/internal/engine/embedding.go +++ b/internal/engine/embedding.go @@ -2,9 +2,12 @@ package engine import ( "context" - "github.com/pinchtab/semantic/internal/types" + "fmt" "math" "sort" + "strings" + + "github.com/pinchtab/semantic/internal/types" ) // Embedder converts text into dense vectors. See NewHashingEmbedder. @@ -47,57 +50,216 @@ func (m *EmbeddingMatcher) Strategy() string { return "embedding:" + m.embedder.Strategy() } -func (m *EmbeddingMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) { - if opts.TopK <= 0 { - opts.TopK = 3 +// contextAwareEmbedder is an optional interface for embedders that support +// context cancellation during embedding. +type contextAwareEmbedder interface { + EmbedContext(ctx context.Context, texts []string) ([][]float32, error) +} + +func (m *EmbeddingMatcher) Find(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) { + if ctx == nil { + ctx = context.Background() + } + if err := ctx.Err(); err != nil { + return types.FindResult{}, err + } + + queryCtx := ParseQueryContext(query) + return m.findWithParsedContext(ctx, queryCtx, elements, opts) +} + +func (m *EmbeddingMatcher) findWithParsedContext(ctx context.Context, queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) { + parsed := queryCtx.Base + opts = sanitizeFindOptions(opts, len(elements), 3) + + if len(parsed.Positive) == 0 && len(parsed.Negative) == 0 { + return types.FindResult{ + Strategy: m.Strategy(), + ElementCount: len(elements), + }, nil + } + + filtered := filterContextExcludedElements(elements, queryCtx) + if len(filtered) == 0 { + return types.FindResult{Strategy: m.Strategy(), ElementCount: len(elements)}, nil } - // Build composite descriptions. + vectors, err := m.embedQueryAndElementsWithContext(ctx, parsed, filtered) + if err != nil { + return types.FindResult{}, err + } + + if err := validateEmbeddedVectors(vectors, len(filtered)+countQueryVectors(parsed)); err != nil { + return types.FindResult{}, err + } + + if err := ctx.Err(); err != nil { + return types.FindResult{}, err + } + + candidates := m.scoreCandidatesWithContext(ctx, parsed, filtered, vectors, opts.Threshold) + sort.Slice(candidates, func(i, j int) bool { + return rankedMatchLess( + candidates[i].score, candidates[i].desc, candidates[i].order, + candidates[j].score, candidates[j].desc, candidates[j].order, + ) + }) + + if len(candidates) > opts.TopK { + candidates = candidates[:opts.TopK] + } + + return buildEmbeddingResult(m.Strategy(), len(elements), candidates), nil +} + +func (m *EmbeddingMatcher) findWithParsed(queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) { + return m.findWithParsedContext(context.Background(), queryCtx, elements, opts) +} + +func filterContextExcludedElements(elements []types.ElementDescriptor, ctx QueryContext) []types.ElementDescriptor { + filtered := make([]types.ElementDescriptor, 0, len(elements)) + for _, el := range elements { + if ctx.HasScope && matchesExcludedContext(el, ctx.Exclude) { + continue + } + filtered = append(filtered, el) + } + return filtered +} + +func (m *EmbeddingMatcher) embedQueryAndElementsWithContext(ctx context.Context, parsed types.ParsedQuery, elements []types.ElementDescriptor) ([][]float32, error) { + positiveQuery := strings.Join(parsed.Positive, " ") + negativeQuery := strings.Join(parsed.Negative, " ") + descs := make([]string, len(elements)) for i, el := range elements { descs[i] = el.Composite() } - // Embed query + all descriptions in a single batch. - texts := append([]string{query}, descs...) - vectors, err := m.embedder.Embed(texts) - if err != nil { - return types.FindResult{}, err + texts := make([]string, 0, len(descs)+2) + if len(parsed.Positive) > 0 { + texts = append(texts, positiveQuery) + } + if len(parsed.Negative) > 0 { + texts = append(texts, negativeQuery) + } + texts = append(texts, descs...) + return embedWithContext(ctx, m.embedder, texts) +} + +func embedWithContext(ctx context.Context, embedder Embedder, texts []string) ([][]float32, error) { + if ce, ok := embedder.(contextAwareEmbedder); ok { + return ce.EmbedContext(ctx, texts) + } + if err := ctx.Err(); err != nil { + return nil, err + } + return embedder.Embed(texts) +} + +func countQueryVectors(parsed types.ParsedQuery) int { + count := 0 + if len(parsed.Positive) > 0 { + count++ + } + if len(parsed.Negative) > 0 { + count++ + } + return count +} + +func validateEmbeddedVectors(vectors [][]float32, expected int) error { + if len(vectors) != expected { + return fmt.Errorf("embedder returned %d vectors, expected %d", len(vectors), expected) + } + if len(vectors) == 0 { + return nil + } + dim := len(vectors[0]) + for i := 1; i < len(vectors); i++ { + if len(vectors[i]) != dim { + return fmt.Errorf("embedder returned inconsistent vector dimensions at index %d: %d vs %d", i, len(vectors[i]), dim) + } + } + return nil +} + +type embeddingScored struct { + desc types.ElementDescriptor + score float64 + order int +} + +func (m *EmbeddingMatcher) scoreCandidatesWithContext(ctx context.Context, parsed types.ParsedQuery, elements []types.ElementDescriptor, vectors [][]float32, threshold float64) []embeddingScored { + negativeOnly := len(parsed.Positive) == 0 && len(parsed.Negative) > 0 + idx := 0 + var posVec []float32 + if len(parsed.Positive) > 0 { + posVec = vectors[idx] + idx++ + } + + var negVec []float32 + if len(parsed.Negative) > 0 { + negVec = vectors[idx] + idx++ } - queryVec := vectors[0] - elemVecs := vectors[1:] + elemVecs := vectors[idx:] contextVecs := elemVecs if m.neighborWeight > 0 && len(elemVecs) > 1 { contextVecs = m.withNeighborContext(elemVecs) } - type scored struct { - desc types.ElementDescriptor - score float64 - } - - var candidates []scored + var candidates []embeddingScored for i, el := range elements { - sim := CosineSimilarity(queryVec, contextVecs[i]) - if sim >= opts.Threshold { - candidates = append(candidates, scored{desc: el, score: sim}) + if i%64 == 0 { + if ctx.Err() != nil { + return candidates + } + } + score := scoreEmbeddingCandidate(parsed, posVec, negVec, contextVecs[i], elemVecs[i]) + if negativeOnly && score == 0 { + continue + } + if score >= threshold { + candidates = append(candidates, embeddingScored{desc: el, score: score, order: i}) } } + return candidates +} - sort.Slice(candidates, func(i, j int) bool { - return candidates[i].score > candidates[j].score - }) +func scoreEmbeddingCandidate(parsed types.ParsedQuery, posVec, negVec, contextVec, elemVec []float32) float64 { + score := 1.0 + if len(parsed.Positive) > 0 { + score = CosineSimilarity(posVec, contextVec) + } - if len(candidates) > opts.TopK { - candidates = candidates[:opts.TopK] + if len(parsed.Negative) > 0 { + negSim := CosineSimilarity(negVec, elemVec) + if len(parsed.Positive) == 0 { + if negSim > 0.5 { + score = 0 + } + } else if negSim > 0.5 { + score *= 1 - (negSim * 0.8) + } } - result := types.FindResult{ - Strategy: m.Strategy(), - ElementCount: len(elements), + if score < 0 { + return 0 + } + if score > 1 { + return 1 } + return score +} +func buildEmbeddingResult(strategy string, elementCount int, candidates []embeddingScored) types.FindResult { + result := types.FindResult{ + Strategy: strategy, + ElementCount: elementCount, + } for _, c := range candidates { result.Matches = append(result.Matches, types.ElementMatch{ Ref: c.desc.Ref, @@ -106,13 +268,11 @@ func (m *EmbeddingMatcher) Find(_ context.Context, query string, elements []type Name: c.desc.Name, }) } - if len(result.Matches) > 0 { result.BestRef = result.Matches[0].Ref result.BestScore = result.Matches[0].Score } - - return result, nil + return result } func (m *EmbeddingMatcher) withNeighborContext(base [][]float32) [][]float32 { diff --git a/internal/engine/embedding_test.go b/internal/engine/embedding_test.go index fde31e0..6c70074 100644 --- a/internal/engine/embedding_test.go +++ b/internal/engine/embedding_test.go @@ -2,10 +2,12 @@ package engine import ( "context" + "errors" "fmt" - "github.com/pinchtab/semantic/internal/types" "math" "testing" + + "github.com/pinchtab/semantic/internal/types" ) // dummyEmbedder tests @@ -187,6 +189,71 @@ func TestEmbeddingMatcher_ThresholdFiltering(t *testing.T) { } } +func TestEmbeddingMatcher_NegativePenalty(t *testing.T) { + e := newScriptedEmbedder(map[string][]float32{ + "button": {1, 0}, + "cancel": {0, 1}, + "button: Submit": {1, 0}, + "button: Cancel": {1, 1}, + }) + m := NewEmbeddingMatcherWithNeighborWeight(e, 0) + + elements := []types.ElementDescriptor{ + {Ref: "submit", Role: "button", Name: "Submit"}, + {Ref: "cancel", Role: "button", Name: "Cancel"}, + } + + res, err := m.Find(context.Background(), "button not cancel", elements, types.FindOptions{Threshold: 0, TopK: 2}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(res.Matches) < 2 { + t.Fatalf("expected two matches, got %d", len(res.Matches)) + } + if res.BestRef != "submit" { + t.Fatalf("expected negative term to demote cancel, got %s", res.BestRef) + } +} + +func TestEmbeddingMatcher_NegativeOnlyQuery(t *testing.T) { + e := newScriptedEmbedder(map[string][]float32{ + "not submit": {1, 0}, + "submit": {1, 0}, + "button: Submit": {1, 0}, + "button: Cancel": {0, 1}, + }) + m := NewEmbeddingMatcherWithNeighborWeight(e, 0) + + elements := []types.ElementDescriptor{ + {Ref: "submit", Role: "button", Name: "Submit"}, + {Ref: "cancel", Role: "button", Name: "Cancel"}, + } + + res, err := m.Find(context.Background(), "not submit", elements, types.FindOptions{Threshold: 0.3, TopK: 2}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(res.Matches) == 0 { + t.Fatalf("expected non-empty matches for leading-not query") + } + if res.BestRef != "submit" { + t.Fatalf("expected leading-not query to behave as positive text, got %s", res.BestRef) + } +} + +func TestEmbeddingMatcher_EmptyQueryReturnsNoResults(t *testing.T) { + m := NewEmbeddingMatcher(newDummyEmbedder(64)) + res, err := m.Find(context.Background(), " ", []types.ElementDescriptor{ + {Ref: "e1", Role: "button", Name: "Submit"}, + }, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(res.Matches) != 0 { + t.Fatalf("expected no matches for empty query, got %d", len(res.Matches)) + } +} + func TestEmbeddingMatcher_NeighborContextDisambiguatesRealWorldButtons(t *testing.T) { e := newScriptedEmbedder(map[string][]float32{ "laptop add to cart": {1, 1, 0}, @@ -258,6 +325,28 @@ func TestEmbeddingMatcher_SingleElement_WithNeighborWeight(t *testing.T) { } } +func TestEmbeddingMatcher_TieBreaksByPositionalHints(t *testing.T) { + e := newScriptedEmbedder(map[string][]float32{ + "open button": {1, 0, 0}, + "button: Open": {1, 0, 0}, + }) + m := NewEmbeddingMatcherWithNeighborWeight(e, 0) + + elements := []types.ElementDescriptor{ + {Ref: "shallow", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 1, SiblingIndex: 1}}, + {Ref: "deep-left", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 3, SiblingIndex: 0}}, + {Ref: "deep-right", Role: "button", Name: "Open", Positional: types.PositionalHints{Depth: 3, SiblingIndex: 2}}, + } + + res, err := m.Find(context.Background(), "open button", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if res.BestRef != "deep-left" { + t.Fatalf("expected deep-left to win tie-break, got %s", res.BestRef) + } +} + type scriptedEmbedder struct { vectors map[string][]float32 } @@ -295,3 +384,108 @@ func findMatchScore(matches []types.ElementMatch, ref string) (float64, bool) { } // FindResult.ConfidenceLabel tests + +// Hardening tests + +func TestEmbeddingMatcher_Find_ContextCanceledBeforeEmbed(t *testing.T) { + m := NewEmbeddingMatcher(newDummyEmbedder(64)) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err := m.Find(ctx, "submit button", []types.ElementDescriptor{ + {Ref: "e1", Role: "button", Name: "Submit"}, + }, types.FindOptions{Threshold: 0, TopK: 1}) + + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context canceled error, got %v", err) + } +} + +func TestEmbeddingMatcher_Find_EmbedderVectorCountMismatchReturnsError(t *testing.T) { + e := &malformedEmbedder{vectors: fixedVectors(1, 64)} + m := NewEmbeddingMatcher(e) + + _, err := m.Find(context.Background(), "submit", []types.ElementDescriptor{ + {Ref: "e1", Role: "button", Name: "Submit"}, + {Ref: "e2", Role: "button", Name: "Cancel"}, + }, types.FindOptions{Threshold: 0, TopK: 2}) + + if err == nil { + t.Fatal("expected error for vector count mismatch") + } +} + +func TestEmbeddingMatcher_Find_InconsistentVectorDimensionsReturnsError(t *testing.T) { + e := &malformedEmbedder{vectors: [][]float32{ + {1, 0, 0}, + {0, 1}, + {0, 0, 1}, + }} + m := NewEmbeddingMatcher(e) + + _, err := m.Find(context.Background(), "submit", []types.ElementDescriptor{ + {Ref: "e1", Role: "button", Name: "Submit"}, + {Ref: "e2", Role: "button", Name: "Cancel"}, + }, types.FindOptions{Threshold: 0, TopK: 2}) + + if err == nil { + t.Fatal("expected error for inconsistent vector dimensions") + } +} + +type malformedEmbedder struct { + vectors [][]float32 +} + +func (e *malformedEmbedder) Strategy() string { return "malformed" } + +func (e *malformedEmbedder) Embed(texts []string) ([][]float32, error) { + return e.vectors, nil +} + +func TestValidateEmbeddedVectors(t *testing.T) { + tests := []struct { + name string + vectors [][]float32 + expected int + wantErr bool + }{ + {"empty valid", [][]float32{}, 0, false}, + {"correct count", fixedVectors(3, 64), 3, false}, + {"wrong count", fixedVectors(2, 64), 3, true}, + {"inconsistent dims", [][]float32{{1, 0}, {0, 1, 0}}, 2, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateEmbeddedVectors(tt.vectors, tt.expected) + if (err != nil) != tt.wantErr { + t.Errorf("validateEmbeddedVectors() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestEmbedWithContext_UsesContextAwareEmbedder(t *testing.T) { + e := NewHashingEmbedder(64) + ctx := context.Background() + + vecs, err := embedWithContext(ctx, e, []string{"test"}) + if err != nil { + t.Fatalf("embedWithContext error: %v", err) + } + if len(vecs) != 1 { + t.Fatalf("expected 1 vector, got %d", len(vecs)) + } +} + +func TestEmbedWithContext_CanceledBeforeNonContextAware(t *testing.T) { + e := newDummyEmbedder(64) // doesn't implement contextAwareEmbedder + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err := embedWithContext(ctx, e, []string{"test"}) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context canceled, got %v", err) + } +} diff --git a/internal/engine/hashing.go b/internal/engine/hashing.go index 9963d02..6c02276 100644 --- a/internal/engine/hashing.go +++ b/internal/engine/hashing.go @@ -1,6 +1,7 @@ package engine import ( + "context" "hash/fnv" "math" "strings" @@ -39,8 +40,24 @@ func NewHashingEmbedder(dim int) *HashingEmbedder { func (h *HashingEmbedder) Strategy() string { return "hashing" } func (h *HashingEmbedder) Embed(texts []string) ([][]float32, error) { + return h.EmbedContext(context.Background(), texts) +} + +func (h *HashingEmbedder) EmbedContext(ctx context.Context, texts []string) ([][]float32, error) { + if ctx == nil { + ctx = context.Background() + } + if err := ctx.Err(); err != nil { + return nil, err + } + result := make([][]float32, len(texts)) for i, text := range texts { + if i%64 == 0 { + if err := ctx.Err(); err != nil { + return nil, err + } + } result[i] = h.vectorize(text) } return result, nil diff --git a/internal/engine/hashing_test.go b/internal/engine/hashing_test.go index d8459fa..ed6dcb6 100644 --- a/internal/engine/hashing_test.go +++ b/internal/engine/hashing_test.go @@ -1,6 +1,8 @@ package engine import ( + "context" + "errors" "math" "testing" ) @@ -257,4 +259,29 @@ func TestHashingEmbedder_BatchConsistency(t *testing.T) { } } +// Hardening tests + +func TestHashingEmbedder_EmbedContext_Canceled(t *testing.T) { + e := NewHashingEmbedder(128) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err := e.EmbedContext(ctx, []string{"test"}) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context canceled error, got %v", err) + } +} + +func TestHashingEmbedder_EmbedContext_NilContext(t *testing.T) { + e := NewHashingEmbedder(128) + //nolint:staticcheck // intentionally testing nil context handling + vecs, err := e.EmbedContext(nil, []string{"test"}) + if err != nil { + t.Fatalf("expected no error with nil context, got %v", err) + } + if len(vecs) != 1 || len(vecs[0]) != 128 { + t.Fatalf("expected 1 vector of dim 128, got %d vectors", len(vecs)) + } +} + // Phase 3: CombinedMatcher tests diff --git a/internal/engine/lexical.go b/internal/engine/lexical.go index ee53a59..ffde616 100644 --- a/internal/engine/lexical.go +++ b/internal/engine/lexical.go @@ -47,10 +47,35 @@ func NewLexicalMatcher() *LexicalMatcher { func (m *LexicalMatcher) Strategy() string { return "lexical" } -func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) { - if opts.TopK <= 0 { - opts.TopK = 3 +func (m *LexicalMatcher) Find(ctx context.Context, query string, elements []types.ElementDescriptor, opts types.FindOptions) (types.FindResult, error) { + if ctx == nil { + ctx = context.Background() } + if err := ctx.Err(); err != nil { + return types.FindResult{}, err + } + + queryCtx := ParseQueryContext(query) + return m.findWithParsedContext(ctx, queryCtx, elements, opts), nil +} + +func (m *LexicalMatcher) findWithParsed(queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult { + return m.findWithParsedContext(context.Background(), queryCtx, elements, opts) +} + +func (m *LexicalMatcher) findWithParsedContext(ctx context.Context, queryCtx QueryContext, elements []types.ElementDescriptor, opts types.FindOptions) types.FindResult { + parsed := queryCtx.Base + opts = sanitizeFindOptions(opts, len(elements), 3) + + if len(parsed.Positive) == 0 && len(parsed.Negative) == 0 { + return types.FindResult{ + Strategy: "lexical", + ElementCount: len(elements), + } + } + + negativeOnly := len(parsed.Positive) == 0 && len(parsed.Negative) > 0 + positiveQuery := strings.Join(parsed.Positive, " ") ef := BuildElementFrequency(elements) @@ -60,13 +85,46 @@ func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types. } var candidates []scored - for _, el := range elements { + for i, el := range elements { + if i%64 == 0 { + if ctx.Err() != nil { + break + } + } + + if queryCtx.HasScope && matchesExcludedContext(el, queryCtx.Exclude) { + continue + } + composite := el.Composite() - score := lexicalScore(query, composite, el.Interactive, ef) - score += positionalBoost(query, el.Positional) + descTokens := tokenize(composite) + score := 0.0 + if len(parsed.Positive) == 0 { + score = 1.0 + } else { + score = lexicalScoreTokens(parsed.Positive, descTokens, el.Interactive, ef) + score += positionalBoost(positiveQuery, el.Positional) + } + + if len(parsed.Negative) > 0 { + negativeScore := lexicalScoreTokens(parsed.Negative, descTokens, el.Interactive, ef) + switch { + case hasStrongNegativeHit(parsed.Negative, descTokens) || negativeScore > 0.7: + score = 0 + case negativeScore > 0.4: + score *= 1 - negativeScore + } + } + + if score < 0 { + score = 0 + } if score > 1.0 { score = 1.0 } + if negativeOnly && score == 0 { + continue + } if score >= opts.Threshold { candidates = append(candidates, scored{desc: el, score: score}) } @@ -116,7 +174,7 @@ func (m *LexicalMatcher) Find(_ context.Context, query string, elements []types. result.BestScore = result.Matches[0].Score } - return result, nil + return result } func tokenize(s string) []string { @@ -226,7 +284,10 @@ func LexicalScoreWithFrequency(query, desc string, ef *ElementFrequency) float64 func lexicalScore(query, desc string, interactive bool, ef *ElementFrequency) float64 { rawQTokens := tokenize(query) rawDTokens := tokenize(desc) + return lexicalScoreTokens(rawQTokens, rawDTokens, interactive, ef) +} +func lexicalScoreTokens(rawQTokens, rawDTokens []string, interactive bool, ef *ElementFrequency) float64 { qTokens := removeStopwordsContextAware(rawQTokens, rawDTokens) dTokens := removeStopwordsContextAware(rawDTokens, rawQTokens) @@ -561,3 +622,44 @@ func tokenPrefixScore(qTokens, dTokens []string) float64 { return total / float64(len(qTokens)) } + +func hasStrongNegativeHit(negativeTokens, descTokens []string) bool { + if len(negativeTokens) == 0 || len(descTokens) == 0 { + return false + } + + dSet := tokenSet(descTokens) + for _, nt := range negativeTokens { + if isStopword(nt) || isSemanticStopword(nt) { + continue + } + if dSet[nt] { + return true + } + if syns, ok := synonymIndex[nt]; ok { + for syn := range syns { + synTokens := strings.Fields(syn) + if len(synTokens) == 0 { + continue + } + allPresent := true + hasMeaningfulToken := false + for _, st := range synTokens { + if isStopword(st) || isSemanticStopword(st) { + continue + } + hasMeaningfulToken = true + if !dSet[st] { + allPresent = false + break + } + } + if hasMeaningfulToken && allPresent { + return true + } + } + } + } + + return false +} diff --git a/internal/engine/lexical_test.go b/internal/engine/lexical_test.go index fbf47ac..cbe82fa 100644 --- a/internal/engine/lexical_test.go +++ b/internal/engine/lexical_test.go @@ -2,10 +2,12 @@ package engine import ( "context" - "github.com/pinchtab/semantic/internal/types" + "errors" "math" "strconv" "testing" + + "github.com/pinchtab/semantic/internal/types" ) func TestTokenPrefixScore_BtnButton(t *testing.T) { @@ -586,4 +588,77 @@ func TestLexicalMatcher_ThresholdFiltering(t *testing.T) { } } +func TestLexicalMatcher_NegativePenalization(t *testing.T) { + m := NewLexicalMatcher() + elements := []types.ElementDescriptor{ + {Ref: "submit", Role: "button", Name: "Submit"}, + {Ref: "cancel", Role: "button", Name: "Cancel"}, + } + + result, err := m.Find(context.Background(), "button not cancel", elements, types.FindOptions{ + Threshold: 0, + TopK: 2, + }) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(result.Matches) < 2 { + t.Fatalf("expected two matches, got %d", len(result.Matches)) + } + if result.BestRef != "submit" { + t.Fatalf("expected submit to rank above canceled element, got %s", result.BestRef) + } +} + +func TestLexicalMatcher_NegativeSynonymExpansion(t *testing.T) { + m := NewLexicalMatcher() + elements := []types.ElementDescriptor{ + {Ref: "password", Role: "textbox", Name: "Password"}, + {Ref: "email", Role: "textbox", Name: "Email"}, + } + + result, err := m.Find(context.Background(), "input no pwd", elements, types.FindOptions{ + Threshold: 0, + TopK: 2, + }) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(result.Matches) < 2 { + t.Fatalf("expected two matches, got %d", len(result.Matches)) + } + if result.BestRef != "email" { + t.Fatalf("expected password synonym to be penalized, got best=%s", result.BestRef) + } +} + +func TestLexicalMatcher_EmptyQueryReturnsNoResults(t *testing.T) { + m := NewLexicalMatcher() + elements := []types.ElementDescriptor{{Ref: "e1", Role: "button", Name: "Submit"}} + + result, err := m.Find(context.Background(), " ", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + if len(result.Matches) != 0 { + t.Fatalf("expected no matches for empty query, got %d", len(result.Matches)) + } +} + +// Hardening tests + +func TestLexicalMatcher_Find_ContextCanceled(t *testing.T) { + m := NewLexicalMatcher() + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err := m.Find(ctx, "submit button", []types.ElementDescriptor{ + {Ref: "e1", Role: "button", Name: "Submit"}, + }, types.FindOptions{Threshold: 0, TopK: 1}) + + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context canceled error, got %v", err) + } +} + // dummyEmbedder tests diff --git a/internal/engine/options.go b/internal/engine/options.go new file mode 100644 index 0000000..15a2ec6 --- /dev/null +++ b/internal/engine/options.go @@ -0,0 +1,36 @@ +package engine + +import ( + "math" + + "github.com/pinchtab/semantic/internal/types" +) + +func sanitizeFindOptions(opts types.FindOptions, elementCount int, defaultTopK int) types.FindOptions { + if defaultTopK <= 0 { + defaultTopK = 3 + } + + if opts.TopK <= 0 { + opts.TopK = defaultTopK + } + if elementCount >= 0 && opts.TopK > elementCount { + opts.TopK = elementCount + } + + opts.Threshold = sanitizeThreshold(opts.Threshold) + return opts +} + +func sanitizeThreshold(threshold float64) float64 { + if math.IsNaN(threshold) || math.IsInf(threshold, 0) { + return 0 + } + if threshold < 0 { + return 0 + } + if threshold > 1 { + return 1 + } + return threshold +} diff --git a/internal/engine/query_context.go b/internal/engine/query_context.go new file mode 100644 index 0000000..6d0fc24 --- /dev/null +++ b/internal/engine/query_context.go @@ -0,0 +1,171 @@ +package engine + +import ( + "regexp" + "strings" + + "github.com/pinchtab/semantic/internal/types" +) + +var negativeContextPattern = regexp.MustCompile(`(?i)\b(not|without|exclude|excluding|except|ignore)\b`) + +type QueryContext struct { + Base ParsedQuery + Exclude []string + HasScope bool + Ordinal OrdinalConstraint +} + +func ParseQueryContext(raw string) QueryContext { + ordinal, baseRaw := parseOrdinalConstraint(raw) + parsed := ParseQuery(baseRaw) + cleaned := strings.TrimSpace(baseRaw) + if cleaned == "" { + return QueryContext{Base: parsed, Ordinal: ordinal} + } + + loc := negativeContextPattern.FindStringIndex(cleaned) + if loc == nil { + return QueryContext{Base: parsed, Ordinal: ordinal} + } + + contextBaseRaw := strings.TrimSpace(cleaned[:loc[0]]) + remainder := strings.TrimSpace(cleaned[loc[1]:]) + if contextBaseRaw == "" || remainder == "" { + return QueryContext{Base: parsed, Ordinal: ordinal} + } + + baseParsed := ParseQuery(contextBaseRaw) + if len(baseParsed.Positive) == 0 { + return QueryContext{Base: parsed, Ordinal: ordinal} + } + if len(parsed.Negative) == 0 { + return QueryContext{Base: parsed, Ordinal: ordinal} + } + + exclude := normalizeContextPhrase(remainder) + if len(exclude) == 0 { + return QueryContext{Base: parsed, Ordinal: ordinal} + } + + if !looksLikeContextPhrase(exclude) { + return QueryContext{Base: parsed, Ordinal: ordinal} + } + + return QueryContext{ + Base: baseParsed, + Exclude: exclude, + HasScope: true, + Ordinal: ordinal, + } +} + +func normalizeContextPhrase(raw string) []string { + words := tokenize(strings.Trim(raw, ",.;:- ")) + if len(words) == 0 { + return nil + } + + for len(words) > 0 && contextLeadingFillers[words[0]] { + words = words[1:] + } + for len(words) > 0 && contextTrailingFillers[words[len(words)-1]] { + words = words[:len(words)-1] + } + if len(words) == 0 { + return nil + } + return words +} + +func matchesExcludedContext(el types.ElementDescriptor, excludeTokens []string) bool { + if len(excludeTokens) == 0 { + return false + } + + ctxTokens := tokenize(strings.Join([]string{ + el.Parent, + el.Section, + el.Positional.LabelledBy, + el.Role, + el.Name, + el.Value, + }, " ")) + if len(ctxTokens) == 0 { + return false + } + + ctxSet := tokenSet(ctxTokens) + matched := 0 + meaningful := 0 + for _, tok := range excludeTokens { + if isStopword(tok) || isSemanticStopword(tok) { + continue + } + meaningful++ + if ctxSet[tok] { + matched++ + } + } + if meaningful == 0 { + return false + } + if matched == meaningful { + return true + } + if meaningful > 1 && float64(matched)/float64(meaningful) >= 0.7 { + return true + } + return false +} + +var contextLeadingFillers = map[string]bool{ + "in": true, + "on": true, + "at": true, + "of": true, + "to": true, + "from": true, + "inside": true, + "within": true, + "the": true, + "a": true, + "an": true, +} + +var contextTrailingFillers = map[string]bool{ + "one": true, +} + +var contextHintTokens = map[string]bool{ + "header": true, + "footer": true, + "sidebar": true, + "nav": true, + "navigation": true, + "menu": true, + "toolbar": true, + "dialog": true, + "modal": true, + "form": true, + "panel": true, + "section": true, + "content": true, + "main": true, + "top": true, + "bottom": true, + "left": true, + "right": true, + "sticky": true, + "primary": true, + "secondary": true, +} + +func looksLikeContextPhrase(tokens []string) bool { + for _, tok := range tokens { + if contextHintTokens[tok] { + return true + } + } + return false +} diff --git a/internal/engine/query_context_test.go b/internal/engine/query_context_test.go new file mode 100644 index 0000000..4314d64 --- /dev/null +++ b/internal/engine/query_context_test.go @@ -0,0 +1,164 @@ +package engine + +import ( + "context" + "testing" + + "github.com/pinchtab/semantic/internal/types" +) + +func TestParseQueryContext_BasicPatterns(t *testing.T) { + tests := []struct { + name string + query string + wantPositive []string + wantNegative []string + wantExclude []string + wantHasScope bool + }{ + { + name: "plain negative tokens", + query: "button not submit", + wantPositive: []string{"button"}, + wantNegative: []string{"submit"}, + }, + { + name: "context exclusion in header", + query: "submit button not in header", + wantPositive: []string{"submit", "button"}, + wantExclude: []string{"header"}, + wantHasScope: true, + }, + { + name: "context exclusion with filler tail", + query: "login link, not the footer one", + wantPositive: []string{"login", "link"}, + wantExclude: []string{"footer"}, + wantHasScope: true, + }, + { + name: "excluding sidebar", + query: "search box excluding sidebar", + wantPositive: []string{"search", "box"}, + wantExclude: []string{"sidebar"}, + wantHasScope: true, + }, + { + name: "leading not stays literal", + query: "not now button", + wantPositive: []string{"not", "now", "button"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ParseQueryContext(tt.query) + assertTokens(t, got.Base.Positive, tt.wantPositive, "positive") + assertTokens(t, got.Base.Negative, tt.wantNegative, "negative") + assertTokens(t, got.Exclude, tt.wantExclude, "exclude") + if got.HasScope != tt.wantHasScope { + t.Fatalf("HasScope mismatch: got=%v want=%v", got.HasScope, tt.wantHasScope) + } + }) + } +} + +func TestMatchesExcludedContext(t *testing.T) { + el := types.ElementDescriptor{ + Ref: "e1", + Role: "button", + Name: "Submit", + Parent: "Account Header", + Section: "Top Header Actions", + Positional: types.PositionalHints{ + LabelledBy: "Header controls", + }, + } + + if !matchesExcludedContext(el, []string{"header"}) { + t.Fatalf("expected header exclusion to match") + } + if !matchesExcludedContext(el, []string{"top", "header"}) { + t.Fatalf("expected multi-token exclusion to match") + } + if matchesExcludedContext(el, []string{"sidebar"}) { + t.Fatalf("did not expect unrelated exclusion to match") + } +} + +func TestNegativeContextAcrossMatchers(t *testing.T) { + elements := []types.ElementDescriptor{ + {Ref: "header-submit", Role: "button", Name: "Submit", Section: "Header"}, + {Ref: "main-submit", Role: "button", Name: "Submit", Section: "Checkout content"}, + {Ref: "footer-submit", Role: "button", Name: "Submit", Section: "Footer"}, + } + + queries := []string{ + "submit button not in header", + "submit button except footer", + } + + matchers := []types.ElementMatcher{ + NewLexicalMatcher(), + NewEmbeddingMatcher(NewHashingEmbedder(128)), + NewCombinedMatcher(NewHashingEmbedder(128)), + } + + for _, matcher := range matchers { + for _, query := range queries { + res, err := matcher.Find(context.Background(), query, elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("%s Find failed for %q: %v", matcher.Strategy(), query, err) + } + for _, match := range res.Matches { + if query == "submit button not in header" && match.Ref == "header-submit" { + t.Fatalf("%s should exclude header match for %q", matcher.Strategy(), query) + } + if query == "submit button except footer" && match.Ref == "footer-submit" { + t.Fatalf("%s should exclude footer match for %q", matcher.Strategy(), query) + } + } + } + } +} + +func TestDuplicateRegionDisambiguation(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "login-header", Role: "link", Name: "Log in", Section: "Header"}, + {Ref: "login-footer", Role: "link", Name: "Log in", Section: "Footer"}, + {Ref: "login-sidebar", Role: "link", Name: "Log in", Section: "Sticky Sidebar Quick Actions"}, + } + + res, err := m.Find(context.Background(), "login link, not the footer one", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + for _, match := range res.Matches { + if match.Ref == "login-footer" { + t.Fatalf("expected footer variant to be excluded") + } + } + + res2, err := m.Find(context.Background(), "login link except sticky sidebar quick actions", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + for _, match := range res2.Matches { + if match.Ref == "login-sidebar" { + t.Fatalf("expected sidebar variant to be excluded") + } + } +} + +func assertTokens(t *testing.T, got, want []string, label string) { + t.Helper() + if len(got) != len(want) { + t.Fatalf("%s length mismatch: got=%v want=%v", label, got, want) + } + for i := range got { + if got[i] != want[i] { + t.Fatalf("%s mismatch: got=%v want=%v", label, got, want) + } + } +} diff --git a/internal/engine/query_ordinal.go b/internal/engine/query_ordinal.go new file mode 100644 index 0000000..fe65420 --- /dev/null +++ b/internal/engine/query_ordinal.go @@ -0,0 +1,222 @@ +package engine + +import ( + "regexp" + "sort" + "strconv" + "strings" + + "github.com/pinchtab/semantic/internal/types" +) + +type OrdinalConstraint struct { + HasOrdinal bool + Last bool + Position int +} + +var numericOrdinalPattern = regexp.MustCompile(`^(\d+)(st|nd|rd|th)$`) + +var ordinalWords = map[string]int{ + "first": 1, + "second": 2, + "third": 3, + "fourth": 4, + "fifth": 5, + "sixth": 6, + "seventh": 7, + "eighth": 8, + "ninth": 9, + "tenth": 10, +} + +var ordinalTargetWords = map[string]bool{ + "button": true, + "link": true, + "input": true, + "field": true, + "textbox": true, + "searchbox": true, + "item": true, + "menu": true, + "option": true, + "tab": true, + "result": true, + "row": true, + "column": true, + "card": true, + "entry": true, + "element": true, +} + +func parseNumericOrdinal(token string) (int, bool) { + m := numericOrdinalPattern.FindStringSubmatch(token) + if len(m) != 3 { + return 0, false + } + n, err := strconv.Atoi(m[1]) + if err != nil || n <= 0 { + return 0, false + } + return n, true +} + +func normalizeQueryToken(token string) string { + return strings.Trim(strings.ToLower(token), ",.;:-") +} + +func containsOrdinalTarget(words []string, ordIdx int) bool { + for i, w := range words { + if i == ordIdx { + continue + } + if ordinalTargetWords[normalizeQueryToken(w)] { + return true + } + } + return false +} + +func parseOrdinalConstraint(query string) (OrdinalConstraint, string) { + cleaned := strings.TrimSpace(query) + if cleaned == "" { + return OrdinalConstraint{}, cleaned + } + + words := strings.Fields(cleaned) + if len(words) == 0 { + return OrdinalConstraint{}, cleaned + } + + ordIdx := -1 + ordPos := 0 + ordLast := false + + for i, w := range words { + norm := normalizeQueryToken(w) + if norm == "" { + continue + } + if norm == "last" || norm == "final" { + ordIdx = i + ordLast = true + break + } + if pos, ok := ordinalWords[norm]; ok { + ordIdx = i + ordPos = pos + break + } + if pos, ok := parseNumericOrdinal(norm); ok { + ordIdx = i + ordPos = pos + break + } + } + + if ordIdx == -1 || !containsOrdinalTarget(words, ordIdx) { + return OrdinalConstraint{}, cleaned + } + + filtered := make([]string, 0, len(words)-1) + for i, w := range words { + if i == ordIdx { + continue + } + filtered = append(filtered, w) + } + + base := strings.Trim(strings.TrimSpace(strings.Join(filtered, " ")), ",.;:-") + if base == "" { + base = cleaned + } + + return OrdinalConstraint{ + HasOrdinal: true, + Last: ordLast, + Position: ordPos, + }, base +} + +func selectOrdinalMatchInOrder(result types.FindResult, constraint OrdinalConstraint, elements []types.ElementDescriptor) types.FindResult { + if !constraint.HasOrdinal || len(result.Matches) == 0 { + return result + } + + filtered := filterOrdinalCandidates(result.Matches) + if len(filtered) == 0 { + result.Matches = nil + result.BestRef = "" + result.BestScore = 0 + return result + } + + refOrder := make(map[string]int, len(elements)) + for idx, el := range elements { + order := idx + if el.DocumentIdx > 0 { + order = el.DocumentIdx + } + refOrder[el.Ref] = order + } + + ordered := make([]types.ElementMatch, len(filtered)) + copy(ordered, filtered) + sort.SliceStable(ordered, func(i, j int) bool { + idxI, okI := refOrder[ordered[i].Ref] + idxJ, okJ := refOrder[ordered[j].Ref] + if okI && okJ { + return idxI < idxJ + } + if okI != okJ { + return okI + } + return ordered[i].Ref < ordered[j].Ref + }) + + idx := -1 + if constraint.Last { + idx = len(ordered) - 1 + } else if constraint.Position > 0 { + idx = constraint.Position - 1 + } + + if idx < 0 || idx >= len(ordered) { + result.Matches = nil + result.BestRef = "" + result.BestScore = 0 + return result + } + + chosen := ordered[idx] + result.Matches = []types.ElementMatch{chosen} + result.BestRef = chosen.Ref + result.BestScore = chosen.Score + return result +} + +func filterOrdinalCandidates(matches []types.ElementMatch) []types.ElementMatch { + if len(matches) == 0 { + return nil + } + + bestScore := matches[0].Score + for _, match := range matches[1:] { + if match.Score > bestScore { + bestScore = match.Score + } + } + + floor := bestScore - 0.15 + if floor < 0.2 { + floor = 0.2 + } + + filtered := make([]types.ElementMatch, 0, len(matches)) + for _, match := range matches { + if match.Score >= floor { + filtered = append(filtered, match) + } + } + return filtered +} diff --git a/internal/engine/query_ordinal_test.go b/internal/engine/query_ordinal_test.go new file mode 100644 index 0000000..8cf3462 --- /dev/null +++ b/internal/engine/query_ordinal_test.go @@ -0,0 +1,210 @@ +package engine + +import ( + "context" + "testing" + + "github.com/pinchtab/semantic/internal/types" +) + +func TestParseOrdinalConstraint_BasicPatterns(t *testing.T) { + tests := []struct { + name string + query string + wantBase string + wantHasOrd bool + wantPos int + wantIsLast bool + }{ + { + name: "second button", + query: "second button", + wantBase: "button", + wantHasOrd: true, + wantPos: 2, + }, + { + name: "numeric ordinal", + query: "3rd menu item", + wantBase: "menu item", + wantHasOrd: true, + wantPos: 3, + }, + { + name: "last input field", + query: "last input field", + wantBase: "input field", + wantHasOrd: true, + wantIsLast: true, + }, + { + name: "non ordinal content query", + query: "first name", + wantBase: "first name", + wantHasOrd: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, base := parseOrdinalConstraint(tt.query) + if base != tt.wantBase { + t.Fatalf("base query mismatch: want=%q got=%q", tt.wantBase, base) + } + if got.HasOrdinal != tt.wantHasOrd { + t.Fatalf("HasOrdinal mismatch: want=%v got=%v", tt.wantHasOrd, got.HasOrdinal) + } + if got.Position != tt.wantPos { + t.Fatalf("position mismatch: want=%d got=%d", tt.wantPos, got.Position) + } + if got.Last != tt.wantIsLast { + t.Fatalf("last mismatch: want=%v got=%v", tt.wantIsLast, got.Last) + } + }) + } +} + +func TestParseQueryContext_WithOrdinalAndNegativeScope(t *testing.T) { + ctx := ParseQueryContext("second button not in header") + if !ctx.Ordinal.HasOrdinal || ctx.Ordinal.Position != 2 { + t.Fatalf("expected second ordinal, got %+v", ctx.Ordinal) + } + assertTokens(t, ctx.Base.Positive, []string{"button"}, "positive") + assertTokens(t, ctx.Base.Negative, []string{}, "negative") + assertTokens(t, ctx.Exclude, []string{"header"}, "exclude") + if !ctx.HasScope { + t.Fatalf("expected scope exclusion to be detected") + } +} + +func TestCombinedMatcher_OrdinalQuery_SecondButton(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "btn-1", Role: "button", Name: "Action", Positional: types.PositionalHints{SiblingIndex: 0}}, + {Ref: "btn-2", Role: "button", Name: "Action", Positional: types.PositionalHints{SiblingIndex: 1}}, + {Ref: "btn-3", Role: "button", Name: "Action", Positional: types.PositionalHints{SiblingIndex: 2}}, + } + + res, err := m.Find(context.Background(), "second button", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if len(res.Matches) != 1 { + t.Fatalf("expected one ordinal-selected match, got %d", len(res.Matches)) + } + if res.BestRef != "btn-2" { + t.Fatalf("expected second button btn-2, got %s", res.BestRef) + } +} + +func TestCombinedMatcher_OrdinalQuery_LastInputField(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "input-1", Role: "textbox", Name: "Email", DocumentIdx: 0, Positional: types.PositionalHints{SiblingIndex: 1}}, + {Ref: "input-2", Role: "textbox", Name: "Email", DocumentIdx: 1, Positional: types.PositionalHints{SiblingIndex: 2}}, + {Ref: "input-3", Role: "textbox", Name: "Email", DocumentIdx: 2, Positional: types.PositionalHints{SiblingIndex: 3}}, + } + + res, err := m.Find(context.Background(), "last input field", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if len(res.Matches) != 1 { + t.Fatalf("expected one ordinal-selected match, got %d", len(res.Matches)) + } + if res.BestRef != "input-3" { + t.Fatalf("expected last input field input-3, got %s", res.BestRef) + } +} + +func TestCombinedMatcher_OrdinalQuery_OutOfRangeReturnsNoMatch(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "b1", Role: "button", Name: "Continue", Positional: types.PositionalHints{SiblingIndex: 0}}, + {Ref: "b2", Role: "button", Name: "Continue", Positional: types.PositionalHints{SiblingIndex: 1}}, + {Ref: "b3", Role: "button", Name: "Continue", Positional: types.PositionalHints{SiblingIndex: 2}}, + } + + res, err := m.Find(context.Background(), "fifth button", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if len(res.Matches) != 0 { + t.Fatalf("expected no matches for out-of-range ordinal, got %d", len(res.Matches)) + } + if res.BestRef != "" || res.BestScore != 0 { + t.Fatalf("expected empty best match for out-of-range ordinal, got ref=%q score=%f", res.BestRef, res.BestScore) + } +} + +func TestCombinedMatcher_OrdinalGuard_DoesNotTreatFirstNameAsOrdinal(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "first-name", Role: "textbox", Name: "First Name", Positional: types.PositionalHints{SiblingIndex: 3}}, + {Ref: "last-name", Role: "textbox", Name: "Last Name", Positional: types.PositionalHints{SiblingIndex: 0}}, + } + + res, err := m.Find(context.Background(), "first name", elements, types.FindOptions{Threshold: 0, TopK: 2}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if len(res.Matches) == 0 { + t.Fatalf("expected at least one match") + } + if res.BestRef != "first-name" { + t.Fatalf("expected semantic match for 'first name', got %s", res.BestRef) + } +} + +func TestCombinedMatcher_OrdinalWithContextExclusion(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "header-btn", Role: "button", Name: "Submit", Section: "Header", Positional: types.PositionalHints{SiblingIndex: 0}}, + {Ref: "main-btn-1", Role: "button", Name: "Submit", Section: "Main", Positional: types.PositionalHints{SiblingIndex: 1}}, + {Ref: "main-btn-2", Role: "button", Name: "Submit", Section: "Main", Positional: types.PositionalHints{SiblingIndex: 2}}, + } + + res, err := m.Find(context.Background(), "second button not in header", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if len(res.Matches) != 1 { + t.Fatalf("expected one ordinal-selected match, got %d", len(res.Matches)) + } + if res.BestRef != "main-btn-2" { + t.Fatalf("expected second non-header button, got %s", res.BestRef) + } +} + +func TestFilterOrdinalCandidates_DropsWeakSemanticTail(t *testing.T) { + matches := []types.ElementMatch{ + {Ref: "e1", Score: 0.92}, + {Ref: "e2", Score: 0.81}, + {Ref: "e3", Score: 0.18}, + } + + filtered := filterOrdinalCandidates(matches) + if len(filtered) != 2 { + t.Fatalf("expected 2 strong candidates, got %d", len(filtered)) + } + if filtered[0].Ref != "e1" || filtered[1].Ref != "e2" { + t.Fatalf("unexpected filtered refs: %+v", filtered) + } +} + +func TestCombinedMatcher_OrdinalQuery_IgnoresWeakNonButtonTail(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "btn-1", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 0}}, + {Ref: "btn-2", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 1}}, + {Ref: "note", Role: "note", Name: "Submission tips", Positional: types.PositionalHints{SiblingIndex: 2}}, + } + + res, err := m.Find(context.Background(), "second submit button", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if res.BestRef != "btn-2" { + t.Fatalf("expected second submit button btn-2, got %s", res.BestRef) + } +} diff --git a/internal/engine/query_parser.go b/internal/engine/query_parser.go new file mode 100644 index 0000000..a16bbc7 --- /dev/null +++ b/internal/engine/query_parser.go @@ -0,0 +1,47 @@ +package engine + +import "github.com/pinchtab/semantic/internal/types" + +// Query grammar: +// +// [NEGATIVE_TRIGGER ...]+ +// +// A NEGATIVE_TRIGGER is one of: +// not, without, exclude, excluding, except, no, ignore. +// After a trigger, all following tokens are classified as negative until +// another trigger or the end of the query. +type ParsedQuery = types.ParsedQuery + +var negativeTriggers = map[string]bool{ + "not": true, + "without": true, + "exclude": true, + "excluding": true, + "except": true, + "no": true, + "ignore": true, +} + +// ParseQuery tokenizes and classifies tokens into positive and negative terms. +func ParseQuery(raw string) ParsedQuery { + tokens := tokenize(raw) + parsed := types.ParsedQuery{ + Positive: make([]string, 0, len(tokens)), + Negative: make([]string, 0, len(tokens)), + } + + inNegative := false + for _, tok := range tokens { + if negativeTriggers[tok] && len(parsed.Positive) > 0 { + inNegative = true + continue + } + if inNegative { + parsed.Negative = append(parsed.Negative, tok) + continue + } + parsed.Positive = append(parsed.Positive, tok) + } + + return parsed +} diff --git a/internal/engine/query_parser_test.go b/internal/engine/query_parser_test.go new file mode 100644 index 0000000..be0b35d --- /dev/null +++ b/internal/engine/query_parser_test.go @@ -0,0 +1,101 @@ +package engine + +import ( + "reflect" + "testing" +) + +func TestParseQuery_TableDriven(t *testing.T) { + tests := []struct { + name string + raw string + positive []string + negative []string + }{ + { + name: "button not submit", + raw: "button not submit", + positive: []string{"button"}, + negative: []string{"submit"}, + }, + { + name: "button not sign in", + raw: "button not sign in", + positive: []string{"button"}, + negative: []string{"sign", "in"}, + }, + { + name: "link without logout", + raw: "link without logout", + positive: []string{"link"}, + negative: []string{"logout"}, + }, + { + name: "input excluding email", + raw: "input excluding email", + positive: []string{"input"}, + negative: []string{"email"}, + }, + { + name: "button except close", + raw: "button except close", + positive: []string{"button"}, + negative: []string{"close"}, + }, + { + name: "sign in button", + raw: "sign in button", + positive: []string{"sign", "in", "button"}, + negative: nil, + }, + { + name: "not button", + raw: "not button", + positive: []string{"not", "button"}, + negative: nil, + }, + { + name: "input no password no username", + raw: "input no password no username", + positive: []string{"input"}, + negative: []string{"password", "username"}, + }, + { + name: "negative segment break by trigger", + raw: "button not sign in except submit", + positive: []string{"button"}, + negative: []string{"sign", "in", "submit"}, + }, + { + name: "trailing trigger behaves as positive query", + raw: "button not", + positive: []string{"button"}, + negative: nil, + }, + { + name: "repeated triggers", + raw: "button not submit not cancel", + positive: []string{"button"}, + negative: []string{"submit", "cancel"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ParseQuery(tt.raw) + if !reflect.DeepEqual(normalizeTokens(got.Positive), normalizeTokens(tt.positive)) { + t.Fatalf("positive mismatch: got=%v want=%v", got.Positive, tt.positive) + } + if !reflect.DeepEqual(normalizeTokens(got.Negative), normalizeTokens(tt.negative)) { + t.Fatalf("negative mismatch: got=%v want=%v", got.Negative, tt.negative) + } + }) + } +} + +func normalizeTokens(tokens []string) []string { + if len(tokens) == 0 { + return []string{} + } + return tokens +} diff --git a/internal/engine/query_visual.go b/internal/engine/query_visual.go new file mode 100644 index 0000000..deb11c5 --- /dev/null +++ b/internal/engine/query_visual.go @@ -0,0 +1,374 @@ +package engine + +import ( + "sort" + "strings" + + "github.com/pinchtab/semantic/internal/types" +) + +const ( + visualDirectionalBoost = 0.12 + visualRelativeBoost = 0.16 + visualRelativePenalty = 0.05 + visualBoostCap = 0.30 +) + +type visualQueryHints struct { + hasHints bool + baseQuery string + top bool + bottom bool + left bool + right bool + aboveAnchor string + belowAnchor string +} + +var visualKeywordSet = map[string]bool{ + "top": true, + "bottom": true, + "left": true, + "right": true, + "corner": true, + "above": true, + "below": true, + "under": true, + "over": true, + "in": true, + "on": true, + "at": true, + "the": true, + "a": true, + "an": true, + "of": true, + "page": true, + "side": true, +} + +func parseVisualQueryHints(query string) visualQueryHints { + cleaned := strings.TrimSpace(query) + if cleaned == "" { + return visualQueryHints{} + } + + words := tokenSet(tokenize(cleaned)) + hints := visualQueryHints{ + top: words["top"], + bottom: words["bottom"], + left: words["left"], + right: words["right"], + baseQuery: cleaned, + } + hasDirectional := hints.top || hints.bottom || hints.left || hints.right || words["corner"] + hasRelative := false + + lower := strings.ToLower(cleaned) + if idx := strings.Index(lower, " below "); idx >= 0 { + hints.belowAnchor = normalizeVisualAnchor(cleaned[idx+len(" below "):]) + hasRelative = true + if base := strings.TrimSpace(cleaned[:idx]); base != "" { + hints.baseQuery = stripVisualKeywords(base) + } + } + if idx := strings.Index(lower, " under "); idx >= 0 { + hints.belowAnchor = normalizeVisualAnchor(cleaned[idx+len(" under "):]) + hasRelative = true + if base := strings.TrimSpace(cleaned[:idx]); base != "" { + hints.baseQuery = stripVisualKeywords(base) + } + } + if idx := strings.Index(lower, " above "); idx >= 0 { + hints.aboveAnchor = normalizeVisualAnchor(cleaned[idx+len(" above "):]) + hasRelative = true + if base := strings.TrimSpace(cleaned[:idx]); base != "" { + hints.baseQuery = stripVisualKeywords(base) + } + } + if idx := strings.Index(lower, " over "); idx >= 0 { + hints.aboveAnchor = normalizeVisualAnchor(cleaned[idx+len(" over "):]) + hasRelative = true + if base := strings.TrimSpace(cleaned[:idx]); base != "" { + hints.baseQuery = stripVisualKeywords(base) + } + } + + if !hasRelative && hasDirectional { + hints.baseQuery = stripVisualKeywords(cleaned) + } + if strings.TrimSpace(hints.baseQuery) == "" { + hints.baseQuery = cleaned + } + + hints.hasHints = hasDirectional || hints.aboveAnchor != "" || hints.belowAnchor != "" + return hints +} + +func stripVisualKeywords(query string) string { + parts := tokenize(query) + filtered := make([]string, 0, len(parts)) + for _, p := range parts { + if !visualKeywordSet[p] { + filtered = append(filtered, p) + } + } + return strings.TrimSpace(strings.Join(filtered, " ")) +} + +func normalizeVisualAnchor(s string) string { + anchor := stripVisualKeywords(s) + if anchor == "" { + anchor = strings.TrimSpace(strings.ToLower(s)) + } + return anchor +} + +type spatialStats struct { + hasX bool + hasY bool + minX float64 + maxX float64 + minY float64 + maxY float64 +} + +func buildSpatialStats(elements []types.ElementDescriptor) spatialStats { + stats := spatialStats{} + for _, el := range elements { + h := el.Positional + if hasHorizontalPosition(h) { + x := horizontalPosition(h) + if !stats.hasX { + stats.hasX = true + stats.minX, stats.maxX = x, x + } else { + if x < stats.minX { + stats.minX = x + } + if x > stats.maxX { + stats.maxX = x + } + } + } + if hasVerticalPosition(h) { + y := verticalPosition(h) + if !stats.hasY { + stats.hasY = true + stats.minY, stats.maxY = y, y + } else { + if y < stats.minY { + stats.minY = y + } + if y > stats.maxY { + stats.maxY = y + } + } + } + } + return stats +} + +func applyVisualHintBoost(result types.FindResult, hints visualQueryHints, elements []types.ElementDescriptor, topK int) types.FindResult { + if !hints.hasHints || len(result.Matches) == 0 { + return result + } + + refToElem := make(map[string]types.ElementDescriptor, len(elements)) + refOrder := make(map[string]int, len(elements)) + for i, el := range elements { + refToElem[el.Ref] = el + refOrder[el.Ref] = i + } + + stats := buildSpatialStats(elements) + anchorRef := "" + if hints.aboveAnchor != "" { + anchorRef = findVisualAnchorRef(hints.aboveAnchor, elements) + } else if hints.belowAnchor != "" { + anchorRef = findVisualAnchorRef(hints.belowAnchor, elements) + } + + type boostedMatch struct { + match types.ElementMatch + order int + } + + boosted := make([]boostedMatch, 0, len(result.Matches)) + for _, match := range result.Matches { + el, ok := refToElem[match.Ref] + if !ok { + continue + } + order := refOrder[match.Ref] + boost := computeVisualBoost(el, order, len(elements), hints, stats, anchorRef, refToElem, refOrder) + if boost > visualBoostCap { + boost = visualBoostCap + } + if boost < -visualBoostCap { + boost = -visualBoostCap + } + match.Score += boost + if match.Score > 1.0 { + match.Score = 1.0 + } + if match.Score < 0 { + match.Score = 0 + } + boosted = append(boosted, boostedMatch{match: match, order: order}) + } + + sort.SliceStable(boosted, func(i, j int) bool { + diff := boosted[i].match.Score - boosted[j].match.Score + if diff > 1e-9 || diff < -1e-9 { + return diff > 0 + } + if boosted[i].order != boosted[j].order { + return boosted[i].order < boosted[j].order + } + return boosted[i].match.Ref < boosted[j].match.Ref + }) + + if topK > 0 && len(boosted) > topK { + boosted = boosted[:topK] + } + + result.Matches = result.Matches[:0] + for _, bm := range boosted { + result.Matches = append(result.Matches, bm.match) + } + if len(result.Matches) > 0 { + result.BestRef = result.Matches[0].Ref + result.BestScore = result.Matches[0].Score + } else { + result.BestRef = "" + result.BestScore = 0 + } + return result +} + +func computeVisualBoost( + el types.ElementDescriptor, + order int, + total int, + hints visualQueryHints, + stats spatialStats, + anchorRef string, + refToElem map[string]types.ElementDescriptor, + refOrder map[string]int, +) float64 { + xRatio := horizontalRatio(el.Positional, stats, order, total) + yRatio := verticalRatio(el.Positional, stats, order, total) + + boost := 0.0 + if hints.top { + boost += visualDirectionalBoost * (1 - yRatio) + } + if hints.bottom { + boost += visualDirectionalBoost * yRatio + } + if hints.left { + boost += visualDirectionalBoost * (1 - xRatio) + } + if hints.right { + boost += visualDirectionalBoost * xRatio + } + + if anchorRef != "" && anchorRef != el.Ref { + anchorEl, ok := refToElem[anchorRef] + if ok { + anchorOrder := refOrder[anchorRef] + anchorY := verticalRatio(anchorEl.Positional, stats, anchorOrder, total) + if hints.aboveAnchor != "" { + if yRatio < anchorY { + boost += visualRelativeBoost + } else { + boost -= visualRelativePenalty + } + } + if hints.belowAnchor != "" { + if yRatio > anchorY { + boost += visualRelativeBoost + } else { + boost -= visualRelativePenalty + } + } + } + } + + return boost +} + +func findVisualAnchorRef(anchorQuery string, elements []types.ElementDescriptor) string { + if strings.TrimSpace(anchorQuery) == "" { + return "" + } + bestRef := "" + bestScore := 0.0 + for _, el := range elements { + anchorContext := strings.TrimSpace(el.Composite() + " " + el.Parent + " " + el.Section) + score := lexicalScore(anchorQuery, anchorContext, false, nil) + if score > bestScore { + bestScore = score + bestRef = el.Ref + } + } + if bestScore < 0.2 { + return "" + } + return bestRef +} + +func horizontalRatio(h types.PositionalHints, stats spatialStats, order, total int) float64 { + if stats.hasX && hasHorizontalPosition(h) { + x := horizontalPosition(h) + if stats.maxX > stats.minX { + return (x - stats.minX) / (stats.maxX - stats.minX) + } + return 0.5 + } + return fallbackOrderRatio(h, order, total) +} + +func verticalRatio(h types.PositionalHints, stats spatialStats, order, total int) float64 { + if stats.hasY && hasVerticalPosition(h) { + y := verticalPosition(h) + if stats.maxY > stats.minY { + return (y - stats.minY) / (stats.maxY - stats.minY) + } + return 0.5 + } + return fallbackOrderRatio(h, order, total) +} + +func fallbackOrderRatio(h types.PositionalHints, order, total int) float64 { + if h.SiblingCount > 1 { + idx := h.SiblingIndex + if idx < 0 { + idx = 0 + } + if idx > h.SiblingCount-1 { + idx = h.SiblingCount - 1 + } + return float64(idx) / float64(h.SiblingCount-1) + } + if total > 1 { + return float64(order) / float64(total-1) + } + return 0.5 +} + +func hasHorizontalPosition(h types.PositionalHints) bool { + return h.Width > 0 || h.X != 0 +} + +func hasVerticalPosition(h types.PositionalHints) bool { + return h.Height > 0 || h.Y != 0 +} + +func horizontalPosition(h types.PositionalHints) float64 { + return h.X + (h.Width / 2) +} + +func verticalPosition(h types.PositionalHints) float64 { + return h.Y + (h.Height / 2) +} diff --git a/internal/engine/query_visual_test.go b/internal/engine/query_visual_test.go new file mode 100644 index 0000000..ae31a23 --- /dev/null +++ b/internal/engine/query_visual_test.go @@ -0,0 +1,147 @@ +package engine + +import ( + "context" + "testing" + + "github.com/pinchtab/semantic/internal/types" +) + +func TestParseVisualQueryHints_BasicPatterns(t *testing.T) { + tests := []struct { + name string + query string + wantBase string + wantTop bool + wantBottom bool + wantLeft bool + wantRight bool + wantAbove string + wantBelow string + wantHasHint bool + }{ + { + name: "top right corner", + query: "button in top right corner", + wantBase: "button", + wantTop: true, + wantRight: true, + wantHasHint: true, + }, + { + name: "below anchor", + query: "link below the search box", + wantBase: "link", + wantBelow: "search box", + wantHasHint: true, + }, + { + name: "left side", + query: "sidebar on the left", + wantBase: "sidebar", + wantLeft: true, + wantHasHint: true, + }, + { + name: "plain query", + query: "submit button", + wantBase: "submit button", + wantHasHint: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := parseVisualQueryHints(tt.query) + if got.baseQuery != tt.wantBase { + t.Fatalf("base query mismatch: want=%q got=%q", tt.wantBase, got.baseQuery) + } + if got.top != tt.wantTop || got.bottom != tt.wantBottom || got.left != tt.wantLeft || got.right != tt.wantRight { + t.Fatalf("directional hints mismatch: got top=%v bottom=%v left=%v right=%v", got.top, got.bottom, got.left, got.right) + } + if got.aboveAnchor != tt.wantAbove || got.belowAnchor != tt.wantBelow { + t.Fatalf("anchor mismatch: got above=%q below=%q", got.aboveAnchor, got.belowAnchor) + } + if got.hasHints != tt.wantHasHint { + t.Fatalf("hasHints mismatch: want=%v got=%v", tt.wantHasHint, got.hasHints) + } + }) + } +} + +func TestParseVisualQueryHints_DoesNotTreatSignInAsVisualHint(t *testing.T) { + got := parseVisualQueryHints("sign in button") + if got.hasHints { + t.Fatalf("expected hasHints=false for non-visual query, got true") + } + if got.baseQuery != "sign in button" { + t.Fatalf("expected base query to stay unchanged, got %q", got.baseQuery) + } +} + +func TestCombinedMatcher_VisualHint_TopRightCorner(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "btn-left-top", Role: "button", Name: "Open", Positional: types.PositionalHints{X: 20, Y: 20, Width: 80, Height: 24}}, + {Ref: "btn-right-top", Role: "button", Name: "Open", Positional: types.PositionalHints{X: 880, Y: 30, Width: 80, Height: 24}}, + {Ref: "btn-right-bottom", Role: "button", Name: "Open", Positional: types.PositionalHints{X: 860, Y: 620, Width: 80, Height: 24}}, + } + + res, err := m.Find(context.Background(), "button in top right corner", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if res.BestRef != "btn-right-top" { + t.Fatalf("expected top-right button, got %s", res.BestRef) + } +} + +func TestCombinedMatcher_VisualHint_BelowAnchor(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "search", Role: "searchbox", Name: "Search", Positional: types.PositionalHints{X: 120, Y: 40, Width: 320, Height: 32}}, + {Ref: "link-top", Role: "link", Name: "Help", Positional: types.PositionalHints{X: 140, Y: 10, Width: 70, Height: 20}}, + {Ref: "link-bottom", Role: "link", Name: "Help", Positional: types.PositionalHints{X: 140, Y: 160, Width: 70, Height: 20}}, + } + + res, err := m.Find(context.Background(), "link below the search box", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if res.BestRef != "link-bottom" { + t.Fatalf("expected link below anchor, got %s", res.BestRef) + } +} + +func TestCombinedMatcher_VisualHint_LeftSidebar(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "sidebar-left", Role: "navigation", Name: "Sidebar", Positional: types.PositionalHints{X: 10, Y: 120, Width: 200, Height: 600}}, + {Ref: "sidebar-right", Role: "navigation", Name: "Sidebar", Positional: types.PositionalHints{X: 980, Y: 120, Width: 200, Height: 600}}, + } + + res, err := m.Find(context.Background(), "sidebar on the left", elements, types.FindOptions{Threshold: 0, TopK: 2}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if res.BestRef != "sidebar-left" { + t.Fatalf("expected left sidebar, got %s", res.BestRef) + } +} + +func TestCombinedMatcher_VisualHint_BottomFallbackWithoutCoordinates(t *testing.T) { + m := NewCombinedMatcher(NewHashingEmbedder(128)) + elements := []types.ElementDescriptor{ + {Ref: "button-1", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 0, SiblingCount: 3}}, + {Ref: "button-2", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 1, SiblingCount: 3}}, + {Ref: "button-3", Role: "button", Name: "Submit", Positional: types.PositionalHints{SiblingIndex: 2, SiblingCount: 3}}, + } + + res, err := m.Find(context.Background(), "button at bottom of page", elements, types.FindOptions{Threshold: 0, TopK: 3}) + if err != nil { + t.Fatalf("Find failed: %v", err) + } + if res.BestRef != "button-3" { + t.Fatalf("expected bottom-most fallback button, got %s", res.BestRef) + } +} diff --git a/internal/engine/ranking.go b/internal/engine/ranking.go new file mode 100644 index 0000000..00c88fa --- /dev/null +++ b/internal/engine/ranking.go @@ -0,0 +1,36 @@ +package engine + +import ( + "math" + + "github.com/pinchtab/semantic/internal/types" +) + +// rankedMatchLess defines deterministic ordering for scored matches. +func rankedMatchLess( + aScore float64, + aDesc types.ElementDescriptor, + aOrder int, + bScore float64, + bDesc types.ElementDescriptor, + bOrder int, +) bool { + scoreDiff := aScore - bScore + if math.Abs(scoreDiff) > 1e-9 { + return scoreDiff > 0 + } + + if aDesc.Positional.Depth != bDesc.Positional.Depth { + return aDesc.Positional.Depth > bDesc.Positional.Depth + } + + if aDesc.Positional.SiblingIndex != bDesc.Positional.SiblingIndex { + return aDesc.Positional.SiblingIndex < bDesc.Positional.SiblingIndex + } + + if aOrder != bOrder { + return aOrder < bOrder + } + + return aDesc.Ref < bDesc.Ref +} diff --git a/internal/engine/testing_helpers_test.go b/internal/engine/testing_helpers_test.go index 7c5197c..244578c 100644 --- a/internal/engine/testing_helpers_test.go +++ b/internal/engine/testing_helpers_test.go @@ -47,3 +47,16 @@ func (d *dummyEmbedder) hashVec(s string) []float32 { } return vec } + +func fixedVectors(n, dim int) [][]float32 { + if dim <= 0 { + dim = 1 + } + vectors := make([][]float32, n) + for i := 0; i < n; i++ { + vec := make([]float32, dim) + vec[i%dim] = 1 + vectors[i] = vec + } + return vectors +} diff --git a/internal/types/types.go b/internal/types/types.go index 20af53b..192cce8 100644 --- a/internal/types/types.go +++ b/internal/types/types.go @@ -53,6 +53,13 @@ type FindResult struct { ElementCount int // total elements evaluated } +// ParsedQuery splits a raw query into positive and negative token groups. +// Negative tokens are interpreted as terms that should be penalized or excluded. +type ParsedQuery struct { + Positive []string + Negative []string +} + // ConfidenceLabel returns "high", "medium", or "low" for the best match. func (r *FindResult) ConfidenceLabel() string { return CalibrateConfidence(r.BestScore) @@ -82,6 +89,10 @@ type PositionalHints struct { SiblingIndex int SiblingCount int LabelledBy string + X float64 + Y float64 + Width float64 + Height float64 } // ElementDescriptor describes a single accessibility tree node. @@ -93,6 +104,7 @@ type ElementDescriptor struct { Interactive bool Parent string Section string + DocumentIdx int Positional PositionalHints } diff --git a/recovery/benchmark_test.go b/recovery/benchmark_test.go new file mode 100644 index 0000000..1261dd6 --- /dev/null +++ b/recovery/benchmark_test.go @@ -0,0 +1,250 @@ +package recovery + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "runtime" + "testing" + "time" + + "github.com/pinchtab/semantic" +) + +type BenchmarkScenario struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + OriginalQuery string `json:"original_query"` + OriginalRef string `json:"original_ref"` + Before []semantic.ElementDescriptor `json:"before"` + After []semantic.ElementDescriptor `json:"after"` + ExpectedRef *string `json:"expected_ref"` + ExpectedAlt []string `json:"expected_alt"` + ExpectNoMatch bool `json:"expect_no_match"` + Difficulty string `json:"difficulty"` +} + +func loadScenarios(t *testing.T) []BenchmarkScenario { + _, thisFile, _, _ := runtime.Caller(0) + repoRoot := filepath.Join(filepath.Dir(thisFile), "..") + scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json") + + data, err := os.ReadFile(scenariosPath) + if err != nil { + t.Fatalf("failed to read scenarios: %v", err) + } + + var scenarios []BenchmarkScenario + if err := json.Unmarshal(data, &scenarios); err != nil { + t.Fatalf("failed to parse scenarios: %v", err) + } + + return scenarios +} + +func TestRecoveryBenchmark_Scenarios(t *testing.T) { + scenarios := loadScenarios(t) + matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128)) + + passed, failed := 0, 0 + + for _, sc := range scenarios { + t.Run(sc.ID, func(t *testing.T) { + result := runBenchmarkScenario(t, matcher, sc) + + if result.pass { + passed++ + t.Logf("PASS: recovered=%v got=%s expected=%s score=%.3f", + result.recovered, result.gotRef, result.expectedRef, result.score) + } else { + failed++ + t.Errorf("FAIL: recovered=%v got=%s expected=%s score=%.3f error=%s", + result.recovered, result.gotRef, result.expectedRef, result.score, result.err) + } + }) + } + + t.Logf("Summary: %d passed, %d failed out of %d scenarios", passed, failed, len(scenarios)) +} + +type scenarioResult struct { + pass bool + recovered bool + gotRef string + expectedRef string + score float64 + confidence string + latencyMs int64 + err string +} + +func runBenchmarkScenario(t *testing.T, matcher semantic.ElementMatcher, sc BenchmarkScenario) scenarioResult { + result := scenarioResult{} + + if sc.ExpectedRef != nil { + result.expectedRef = *sc.ExpectedRef + } + + var origDesc semantic.ElementDescriptor + for _, d := range sc.Before { + if d.Ref == sc.OriginalRef { + origDesc = d + break + } + } + + cache := NewIntentCache(100, 5*time.Minute) + cache.Store("test-tab", sc.OriginalRef, IntentEntry{ + Query: sc.OriginalQuery, + Descriptor: origDesc, + Score: 0.95, + Confidence: "high", + Strategy: "combined", + }) + + re := NewRecoveryEngine( + DefaultRecoveryConfig(), + matcher, + cache, + func(_ context.Context, _ string) error { return nil }, + func(_, ref string) (int64, bool) { + for i, d := range sc.After { + if d.Ref == ref { + return int64(1000 + i), true + } + } + return 0, false + }, + func(_ string) []semantic.ElementDescriptor { return sc.After }, + ) + + start := time.Now() + + err := fmt.Errorf("could not find node with id %s", sc.OriginalRef) + + if !re.ShouldAttempt(err, sc.OriginalRef) { + result.err = "ShouldAttempt returned false" + result.pass = sc.ExpectNoMatch + result.latencyMs = time.Since(start).Milliseconds() + return result + } + + rr, _, recErr := re.AttemptWithClassification( + context.Background(), + "test-tab", + sc.OriginalRef, + "click", + ClassifyFailure(err), + func(_ context.Context, kind string, nodeID int64) (map[string]any, error) { + return map[string]any{"clicked": true}, nil + }, + ) + + result.latencyMs = time.Since(start).Milliseconds() + result.recovered = rr.Recovered + result.gotRef = rr.NewRef + result.score = rr.Score + result.confidence = rr.Confidence + + if recErr != nil { + result.err = recErr.Error() + } + + if sc.ExpectNoMatch { + result.pass = !rr.Recovered + } else if sc.ExpectedRef != nil { + if rr.NewRef == *sc.ExpectedRef { + result.pass = true + } else { + for _, alt := range sc.ExpectedAlt { + if rr.NewRef == alt { + result.pass = true + break + } + } + } + } + + return result +} + +func BenchmarkRecoveryEngine_Scenarios(b *testing.B) { + scenarios := loadScenariosB(b) + matcher := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128)) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, sc := range scenarios { + runBenchmarkScenarioB(b, matcher, sc) + } + } +} + +func loadScenariosB(b *testing.B) []BenchmarkScenario { + _, thisFile, _, _ := runtime.Caller(0) + repoRoot := filepath.Join(filepath.Dir(thisFile), "..") + scenariosPath := filepath.Join(repoRoot, "tests", "benchmark", "corpus", "recovery-scenarios", "scenarios.json") + + data, err := os.ReadFile(scenariosPath) + if err != nil { + b.Fatalf("failed to read scenarios: %v", err) + } + + var scenarios []BenchmarkScenario + if err := json.Unmarshal(data, &scenarios); err != nil { + b.Fatalf("failed to parse scenarios: %v", err) + } + + return scenarios +} + +func runBenchmarkScenarioB(b *testing.B, matcher semantic.ElementMatcher, sc BenchmarkScenario) { + var origDesc semantic.ElementDescriptor + for _, d := range sc.Before { + if d.Ref == sc.OriginalRef { + origDesc = d + break + } + } + + cache := NewIntentCache(100, 5*time.Minute) + cache.Store("test-tab", sc.OriginalRef, IntentEntry{ + Query: sc.OriginalQuery, + Descriptor: origDesc, + Score: 0.95, + Confidence: "high", + Strategy: "combined", + }) + + re := NewRecoveryEngine( + DefaultRecoveryConfig(), + matcher, + cache, + func(_ context.Context, _ string) error { return nil }, + func(_, ref string) (int64, bool) { + for i, d := range sc.After { + if d.Ref == ref { + return int64(1000 + i), true + } + } + return 0, false + }, + func(_ string) []semantic.ElementDescriptor { return sc.After }, + ) + + err := fmt.Errorf("could not find node with id %s", sc.OriginalRef) + + _, _, _ = re.AttemptWithClassification( + context.Background(), + "test-tab", + sc.OriginalRef, + "click", + ClassifyFailure(err), + func(_ context.Context, kind string, nodeID int64) (map[string]any, error) { + return map[string]any{"clicked": true}, nil + }, + ) +} diff --git a/scripts/check-docs-links.sh b/scripts/check-docs-links.sh new file mode 100755 index 0000000..90a8738 --- /dev/null +++ b/scripts/check-docs-links.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# Check for broken documentation links +# +# Usage: +# ./scripts/check-docs-links.sh +# +set -uo pipefail + +cd "$(dirname "$0")/.." + +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' + +ERRORS=0 + +echo "Checking documentation links..." +echo "" + +# Find all markdown files and check links +while IFS= read -r file; do + dir=$(dirname "$file") + + # Extract markdown links: [text](path) + while IFS= read -r link; do + # Skip URLs and anchors + if [[ "$link" =~ ^https?:// ]] || [[ "$link" =~ ^mailto: ]] || [[ "$link" =~ ^# ]]; then + continue + fi + + # Remove anchor from link + link_path="${link%%#*}" + + # Skip empty paths + if [[ -z "$link_path" ]]; then + continue + fi + + # Resolve relative path + if [[ "$link_path" =~ ^/ ]]; then + target="$link_path" + else + target="$dir/$link_path" + fi + + # Check if target exists + if [[ ! -e "$target" ]]; then + echo -e "${RED}BROKEN:${NC} $file -> $link" + ERRORS=$((ERRORS + 1)) + fi + done < <(grep -oE '\]\([^)]+\)' "$file" 2>/dev/null | sed 's/\](//' | sed 's/)//') +done < <(find . -name "*.md" -not -path "./.git/*" -not -path "./node_modules/*") + +echo "" +if [[ $ERRORS -eq 0 ]]; then + echo -e "${GREEN}✓${NC} All documentation links valid" + exit 0 +else + echo -e "${RED}Found $ERRORS broken link(s)${NC}" + exit 1 +fi diff --git a/semantic_test.go b/semantic_test.go index a2a32ee..e41120b 100644 --- a/semantic_test.go +++ b/semantic_test.go @@ -7,6 +7,220 @@ import ( "github.com/pinchtab/semantic" ) +func negativeMatchingFixture() []semantic.ElementDescriptor { + return []semantic.ElementDescriptor{ + {Ref: "e0", Role: "button", Name: "Submit"}, + {Ref: "e1", Role: "button", Name: "Cancel"}, + {Ref: "e2", Role: "button", Name: "Sign In"}, + {Ref: "e3", Role: "link", Name: "Logout"}, + {Ref: "e4", Role: "textbox", Name: "Email"}, + {Ref: "e5", Role: "textbox", Name: "Password"}, + } +} + +func findScore(matches []semantic.ElementMatch, ref string) (float64, bool) { + for _, m := range matches { + if m.Ref == ref { + return m.Score, true + } + } + return 0, false +} + +func TestLexicalMatcher_NegativeMatching_Issue24Cases(t *testing.T) { + m := semantic.NewLexicalMatcher() + elements := negativeMatchingFixture() + + tests := []struct { + name string + query string + check func(t *testing.T, result semantic.FindResult) + }{ + { + name: "button not submit penalizes submit", + query: "button not submit", + check: func(t *testing.T, result semantic.FindResult) { + e0, ok := findScore(result.Matches, "e0") + if !ok { + t.Fatalf("expected e0 in matches") + } + e1, ok := findScore(result.Matches, "e1") + if !ok { + t.Fatalf("expected e1 in matches") + } + e2, ok := findScore(result.Matches, "e2") + if !ok { + t.Fatalf("expected e2 in matches") + } + if !(e1 > e0 || e2 > e0) { + t.Fatalf("expected e1 or e2 to rank above penalized e0, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2) + } + }, + }, + { + name: "button not behaves as button", + query: "button not", + check: func(t *testing.T, result semantic.FindResult) { + if result.BestRef == "e4" || result.BestRef == "e5" { + t.Fatalf("expected a button-like result, got %s", result.BestRef) + } + }, + }, + { + name: "button not cancel penalizes cancel", + query: "button not cancel", + check: func(t *testing.T, result semantic.FindResult) { + e0, ok := findScore(result.Matches, "e0") + if !ok { + t.Fatalf("expected e0 in matches") + } + e1, ok := findScore(result.Matches, "e1") + if !ok { + t.Fatalf("expected e1 in matches") + } + if e0 <= e1 { + t.Fatalf("expected e0 above penalized e1, got e0=%.4f e1=%.4f", e0, e1) + } + }, + }, + { + name: "textbox not email prefers password", + query: "textbox not email", + check: func(t *testing.T, result semantic.FindResult) { + e4, ok := findScore(result.Matches, "e4") + if !ok { + t.Fatalf("expected e4 in matches") + } + e5, ok := findScore(result.Matches, "e5") + if !ok { + t.Fatalf("expected e5 in matches") + } + if e5 <= e4 { + t.Fatalf("expected e5 above penalized e4, got e4=%.4f e5=%.4f", e4, e5) + } + }, + }, + { + name: "button not login penalizes sign in by synonym", + query: "button not login", + check: func(t *testing.T, result semantic.FindResult) { + e0, ok := findScore(result.Matches, "e0") + if !ok { + t.Fatalf("expected e0 in matches") + } + e1, ok := findScore(result.Matches, "e1") + if !ok { + t.Fatalf("expected e1 in matches") + } + e2, ok := findScore(result.Matches, "e2") + if !ok { + t.Fatalf("expected e2 in matches") + } + if !(e0 > e2 && e1 > e2) { + t.Fatalf("expected e2 to be penalized by login/sign in synonym, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2) + } + }, + }, + { + name: "button not sign in penalizes sign in", + query: "button not sign in", + check: func(t *testing.T, result semantic.FindResult) { + e0, ok := findScore(result.Matches, "e0") + if !ok { + t.Fatalf("expected e0 in matches") + } + e1, ok := findScore(result.Matches, "e1") + if !ok { + t.Fatalf("expected e1 in matches") + } + e2, ok := findScore(result.Matches, "e2") + if !ok { + t.Fatalf("expected e2 in matches") + } + if !(e0 > e2 && e1 > e2) { + t.Fatalf("expected e2 to be penalized by multi-token negative, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2) + } + }, + }, + { + name: "button not submit not cancel penalizes both", + query: "button not submit not cancel", + check: func(t *testing.T, result semantic.FindResult) { + e0, ok := findScore(result.Matches, "e0") + if !ok { + t.Fatalf("expected e0 in matches") + } + e1, ok := findScore(result.Matches, "e1") + if !ok { + t.Fatalf("expected e1 in matches") + } + e2, ok := findScore(result.Matches, "e2") + if !ok { + t.Fatalf("expected e2 in matches") + } + if !(e2 > e0 && e2 > e1) { + t.Fatalf("expected both submit/cancel to be penalized, got e0=%.4f e1=%.4f e2=%.4f", e0, e1, e2) + } + }, + }, + { + name: "sign in button regression", + query: "sign in button", + check: func(t *testing.T, result semantic.FindResult) { + if result.BestRef != "e2" { + t.Fatalf("expected e2 as best result, got %s", result.BestRef) + } + }, + }, + { + name: "link without logout drives logout near zero", + query: "link without logout", + check: func(t *testing.T, result semantic.FindResult) { + e3, ok := findScore(result.Matches, "e3") + if !ok { + t.Fatalf("expected e3 in matches") + } + if e3 > 0.1 { + t.Fatalf("expected e3 near-zero score, got %.4f", e3) + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := m.Find(context.Background(), tt.query, elements, semantic.FindOptions{ + Threshold: 0, + TopK: len(elements), + }) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + tt.check(t, result) + }) + } +} + +func TestLexicalMatcher_NegativeOnlyQuery(t *testing.T) { + m := semantic.NewLexicalMatcher() + elements := negativeMatchingFixture() + + result, err := m.Find(context.Background(), "not submit", elements, semantic.FindOptions{ + Threshold: 0, + TopK: len(elements), + }) + if err != nil { + t.Fatalf("Find returned error: %v", err) + } + + if len(result.Matches) == 0 { + t.Fatalf("expected non-empty matches for leading-not query") + } + if result.BestRef != "e0" { + t.Fatalf("expected leading-not query to behave as positive text, got best=%s", result.BestRef) + } +} + func TestNewCombinedMatcher_Find(t *testing.T) { m := semantic.NewCombinedMatcher(semantic.NewHashingEmbedder(128)) diff --git a/skills/semantic-dev/SKILL.md b/skills/semantic-dev/SKILL.md index cee7454..2bea9dd 100644 --- a/skills/semantic-dev/SKILL.md +++ b/skills/semantic-dev/SKILL.md @@ -5,32 +5,43 @@ description: Develop and contribute to the Semantic project. Use when working on # Semantic Development -Semantic is a zero-dependency Go library for matching natural language queries against accessibility tree elements. +Zero-dependency Go library for matching natural language queries against accessibility tree elements. -## Project Location +## Essential Commands +**Before any PR:** ```bash -cd ~/dev/semantic +./dev pr # runs: check + e2e + lint corpus + bench ``` -## Dev Commands - -All development commands run via `./dev`: - -| Command | Description | -|---------|-------------| -| `./dev doctor` | Setup dev environment | -| `./dev test` | Run unit tests | -| `./dev test verbose` | Run unit tests (verbose) | -| `./dev test race` | Run unit tests with race detector | -| `./dev coverage` | Run tests with coverage report | -| `./dev lint` | Run golangci-lint | -| `./dev fmt` | Format code | -| `./dev vet` | Run go vet | -| `./dev check` | All checks (fmt + vet + lint + test) | -| `./dev build` | Build CLI binary | -| `./dev benchmark` | Run benchmark study | -| `./dev e2e` | Run E2E tests (Docker) | +**During development:** +```bash +./dev test # unit tests (fast) +./dev check # fmt + vet + lint + test race (full validation) +./dev build # build ./semantic CLI binary +``` + +**Quality regression checks:** +```bash +./dev baseline check # compare quality against baseline +./dev runtime # compare performance against baseline +``` + +**When quality changes intentionally:** +```bash +./dev baseline update # accept new quality baseline (after review) +``` + +## When to Use Each + +| Scenario | Command | +|----------|---------| +| Made code changes, quick sanity | `./dev test` | +| Ready to commit | `./dev check` | +| Before opening PR | `./dev pr` | +| Changed scoring/matching logic | `./dev baseline check` | +| Performance-sensitive changes | `./dev runtime` | +| Tuning weights | `./dev tune` then `./dev bench` | ## Architecture @@ -54,6 +65,7 @@ recovery/ Public subpackage failure.go FailureType classification cmd/semantic/main.go CLI tool (find, match, classify) +cmd/semantic-bench/ Benchmark CLI (check, baseline, calibrate, tune, runtime) ``` ## Key Design Decisions @@ -79,6 +91,27 @@ cmd/semantic/main.go CLI tool (find, match, classify) 4. **Pre-commit hook** runs gofmt + golangci-lint automatically on staged files. +## Benchmark Improvement Loop + +When implementing changes that affect matching quality: + +```bash +./dev baseline # create baseline (first time only) +# ... make changes ... +./dev bench # run benchmark, compare to baseline +./dev baseline update # accept new baseline (if improved) +``` + +**Key metrics:** +- **MRR** — Mean Reciprocal Rank (higher = finds correct element faster) +- **P@1** — Precision at 1 (is top result correct?) +- **Hit@3** — Any correct result in top 3? + +**Adding test cases:** +1. Add to `tests/benchmark/corpus/*/queries.json` +2. Run `./dev lint corpus` to validate +3. Run `./dev bench` — shows regression until fixed + ## Public API Surface Only these symbols are visible to consumers: @@ -112,7 +145,7 @@ recovery.ClassifyFailure, recovery.DefaultRecoveryConfig ## Testing - **167 tests** across 3 packages (root, engine, recovery) -- `internal/engine/` has unit tests for all matchers + benchmark study +- `internal/engine/` has unit tests for all matchers + benchmark suite - Root has API-level smoke tests - `recovery/` has scenario tests (SPA re-render, checkout, login, etc.) diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index c2526c1..531f679 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -14,6 +14,9 @@ cd tests/benchmark ./scripts/run-corpus-benchmark.sh --strategy lexical ./scripts/run-corpus-benchmark.sh --strategy embedding ./scripts/run-corpus-benchmark.sh --strategy combined + +# Sweep combined lexical/embedding weights +./scripts/tune-weights.sh ``` ## Metrics @@ -55,25 +58,27 @@ corpus/ ## Current Results (combined strategy) ``` -Queries: 50 -MRR: 0.88 -P@1: 0.87 -P@3: 0.34 -Latency P50: 31 ms -Latency P95: 52 ms +Queries: 105 +MRR: 0.8897 +P@1: 0.8762 +P@3: 0.3412 +Latency P50: 23 ms +Latency P95: 28 ms By Difficulty: - easy: 34 queries, P@1 = 0.95 - medium: 14 queries, P@1 = 0.78 - hard: 2 queries, P@1 = 0.00 + easy: 76 queries, P@1 = 0.94 + medium: 25 queries, P@1 = 0.74 + hard: 4 queries, P@1 = 0.50 ``` ## Optimization Targets -The 6 current misses are "hard" cases requiring: +The current misses cluster around: - Synonym expansion (save for later → wishlist) - Implicit actions (clone → Code button) - Domain knowledge (CI status → Actions tab) +- Form/input intent (type new query → search box) +- Accessibility/navigation shortcuts (skip to content, homepage) ## Scripts @@ -81,6 +86,7 @@ The 6 current misses are "hard" cases requiring: |--------|---------| | `run-corpus-benchmark.sh` | Main benchmark with MRR/P@K metrics | | `run-benchmark.sh` | Simple pass/fail test runner | +| `tune-weights.sh` | Grid search combined matcher lexical/embedding weights | ## Adding to Corpus @@ -95,6 +101,13 @@ The 6 current misses are "hard" cases requiring: 3. Run benchmark to establish baseline +4. Add several related queries for the same behavior, not one isolated case. + Include easy, medium, hard, and at least one near-miss or partial match where + ambiguity matters. + +5. Re-run `./scripts/tune-weights.sh` after larger corpus changes to see whether + the best combined weights moved. + ## CI Integration ```yaml diff --git a/tests/benchmark/baselines/.gitkeep b/tests/benchmark/baselines/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmark/cases/complex.json b/tests/benchmark/cases/complex.json new file mode 100644 index 0000000..d0d4d0a --- /dev/null +++ b/tests/benchmark/cases/complex.json @@ -0,0 +1,102 @@ +[ + { + "id": "complex-001", + "query": "second submit button not in login", + "snapshot": "multi-form.json", + "expect_ref": "e11", + "min_score": 0.5, + "tags": ["ordinal", "negative-context", "compound"] + }, + { + "id": "complex-002", + "query": "last text field in payment", + "snapshot": "multi-form.json", + "expect_ref": "e6", + "min_score": 0.4, + "tags": ["ordinal", "section-context", "compound"] + }, + { + "id": "complex-003", + "query": "click the submit button in the shipping form", + "snapshot": "multi-form.json", + "expect_ref": "e11", + "min_score": 0.5, + "tags": ["natural-language", "section-context", "action-verb"] + }, + { + "id": "complex-004", + "query": "first input field except login section", + "snapshot": "multi-form.json", + "expect_ref": "e4", + "expect_ref_alt": ["e1"], + "min_score": 0.4, + "tags": ["ordinal", "negative-context", "compound"], + "notes": "Known gap: ordinal applied before negative filter" + }, + { + "id": "complex-005", + "query": "I want to click the sign in button", + "snapshot": "login-page.json", + "expect_ref": "e4", + "min_score": 0.4, + "tags": ["natural-language", "conversational"] + }, + { + "id": "complex-006", + "query": "the button that says submit", + "snapshot": "multi-form.json", + "expect_ref": "e7", + "expect_ref_alt": ["e3", "e11"], + "min_score": 0.4, + "tags": ["natural-language", "descriptive"], + "notes": "Any Submit button is valid without ordinal" + }, + { + "id": "complex-007", + "query": "where can I type my password", + "snapshot": "login-page.json", + "expect_ref": "e2", + "min_score": 0.3, + "tags": ["natural-language", "question-form"] + }, + { + "id": "complex-008", + "query": "press enter to login", + "snapshot": "login-page.json", + "expect_no_crash": true, + "tags": ["natural-language", "action-synonym"], + "notes": "Known gap: 'press enter' not recognized as submit action" + }, + { + "id": "complex-009", + "query": "add item to my shopping bag", + "snapshot": "ecommerce-product.json", + "expect_ref": "e10", + "min_score": 0.4, + "tags": ["synonym-chain", "ecommerce"] + }, + { + "id": "complex-010", + "query": "go to my account settings", + "snapshot": "dashboard.json", + "expect_ref": "e3", + "min_score": 0.4, + "tags": ["natural-language", "navigation"] + }, + { + "id": "complex-011", + "query": "sign up for a new account", + "snapshot": "login-page.json", + "expect_ref": "e6", + "min_score": 0.4, + "tags": ["synonym", "registration"] + }, + { + "id": "complex-012", + "query": "search for products", + "snapshot": "dashboard.json", + "expect_ref": "e6", + "min_score": 0.4, + "tags": ["natural-language", "search"] + } +] diff --git a/tests/benchmark/cases/negative-threshold.json b/tests/benchmark/cases/negative-threshold.json new file mode 100644 index 0000000..dece15f --- /dev/null +++ b/tests/benchmark/cases/negative-threshold.json @@ -0,0 +1,117 @@ +[ + { + "id": "neg-001", + "query": "xyzzy plugh qwerty", + "snapshot": "login-page.json", + "expect_no_match": true, + "threshold": 0.3, + "tags": ["no-match", "nonsense"] + }, + { + "id": "neg-002", + "query": "upload spreadsheet to cloud", + "snapshot": "login-page.json", + "expect_no_match": true, + "threshold": 0.4, + "tags": ["no-match", "absent-control"] + }, + { + "id": "neg-003", + "query": "open video player", + "snapshot": "dashboard.json", + "expect_no_match": true, + "threshold": 0.4, + "tags": ["no-match", "absent-control"] + }, + { + "id": "neg-004", + "query": "print receipt", + "snapshot": "login-page.json", + "expect_no_match": true, + "threshold": 0.4, + "tags": ["no-match", "absent-control"] + }, + { + "id": "neg-005", + "query": "submit button", + "snapshot": "multi-form.json", + "expect_ref": "e11", + "expect_ref_alt": ["e3", "e7"], + "threshold": 0.3, + "min_score": 0.5, + "tags": ["threshold", "duplicate-labels"] + }, + { + "id": "neg-006", + "query": "enter", + "snapshot": "login-page.json", + "expect_has_matches": true, + "threshold": 0.1, + "min_score": 0.15, + "tags": ["threshold", "weak-match"] + }, + { + "id": "neg-007", + "query": "click", + "snapshot": "ecommerce-product.json", + "expect_has_matches": true, + "threshold": 0.1, + "tags": ["threshold", "generic-verb"] + }, + { + "id": "neg-008", + "query": "the thing", + "snapshot": "dashboard.json", + "expect_has_matches": true, + "threshold": 0.05, + "tags": ["threshold", "vague-query"] + }, + { + "id": "neg-009", + "query": "asdfghjkl", + "snapshot": "multi-form.json", + "expect_no_match": true, + "threshold": 0.3, + "tags": ["no-match", "keyboard-mash"] + }, + { + "id": "neg-010", + "query": "stale element e999", + "snapshot": "login-page.json", + "expect_no_match": true, + "threshold": 0.3, + "tags": ["no-match", "stale-ref"] + }, + { + "id": "neg-011", + "query": "a b c d e f", + "snapshot": "dashboard.json", + "expect_no_crash": true, + "threshold": 0.3, + "tags": ["threshold", "noise-tokens"] + }, + { + "id": "neg-012", + "query": "configure webhook endpoint", + "snapshot": "login-page.json", + "expect_no_match": true, + "threshold": 0.4, + "tags": ["no-match", "absent-control", "domain-intent"] + }, + { + "id": "neg-013", + "query": "invoice download", + "snapshot": "ecommerce-product.json", + "expect_no_match": true, + "threshold": 0.4, + "tags": ["no-match", "absent-control"] + }, + { + "id": "neg-014", + "query": "share on twitter", + "snapshot": "login-page.json", + "expect_no_match": true, + "threshold": 0.4, + "tags": ["no-match", "absent-control"] + } +] diff --git a/tests/benchmark/cases/visual.json b/tests/benchmark/cases/visual.json new file mode 100644 index 0000000..3a12219 --- /dev/null +++ b/tests/benchmark/cases/visual.json @@ -0,0 +1,50 @@ +[ + { + "id": "vcase-001", + "query": "button in top right", + "snapshot": "visual-layout.json", + "expect_ref": "e1", + "min_score": 0.5, + "tags": ["visual", "position", "directional"] + }, + { + "id": "vcase-002", + "query": "button on the left", + "snapshot": "visual-layout.json", + "expect_ref": "e0", + "min_score": 0.4, + "tags": ["visual", "position", "directional"] + }, + { + "id": "vcase-003", + "query": "button at bottom", + "snapshot": "visual-layout.json", + "expect_ref": "e7", + "min_score": 0.4, + "tags": ["visual", "position", "directional"] + }, + { + "id": "vcase-004", + "query": "link on left side", + "snapshot": "visual-layout.json", + "expect_ref": "e3", + "min_score": 0.4, + "tags": ["visual", "position", "directional"] + }, + { + "id": "vcase-005", + "query": "top left menu button", + "snapshot": "visual-layout.json", + "expect_ref": "e0", + "min_score": 0.5, + "tags": ["visual", "position", "compound"] + }, + { + "id": "vcase-006", + "query": "settings in upper right corner", + "snapshot": "visual-layout.json", + "expect_ref": "e1", + "min_score": 0.5, + "tags": ["visual", "position", "name-match"] + } +] diff --git a/tests/benchmark/config/benchmark.json b/tests/benchmark/config/benchmark.json index 23b5661..7b06060 100644 --- a/tests/benchmark/config/benchmark.json +++ b/tests/benchmark/config/benchmark.json @@ -1,13 +1,35 @@ { - "version": "1.0.0", - "strategies": ["lexical", "embedding", "combined"], - "default_strategy": "combined", - "default_threshold": 0.3, - "default_top_k": 3, - "metrics": { - "min_accuracy": 0.85, - "min_avg_score": 0.5, - "max_latency_ms": 100 + "version": "1.1.0", + "defaults": { + "strategy": "combined", + "threshold": 0.01, + "top_k": 5, + "weights": { + "lexical": 0.6, + "embedding": 0.4 + } + }, + "baseline": { + "quality": { + "max_overall_p_at_1_drop": 0.02, + "max_overall_mrr_drop": 0.02, + "max_overall_hit_at_3_drop": 0.02, + "max_corpus_p_at_1_drop": 0.08, + "max_difficulty_p_at_1_drop": 0.08, + "max_margin_drop_report": 0.15 + }, + "runtime": { + "max_ns_op_regression_ratio": 1.25, + "max_alloc_regression_ratio": 1.25, + "max_corpus_latency_p50_ms": 75, + "max_corpus_latency_p95_ms": 200 + } }, + "results": { + "dir": "tests/benchmark/results", + "baselines_dir": "tests/benchmark/baselines", + "generated_files_policy": "warn" + }, + "strategies": ["lexical", "embedding", "combined"], "snapshots_dir": "../e2e/assets/snapshots" } diff --git a/tests/benchmark/corpus/README.md b/tests/benchmark/corpus/README.md index 37c353e..c1defda 100644 --- a/tests/benchmark/corpus/README.md +++ b/tests/benchmark/corpus/README.md @@ -38,6 +38,30 @@ Each corpus entry is a directory containing: - **P@3**: How many of top-3 are relevant? - **Margin**: Score gap between relevant and irrelevant +## Expansion Groups + +### Expansion 1: Complex Query Patterns (2026-04) + +Added corpora for underrepresented query types: + +- **implicit-domain-intent/**: GitHub-like repo page with 56 elements. Tests implicit intents like "clone this repo", "check CI status", "switch branch", "save for later". 18 queries, 8 hard. + +- **form-state-controls/**: Settings page with checkboxes, radios, toggles, comboboxes. Tests stateful controls like "keep me logged in", "enable 2FA", "subscribe to newsletter". 18 queries, 8 hard. + +- **ambiguous-layout-context/**: Multi-section page with duplicate labels (3x Search, 2x Save, 2x Cancel, 2x Login, 2x Home, 2x Help). Tests positional and section disambiguation. 17 queries, 7 hard. + +Also added `tests/benchmark/cases/negative-threshold.json` with 14 no-match and threshold calibration cases. + +### Expansion 2: Enterprise UI Patterns (2026-04) + +Added corpora for complex enterprise UI scenarios: + +- **table-grid/**: Invoice table with 50+ elements. Tests row-level context, repeated buttons (Edit, Delete, More), ordinal references ("second invoice", "last row"), and bulk operations. 24 queries, 8 hard. + +- **overlays-menus-dialogs/**: Multi-layer UI with modal dialogs, dropdown menus, context menus, notifications. Tests duplicate controls across scopes ("cancel in modal", "save on page not dialog"), menu item selection, and overlay disambiguation. 24 queries, 8 hard. + +- **icon-aria-labels/**: Icon-only controls across toolbar, media player, navigation. Tests sparse accessible names, icon descriptions ("kebab menu", "hamburger", "pencil edit"), and section context for repeated icons. 25 queries, 6 hard. + ## Sources Snapshots should be captured from real websites using pinchtab: diff --git a/tests/benchmark/corpus/ambiguous-layout-context/queries.json b/tests/benchmark/corpus/ambiguous-layout-context/queries.json new file mode 100644 index 0000000..458fe8a --- /dev/null +++ b/tests/benchmark/corpus/ambiguous-layout-context/queries.json @@ -0,0 +1,138 @@ +[ + { + "id": "alc-001", + "query": "header search box", + "relevant_refs": ["e1"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "section", "position"] + }, + { + "id": "alc-002", + "query": "sidebar search", + "relevant_refs": ["e6"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "section", "position"] + }, + { + "id": "alc-003", + "query": "search on the left", + "relevant_refs": ["e6"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["duplicate-labels", "visual", "position"] + }, + { + "id": "alc-004", + "query": "search in top right area", + "relevant_refs": ["e1"], + "partially_relevant_refs": ["e2"], + "difficulty": "hard", + "tags": ["duplicate-labels", "visual", "position"] + }, + { + "id": "alc-005", + "query": "save button in profile", + "relevant_refs": ["e18"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "section", "context-exclusion"] + }, + { + "id": "alc-006", + "query": "save in billing section", + "relevant_refs": ["e22"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "section", "context-exclusion"] + }, + { + "id": "alc-007", + "query": "second save button", + "relevant_refs": ["e22"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "ordinal"] + }, + { + "id": "alc-008", + "query": "cancel button below password", + "relevant_refs": ["e19"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["duplicate-labels", "visual", "position"] + }, + { + "id": "alc-009", + "query": "login link not in footer", + "relevant_refs": ["e3"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["duplicate-labels", "context-exclusion"] + }, + { + "id": "alc-010", + "query": "footer login link", + "relevant_refs": ["e29"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "section"] + }, + { + "id": "alc-011", + "query": "home link in sidebar", + "relevant_refs": ["e7"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "section"] + }, + { + "id": "alc-012", + "query": "home link at bottom", + "relevant_refs": ["e25"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "visual", "position"] + }, + { + "id": "alc-013", + "query": "help in sidebar not footer", + "relevant_refs": ["e10"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["duplicate-labels", "context-exclusion"] + }, + { + "id": "alc-014", + "query": "main content save button", + "relevant_refs": ["e18"], + "partially_relevant_refs": ["e22"], + "difficulty": "hard", + "tags": ["duplicate-labels", "section"] + }, + { + "id": "alc-015", + "query": "first cancel button", + "relevant_refs": ["e19"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["duplicate-labels", "ordinal"] + }, + { + "id": "alc-016", + "query": "submit at the bottom of the page", + "relevant_refs": ["e32"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["visual", "position"] + }, + { + "id": "alc-017", + "query": "input field below username", + "relevant_refs": ["e16"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["visual", "position"] + } +] diff --git a/tests/benchmark/corpus/ambiguous-layout-context/snapshot.json b/tests/benchmark/corpus/ambiguous-layout-context/snapshot.json new file mode 100644 index 0000000..6d5a5aa --- /dev/null +++ b/tests/benchmark/corpus/ambiguous-layout-context/snapshot.json @@ -0,0 +1,35 @@ +[ + {"ref": "e0", "role": "banner", "name": "Header", "interactive": false, "section": "Header"}, + {"ref": "e1", "role": "searchbox", "name": "Search", "interactive": true, "section": "Header", "parent": "Header search", "positional": {"x": 200, "y": 20, "width": 300, "height": 36}}, + {"ref": "e2", "role": "button", "name": "Search", "interactive": true, "section": "Header", "parent": "Header search", "positional": {"x": 510, "y": 20, "width": 80, "height": 36}}, + {"ref": "e3", "role": "link", "name": "Login", "interactive": true, "section": "Header", "parent": "Header actions", "positional": {"x": 850, "y": 20, "width": 60, "height": 36}}, + {"ref": "e4", "role": "link", "name": "Sign up", "interactive": true, "section": "Header", "parent": "Header actions", "positional": {"x": 920, "y": 20, "width": 70, "height": 36}}, + {"ref": "e5", "role": "navigation", "name": "Sidebar", "interactive": false, "section": "Sidebar", "positional": {"x": 0, "y": 80, "width": 200, "height": 600}}, + {"ref": "e6", "role": "searchbox", "name": "Search", "interactive": true, "section": "Sidebar", "parent": "Sidebar search", "positional": {"x": 10, "y": 100, "width": 180, "height": 32}}, + {"ref": "e7", "role": "link", "name": "Home", "interactive": true, "section": "Sidebar", "parent": "Main nav", "positional": {"x": 10, "y": 150, "width": 180, "height": 32}}, + {"ref": "e8", "role": "link", "name": "Products", "interactive": true, "section": "Sidebar", "parent": "Main nav", "positional": {"x": 10, "y": 190, "width": 180, "height": 32}}, + {"ref": "e9", "role": "link", "name": "Settings", "interactive": true, "section": "Sidebar", "parent": "Main nav", "positional": {"x": 10, "y": 230, "width": 180, "height": 32}}, + {"ref": "e10", "role": "link", "name": "Help", "interactive": true, "section": "Sidebar", "parent": "Secondary nav", "positional": {"x": 10, "y": 600, "width": 180, "height": 32}}, + {"ref": "e11", "role": "main", "name": "Main content", "interactive": false, "section": "Main", "positional": {"x": 220, "y": 80, "width": 780, "height": 600}}, + {"ref": "e12", "role": "heading", "name": "Welcome", "interactive": false, "section": "Main", "parent": "Hero"}, + {"ref": "e13", "role": "button", "name": "Get Started", "interactive": true, "section": "Main", "parent": "Hero", "positional": {"x": 400, "y": 200, "width": 120, "height": 40}}, + {"ref": "e14", "role": "heading", "name": "Profile Settings", "interactive": false, "section": "Main", "parent": "Profile form"}, + {"ref": "e15", "role": "textbox", "name": "Username", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 300, "width": 300, "height": 36}}, + {"ref": "e16", "role": "textbox", "name": "Email", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 350, "width": 300, "height": 36}}, + {"ref": "e17", "role": "textbox", "name": "Password", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 400, "width": 300, "height": 36}}, + {"ref": "e18", "role": "button", "name": "Save", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 300, "y": 460, "width": 100, "height": 40}}, + {"ref": "e19", "role": "button", "name": "Cancel", "interactive": true, "section": "Main", "parent": "Profile form", "positional": {"x": 420, "y": 460, "width": 100, "height": 40}}, + {"ref": "e20", "role": "heading", "name": "Billing Details", "interactive": false, "section": "Main", "parent": "Billing form"}, + {"ref": "e21", "role": "textbox", "name": "Card number", "interactive": true, "section": "Main", "parent": "Billing form", "positional": {"x": 300, "y": 540, "width": 300, "height": 36}}, + {"ref": "e22", "role": "button", "name": "Save", "interactive": true, "section": "Main", "parent": "Billing form", "positional": {"x": 300, "y": 600, "width": 100, "height": 40}}, + {"ref": "e23", "role": "button", "name": "Cancel", "interactive": true, "section": "Main", "parent": "Billing form", "positional": {"x": 420, "y": 600, "width": 100, "height": 40}}, + {"ref": "e24", "role": "contentinfo", "name": "Footer", "interactive": false, "section": "Footer", "positional": {"x": 0, "y": 700, "width": 1000, "height": 100}}, + {"ref": "e25", "role": "link", "name": "Home", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 50, "y": 720, "width": 60, "height": 24}}, + {"ref": "e26", "role": "link", "name": "Privacy", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 130, "y": 720, "width": 60, "height": 24}}, + {"ref": "e27", "role": "link", "name": "Terms", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 210, "y": 720, "width": 60, "height": 24}}, + {"ref": "e28", "role": "link", "name": "Contact", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 290, "y": 720, "width": 70, "height": 24}}, + {"ref": "e29", "role": "link", "name": "Login", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 380, "y": 720, "width": 50, "height": 24}}, + {"ref": "e30", "role": "link", "name": "Help", "interactive": true, "section": "Footer", "parent": "Footer links", "positional": {"x": 450, "y": 720, "width": 50, "height": 24}}, + {"ref": "e31", "role": "searchbox", "name": "Search", "interactive": true, "section": "Footer", "parent": "Footer search", "positional": {"x": 700, "y": 720, "width": 200, "height": 32}}, + {"ref": "e32", "role": "button", "name": "Submit", "interactive": true, "section": "Footer", "parent": "Footer newsletter", "positional": {"x": 700, "y": 760, "width": 100, "height": 32}} +] diff --git a/tests/benchmark/corpus/form-state-controls/queries.json b/tests/benchmark/corpus/form-state-controls/queries.json new file mode 100644 index 0000000..7fec46c --- /dev/null +++ b/tests/benchmark/corpus/form-state-controls/queries.json @@ -0,0 +1,146 @@ +[ + { + "id": "fsc-001", + "query": "keep me logged in", + "relevant_refs": ["e10"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "checkbox", "state"] + }, + { + "id": "fsc-002", + "query": "remember me checkbox", + "relevant_refs": ["e10"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["checkbox", "state"] + }, + { + "id": "fsc-003", + "query": "subscribe to newsletter", + "relevant_refs": ["e15"], + "partially_relevant_refs": ["e14", "e16"], + "difficulty": "hard", + "tags": ["implicit", "checkbox", "state"] + }, + { + "id": "fsc-004", + "query": "opt out of marketing", + "relevant_refs": ["e15"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "checkbox", "state"] + }, + { + "id": "fsc-005", + "query": "enable 2FA", + "relevant_refs": ["e11"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "checkbox", "state", "domain-intent"] + }, + { + "id": "fsc-006", + "query": "use same address for shipping", + "relevant_refs": ["e28"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["implicit", "checkbox", "state"] + }, + { + "id": "fsc-007", + "query": "select payment method", + "relevant_refs": ["e24", "e25", "e26"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["radio", "state"] + }, + { + "id": "fsc-008", + "query": "pay with PayPal", + "relevant_refs": ["e25"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["radio", "state"] + }, + { + "id": "fsc-009", + "query": "change my country", + "relevant_refs": ["e33"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["combobox", "state"] + }, + { + "id": "fsc-010", + "query": "select language preference", + "relevant_refs": ["e35"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["combobox", "state"] + }, + { + "id": "fsc-011", + "query": "turn on push notifications", + "relevant_refs": ["e17"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["toggle", "switch", "state"] + }, + { + "id": "fsc-012", + "query": "enable text alerts", + "relevant_refs": ["e18"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "toggle", "switch", "state"] + }, + { + "id": "fsc-013", + "query": "export my data", + "relevant_refs": ["e42"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["domain-intent", "link"] + }, + { + "id": "fsc-014", + "query": "delete my account", + "relevant_refs": ["e43"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["domain-intent", "link"] + }, + { + "id": "fsc-015", + "query": "show advanced settings", + "relevant_refs": ["e44"], + "partially_relevant_refs": ["e45"], + "difficulty": "medium", + "tags": ["implicit", "button", "state"] + }, + { + "id": "fsc-016", + "query": "enable beta features", + "relevant_refs": ["e47"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["checkbox", "state"] + }, + { + "id": "fsc-017", + "query": "hide my profile from search", + "relevant_refs": ["e40"], + "partially_relevant_refs": ["e39"], + "difficulty": "hard", + "tags": ["implicit", "checkbox", "state"] + }, + { + "id": "fsc-018", + "query": "stop sharing data with partners", + "relevant_refs": ["e41"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "checkbox", "state"] + } +] diff --git a/tests/benchmark/corpus/form-state-controls/snapshot.json b/tests/benchmark/corpus/form-state-controls/snapshot.json new file mode 100644 index 0000000..6819953 --- /dev/null +++ b/tests/benchmark/corpus/form-state-controls/snapshot.json @@ -0,0 +1,54 @@ +[ + {"ref": "e0", "role": "heading", "name": "Account Settings", "interactive": false, "section": "Header"}, + {"ref": "e1", "role": "link", "name": "Profile", "interactive": true, "section": "Settings nav", "parent": "Navigation"}, + {"ref": "e2", "role": "link", "name": "Security", "interactive": true, "section": "Settings nav", "parent": "Navigation"}, + {"ref": "e3", "role": "link", "name": "Notifications", "interactive": true, "section": "Settings nav", "parent": "Navigation"}, + {"ref": "e4", "role": "link", "name": "Billing", "interactive": true, "section": "Settings nav", "parent": "Navigation"}, + {"ref": "e5", "role": "link", "name": "Privacy", "interactive": true, "section": "Settings nav", "parent": "Navigation"}, + {"ref": "e6", "role": "heading", "name": "Login & Security", "interactive": false, "section": "Security settings"}, + {"ref": "e7", "role": "textbox", "name": "Current password", "interactive": true, "section": "Security settings", "parent": "Change password"}, + {"ref": "e8", "role": "textbox", "name": "New password", "interactive": true, "section": "Security settings", "parent": "Change password"}, + {"ref": "e9", "role": "textbox", "name": "Confirm password", "interactive": true, "section": "Security settings", "parent": "Change password"}, + {"ref": "e10", "role": "checkbox", "name": "Remember me on this device", "interactive": true, "section": "Security settings", "parent": "Session options"}, + {"ref": "e11", "role": "checkbox", "name": "Enable two-factor authentication", "interactive": true, "section": "Security settings", "parent": "Security options"}, + {"ref": "e12", "role": "button", "name": "Update password", "interactive": true, "section": "Security settings", "parent": "Change password"}, + {"ref": "e13", "role": "heading", "name": "Notification Preferences", "interactive": false, "section": "Notification settings"}, + {"ref": "e14", "role": "checkbox", "name": "Email notifications", "interactive": true, "section": "Notification settings", "parent": "Email"}, + {"ref": "e15", "role": "checkbox", "name": "Marketing emails", "interactive": true, "section": "Notification settings", "parent": "Email"}, + {"ref": "e16", "role": "checkbox", "name": "Product updates", "interactive": true, "section": "Notification settings", "parent": "Email"}, + {"ref": "e17", "role": "switch", "name": "Push notifications", "interactive": true, "section": "Notification settings", "parent": "Mobile"}, + {"ref": "e18", "role": "switch", "name": "SMS alerts", "interactive": true, "section": "Notification settings", "parent": "Mobile"}, + {"ref": "e19", "role": "heading", "name": "Billing Information", "interactive": false, "section": "Billing settings"}, + {"ref": "e20", "role": "textbox", "name": "Cardholder name", "interactive": true, "section": "Billing settings", "parent": "Payment method"}, + {"ref": "e21", "role": "textbox", "name": "Card number", "interactive": true, "section": "Billing settings", "parent": "Payment method"}, + {"ref": "e22", "role": "textbox", "name": "Expiry date", "interactive": true, "section": "Billing settings", "parent": "Payment method"}, + {"ref": "e23", "role": "textbox", "name": "CVV", "interactive": true, "section": "Billing settings", "parent": "Payment method"}, + {"ref": "e24", "role": "radio", "name": "Credit Card", "interactive": true, "section": "Billing settings", "parent": "Payment type"}, + {"ref": "e25", "role": "radio", "name": "PayPal", "interactive": true, "section": "Billing settings", "parent": "Payment type"}, + {"ref": "e26", "role": "radio", "name": "Bank Transfer", "interactive": true, "section": "Billing settings", "parent": "Payment type"}, + {"ref": "e27", "role": "heading", "name": "Shipping Address", "interactive": false, "section": "Shipping settings"}, + {"ref": "e28", "role": "checkbox", "name": "Same as billing address", "interactive": true, "section": "Shipping settings"}, + {"ref": "e29", "role": "textbox", "name": "Street address", "interactive": true, "section": "Shipping settings", "parent": "Address form"}, + {"ref": "e30", "role": "textbox", "name": "City", "interactive": true, "section": "Shipping settings", "parent": "Address form"}, + {"ref": "e31", "role": "textbox", "name": "State/Province", "interactive": true, "section": "Shipping settings", "parent": "Address form"}, + {"ref": "e32", "role": "textbox", "name": "Postal code", "interactive": true, "section": "Shipping settings", "parent": "Address form"}, + {"ref": "e33", "role": "combobox", "name": "Country", "interactive": true, "section": "Shipping settings", "parent": "Address form"}, + {"ref": "e34", "role": "heading", "name": "Language & Region", "interactive": false, "section": "Preferences"}, + {"ref": "e35", "role": "combobox", "name": "Language", "interactive": true, "section": "Preferences", "parent": "Language settings"}, + {"ref": "e36", "role": "combobox", "name": "Timezone", "interactive": true, "section": "Preferences", "parent": "Regional settings"}, + {"ref": "e37", "role": "combobox", "name": "Currency", "interactive": true, "section": "Preferences", "parent": "Regional settings"}, + {"ref": "e38", "role": "heading", "name": "Privacy", "interactive": false, "section": "Privacy settings"}, + {"ref": "e39", "role": "checkbox", "name": "Make profile public", "interactive": true, "section": "Privacy settings", "parent": "Visibility"}, + {"ref": "e40", "role": "checkbox", "name": "Allow search engines to index my profile", "interactive": true, "section": "Privacy settings", "parent": "Visibility"}, + {"ref": "e41", "role": "checkbox", "name": "Share activity with partners", "interactive": true, "section": "Privacy settings", "parent": "Data sharing"}, + {"ref": "e42", "role": "link", "name": "Download my data", "interactive": true, "section": "Privacy settings", "parent": "Data export"}, + {"ref": "e43", "role": "link", "name": "Delete my account", "interactive": true, "section": "Privacy settings", "parent": "Account"}, + {"ref": "e44", "role": "button", "name": "Advanced options", "interactive": true, "section": "Footer", "parent": "Expandable section"}, + {"ref": "e45", "role": "region", "name": "Advanced options", "interactive": false, "section": "Footer", "parent": "Expandable content"}, + {"ref": "e46", "role": "checkbox", "name": "Developer mode", "interactive": true, "section": "Advanced", "parent": "Advanced options"}, + {"ref": "e47", "role": "checkbox", "name": "Beta features", "interactive": true, "section": "Advanced", "parent": "Advanced options"}, + {"ref": "e48", "role": "button", "name": "Save changes", "interactive": true, "section": "Footer"}, + {"ref": "e49", "role": "button", "name": "Cancel", "interactive": true, "section": "Footer"}, + {"ref": "e50", "role": "button", "name": "Reset to defaults", "interactive": false, "section": "Footer"}, + {"ref": "e51", "role": "alert", "name": "Please fill in all required fields", "interactive": false, "section": "Form validation"} +] diff --git a/tests/benchmark/corpus/icon-aria-labels/queries.json b/tests/benchmark/corpus/icon-aria-labels/queries.json new file mode 100644 index 0000000..a6daad0 --- /dev/null +++ b/tests/benchmark/corpus/icon-aria-labels/queries.json @@ -0,0 +1,227 @@ +[ + { + "id": "icon-001", + "query": "settings gear", + "relevant_refs": ["e1"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "exact-match", "description"], + "notes": "Settings button with gear icon description" + }, + { + "id": "icon-002", + "query": "delete trash icon", + "relevant_refs": ["e2", "e13"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["icon", "duplicate-labels", "description"], + "notes": "Multiple delete buttons with trash icon" + }, + { + "id": "icon-003", + "query": "more options", + "relevant_refs": ["e3"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "exact-match"], + "notes": "Kebab/three dots menu" + }, + { + "id": "icon-004", + "query": "kebab menu", + "relevant_refs": ["e3"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["icon", "synonym"], + "notes": "Three dots menu using slang term" + }, + { + "id": "icon-005", + "query": "notifications bell", + "relevant_refs": ["e4"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "description"], + "notes": "Bell icon for notifications" + }, + { + "id": "icon-006", + "query": "search magnifier", + "relevant_refs": ["e5"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "synonym", "description"], + "notes": "Search with magnifying glass" + }, + { + "id": "icon-007", + "query": "copy link", + "relevant_refs": ["e6"], + "partially_relevant_refs": ["e14"], + "difficulty": "easy", + "tags": ["icon", "exact-match"], + "notes": "Copy link button" + }, + { + "id": "icon-008", + "query": "share", + "relevant_refs": ["e7"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "exact-match"], + "notes": "Share button" + }, + { + "id": "icon-009", + "query": "refresh", + "relevant_refs": ["e8"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "exact-match"], + "notes": "Refresh button with circular arrows" + }, + { + "id": "icon-010", + "query": "download", + "relevant_refs": ["e9"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "exact-match"], + "notes": "Download button" + }, + { + "id": "icon-011", + "query": "upload file", + "relevant_refs": ["e10"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "action"], + "notes": "Upload button" + }, + { + "id": "icon-012", + "query": "calendar picker", + "relevant_refs": ["e20"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["icon", "section"], + "notes": "Calendar button in date picker" + }, + { + "id": "icon-013", + "query": "hamburger menu", + "relevant_refs": ["e30"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["icon", "synonym"], + "notes": "Menu button using slang term" + }, + { + "id": "icon-014", + "query": "play video", + "relevant_refs": ["e40"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "action", "media"], + "notes": "Play button in media controls" + }, + { + "id": "icon-015", + "query": "mute audio", + "relevant_refs": ["e46"], + "partially_relevant_refs": ["e45"], + "difficulty": "medium", + "tags": ["icon", "action", "media"], + "notes": "Mute button or volume control" + }, + { + "id": "icon-016", + "query": "fullscreen expand", + "relevant_refs": ["e47"], + "partially_relevant_refs": ["e52"], + "difficulty": "medium", + "tags": ["icon", "synonym"], + "notes": "Fullscreen or expand button" + }, + { + "id": "icon-017", + "query": "plus add button", + "relevant_refs": ["e50"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "description"], + "notes": "Add button with plus icon" + }, + { + "id": "icon-018", + "query": "star favorite", + "relevant_refs": ["e60"], + "partially_relevant_refs": ["e63"], + "difficulty": "medium", + "tags": ["icon", "synonym", "social"], + "notes": "Star or favorite button" + }, + { + "id": "icon-019", + "query": "like heart", + "relevant_refs": ["e61"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "description", "social"], + "notes": "Like button with heart icon" + }, + { + "id": "icon-020", + "query": "undo last action", + "relevant_refs": ["e70"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["icon", "action"], + "notes": "Undo button" + }, + { + "id": "icon-021", + "query": "info circle", + "relevant_refs": ["e80"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["icon", "description"], + "notes": "Info button (i in circle)" + }, + { + "id": "icon-022", + "query": "question help", + "relevant_refs": ["e81"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["icon", "synonym"], + "notes": "Help button" + }, + { + "id": "icon-023", + "query": "trash in toolbar", + "relevant_refs": ["e2"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["icon", "section-context", "description"], + "notes": "Delete in toolbar section, not row actions" + }, + { + "id": "icon-024", + "query": "pencil edit icon", + "relevant_refs": ["e11"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["icon", "description"], + "notes": "Edit button identified by pencil" + }, + { + "id": "icon-025", + "query": "eye view button", + "relevant_refs": ["e12"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["icon", "description"], + "notes": "View button with eye icon" + } +] diff --git a/tests/benchmark/corpus/icon-aria-labels/snapshot.json b/tests/benchmark/corpus/icon-aria-labels/snapshot.json new file mode 100644 index 0000000..f1c9c15 --- /dev/null +++ b/tests/benchmark/corpus/icon-aria-labels/snapshot.json @@ -0,0 +1,59 @@ +[ + {"ref": "e1", "role": "button", "name": "Settings", "parent": null, "section": "Toolbar", "description": "gear icon"}, + {"ref": "e2", "role": "button", "name": "Delete", "parent": null, "section": "Toolbar", "description": "trash icon"}, + {"ref": "e3", "role": "button", "name": "More options", "parent": null, "section": "Toolbar", "description": "three dots vertical"}, + {"ref": "e4", "role": "button", "name": "Notifications", "parent": null, "section": "Toolbar", "description": "bell icon"}, + {"ref": "e5", "role": "button", "name": "Search", "parent": null, "section": "Toolbar", "description": "magnifying glass icon"}, + {"ref": "e6", "role": "button", "name": "Copy link", "parent": null, "section": "Toolbar"}, + {"ref": "e7", "role": "button", "name": "Share", "parent": null, "section": "Toolbar"}, + {"ref": "e8", "role": "button", "name": "Refresh", "parent": null, "section": "Toolbar", "description": "circular arrows"}, + {"ref": "e9", "role": "button", "name": "Download", "parent": null, "section": "Toolbar", "description": "down arrow"}, + {"ref": "e10", "role": "button", "name": "Upload", "parent": null, "section": "Toolbar", "description": "up arrow"}, + + {"ref": "e11", "role": "button", "name": "Edit", "parent": null, "section": "Row Actions", "description": "pencil icon"}, + {"ref": "e12", "role": "button", "name": "View", "parent": null, "section": "Row Actions", "description": "eye icon"}, + {"ref": "e13", "role": "button", "name": "Delete", "parent": null, "section": "Row Actions", "description": "trash icon"}, + {"ref": "e14", "role": "button", "name": "Copy", "parent": null, "section": "Row Actions", "description": "two rectangles"}, + {"ref": "e15", "role": "button", "name": "Move", "parent": null, "section": "Row Actions", "description": "folder with arrow"}, + + {"ref": "e20", "role": "button", "name": "Calendar", "parent": null, "section": "Date Picker", "description": "calendar icon"}, + {"ref": "e21", "role": "button", "name": "Previous month", "parent": "e20", "description": "left chevron"}, + {"ref": "e22", "role": "button", "name": "Next month", "parent": "e20", "description": "right chevron"}, + {"ref": "e23", "role": "button", "name": "Today", "parent": "e20"}, + + {"ref": "e30", "role": "button", "name": "Menu", "parent": null, "section": "Navigation", "description": "hamburger icon"}, + {"ref": "e31", "role": "button", "name": "Close", "parent": null, "section": "Navigation", "description": "X icon"}, + {"ref": "e32", "role": "button", "name": "Back", "parent": null, "section": "Navigation", "description": "left arrow"}, + {"ref": "e33", "role": "button", "name": "Forward", "parent": null, "section": "Navigation", "description": "right arrow"}, + {"ref": "e34", "role": "button", "name": "Home", "parent": null, "section": "Navigation", "description": "house icon"}, + + {"ref": "e40", "role": "button", "name": "Play", "parent": null, "section": "Media Controls", "description": "triangle pointing right"}, + {"ref": "e41", "role": "button", "name": "Pause", "parent": null, "section": "Media Controls", "description": "two vertical bars"}, + {"ref": "e42", "role": "button", "name": "Stop", "parent": null, "section": "Media Controls", "description": "square"}, + {"ref": "e43", "role": "button", "name": "Skip forward", "parent": null, "section": "Media Controls", "description": "double right arrows"}, + {"ref": "e44", "role": "button", "name": "Skip back", "parent": null, "section": "Media Controls", "description": "double left arrows"}, + {"ref": "e45", "role": "button", "name": "Volume", "parent": null, "section": "Media Controls", "description": "speaker icon"}, + {"ref": "e46", "role": "button", "name": "Mute", "parent": null, "section": "Media Controls", "description": "speaker with X"}, + {"ref": "e47", "role": "button", "name": "Fullscreen", "parent": null, "section": "Media Controls", "description": "expand arrows"}, + + {"ref": "e50", "role": "button", "name": "Add", "parent": null, "section": "Quick Actions", "description": "plus icon"}, + {"ref": "e51", "role": "button", "name": "Remove", "parent": null, "section": "Quick Actions", "description": "minus icon"}, + {"ref": "e52", "role": "button", "name": "Expand", "parent": null, "section": "Quick Actions", "description": "chevron down"}, + {"ref": "e53", "role": "button", "name": "Collapse", "parent": null, "section": "Quick Actions", "description": "chevron up"}, + {"ref": "e54", "role": "button", "name": "Pin", "parent": null, "section": "Quick Actions", "description": "pin icon"}, + {"ref": "e55", "role": "button", "name": "Unpin", "parent": null, "section": "Quick Actions", "description": "pin with slash"}, + + {"ref": "e60", "role": "button", "name": "Star", "parent": null, "section": "Social", "description": "star outline"}, + {"ref": "e61", "role": "button", "name": "Like", "parent": null, "section": "Social", "description": "heart icon"}, + {"ref": "e62", "role": "button", "name": "Comment", "parent": null, "section": "Social", "description": "speech bubble"}, + {"ref": "e63", "role": "button", "name": "Bookmark", "parent": null, "section": "Social", "description": "bookmark icon"}, + {"ref": "e64", "role": "button", "name": "Flag", "parent": null, "section": "Social", "description": "flag icon"}, + + {"ref": "e70", "role": "button", "name": "Undo", "parent": null, "section": "History", "description": "curved left arrow"}, + {"ref": "e71", "role": "button", "name": "Redo", "parent": null, "section": "History", "description": "curved right arrow"}, + {"ref": "e72", "role": "button", "name": "History", "parent": null, "section": "History", "description": "clock icon"}, + + {"ref": "e80", "role": "button", "name": "Info", "parent": null, "section": "Help", "description": "i in circle"}, + {"ref": "e81", "role": "button", "name": "Help", "parent": null, "section": "Help", "description": "question mark in circle"}, + {"ref": "e82", "role": "button", "name": "Warning", "parent": null, "section": "Help", "description": "triangle with exclamation"} +] diff --git a/tests/benchmark/corpus/implicit-domain-intent/queries.json b/tests/benchmark/corpus/implicit-domain-intent/queries.json new file mode 100644 index 0000000..19652e9 --- /dev/null +++ b/tests/benchmark/corpus/implicit-domain-intent/queries.json @@ -0,0 +1,146 @@ +[ + { + "id": "idi-001", + "query": "clone this repository", + "relevant_refs": ["e24"], + "partially_relevant_refs": ["e28", "e29"], + "difficulty": "medium", + "tags": ["implicit", "domain-intent", "action"] + }, + { + "id": "idi-002", + "query": "download the source code", + "relevant_refs": ["e30"], + "partially_relevant_refs": ["e24"], + "difficulty": "hard", + "tags": ["implicit", "domain-intent", "action"] + }, + { + "id": "idi-003", + "query": "switch to a different branch", + "relevant_refs": ["e21"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "domain-intent", "action"] + }, + { + "id": "idi-004", + "query": "check CI status", + "relevant_refs": ["e15", "e33"], + "partially_relevant_refs": ["e32"], + "difficulty": "hard", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-005", + "query": "view build results", + "relevant_refs": ["e15", "e33"], + "partially_relevant_refs": ["e32"], + "difficulty": "hard", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-006", + "query": "save this project for later", + "relevant_refs": ["e11", "e39"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "domain-intent", "action"] + }, + { + "id": "idi-007", + "query": "bookmark this repo", + "relevant_refs": ["e11"], + "partially_relevant_refs": ["e39"], + "difficulty": "hard", + "tags": ["implicit", "domain-intent", "action"] + }, + { + "id": "idi-008", + "query": "compare branches", + "relevant_refs": ["e37"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["implicit", "domain-intent", "action"] + }, + { + "id": "idi-009", + "query": "go to my profile", + "relevant_refs": ["e7"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-010", + "query": "check my notifications", + "relevant_refs": ["e6"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-011", + "query": "contact support", + "relevant_refs": ["e52"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-012", + "query": "privacy policy", + "relevant_refs": ["e48"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-013", + "query": "type a new search query", + "relevant_refs": ["e5"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["implicit", "domain-intent", "input"] + }, + { + "id": "idi-014", + "query": "fork this project", + "relevant_refs": ["e10"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["domain-intent", "action", "button"] + }, + { + "id": "idi-015", + "query": "open pull requests", + "relevant_refs": ["e14"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["domain-intent", "navigation"] + }, + { + "id": "idi-016", + "query": "see who contributed", + "relevant_refs": ["e42"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-017", + "query": "check latest releases", + "relevant_refs": ["e40"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["implicit", "domain-intent", "navigation"] + }, + { + "id": "idi-018", + "query": "create a new file", + "relevant_refs": ["e23"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["implicit", "domain-intent", "action"] + } +] diff --git a/tests/benchmark/corpus/implicit-domain-intent/snapshot.json b/tests/benchmark/corpus/implicit-domain-intent/snapshot.json new file mode 100644 index 0000000..1ff13a7 --- /dev/null +++ b/tests/benchmark/corpus/implicit-domain-intent/snapshot.json @@ -0,0 +1,58 @@ +[ + {"ref": "e0", "role": "banner", "name": "", "interactive": false, "section": "Header"}, + {"ref": "e1", "role": "link", "name": "Dashboard", "interactive": true, "section": "Header", "parent": "Navigation"}, + {"ref": "e2", "role": "link", "name": "Projects", "interactive": true, "section": "Header", "parent": "Navigation"}, + {"ref": "e3", "role": "link", "name": "Settings", "interactive": true, "section": "Header", "parent": "Navigation"}, + {"ref": "e4", "role": "button", "name": "New", "interactive": true, "section": "Header", "parent": "Actions"}, + {"ref": "e5", "role": "searchbox", "name": "Search or jump to...", "interactive": true, "section": "Header"}, + {"ref": "e6", "role": "button", "name": "Notifications", "interactive": true, "section": "Header", "parent": "User menu"}, + {"ref": "e7", "role": "button", "name": "Profile", "interactive": true, "section": "Header", "parent": "User menu"}, + {"ref": "e8", "role": "heading", "name": "acme/webapp", "interactive": false, "section": "Repository header"}, + {"ref": "e9", "role": "button", "name": "Watch", "interactive": true, "section": "Repository header", "parent": "Repository actions"}, + {"ref": "e10", "role": "button", "name": "Fork", "interactive": true, "section": "Repository header", "parent": "Repository actions"}, + {"ref": "e11", "role": "button", "name": "Star", "interactive": true, "section": "Repository header", "parent": "Repository actions"}, + {"ref": "e12", "role": "link", "name": "Code", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e13", "role": "link", "name": "Issues", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e14", "role": "link", "name": "Pull requests", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e15", "role": "link", "name": "Actions", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e16", "role": "link", "name": "Projects", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e17", "role": "link", "name": "Wiki", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e18", "role": "link", "name": "Security", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e19", "role": "link", "name": "Insights", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e20", "role": "link", "name": "Settings", "interactive": true, "section": "Repository tabs", "parent": "Tab navigation"}, + {"ref": "e21", "role": "button", "name": "main", "interactive": true, "section": "Code view", "parent": "Branch selector"}, + {"ref": "e22", "role": "button", "name": "Go to file", "interactive": true, "section": "Code view", "parent": "File actions"}, + {"ref": "e23", "role": "button", "name": "Add file", "interactive": true, "section": "Code view", "parent": "File actions"}, + {"ref": "e24", "role": "button", "name": "Code", "interactive": true, "section": "Code view", "parent": "Clone dropdown"}, + {"ref": "e25", "role": "link", "name": "HTTPS", "interactive": true, "section": "Clone panel", "parent": "Clone options"}, + {"ref": "e26", "role": "link", "name": "SSH", "interactive": true, "section": "Clone panel", "parent": "Clone options"}, + {"ref": "e27", "role": "link", "name": "GitHub CLI", "interactive": true, "section": "Clone panel", "parent": "Clone options"}, + {"ref": "e28", "role": "textbox", "name": "Clone URL", "value": "https://github.com/acme/webapp.git", "interactive": true, "section": "Clone panel"}, + {"ref": "e29", "role": "button", "name": "Copy URL", "interactive": true, "section": "Clone panel"}, + {"ref": "e30", "role": "link", "name": "Download ZIP", "interactive": true, "section": "Clone panel"}, + {"ref": "e31", "role": "link", "name": "Open with GitHub Desktop", "interactive": true, "section": "Clone panel"}, + {"ref": "e32", "role": "status", "name": "All checks have passed", "interactive": false, "section": "Commit status"}, + {"ref": "e33", "role": "link", "name": "View workflow runs", "interactive": true, "section": "Commit status"}, + {"ref": "e34", "role": "link", "name": "README.md", "interactive": true, "section": "File tree"}, + {"ref": "e35", "role": "link", "name": "package.json", "interactive": true, "section": "File tree"}, + {"ref": "e36", "role": "link", "name": "src", "interactive": true, "section": "File tree"}, + {"ref": "e37", "role": "link", "name": "Compare", "interactive": true, "section": "Branch actions"}, + {"ref": "e38", "role": "link", "name": "Contribute", "interactive": true, "section": "Branch actions"}, + {"ref": "e39", "role": "button", "name": "Add to list", "interactive": true, "section": "Repository sidebar", "parent": "Lists"}, + {"ref": "e40", "role": "link", "name": "Releases", "interactive": true, "section": "Repository sidebar"}, + {"ref": "e41", "role": "link", "name": "Packages", "interactive": true, "section": "Repository sidebar"}, + {"ref": "e42", "role": "link", "name": "Contributors", "interactive": true, "section": "Repository sidebar"}, + {"ref": "e43", "role": "link", "name": "Activity", "interactive": true, "section": "Repository sidebar"}, + {"ref": "e44", "role": "link", "name": "Report repository", "interactive": true, "section": "Footer"}, + {"ref": "e45", "role": "link", "name": "About", "interactive": true, "section": "Footer"}, + {"ref": "e46", "role": "link", "name": "Blog", "interactive": true, "section": "Footer"}, + {"ref": "e47", "role": "link", "name": "Terms", "interactive": true, "section": "Footer"}, + {"ref": "e48", "role": "link", "name": "Privacy", "interactive": true, "section": "Footer"}, + {"ref": "e49", "role": "link", "name": "Security", "interactive": true, "section": "Footer"}, + {"ref": "e50", "role": "link", "name": "Status", "interactive": true, "section": "Footer"}, + {"ref": "e51", "role": "link", "name": "Docs", "interactive": true, "section": "Footer"}, + {"ref": "e52", "role": "link", "name": "Contact", "interactive": true, "section": "Footer"}, + {"ref": "e53", "role": "link", "name": "Pricing", "interactive": true, "section": "Footer"}, + {"ref": "e54", "role": "link", "name": "API", "interactive": true, "section": "Footer"}, + {"ref": "e55", "role": "link", "name": "Training", "interactive": true, "section": "Footer"} +] diff --git a/tests/benchmark/corpus/ordinal-context/queries.json b/tests/benchmark/corpus/ordinal-context/queries.json new file mode 100644 index 0000000..a8b4f3c --- /dev/null +++ b/tests/benchmark/corpus/ordinal-context/queries.json @@ -0,0 +1,44 @@ +[ + { + "id": "ordinal-001", + "query": "second submit button", + "relevant_refs": ["e2"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["ordinal", "button", "duplicate-labels"] + }, + { + "id": "ordinal-002", + "query": "last submit button", + "relevant_refs": ["e4"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["ordinal", "button", "duplicate-labels"] + }, + { + "id": "ordinal-003", + "query": "second submit button not in header", + "relevant_refs": ["e3"], + "partially_relevant_refs": ["e4"], + "difficulty": "hard", + "tags": ["ordinal", "context-exclusion", "button", "duplicate-labels"], + "notes": "Header submit should be excluded before ordinal selection" + }, + { + "id": "ordinal-004", + "query": "last login link except footer", + "relevant_refs": ["e6"], + "partially_relevant_refs": ["e5"], + "difficulty": "hard", + "tags": ["ordinal", "context-exclusion", "link", "duplicate-labels"] + }, + { + "id": "ordinal-005", + "query": "first name", + "relevant_refs": [], + "partially_relevant_refs": ["e8", "e9", "e10"], + "difficulty": "medium", + "tags": ["guard", "literal-text", "textbox"], + "notes": "Guard case, should not trigger ordinal parsing just because of the word first" + } +] diff --git a/tests/benchmark/corpus/ordinal-context/snapshot.json b/tests/benchmark/corpus/ordinal-context/snapshot.json new file mode 100644 index 0000000..ceb60df --- /dev/null +++ b/tests/benchmark/corpus/ordinal-context/snapshot.json @@ -0,0 +1,13 @@ +[ + {"ref": "e0", "role": "heading", "name": "Checkout", "interactive": false, "section": "Header"}, + {"ref": "e1", "role": "button", "name": "Submit", "interactive": true, "section": "Header", "parent": "Header actions", "positional": {"siblingIndex": 0}}, + {"ref": "e2", "role": "button", "name": "Submit", "interactive": true, "section": "Login", "parent": "Login form", "positional": {"siblingIndex": 1}}, + {"ref": "e3", "role": "button", "name": "Submit", "interactive": true, "section": "Payment", "parent": "Payment form", "positional": {"siblingIndex": 2}}, + {"ref": "e4", "role": "button", "name": "Submit", "interactive": true, "section": "Footer", "parent": "Footer actions", "positional": {"siblingIndex": 3}}, + {"ref": "e5", "role": "link", "name": "Log in", "interactive": true, "section": "Header", "parent": "Header nav", "positional": {"siblingIndex": 4}}, + {"ref": "e6", "role": "link", "name": "Log in", "interactive": true, "section": "Sidebar", "parent": "Quick actions", "positional": {"siblingIndex": 5}}, + {"ref": "e7", "role": "link", "name": "Log in", "interactive": true, "section": "Footer", "parent": "Footer nav", "positional": {"siblingIndex": 6}}, + {"ref": "e8", "role": "textbox", "name": "Email", "interactive": true, "section": "Billing", "parent": "Billing form", "positional": {"siblingIndex": 7}}, + {"ref": "e9", "role": "textbox", "name": "Email", "interactive": true, "section": "Shipping", "parent": "Shipping form", "positional": {"siblingIndex": 8}}, + {"ref": "e10", "role": "textbox", "name": "Email", "interactive": true, "section": "Profile", "parent": "Profile form", "positional": {"siblingIndex": 9}} +] diff --git a/tests/benchmark/corpus/overlays-menus-dialogs/queries.json b/tests/benchmark/corpus/overlays-menus-dialogs/queries.json new file mode 100644 index 0000000..95990d7 --- /dev/null +++ b/tests/benchmark/corpus/overlays-menus-dialogs/queries.json @@ -0,0 +1,218 @@ +[ + { + "id": "overlay-001", + "query": "close dialog", + "relevant_refs": ["e15", "e56"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["dialog", "action", "duplicate-labels"], + "notes": "Multiple dialogs have close buttons" + }, + { + "id": "overlay-002", + "query": "confirm delete", + "relevant_refs": ["e14"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["dialog", "action", "context-exclusion"], + "notes": "Delete button in the Confirm Delete dialog" + }, + { + "id": "overlay-003", + "query": "cancel inside modal", + "relevant_refs": ["e13", "e55"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["dialog", "action", "context-exclusion"], + "notes": "Cancel buttons inside modals vs page" + }, + { + "id": "overlay-004", + "query": "cancel on page not modal", + "relevant_refs": ["e1", "e71"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["dialog", "negative-context", "context-exclusion"], + "notes": "Cancel buttons on the page, excluding modals" + }, + { + "id": "overlay-005", + "query": "open account menu", + "relevant_refs": ["e4"], + "partially_relevant_refs": ["e20"], + "difficulty": "easy", + "tags": ["menu", "action"], + "notes": "Account Menu button or the open menu" + }, + { + "id": "overlay-006", + "query": "select export CSV from menu", + "relevant_refs": ["e31"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["menu", "menuitem", "action"], + "notes": "Export CSV menuitem in Repository Options" + }, + { + "id": "overlay-007", + "query": "choose billing from dropdown", + "relevant_refs": ["e23"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["menu", "menuitem", "synonym"], + "notes": "Billing menuitem in Account Menu" + }, + { + "id": "overlay-008", + "query": "more options for repository", + "relevant_refs": ["e6"], + "partially_relevant_refs": ["e30"], + "difficulty": "medium", + "tags": ["menu", "action"], + "notes": "More Options button in Repository Actions" + }, + { + "id": "overlay-009", + "query": "dismiss notification", + "relevant_refs": ["e42"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["alertdialog", "action", "exact-match"], + "notes": "Dismiss button in notification" + }, + { + "id": "overlay-010", + "query": "save changes in dialog", + "relevant_refs": ["e54"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["dialog", "action", "context-exclusion"], + "notes": "Save button inside Settings dialog, not page" + }, + { + "id": "overlay-011", + "query": "sign out from menu", + "relevant_refs": ["e24"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["menu", "menuitem", "action"], + "notes": "Sign Out menuitem" + }, + { + "id": "overlay-012", + "query": "archive repository", + "relevant_refs": ["e34"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["menu", "menuitem", "action"], + "notes": "Archive menuitem in Repository Options" + }, + { + "id": "overlay-013", + "query": "delete in context menu", + "relevant_refs": ["e94"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["menu", "context-exclusion", "duplicate-labels"], + "notes": "Delete in Context Menu vs other menus" + }, + { + "id": "overlay-014", + "query": "copy from right click menu", + "relevant_refs": ["e92"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["menu", "menuitem", "synonym"], + "notes": "Copy in Context Menu (right-click equivalent)" + }, + { + "id": "overlay-015", + "query": "theme dropdown options", + "relevant_refs": ["e60"], + "partially_relevant_refs": ["e61", "e62", "e63"], + "difficulty": "medium", + "tags": ["combobox", "dialog"], + "notes": "Select Theme combobox in Settings" + }, + { + "id": "overlay-016", + "query": "choose dark theme", + "relevant_refs": ["e62"], + "partially_relevant_refs": ["e52"], + "difficulty": "medium", + "tags": ["option", "combobox"], + "notes": "Dark option in theme dropdown" + }, + { + "id": "overlay-017", + "query": "help in footer not menu", + "relevant_refs": ["e72"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["link", "negative-context", "context-exclusion"], + "notes": "Help link in Footer, excluding menu" + }, + { + "id": "overlay-018", + "query": "settings menuitem", + "relevant_refs": ["e22"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["menu", "menuitem", "duplicate-labels"], + "notes": "Settings in Account Menu, not the nav button" + }, + { + "id": "overlay-019", + "query": "undo notification action", + "relevant_refs": ["e43"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["alertdialog", "action"], + "notes": "Undo button in notification" + }, + { + "id": "overlay-020", + "query": "profile in account dropdown", + "relevant_refs": ["e21"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["menu", "menuitem"], + "notes": "Profile menuitem" + }, + { + "id": "overlay-021", + "query": "delete from repository options menu", + "relevant_refs": ["e35"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["menu", "context-exclusion", "duplicate-labels"], + "notes": "Delete in Repository Options, not Context Menu" + }, + { + "id": "overlay-022", + "query": "enable dark mode checkbox", + "relevant_refs": ["e52"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["dialog", "checkbox"], + "notes": "Dark Mode checkbox in Settings" + }, + { + "id": "overlay-023", + "query": "page save button not dialog", + "relevant_refs": ["e2", "e70"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["button", "negative-context", "context-exclusion"], + "notes": "Save buttons on page, excluding dialogs" + }, + { + "id": "overlay-024", + "query": "paste from clipboard", + "relevant_refs": ["e93"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["menu", "menuitem", "action"], + "notes": "Paste menuitem in Context Menu" + } +] diff --git a/tests/benchmark/corpus/overlays-menus-dialogs/snapshot.json b/tests/benchmark/corpus/overlays-menus-dialogs/snapshot.json new file mode 100644 index 0000000..7622665 --- /dev/null +++ b/tests/benchmark/corpus/overlays-menus-dialogs/snapshot.json @@ -0,0 +1,60 @@ +[ + {"ref": "e1", "role": "button", "name": "Cancel", "parent": null, "section": "Page Header"}, + {"ref": "e2", "role": "button", "name": "Save", "parent": null, "section": "Page Header"}, + {"ref": "e3", "role": "button", "name": "Settings", "parent": null, "section": "Navigation"}, + {"ref": "e4", "role": "button", "name": "Account Menu", "parent": null, "section": "Navigation", "expanded": false}, + {"ref": "e5", "role": "link", "name": "Home", "parent": null, "section": "Navigation"}, + {"ref": "e6", "role": "button", "name": "More Options", "parent": null, "section": "Repository Actions"}, + {"ref": "e7", "role": "button", "name": "Delete Repository", "parent": null, "section": "Repository Actions"}, + + {"ref": "e10", "role": "dialog", "name": "Confirm Delete", "parent": null, "modal": true}, + {"ref": "e11", "role": "heading", "name": "Delete Repository?", "parent": "e10", "section": "Confirm Delete"}, + {"ref": "e12", "role": "StaticText", "name": "This action cannot be undone.", "parent": "e10", "section": "Confirm Delete"}, + {"ref": "e13", "role": "button", "name": "Cancel", "parent": "e10", "section": "Confirm Delete"}, + {"ref": "e14", "role": "button", "name": "Delete", "parent": "e10", "section": "Confirm Delete"}, + {"ref": "e15", "role": "button", "name": "Close", "parent": "e10", "section": "Confirm Delete", "description": "Close dialog"}, + + {"ref": "e20", "role": "menu", "name": "Account Menu", "parent": null, "expanded": true}, + {"ref": "e21", "role": "menuitem", "name": "Profile", "parent": "e20", "section": "Account Menu"}, + {"ref": "e22", "role": "menuitem", "name": "Settings", "parent": "e20", "section": "Account Menu"}, + {"ref": "e23", "role": "menuitem", "name": "Billing", "parent": "e20", "section": "Account Menu"}, + {"ref": "e24", "role": "menuitem", "name": "Sign Out", "parent": "e20", "section": "Account Menu"}, + {"ref": "e25", "role": "menuitem", "name": "Help", "parent": "e20", "section": "Account Menu"}, + + {"ref": "e30", "role": "menu", "name": "Repository Options", "parent": null, "expanded": true}, + {"ref": "e31", "role": "menuitem", "name": "Export CSV", "parent": "e30", "section": "Repository Options"}, + {"ref": "e32", "role": "menuitem", "name": "Export JSON", "parent": "e30", "section": "Repository Options"}, + {"ref": "e33", "role": "menuitem", "name": "Duplicate", "parent": "e30", "section": "Repository Options"}, + {"ref": "e34", "role": "menuitem", "name": "Archive", "parent": "e30", "section": "Repository Options"}, + {"ref": "e35", "role": "menuitem", "name": "Delete", "parent": "e30", "section": "Repository Options"}, + + {"ref": "e40", "role": "alertdialog", "name": "Notification", "parent": null}, + {"ref": "e41", "role": "StaticText", "name": "Changes saved successfully", "parent": "e40", "section": "Notification"}, + {"ref": "e42", "role": "button", "name": "Dismiss", "parent": "e40", "section": "Notification"}, + {"ref": "e43", "role": "button", "name": "Undo", "parent": "e40", "section": "Notification"}, + + {"ref": "e50", "role": "dialog", "name": "Settings", "parent": null, "modal": true}, + {"ref": "e51", "role": "heading", "name": "Settings", "parent": "e50", "section": "Settings Dialog"}, + {"ref": "e52", "role": "checkbox", "name": "Dark Mode", "parent": "e50", "section": "Settings Dialog"}, + {"ref": "e53", "role": "checkbox", "name": "Notifications", "parent": "e50", "section": "Settings Dialog"}, + {"ref": "e54", "role": "button", "name": "Save", "parent": "e50", "section": "Settings Dialog"}, + {"ref": "e55", "role": "button", "name": "Cancel", "parent": "e50", "section": "Settings Dialog"}, + {"ref": "e56", "role": "button", "name": "Close", "parent": "e50", "section": "Settings Dialog"}, + + {"ref": "e60", "role": "combobox", "name": "Select Theme", "parent": "e50", "section": "Settings Dialog", "expanded": true}, + {"ref": "e61", "role": "option", "name": "Light", "parent": "e60"}, + {"ref": "e62", "role": "option", "name": "Dark", "parent": "e60"}, + {"ref": "e63", "role": "option", "name": "System", "parent": "e60"}, + + {"ref": "e70", "role": "button", "name": "Save", "parent": null, "section": "Footer"}, + {"ref": "e71", "role": "button", "name": "Cancel", "parent": null, "section": "Footer"}, + {"ref": "e72", "role": "link", "name": "Help", "parent": null, "section": "Footer"}, + + {"ref": "e80", "role": "tooltip", "name": "Save your changes", "parent": null}, + + {"ref": "e90", "role": "menu", "name": "Context Menu", "parent": null, "expanded": true}, + {"ref": "e91", "role": "menuitem", "name": "Cut", "parent": "e90"}, + {"ref": "e92", "role": "menuitem", "name": "Copy", "parent": "e90"}, + {"ref": "e93", "role": "menuitem", "name": "Paste", "parent": "e90"}, + {"ref": "e94", "role": "menuitem", "name": "Delete", "parent": "e90"} +] diff --git a/tests/benchmark/corpus/table-grid/queries.json b/tests/benchmark/corpus/table-grid/queries.json new file mode 100644 index 0000000..f7aeb28 --- /dev/null +++ b/tests/benchmark/corpus/table-grid/queries.json @@ -0,0 +1,219 @@ +[ + { + "id": "table-001", + "query": "edit Alice", + "relevant_refs": ["e15"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "row-context", "action"], + "notes": "Edit button for Alice Johnson's row" + }, + { + "id": "table-002", + "query": "delete second invoice", + "relevant_refs": ["e26"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["table", "ordinal", "action"], + "notes": "Delete button for second row (Acme Corp)" + }, + { + "id": "table-003", + "query": "status for Acme Corp", + "relevant_refs": ["e24"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "row-context", "cell"], + "notes": "Status cell showing Pending for Acme Corp" + }, + { + "id": "table-004", + "query": "open failed payment row", + "relevant_refs": ["e40"], + "partially_relevant_refs": ["e48"], + "difficulty": "hard", + "tags": ["table", "state", "row-context"], + "notes": "Row with Failed status (Maya Chen)" + }, + { + "id": "table-005", + "query": "download invoice INV-1024", + "relevant_refs": [], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["table", "no-match", "absent-control"], + "expect_no_match": true, + "notes": "Invoice doesn't exist" + }, + { + "id": "table-006", + "query": "approve pending request from Maya", + "relevant_refs": [], + "partially_relevant_refs": ["e48"], + "difficulty": "hard", + "tags": ["table", "domain-intent", "row-context"], + "notes": "Maya has Failed status, not Pending. Retry Payment is closest." + }, + { + "id": "table-007", + "query": "sort by due date", + "relevant_refs": ["e60"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["table", "action", "exact-match"], + "notes": "Exact match for sort control" + }, + { + "id": "table-008", + "query": "filter unpaid invoices", + "relevant_refs": ["e62"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "filter", "domain-intent"], + "notes": "Filter by Status combobox for non-paid statuses" + }, + { + "id": "table-009", + "query": "open actions menu for third row", + "relevant_refs": ["e82"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["table", "ordinal", "action"], + "notes": "More button for Bob Smith (third row)" + }, + { + "id": "table-010", + "query": "select all overdue invoices", + "relevant_refs": ["e72"], + "partially_relevant_refs": ["e64"], + "difficulty": "hard", + "tags": ["table", "state", "checkbox"], + "notes": "Only INV-1003 is overdue; select all is partial" + }, + { + "id": "table-011", + "query": "download Bob Smith invoice", + "relevant_refs": ["e37"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "row-context", "action"], + "notes": "Download button for Bob Smith's row" + }, + { + "id": "table-012", + "query": "send reminder to Acme", + "relevant_refs": ["e28"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "row-context", "action"], + "notes": "Send Reminder button for Acme Corp" + }, + { + "id": "table-013", + "query": "retry payment Maya Chen", + "relevant_refs": ["e48"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["table", "row-context", "action"], + "notes": "Retry Payment button in Maya's row" + }, + { + "id": "table-014", + "query": "mark overdue as bad debt", + "relevant_refs": ["e39"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "state", "action"], + "notes": "Only the overdue invoice has Mark as Bad Debt" + }, + { + "id": "table-015", + "query": "export table to CSV", + "relevant_refs": ["e63"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["table", "action", "synonym"], + "notes": "Export CSV button" + }, + { + "id": "table-016", + "query": "delete all selected", + "relevant_refs": ["e65"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "bulk-action", "synonym"], + "notes": "Bulk Delete button" + }, + { + "id": "table-017", + "query": "edit first invoice", + "relevant_refs": ["e15"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "ordinal", "action"], + "notes": "Edit button for first row (Alice)" + }, + { + "id": "table-018", + "query": "select invoice 1003", + "relevant_refs": ["e72"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "row-context", "checkbox"], + "notes": "Checkbox for INV-1003" + }, + { + "id": "table-019", + "query": "last row more actions", + "relevant_refs": ["e84"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["table", "ordinal", "action"], + "notes": "More button in last row (Delta Inc)" + }, + { + "id": "table-020", + "query": "search invoices", + "relevant_refs": ["e66"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["table", "searchbox", "exact-match"], + "notes": "Search invoices searchbox" + }, + { + "id": "table-021", + "query": "edit Delta Inc invoice", + "relevant_refs": ["e55"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["table", "row-context", "action"], + "notes": "Edit button for Delta Inc row" + }, + { + "id": "table-022", + "query": "invoice with highest amount", + "relevant_refs": ["e20"], + "partially_relevant_refs": ["e23"], + "difficulty": "hard", + "tags": ["table", "domain-intent", "implicit"], + "notes": "Acme Corp has highest amount at $1500" + }, + { + "id": "table-023", + "query": "select checkbox for pending invoice", + "relevant_refs": ["e71"], + "partially_relevant_refs": [], + "difficulty": "hard", + "tags": ["table", "state", "checkbox"], + "notes": "Only INV-1002 (Acme) is pending" + }, + { + "id": "table-024", + "query": "sort invoices by amount", + "relevant_refs": ["e61"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["table", "action", "exact-match"], + "notes": "Sort by Amount button" + } +] diff --git a/tests/benchmark/corpus/table-grid/snapshot.json b/tests/benchmark/corpus/table-grid/snapshot.json new file mode 100644 index 0000000..ade656e --- /dev/null +++ b/tests/benchmark/corpus/table-grid/snapshot.json @@ -0,0 +1,78 @@ +[ + {"ref": "e1", "role": "table", "name": "Invoices", "parent": null}, + {"ref": "e2", "role": "row", "name": "", "parent": "e1", "section": "Invoices"}, + {"ref": "e3", "role": "columnheader", "name": "Invoice", "parent": "e2"}, + {"ref": "e4", "role": "columnheader", "name": "Customer", "parent": "e2"}, + {"ref": "e5", "role": "columnheader", "name": "Amount", "parent": "e2"}, + {"ref": "e6", "role": "columnheader", "name": "Status", "parent": "e2"}, + {"ref": "e7", "role": "columnheader", "name": "Actions", "parent": "e2"}, + + {"ref": "e10", "role": "row", "name": "INV-1001 Alice Johnson $250.00 Paid", "parent": "e1"}, + {"ref": "e11", "role": "cell", "name": "INV-1001", "parent": "e10"}, + {"ref": "e12", "role": "cell", "name": "Alice Johnson", "parent": "e10"}, + {"ref": "e13", "role": "cell", "name": "$250.00", "parent": "e10"}, + {"ref": "e14", "role": "cell", "name": "Paid", "parent": "e10"}, + {"ref": "e15", "role": "button", "name": "Edit", "parent": "e10", "description": "Edit invoice INV-1001"}, + {"ref": "e16", "role": "button", "name": "Delete", "parent": "e10", "description": "Delete invoice INV-1001"}, + {"ref": "e17", "role": "button", "name": "Download", "parent": "e10", "description": "Download PDF"}, + + {"ref": "e20", "role": "row", "name": "INV-1002 Acme Corp $1,500.00 Pending", "parent": "e1"}, + {"ref": "e21", "role": "cell", "name": "INV-1002", "parent": "e20"}, + {"ref": "e22", "role": "cell", "name": "Acme Corp", "parent": "e20"}, + {"ref": "e23", "role": "cell", "name": "$1,500.00", "parent": "e20"}, + {"ref": "e24", "role": "cell", "name": "Pending", "parent": "e20"}, + {"ref": "e25", "role": "button", "name": "Edit", "parent": "e20", "description": "Edit invoice INV-1002"}, + {"ref": "e26", "role": "button", "name": "Delete", "parent": "e20", "description": "Delete invoice INV-1002"}, + {"ref": "e27", "role": "button", "name": "Download", "parent": "e20", "description": "Download PDF"}, + {"ref": "e28", "role": "button", "name": "Send Reminder", "parent": "e20"}, + + {"ref": "e30", "role": "row", "name": "INV-1003 Bob Smith $75.00 Overdue", "parent": "e1"}, + {"ref": "e31", "role": "cell", "name": "INV-1003", "parent": "e30"}, + {"ref": "e32", "role": "cell", "name": "Bob Smith", "parent": "e30"}, + {"ref": "e33", "role": "cell", "name": "$75.00", "parent": "e30"}, + {"ref": "e34", "role": "cell", "name": "Overdue", "parent": "e30"}, + {"ref": "e35", "role": "button", "name": "Edit", "parent": "e30", "description": "Edit invoice INV-1003"}, + {"ref": "e36", "role": "button", "name": "Delete", "parent": "e30", "description": "Delete invoice INV-1003"}, + {"ref": "e37", "role": "button", "name": "Download", "parent": "e30", "description": "Download PDF"}, + {"ref": "e38", "role": "button", "name": "Send Reminder", "parent": "e30"}, + {"ref": "e39", "role": "button", "name": "Mark as Bad Debt", "parent": "e30"}, + + {"ref": "e40", "role": "row", "name": "INV-1004 Maya Chen $320.00 Failed", "parent": "e1"}, + {"ref": "e41", "role": "cell", "name": "INV-1004", "parent": "e40"}, + {"ref": "e42", "role": "cell", "name": "Maya Chen", "parent": "e40"}, + {"ref": "e43", "role": "cell", "name": "$320.00", "parent": "e40"}, + {"ref": "e44", "role": "cell", "name": "Failed", "parent": "e40"}, + {"ref": "e45", "role": "button", "name": "Edit", "parent": "e40", "description": "Edit invoice INV-1004"}, + {"ref": "e46", "role": "button", "name": "Delete", "parent": "e40", "description": "Delete invoice INV-1004"}, + {"ref": "e47", "role": "button", "name": "Download", "parent": "e40", "description": "Download PDF"}, + {"ref": "e48", "role": "button", "name": "Retry Payment", "parent": "e40"}, + + {"ref": "e50", "role": "row", "name": "INV-1005 Delta Inc $890.00 Paid", "parent": "e1"}, + {"ref": "e51", "role": "cell", "name": "INV-1005", "parent": "e50"}, + {"ref": "e52", "role": "cell", "name": "Delta Inc", "parent": "e50"}, + {"ref": "e53", "role": "cell", "name": "$890.00", "parent": "e50"}, + {"ref": "e54", "role": "cell", "name": "Paid", "parent": "e50"}, + {"ref": "e55", "role": "button", "name": "Edit", "parent": "e50", "description": "Edit invoice INV-1005"}, + {"ref": "e56", "role": "button", "name": "Delete", "parent": "e50", "description": "Delete invoice INV-1005"}, + {"ref": "e57", "role": "button", "name": "Download", "parent": "e50", "description": "Download PDF"}, + + {"ref": "e60", "role": "button", "name": "Sort by Due Date", "parent": null, "section": "Table Controls"}, + {"ref": "e61", "role": "button", "name": "Sort by Amount", "parent": null, "section": "Table Controls"}, + {"ref": "e62", "role": "combobox", "name": "Filter by Status", "parent": null, "section": "Table Controls"}, + {"ref": "e63", "role": "button", "name": "Export CSV", "parent": null, "section": "Table Controls"}, + {"ref": "e64", "role": "checkbox", "name": "Select All", "parent": null, "section": "Table Controls"}, + {"ref": "e65", "role": "button", "name": "Bulk Delete", "parent": null, "section": "Table Controls"}, + {"ref": "e66", "role": "searchbox", "name": "Search invoices", "parent": null, "section": "Table Controls"}, + + {"ref": "e70", "role": "checkbox", "name": "Select INV-1001", "parent": "e10"}, + {"ref": "e71", "role": "checkbox", "name": "Select INV-1002", "parent": "e20"}, + {"ref": "e72", "role": "checkbox", "name": "Select INV-1003", "parent": "e30"}, + {"ref": "e73", "role": "checkbox", "name": "Select INV-1004", "parent": "e40"}, + {"ref": "e74", "role": "checkbox", "name": "Select INV-1005", "parent": "e50"}, + + {"ref": "e80", "role": "button", "name": "More", "parent": "e10", "description": "More actions"}, + {"ref": "e81", "role": "button", "name": "More", "parent": "e20", "description": "More actions"}, + {"ref": "e82", "role": "button", "name": "More", "parent": "e30", "description": "More actions"}, + {"ref": "e83", "role": "button", "name": "More", "parent": "e40", "description": "More actions"}, + {"ref": "e84", "role": "button", "name": "More", "parent": "e50", "description": "More actions"} +] diff --git a/tests/benchmark/corpus/visual-layout/queries.json b/tests/benchmark/corpus/visual-layout/queries.json new file mode 100644 index 0000000..6cb0ca2 --- /dev/null +++ b/tests/benchmark/corpus/visual-layout/queries.json @@ -0,0 +1,50 @@ +[ + { + "id": "visual-001", + "query": "button in top right corner", + "relevant_refs": ["e1"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["visual", "position", "directional"] + }, + { + "id": "visual-002", + "query": "button on the left side", + "relevant_refs": ["e0"], + "partially_relevant_refs": ["e3", "e4"], + "difficulty": "medium", + "tags": ["visual", "position", "directional"] + }, + { + "id": "visual-003", + "query": "button at the bottom of the page", + "relevant_refs": ["e6", "e7"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["visual", "position", "directional"] + }, + { + "id": "visual-004", + "query": "link on the left", + "relevant_refs": ["e3", "e4"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["visual", "position", "link"] + }, + { + "id": "visual-005", + "query": "search box in the header", + "relevant_refs": ["e2"], + "partially_relevant_refs": [], + "difficulty": "medium", + "tags": ["visual", "section", "search"] + }, + { + "id": "visual-006", + "query": "settings button top right", + "relevant_refs": ["e1"], + "partially_relevant_refs": [], + "difficulty": "easy", + "tags": ["visual", "position", "name-match"] + } +] diff --git a/tests/benchmark/corpus/visual-layout/snapshot.json b/tests/benchmark/corpus/visual-layout/snapshot.json new file mode 100644 index 0000000..5ee983e --- /dev/null +++ b/tests/benchmark/corpus/visual-layout/snapshot.json @@ -0,0 +1,10 @@ +[ + {"ref": "e0", "role": "button", "name": "Menu", "interactive": true, "section": "Header", "positional": {"x": 20, "y": 20, "width": 80, "height": 32}}, + {"ref": "e1", "role": "button", "name": "Settings", "interactive": true, "section": "Header", "positional": {"x": 900, "y": 20, "width": 80, "height": 32}}, + {"ref": "e2", "role": "searchbox", "name": "Search", "interactive": true, "section": "Header", "positional": {"x": 400, "y": 20, "width": 200, "height": 32}}, + {"ref": "e3", "role": "link", "name": "Help", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 300, "width": 100, "height": 24}}, + {"ref": "e4", "role": "link", "name": "Contact", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 340, "width": 100, "height": 24}}, + {"ref": "e5", "role": "button", "name": "Submit", "interactive": true, "section": "Main", "positional": {"x": 500, "y": 400, "width": 120, "height": 40}}, + {"ref": "e6", "role": "button", "name": "Cancel", "interactive": true, "section": "Footer", "positional": {"x": 400, "y": 700, "width": 80, "height": 32}}, + {"ref": "e7", "role": "button", "name": "Save", "interactive": true, "section": "Footer", "positional": {"x": 500, "y": 700, "width": 80, "height": 32}} +] diff --git a/tests/benchmark/scripts/finalize-report.sh b/tests/benchmark/scripts/finalize-report.sh deleted file mode 100755 index 632a923..0000000 --- a/tests/benchmark/scripts/finalize-report.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -# -# Finalize benchmark report and generate summary -# -# Usage: -# ./finalize-report.sh -# -set -euo pipefail - -if [[ $# -lt 1 ]]; then - echo "Usage: $0 " - exit 1 -fi - -REPORT_FILE="$1" -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -# Calculate final metrics -TMP_FILE=$(mktemp) -jq ' - .summary.accuracy = (if .summary.total > 0 then (.summary.passed / .summary.total * 100 | floor / 100) else 0 end) | - .summary.avg_score = (if (.results | length) > 0 then ([.results[].score] | add / length | . * 1000 | floor / 1000) else 0 end) | - .summary.avg_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | add / length | floor) else 0 end) | - .summary.min_score = (if (.results | length) > 0 then ([.results[].score] | min) else 0 end) | - .summary.max_score = (if (.results | length) > 0 then ([.results[].score] | max) else 0 end) | - .summary.min_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | min) else 0 end) | - .summary.max_latency_ms = (if (.results | length) > 0 then ([.results[].latency_ms] | max) else 0 end) -' "${REPORT_FILE}" > "${TMP_FILE}" -mv "${TMP_FILE}" "${REPORT_FILE}" - -# Generate markdown summary -TIMESTAMP=$(jq -r '.benchmark.timestamp' "${REPORT_FILE}") -STRATEGY=$(jq -r '.benchmark.strategy' "${REPORT_FILE}") -VERSION=$(jq -r '.benchmark.version' "${REPORT_FILE}") -TOTAL=$(jq -r '.summary.total' "${REPORT_FILE}") -PASSED=$(jq -r '.summary.passed' "${REPORT_FILE}") -FAILED=$(jq -r '.summary.failed' "${REPORT_FILE}") -SKIPPED=$(jq -r '.summary.skipped' "${REPORT_FILE}") -ACCURACY=$(jq -r '.summary.accuracy' "${REPORT_FILE}") -AVG_SCORE=$(jq -r '.summary.avg_score' "${REPORT_FILE}") -AVG_LATENCY=$(jq -r '.summary.avg_latency_ms' "${REPORT_FILE}") -MIN_SCORE=$(jq -r '.summary.min_score' "${REPORT_FILE}") -MAX_SCORE=$(jq -r '.summary.max_score' "${REPORT_FILE}") -MIN_LATENCY=$(jq -r '.summary.min_latency_ms' "${REPORT_FILE}") -MAX_LATENCY=$(jq -r '.summary.max_latency_ms' "${REPORT_FILE}") - -cat > "${SUMMARY_FILE}" << EOF -# Semantic Matching Benchmark Results - -## Benchmark Info - -| Field | Value | -|-------|-------| -| Timestamp | ${TIMESTAMP} | -| Strategy | ${STRATEGY} | -| Version | ${VERSION} | - -## Results Summary - -| Metric | Value | -|--------|-------| -| Total Cases | ${TOTAL} | -| Passed | ${PASSED} | -| Failed | ${FAILED} | -| Skipped | ${SKIPPED} | -| **Accuracy** | **${ACCURACY}%** | - -## Score Distribution - -| Metric | Value | -|--------|-------| -| Average Score | ${AVG_SCORE} | -| Min Score | ${MIN_SCORE} | -| Max Score | ${MAX_SCORE} | - -## Latency - -| Metric | Value | -|--------|-------| -| Average | ${AVG_LATENCY} ms | -| Min | ${MIN_LATENCY} ms | -| Max | ${MAX_LATENCY} ms | - -## Failed Cases - -EOF - -# Add failed cases -jq -r '.results[] | select(.status == "fail") | "| \(.id) | \(.notes) |"' "${REPORT_FILE}" >> "${SUMMARY_FILE}" - -if [[ $(jq '[.results[] | select(.status == "fail")] | length' "${REPORT_FILE}") -eq 0 ]]; then - echo "_No failures_" >> "${SUMMARY_FILE}" -else - # Add header - sed -i.bak '/## Failed Cases/a\ -| ID | Notes |\ -|-----|-------|' "${SUMMARY_FILE}" - rm -f "${SUMMARY_FILE}.bak" -fi - -echo "" -echo "================================================" -echo " BENCHMARK SUMMARY" -echo "================================================" -echo " Strategy: ${STRATEGY}" -echo " Total: ${TOTAL}" -echo " Passed: ${PASSED}" -echo " Failed: ${FAILED}" -echo " Accuracy: ${ACCURACY}%" -echo " Avg Score: ${AVG_SCORE}" -echo " Avg Latency: ${AVG_LATENCY} ms" -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/record-result.sh b/tests/benchmark/scripts/record-result.sh deleted file mode 100755 index 2288f7c..0000000 --- a/tests/benchmark/scripts/record-result.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# -# Record a benchmark result -# -# Usage: -# ./record-result.sh "notes" -# -set -euo pipefail - -if [[ $# -lt 5 ]]; then - echo "Usage: $0 [notes]" - exit 1 -fi - -REPORT_FILE="$1" -ID="$2" -STATUS="$3" -SCORE="$4" -LATENCY_MS="$5" -NOTES="${6:-}" -TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) - -# Create result entry -RESULT_JSON=$(jq -n \ - --arg id "${ID}" \ - --arg status "${STATUS}" \ - --argjson score "${SCORE}" \ - --argjson latency "${LATENCY_MS}" \ - --arg notes "${NOTES}" \ - --arg ts "${TIMESTAMP}" \ - '{id: $id, status: $status, score: $score, latency_ms: $latency, notes: $notes, timestamp: $ts}') - -# Append to report -TMP_FILE=$(mktemp) -jq --argjson result "${RESULT_JSON}" \ - --arg status "${STATUS}" \ - '.results += [$result] | - .summary.total += 1 | - if $status == "pass" then .summary.passed += 1 - elif $status == "fail" then .summary.failed += 1 - else .summary.skipped += 1 end' \ - "${REPORT_FILE}" > "${TMP_FILE}" - -mv "${TMP_FILE}" "${REPORT_FILE}" diff --git a/tests/benchmark/scripts/run-benchmark.sh b/tests/benchmark/scripts/run-benchmark.sh deleted file mode 100755 index a8b4492..0000000 --- a/tests/benchmark/scripts/run-benchmark.sh +++ /dev/null @@ -1,194 +0,0 @@ -#!/bin/bash -# -# Run semantic matching benchmark -# -# Usage: -# ./run-benchmark.sh [--strategy ] [--cases ] -# -# Options: -# --strategy Strategy to benchmark (lexical, embedding, combined, all) -# --cases Specific case file to run (default: all) -# --output Output directory (default: ../results) -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CASES_DIR="${BENCHMARK_DIR}/cases" -CONFIG_FILE="${BENCHMARK_DIR}/config/benchmark.json" -SNAPSHOTS_DIR="${BENCHMARK_DIR}/../e2e/assets/snapshots" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -# Parse args -STRATEGY="combined" -CASE_FILE="" -while [[ $# -gt 0 ]]; do - case "$1" in - --strategy) STRATEGY="$2"; shift 2 ;; - --cases) CASE_FILE="$2"; shift 2 ;; - --output) RESULTS_DIR="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.json" - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg strategy "${STRATEGY}" \ - --arg version "$(${SEMANTIC} --version 2>/dev/null || echo 'dev')" \ - '{ - benchmark: { - timestamp: $ts, - strategy: $strategy, - version: $version - }, - results: [], - summary: { - total: 0, - passed: 0, - failed: 0, - skipped: 0, - accuracy: 0, - avg_score: 0, - avg_latency_ms: 0 - } - }' > "${REPORT_FILE}" - -# Run cases -run_case() { - local case_file="$1" - local case_name - case_name=$(basename "$case_file" .json) - - echo "" - echo "=== Running: ${case_name} ===" - - local count - count=$(jq length "$case_file") - - for i in $(seq 0 $((count - 1))); do - local id query snapshot expect_ref expect_ref_alt expect_no_match expect_no_crash expect_has_matches threshold min_score - - id=$(jq -r ".[$i].id" "$case_file") - query=$(jq -r ".[$i].query" "$case_file") - snapshot=$(jq -r ".[$i].snapshot" "$case_file") - expect_ref=$(jq -r ".[$i].expect_ref // empty" "$case_file") - expect_ref_alt=$(jq -r ".[$i].expect_ref_alt // [] | join(\",\")" "$case_file") - expect_no_match=$(jq -r ".[$i].expect_no_match // false" "$case_file") - expect_no_crash=$(jq -r ".[$i].expect_no_crash // false" "$case_file") - expect_has_matches=$(jq -r ".[$i].expect_has_matches // false" "$case_file") - threshold=$(jq -r ".[$i].threshold // 0.3" "$case_file") - min_score=$(jq -r ".[$i].min_score // 0" "$case_file") - - local snapshot_path="${SNAPSHOTS_DIR}/${snapshot}" - if [[ ! -f "${snapshot_path}" ]]; then - echo " [${id}] SKIP: snapshot not found: ${snapshot}" - "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "skip" 0 0 "snapshot not found" - continue - fi - - # Run query and measure time - local start_ms end_ms duration_ms result exit_code - start_ms=$(python3 -c 'import time; print(int(time.time() * 1000))') - - set +e - result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot_path}" \ - --strategy "${STRATEGY}" \ - --threshold "${threshold}" \ - --format json 2>&1) - exit_code=$? - set -e - - end_ms=$(python3 -c 'import time; print(int(time.time() * 1000))') - duration_ms=$((end_ms - start_ms)) - - # Evaluate result - local status="fail" - local got_ref="" - local got_score=0 - local notes="" - - if [[ ${exit_code} -ne 0 ]]; then - if [[ "${expect_no_crash}" == "true" ]]; then - # Some crashes are expected (empty query, etc) - status="pass" - notes="exit ${exit_code} (expected)" - else - notes="exit ${exit_code}: ${result}" - fi - else - got_ref=$(echo "$result" | jq -r '.best_ref // empty') - got_score=$(echo "$result" | jq -r '.best_score // 0') - local match_count - match_count=$(echo "$result" | jq -r '.matches | length') - - if [[ "${expect_no_match}" == "true" ]]; then - if [[ ${match_count} -eq 0 ]]; then - status="pass" - notes="no matches (expected)" - else - notes="expected no matches, got ${match_count}" - fi - elif [[ "${expect_has_matches}" == "true" ]]; then - if [[ ${match_count} -gt 0 ]]; then - status="pass" - notes="${match_count} matches" - else - notes="expected matches, got 0" - fi - elif [[ -n "${expect_ref}" ]]; then - if [[ "${got_ref}" == "${expect_ref}" ]]; then - status="pass" - notes="ref=${got_ref}, score=${got_score}" - elif [[ -n "${expect_ref_alt}" ]] && echo ",${expect_ref_alt}," | grep -q ",${got_ref},"; then - status="pass" - notes="ref=${got_ref} (alt), score=${got_score}" - else - notes="got ${got_ref}, want ${expect_ref}" - fi - elif [[ "${expect_no_crash}" == "true" ]]; then - status="pass" - notes="no crash" - fi - fi - - # Record result - "${SCRIPT_DIR}/record-result.sh" "${REPORT_FILE}" "${id}" "${status}" "${got_score}" "${duration_ms}" "${notes}" - - if [[ "${status}" == "pass" ]]; then - echo " [${id}] PASS: ${notes}" - else - echo " [${id}] FAIL: ${notes}" - fi - done -} - -# Find case files -if [[ -n "${CASE_FILE}" ]]; then - run_case "${CASES_DIR}/${CASE_FILE}" -else - for case_file in "${CASES_DIR}"/*.json; do - [[ -f "$case_file" ]] || continue - run_case "$case_file" - done -fi - -# Finalize report -"${SCRIPT_DIR}/finalize-report.sh" "${REPORT_FILE}" - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo "Benchmark complete: ${REPORT_FILE}" diff --git a/tests/benchmark/scripts/run-corpus-benchmark.sh b/tests/benchmark/scripts/run-corpus-benchmark.sh deleted file mode 100755 index 67aca0c..0000000 --- a/tests/benchmark/scripts/run-corpus-benchmark.sh +++ /dev/null @@ -1,344 +0,0 @@ -#!/bin/bash -# -# Run semantic matching benchmark with ranking metrics -# -# Usage: -# ./run-corpus-benchmark.sh [--strategy ] [--corpus ] -# -# Metrics: -# - MRR (Mean Reciprocal Rank) -# - P@1 (Precision at 1) -# - P@3 (Precision at 3) -# - Latency distribution (p50, p95, p99) -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -# Parse args -STRATEGY="combined" -SPECIFIC_CORPUS="" -TOP_K=5 -while [[ $# -gt 0 ]]; do - case "$1" in - --strategy) STRATEGY="$2"; shift 2 ;; - --corpus) SPECIFIC_CORPUS="$2"; shift 2 ;; - --top-k) TOP_K="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/corpus_${STRATEGY}_${TIMESTAMP}.json" - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --arg strategy "${STRATEGY}" \ - --argjson top_k "${TOP_K}" \ - '{ - benchmark: { - timestamp: $ts, - strategy: $strategy, - top_k: $top_k, - type: "corpus" - }, - results: [], - metrics: { - total: 0, - mrr: 0, - p_at_1: 0, - p_at_3: 0, - latencies_ms: [], - by_difficulty: {}, - by_tag: {} - } - }' > "${REPORT_FILE}" - -# Arrays to collect metrics -declare -a ALL_RRS=() -declare -a ALL_P1=() -declare -a ALL_P3=() -declare -a ALL_LATENCIES=() - -run_corpus() { - local corpus_path="$1" - local corpus_name - corpus_name=$(basename "$corpus_path") - - local snapshot="${corpus_path}/snapshot.json" - local queries="${corpus_path}/queries.json" - - if [[ ! -f "$snapshot" ]] || [[ ! -f "$queries" ]]; then - echo " Skipping ${corpus_name}: missing files" - return - fi - - echo "" - echo "=== Corpus: ${corpus_name} ===" - - local count - count=$(jq length "$queries") - - for i in $(seq 0 $((count - 1))); do - local id query relevant_refs partial_refs difficulty tags - - id=$(jq -r ".[$i].id" "$queries") - query=$(jq -r ".[$i].query" "$queries") - relevant_refs=$(jq -c ".[$i].relevant_refs" "$queries") - partial_refs=$(jq -c ".[$i].partially_relevant_refs // []" "$queries") - difficulty=$(jq -r ".[$i].difficulty // \"medium\"" "$queries") - tags=$(jq -c ".[$i].tags // []" "$queries") - - # Run query and measure time - local start_ns end_ns duration_ms result - start_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))') - - result=$("${SEMANTIC}" find "${query}" \ - --snapshot "${snapshot}" \ - --strategy "${STRATEGY}" \ - --threshold 0.01 \ - --top-k "${TOP_K}" \ - --format json 2>/dev/null || echo '{"matches":[]}') - - end_ns=$(python3 -c 'import time; print(int(time.time() * 1000000))') - duration_ms=$(( (end_ns - start_ns) / 1000 )) - - # Extract results - local matches best_ref best_score - matches=$(echo "$result" | jq -c '[.matches[].ref]') - best_ref=$(echo "$result" | jq -r '.best_ref // ""') - best_score=$(echo "$result" | jq -r '.best_score // 0') - - # Calculate Reciprocal Rank - local rr=0 - for rank in $(seq 1 ${TOP_K}); do - local ref_at_rank - ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - rr=$(echo "scale=4; 1 / ${rank}" | bc) - break - fi - done - - # Calculate P@1 - local p1=0 - if echo "$relevant_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - p1=1 - elif echo "$partial_refs" | jq -e "index(\"${best_ref}\")" > /dev/null 2>&1; then - p1=0.5 - fi - - # Calculate P@3 (count relevant in top 3, partials count as 0.5) - local relevant_in_top3=0 - local partial_in_top3=0 - for rank in 1 2 3; do - local ref_at_rank - ref_at_rank=$(echo "$result" | jq -r ".matches[$((rank-1))].ref // \"\"") - if echo "$relevant_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - relevant_in_top3=$((relevant_in_top3 + 1)) - elif echo "$partial_refs" | jq -e "index(\"${ref_at_rank}\")" > /dev/null 2>&1; then - partial_in_top3=$((partial_in_top3 + 1)) - fi - done - local p3 - p3=$(echo "scale=4; (${relevant_in_top3} + ${partial_in_top3} * 0.5) / 3" | bc) - - # Collect metrics - ALL_RRS+=("$rr") - ALL_P1+=("$p1") - ALL_P3+=("$p3") - ALL_LATENCIES+=("$duration_ms") - - # Status indicator - local status="MISS" - if (( $(echo "$p1 >= 1" | bc -l) )); then - status="HIT " - elif (( $(echo "$p1 >= 0.5" | bc -l) )); then - status="PART" - fi - - printf " [%s] %s | RR=%.2f P@1=%.1f P@3=%.2f | %dms | %s\n" \ - "$id" "$status" "$rr" "$p1" "$p3" "$duration_ms" "$query" - - # Record to report - local result_json - result_json=$(jq -n \ - --arg id "$id" \ - --arg query "$query" \ - --arg corpus "$corpus_name" \ - --arg difficulty "$difficulty" \ - --argjson tags "$tags" \ - --arg best_ref "$best_ref" \ - --argjson best_score "$best_score" \ - --argjson matches "$matches" \ - --argjson relevant "$relevant_refs" \ - --argjson rr "$rr" \ - --argjson p1 "$p1" \ - --argjson p3 "$p3" \ - --argjson latency "$duration_ms" \ - '{ - id: $id, query: $query, corpus: $corpus, - difficulty: $difficulty, tags: $tags, - best_ref: $best_ref, best_score: $best_score, - matches: $matches, relevant_refs: $relevant, - rr: $rr, p_at_1: $p1, p_at_3: $p3, - latency_ms: $latency - }') - - # Append to report - local tmp - tmp=$(mktemp) - jq --argjson r "$result_json" '.results += [$r]' "$REPORT_FILE" > "$tmp" - mv "$tmp" "$REPORT_FILE" - done -} - -# Run benchmarks -if [[ -n "${SPECIFIC_CORPUS}" ]]; then - run_corpus "${CORPUS_DIR}/${SPECIFIC_CORPUS}" -else - for corpus in "${CORPUS_DIR}"/*/; do - [[ -d "$corpus" ]] || continue - run_corpus "$corpus" - done -fi - -# Calculate aggregate metrics -echo "" -echo "Calculating aggregate metrics..." - -TOTAL=${#ALL_RRS[@]} -if [[ $TOTAL -eq 0 ]]; then - echo "No results to aggregate" - exit 1 -fi - -# MRR -MRR=$(printf '%s\n' "${ALL_RRS[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# P@1 -P1=$(printf '%s\n' "${ALL_P1[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# P@3 -P3=$(printf '%s\n' "${ALL_P3[@]}" | awk '{s+=$1} END {printf "%.4f", s/NR}') - -# Latency percentiles -SORTED_LAT=($(printf '%s\n' "${ALL_LATENCIES[@]}" | sort -n)) -P50_IDX=$(( TOTAL * 50 / 100 )) -P95_IDX=$(( TOTAL * 95 / 100 )) -P99_IDX=$(( TOTAL * 99 / 100 )) -LAT_P50=${SORTED_LAT[$P50_IDX]:-0} -LAT_P95=${SORTED_LAT[$P95_IDX]:-0} -LAT_P99=${SORTED_LAT[$P99_IDX]:-0} -LAT_AVG=$(printf '%s\n' "${ALL_LATENCIES[@]}" | awk '{s+=$1} END {printf "%.0f", s/NR}') - -# Update report with aggregates -tmp=$(mktemp) -jq \ - --argjson total "$TOTAL" \ - --argjson mrr "$MRR" \ - --argjson p1 "$P1" \ - --argjson p3 "$P3" \ - --argjson lat_avg "$LAT_AVG" \ - --argjson lat_p50 "$LAT_P50" \ - --argjson lat_p95 "$LAT_P95" \ - --argjson lat_p99 "$LAT_P99" \ - '.metrics = { - total: $total, - mrr: $mrr, - p_at_1: $p1, - p_at_3: $p3, - latency_avg_ms: $lat_avg, - latency_p50_ms: $lat_p50, - latency_p95_ms: $lat_p95, - latency_p99_ms: $lat_p99 - }' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Add by-difficulty breakdown -tmp=$(mktemp) -jq '.metrics.by_difficulty = ( - .results | group_by(.difficulty) | map({ - key: .[0].difficulty, - value: { - count: length, - mrr: ([.[].rr] | add / length), - p_at_1: ([.[].p_at_1] | add / length) - } - }) | from_entries -)' "$REPORT_FILE" > "$tmp" -mv "$tmp" "$REPORT_FILE" - -# Generate summary -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" - -cat > "${SUMMARY_FILE}" << EOF -# Semantic Matching Benchmark Results - -## Configuration - -| Field | Value | -|-------|-------| -| Timestamp | $(jq -r '.benchmark.timestamp' "$REPORT_FILE") | -| Strategy | ${STRATEGY} | -| Top-K | ${TOP_K} | -| Total Queries | ${TOTAL} | - -## Ranking Metrics - -| Metric | Value | Description | -|--------|-------|-------------| -| **MRR** | **${MRR}** | Mean Reciprocal Rank | -| **P@1** | **${P1}** | Precision at rank 1 | -| **P@3** | **${P3}** | Precision at rank 3 | - -## Latency - -| Percentile | Value | -|------------|-------| -| Average | ${LAT_AVG} ms | -| P50 | ${LAT_P50} ms | -| P95 | ${LAT_P95} ms | -| P99 | ${LAT_P99} ms | - -## By Difficulty - -$(jq -r '.metrics.by_difficulty | to_entries | .[] | "| \(.key) | \(.value.count) queries | MRR: \(.value.mrr | . * 100 | floor / 100) | P@1: \(.value.p_at_1 | . * 100 | floor / 100) |"' "$REPORT_FILE") - -## Misses (P@1 = 0) - -| ID | Query | Got | Expected | -|----|-------|-----|----------| -$(jq -r '.results[] | select(.p_at_1 == 0) | "| \(.id) | \(.query) | \(.best_ref) | \(.relevant_refs | join(",")) |"' "$REPORT_FILE") - -EOF - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo "================================================" -echo " CORPUS BENCHMARK RESULTS" -echo "================================================" -echo " Strategy: ${STRATEGY}" -echo " Queries: ${TOTAL}" -echo " MRR: ${MRR}" -echo " P@1: ${P1}" -echo " P@3: ${P3}" -echo " Latency P50: ${LAT_P50} ms" -echo " Latency P95: ${LAT_P95} ms" -echo "================================================" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/benchmark/scripts/run-full-benchmark.sh b/tests/benchmark/scripts/run-full-benchmark.sh deleted file mode 100755 index 89db077..0000000 --- a/tests/benchmark/scripts/run-full-benchmark.sh +++ /dev/null @@ -1,283 +0,0 @@ -#!/bin/bash -# -# Full semantic benchmark: Find + Recovery + Classification -# -# Produces a composite score for overall system health. -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -BENCHMARK_DIR="${SCRIPT_DIR}/.." -CORPUS_DIR="${BENCHMARK_DIR}/corpus" -RESULTS_DIR="${BENCHMARK_DIR}/results" - -mkdir -p "${RESULTS_DIR}" - -# Build semantic binary with recovery support -echo "Building semantic..." -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -SEMANTIC="${BENCHMARK_DIR}/semantic" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -REPORT_FILE="${RESULTS_DIR}/full_benchmark_${TIMESTAMP}.json" - -has_role_keyword() { - local query="$1" - echo "$query" | grep -Eiq '(^|[^[:alnum:]])(button|input|link|textbox|checkbox|radio|select|option|tab|menu|form|search)([^[:alnum:]]|$)' -} - -enrich_recovery_query() { - local query="$1" - local role="$2" - - if [[ -z "$query" || -z "$role" ]]; then - printf '%s' "$query" - return - fi - if has_role_keyword "$query"; then - printf '%s' "$query" - return - fi - printf '%s %s' "$query" "$role" -} - -# Initialize report -jq -n \ - --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - '{ - timestamp: $ts, - find: { total: 0, mrr: 0, p_at_1: 0, latency_p50: 0 }, - recovery: { total: 0, recovered: 0, rate: 0 }, - classification: { total: 0, correct: 0, accuracy: 0 }, - composite: { score: 0, grade: "" } - }' > "${REPORT_FILE}" - -echo "" -echo "==============================================" -echo " PHASE 1: FIND BENCHMARK" -echo "==============================================" - -# Run corpus benchmark and capture metrics -FIND_OUTPUT=$("${SCRIPT_DIR}/run-corpus-benchmark.sh" 2>&1) -echo "$FIND_OUTPUT" - -# Extract metrics from output -FIND_MRR=$(echo "$FIND_OUTPUT" | grep "MRR:" | tail -1 | awk '{print $2}') -FIND_P1=$(echo "$FIND_OUTPUT" | grep "P@1:" | tail -1 | awk '{print $2}') -FIND_TOTAL=$(echo "$FIND_OUTPUT" | grep "Queries:" | tail -1 | awk '{print $2}') -FIND_LAT=$(echo "$FIND_OUTPUT" | grep "Latency P50:" | tail -1 | awk '{print $3}') - -# Rebuild semantic binary (corpus benchmark deletes it) -(cd "${BENCHMARK_DIR}/../.." && go build -o "${BENCHMARK_DIR}/semantic" ./cmd/semantic) - -echo "" -echo "==============================================" -echo " PHASE 2: RECOVERY BENCHMARK" -echo "==============================================" - -SCENARIOS_FILE="${CORPUS_DIR}/recovery-scenarios/scenarios.json" -RECOVERY_TOTAL=0 -RECOVERY_SUCCESS=0 - -if [[ -f "$SCENARIOS_FILE" ]]; then - SCENARIO_COUNT=$(jq length "$SCENARIOS_FILE") - - for i in $(seq 0 $((SCENARIO_COUNT - 1))); do - ID=$(jq -r ".[$i].id" "$SCENARIOS_FILE") - NAME=$(jq -r ".[$i].name" "$SCENARIOS_FILE") - RAW_QUERY=$(jq -r ".[$i].original_query" "$SCENARIOS_FILE") - ORIGINAL_REF=$(jq -r ".[$i].original_ref // empty" "$SCENARIOS_FILE") - ORIGINAL_ROLE=$(jq -r ".[$i].before[]? | select(.ref == \"$ORIGINAL_REF\") | .role // empty" "$SCENARIOS_FILE") - QUERY=$(enrich_recovery_query "$RAW_QUERY" "$ORIGINAL_ROLE") - EXPECTED=$(jq -r ".[$i].expected_ref // empty" "$SCENARIOS_FILE") - EXPECTED_ALT=$(jq -r ".[$i].expected_alt // [] | join(\",\")" "$SCENARIOS_FILE") - EXPECT_NO_MATCH=$(jq -r ".[$i].expect_no_match // false" "$SCENARIOS_FILE") - - # Write after snapshot to temp file - AFTER_FILE=$(mktemp) - jq ".[$i].after" "$SCENARIOS_FILE" > "$AFTER_FILE" - - # Run semantic find on after snapshot with the same minimum score - # enforced by DefaultRecoveryConfig in the recovery engine. - RESULT=$("${SEMANTIC}" find "$QUERY" --snapshot "$AFTER_FILE" --format json --threshold 0.52 2>/dev/null || echo '{"matches":[]}') - BEST_REF=$(echo "$RESULT" | jq -r '.best_ref // ""') - - rm -f "$AFTER_FILE" - - RECOVERY_TOTAL=$((RECOVERY_TOTAL + 1)) - STATUS="FAIL" - - if [[ "$EXPECT_NO_MATCH" == "true" ]]; then - if [[ -z "$BEST_REF" ]] || [[ "$BEST_REF" == "null" ]]; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - fi - elif [[ "$BEST_REF" == "$EXPECTED" ]]; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - elif [[ -n "$EXPECTED_ALT" ]] && echo ",$EXPECTED_ALT," | grep -q ",$BEST_REF,"; then - STATUS="PASS" - RECOVERY_SUCCESS=$((RECOVERY_SUCCESS + 1)) - fi - - printf " [%s] %s | %s | got=%s want=%s\n" "$ID" "$STATUS" "$NAME" "$BEST_REF" "$EXPECTED" - done -fi - -RECOVERY_RATE=0 -if [[ $RECOVERY_TOTAL -gt 0 ]]; then - RECOVERY_RATE=$(echo "scale=4; $RECOVERY_SUCCESS / $RECOVERY_TOTAL" | bc) -fi - -echo "" -echo " Recovery: $RECOVERY_SUCCESS / $RECOVERY_TOTAL = $RECOVERY_RATE" - -echo "" -echo "==============================================" -echo " PHASE 3: CLASSIFICATION BENCHMARK" -echo "==============================================" - -CLASS_FILE="${CORPUS_DIR}/classification/cases.json" -CLASS_TOTAL=0 -CLASS_CORRECT=0 - -if [[ -f "$CLASS_FILE" ]]; then - CLASS_COUNT=$(jq length "$CLASS_FILE") - - for i in $(seq 0 $((CLASS_COUNT - 1))); do - ID=$(jq -r ".[$i].id" "$CLASS_FILE") - ERROR=$(jq -r ".[$i].error" "$CLASS_FILE") - EXPECTED=$(jq -r ".[$i].expected_type" "$CLASS_FILE") - - # Run semantic classify (extract just the type, first word) - RESULT=$("${SEMANTIC}" classify "$ERROR" 2>/dev/null || echo "unknown") - GOT=$(echo "$RESULT" | awk '{print $1}') - - CLASS_TOTAL=$((CLASS_TOTAL + 1)) - STATUS="FAIL" - - if [[ "$GOT" == "$EXPECTED" ]]; then - STATUS="PASS" - CLASS_CORRECT=$((CLASS_CORRECT + 1)) - fi - - printf " [%s] %s | \"%s\" → %s (want %s)\n" "$ID" "$STATUS" "${ERROR:0:40}" "$GOT" "$EXPECTED" - done -fi - -CLASS_ACCURACY=0 -if [[ $CLASS_TOTAL -gt 0 ]]; then - CLASS_ACCURACY=$(echo "scale=4; $CLASS_CORRECT / $CLASS_TOTAL" | bc) -fi - -echo "" -echo " Classification: $CLASS_CORRECT / $CLASS_TOTAL = $CLASS_ACCURACY" - -echo "" -echo "==============================================" -echo " COMPOSITE SCORE" -echo "==============================================" - -# Calculate composite score with weights: -# Find P@1: 40% -# Find MRR: 20% -# Recovery Rate: 25% -# Classification: 15% - -COMPOSITE=$(echo "scale=4; \ - ($FIND_P1 * 0.40) + \ - ($FIND_MRR * 0.20) + \ - ($RECOVERY_RATE * 0.25) + \ - ($CLASS_ACCURACY * 0.15)" | bc) - -# Assign grade -GRADE="F" -if (( $(echo "$COMPOSITE >= 0.95" | bc -l) )); then GRADE="A+" -elif (( $(echo "$COMPOSITE >= 0.90" | bc -l) )); then GRADE="A" -elif (( $(echo "$COMPOSITE >= 0.85" | bc -l) )); then GRADE="B+" -elif (( $(echo "$COMPOSITE >= 0.80" | bc -l) )); then GRADE="B" -elif (( $(echo "$COMPOSITE >= 0.75" | bc -l) )); then GRADE="C+" -elif (( $(echo "$COMPOSITE >= 0.70" | bc -l) )); then GRADE="C" -elif (( $(echo "$COMPOSITE >= 0.60" | bc -l) )); then GRADE="D" -fi - -# Update report -TMP=$(mktemp) -jq \ - --argjson find_total "${FIND_TOTAL:-0}" \ - --argjson find_mrr "${FIND_MRR:-0}" \ - --argjson find_p1 "${FIND_P1:-0}" \ - --argjson find_lat "${FIND_LAT:-0}" \ - --argjson rec_total "$RECOVERY_TOTAL" \ - --argjson rec_success "$RECOVERY_SUCCESS" \ - --argjson rec_rate "$RECOVERY_RATE" \ - --argjson class_total "$CLASS_TOTAL" \ - --argjson class_correct "$CLASS_CORRECT" \ - --argjson class_acc "$CLASS_ACCURACY" \ - --argjson composite "$COMPOSITE" \ - --arg grade "$GRADE" \ - '.find = { total: $find_total, mrr: $find_mrr, p_at_1: $find_p1, latency_p50: $find_lat } | - .recovery = { total: $rec_total, recovered: $rec_success, rate: $rec_rate } | - .classification = { total: $class_total, correct: $class_correct, accuracy: $class_acc } | - .composite = { score: $composite, grade: $grade }' \ - "$REPORT_FILE" > "$TMP" -mv "$TMP" "$REPORT_FILE" - -# Generate summary -SUMMARY_FILE="${REPORT_FILE%.json}_summary.md" -cat > "$SUMMARY_FILE" << EOF -# Semantic Benchmark Report - -## Composite Score: ${COMPOSITE} (${GRADE}) - -| Component | Weight | Score | Weighted | -|-----------|--------|-------|----------| -| Find P@1 | 40% | ${FIND_P1:-0} | $(echo "scale=3; ${FIND_P1:-0} * 0.40" | bc) | -| Find MRR | 20% | ${FIND_MRR:-0} | $(echo "scale=3; ${FIND_MRR:-0} * 0.20" | bc) | -| Recovery | 25% | ${RECOVERY_RATE} | $(echo "scale=3; ${RECOVERY_RATE} * 0.25" | bc) | -| Classification | 15% | ${CLASS_ACCURACY} | $(echo "scale=3; ${CLASS_ACCURACY} * 0.15" | bc) | - -## Find Performance -- Queries: ${FIND_TOTAL:-0} -- MRR: ${FIND_MRR:-0} -- P@1: ${FIND_P1:-0} -- Latency P50: ${FIND_LAT:-0} ms - -## Recovery Performance -- Scenarios: ${RECOVERY_TOTAL} -- Recovered: ${RECOVERY_SUCCESS} -- Rate: ${RECOVERY_RATE} - -## Classification Performance -- Cases: ${CLASS_TOTAL} -- Correct: ${CLASS_CORRECT} -- Accuracy: ${CLASS_ACCURACY} - -## Grade Scale -| Grade | Score | -|-------|-------| -| A+ | >= 0.95 | -| A | >= 0.90 | -| B+ | >= 0.85 | -| B | >= 0.80 | -| C+ | >= 0.75 | -| C | >= 0.70 | -| D | >= 0.60 | -| F | < 0.60 | -EOF - -# Cleanup -rm -f "${BENCHMARK_DIR}/semantic" - -echo "" -echo " ┌─────────────────────────────────────────┐" -echo " │ COMPOSITE SCORE: ${COMPOSITE} GRADE: ${GRADE} │" -echo " ├─────────────────────────────────────────┤" -echo " │ Find P@1: ${FIND_P1:-0} (40%) │" -echo " │ Find MRR: ${FIND_MRR:-0} (20%) │" -echo " │ Recovery: ${RECOVERY_RATE} (25%) │" -echo " │ Classification: ${CLASS_ACCURACY} (15%) │" -echo " └─────────────────────────────────────────┘" -echo "" -echo "Report: ${REPORT_FILE}" -echo "Summary: ${SUMMARY_FILE}" diff --git a/tests/e2e/assets/snapshots/visual-layout.json b/tests/e2e/assets/snapshots/visual-layout.json new file mode 100644 index 0000000..5ee983e --- /dev/null +++ b/tests/e2e/assets/snapshots/visual-layout.json @@ -0,0 +1,10 @@ +[ + {"ref": "e0", "role": "button", "name": "Menu", "interactive": true, "section": "Header", "positional": {"x": 20, "y": 20, "width": 80, "height": 32}}, + {"ref": "e1", "role": "button", "name": "Settings", "interactive": true, "section": "Header", "positional": {"x": 900, "y": 20, "width": 80, "height": 32}}, + {"ref": "e2", "role": "searchbox", "name": "Search", "interactive": true, "section": "Header", "positional": {"x": 400, "y": 20, "width": 200, "height": 32}}, + {"ref": "e3", "role": "link", "name": "Help", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 300, "width": 100, "height": 24}}, + {"ref": "e4", "role": "link", "name": "Contact", "interactive": true, "section": "Sidebar", "positional": {"x": 20, "y": 340, "width": 100, "height": 24}}, + {"ref": "e5", "role": "button", "name": "Submit", "interactive": true, "section": "Main", "positional": {"x": 500, "y": 400, "width": 120, "height": 40}}, + {"ref": "e6", "role": "button", "name": "Cancel", "interactive": true, "section": "Footer", "positional": {"x": 400, "y": 700, "width": 80, "height": 32}}, + {"ref": "e7", "role": "button", "name": "Save", "interactive": true, "section": "Footer", "positional": {"x": 500, "y": 700, "width": 80, "height": 32}} +] diff --git a/tests/e2e/cases/14-find-ordinal.sh b/tests/e2e/cases/14-find-ordinal.sh new file mode 100755 index 0000000..c0f1819 --- /dev/null +++ b/tests/e2e/cases/14-find-ordinal.sh @@ -0,0 +1,22 @@ +#!/bin/bash +CASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${CASE_DIR}/../lib.sh" + +echo " -- Find: Ordinal Queries --" + +MULTI="${ASSETS_DIR}/snapshots/multi-form.json" + +result=$(semantic find "second submit button" --snapshot "$MULTI" --format json) +assert_json_field "$result" ".best_ref" "e7" "ordinal: second submit button → e7" + +result=$(semantic find "last submit button" --snapshot "$MULTI" --format json) +assert_json_field "$result" ".best_ref" "e11" "ordinal: last submit button → e11" + +result=$(semantic find "second submit button not in login" --snapshot "$MULTI" --format json) +assert_json_field "$result" ".best_ref" "e11" "ordinal+context: second submit button not in login → e11" + +LOGIN="${ASSETS_DIR}/snapshots/login-page.json" +result=$(semantic find "email address" --snapshot "$LOGIN" --format json) +assert_json_field "$result" ".best_ref" "e1" "guard: literal query still resolves email address → e1" + +summary "find-ordinal" diff --git a/tests/e2e/cases/15-find-visual.sh b/tests/e2e/cases/15-find-visual.sh new file mode 100755 index 0000000..42ff027 --- /dev/null +++ b/tests/e2e/cases/15-find-visual.sh @@ -0,0 +1,21 @@ +#!/bin/bash +CASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${CASE_DIR}/../lib.sh" + +echo " -- Find: Visual Position Hints --" + +VISUAL="${ASSETS_DIR}/snapshots/visual-layout.json" + +result=$(semantic find "button in top right" --snapshot "$VISUAL" --format json) +assert_json_field "$result" ".best_ref" "e1" "visual: button in top right → e1 (Settings)" + +result=$(semantic find "button on the left" --snapshot "$VISUAL" --format json) +assert_json_field "$result" ".best_ref" "e0" "visual: button on the left → e0 (Menu)" + +result=$(semantic find "button at the bottom" --snapshot "$VISUAL" --format json) +assert_json_field "$result" ".best_ref" "e7" "visual: button at the bottom → e7 (Save)" + +result=$(semantic find "link on left side" --snapshot "$VISUAL" --format json) +assert_json_field "$result" ".best_ref" "e3" "visual: link on left side → e3 (Help)" + +summary "find-visual" diff --git a/tests/e2e/cases/16-input-hardening.sh b/tests/e2e/cases/16-input-hardening.sh new file mode 100755 index 0000000..3ebc4c2 --- /dev/null +++ b/tests/e2e/cases/16-input-hardening.sh @@ -0,0 +1,65 @@ +#!/bin/bash +CASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${CASE_DIR}/../lib.sh" + +echo " ── Find: Input Hardening / Edge Cases ──" + +SNAPSHOT="${ASSETS_DIR}/snapshots/login-page.json" + +# Negative threshold (should be clamped to 0) +set +e +result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --threshold -0.5 2>&1) +exit_code=$? +set -e +assert_eq "$exit_code" "0" "hardening: negative threshold doesn't crash" + +# Threshold > 1 (should be clamped to 1, returning no matches) +result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --threshold 1.5) +count=$(echo "$result" | jq '.matches | length') +assert_eq "$count" "0" "hardening: threshold > 1 returns no matches" + +# Zero topk (should use default) +set +e +result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --top-k 0 2>&1) +exit_code=$? +set -e +assert_eq "$exit_code" "0" "hardening: zero topk doesn't crash" + +# Negative topk (should use default) +set +e +result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --top-k -5 2>&1) +exit_code=$? +set -e +assert_eq "$exit_code" "0" "hardening: negative topk doesn't crash" + +# Very large topk (should be clamped to element count) +result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --top-k 10000 --threshold 0) +count=$(echo "$result" | jq '.matches | length') +elem_count=$(jq 'length' "$SNAPSHOT") +if [ "$count" -le "$elem_count" ]; then + pass "hardening: large topk clamped to element count" +else + fail "hardening: large topk clamped to element count" "got $count matches, expected <= $elem_count" +fi + +# Custom weights that sum to > 1 +set +e +result=$(semantic find "sign in" --snapshot "$SNAPSHOT" --format json --lexical-weight 2 --embedding-weight 2 2>&1) +exit_code=$? +set -e +assert_eq "$exit_code" "0" "hardening: weights > 1 don't crash" + +# Verify scores are still bounded [0,1] with extreme weights +if [ "$exit_code" != "0" ]; then + fail "hardening: scores bounded with extreme weights" "semantic find failed: $result" +elif best_score=$(echo "$result" | jq -er '.best_score' 2>/dev/null); then + if awk "BEGIN {exit !($best_score >= 0 && $best_score <= 1)}"; then + pass "hardening: scores bounded with extreme weights" + else + fail "hardening: scores bounded with extreme weights" "got score $best_score" + fi +else + fail "hardening: scores bounded with extreme weights" "semantic find returned invalid JSON: $result" +fi + +summary "input-hardening" diff --git a/tests/e2e/lib.sh b/tests/e2e/lib.sh index c6d1ad0..9c903a0 100755 --- a/tests/e2e/lib.sh +++ b/tests/e2e/lib.sh @@ -11,6 +11,20 @@ else ASSETS_DIR="${E2E_DIR}/assets" fi +if [ "$E2E_DIR" != "/e2e" ] && [ -z "${SEMANTIC_E2E_BOOTSTRAPPED:-}" ]; then + REPO_ROOT="$(cd "${E2E_DIR}/../.." && pwd)" + if ! command -v go >/dev/null 2>&1; then + echo "ERROR: go is required to run local E2E tests" >&2 + exit 1 + fi + if ! (cd "$REPO_ROOT" && go build -o "${E2E_DIR}/semantic" ./cmd/semantic); then + echo "ERROR: failed to build semantic binary for local E2E tests" >&2 + exit 1 + fi + export PATH="${E2E_DIR}:$PATH" + export SEMANTIC_E2E_BOOTSTRAPPED=1 +fi + PASSED=0 FAILED=0 ERRORS="" diff --git a/tests/e2e/run.sh b/tests/e2e/run.sh index 1af644c..29de4ea 100755 --- a/tests/e2e/run.sh +++ b/tests/e2e/run.sh @@ -7,6 +7,9 @@ echo " semantic E2E tests" echo "═══════════════════════════════════════════════════" echo "" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib.sh" + # Verify binary is available if ! command -v semantic &>/dev/null; then echo "ERROR: semantic binary not found" @@ -24,7 +27,6 @@ run_suite() { } # Run all test suites -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" for suite in "${SCRIPT_DIR}"/cases/*.sh; do [ -f "$suite" ] || continue echo ""