Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
1d41696
Merge pull request #22 from pinchtab/chore/testing-and-benchmark
luigi-agosti Apr 12, 2026
0a432dc
Merge pull request #27 from pinchtab/feat/issue-23-recovery-threshold
luigi-agosti Apr 12, 2026
6547a6c
feat: support negative matching in queries
Chetnapadhi Apr 17, 2026
6bff209
fix: address review feedback (stopwords, parser, test cleanup)
Chetnapadhi Apr 17, 2026
c92329c
Merge pull request #28 from pinchtab/feat/negative-matching
luigi-agosti Apr 22, 2026
87e3807
feat: add context-aware negative matching
Apr 22, 2026
95eb3be
Merge pull request #35 from pinchtab/feat/followup-negative-context
luigi-agosti Apr 22, 2026
f885acb
feat: add composable ordinal query support
Apr 22, 2026
a6e6a8e
test: harden ordinal query selection
Apr 23, 2026
ec91f06
fix: preserve document order for ordinal queries
Apr 23, 2026
981e9b1
chore: rename benchmark dev command to bench
Apr 23, 2026
852aaca
fix: stabilize ordinal document ordering
Apr 23, 2026
18cbd15
test: stabilize ordinal input ordering coverage test
Apr 23, 2026
9ca2ea6
Merge pull request #36 from pinchtab/feat/followup-ordinal-queries
luigi-agosti Apr 23, 2026
cccada9
feat: support visual position hints in queries
Apr 23, 2026
16066d4
fix: e2e tests
luigi-agosti Apr 23, 2026
7cacd78
feat: add deterministic ranking and expand benchmark coverage
luigi-agosti Apr 23, 2026
897ae3f
Merge pull request #37 from pinchtab/feat/visual-position-hints-redux
luigi-agosti Apr 23, 2026
e3e963f
chore: small cleanup of bench scripts
luigi-agosti Apr 23, 2026
e8f07db
chore: expand benchmark corpus and add tuning tools
luigi-agosti Apr 23, 2026
fa72533
Merge pull request #38 from pinchtab/chore/benchmark-improvements
luigi-agosti Apr 23, 2026
71c0932
feat: harden matchers with input sanitization and context cancellation
luigi-agosti Apr 23, 2026
9131914
test: build fresh CLI for local e2e hardening checks
luigi-agosti Apr 23, 2026
1a481aa
chore: expand benchmark corpus and add tuning tools
luigi-agosti Apr 24, 2026
6cab0f7
docs: improve SKILL.md for LLM usage
luigi-agosti Apr 24, 2026
c69f054
refactor: use Go CLI instead of bash scripts in dev tool
luigi-agosti Apr 24, 2026
201b9c4
feat: add semantic-bench CLI for benchmark management
luigi-agosti Apr 24, 2026
39729ab
chore: ignore semantic-bench binary
luigi-agosti Apr 24, 2026
510be95
feat: add baseline, calibrate, tune commands to Go CLI
luigi-agosti Apr 24, 2026
33bc06f
chore: remove bash scripts replaced by Go CLI
luigi-agosti Apr 24, 2026
dfb7b02
feat: move runtime baseline check to Go CLI
luigi-agosti Apr 24, 2026
513d281
chore: ignore generated baseline files
luigi-agosti Apr 24, 2026
f7a1c8f
chore: simplify dev tool and update SKILL.md
luigi-agosti Apr 24, 2026
26ddb72
refactor: split benchmark commands.go into separate files
luigi-agosti Apr 24, 2026
03c7a6e
fix: resolve golangci-lint errors in benchmark package
luigi-agosti Apr 24, 2026
c3a85ab
feat: config-driven thresholds, validation, and deterministic output
luigi-agosti Apr 24, 2026
f8c8136
feat: config-driven thresholds with validation and enforcement
luigi-agosti Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Binary (root only, not cmd/semantic/)
# Binary
/semantic
/semantic-bench
tests/benchmark/semantic
tests/e2e/semantic
*.exe

# Test
Expand All @@ -19,4 +22,5 @@ cover.out
.claude
tests/e2e/results/*.txt
tests/benchmark/results/*.json
tests/benchmark/results/*.md
tests/benchmark/results/*.md
tests/benchmark/baselines/*.json
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ Implementations are internal — consumers use the `ElementMatcher` interface an
## Features

- **Synonym expansion** — 54 UI synonym groups ("sign in" ↔ "log in", "cart" ↔ "basket", "preferences" ↔ "settings", etc.)
- **Visual position hints** — Understand layout cues like `top`, `bottom`, `left`, `right`, and `above`/`below` anchors
- **Confidence calibration** — Scores mapped to high (≥ 0.8) / medium (≥ 0.6) / low labels
- **Error classification** — Classify browser errors (CDP, chromedp) as recoverable or not
- **Self-healing recovery** — Re-locate stale elements after DOM changes via callback interfaces
Expand Down Expand Up @@ -184,6 +185,11 @@ semantic find "login" --snapshot page.json --format json # machine-readable
semantic find "login" --snapshot page.json --format table # human-readable
semantic find "login" --snapshot page.json --format refs # just refs

# Visual position hints
semantic find "button in top right corner" --snapshot page.json
semantic find "link below the search box" --snapshot page.json
semantic find "sidebar on the left" --snapshot page.json

# Score a specific element
semantic match "login" e4 --snapshot page.json

Expand All @@ -198,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode

## Design Trade-offs

See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.

## Origin

Expand Down
168 changes: 168 additions & 0 deletions cmd/semantic-bench/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package main

import (
"fmt"
"os"

"github.com/pinchtab/semantic/internal/benchmark"
)

const usage = `semantic-bench - Benchmark runner for semantic matching

Usage:
semantic-bench <command> [flags]

Commands:
check Run benchmark and compare against baseline (default)
run Run benchmark suites
compare Compare two reports
lint Validate dataset
catalog Print dataset inventory
baseline Manage quality baselines (create, update)
calibrate Find optimal thresholds via precision/recall analysis
tune Grid-search lexical/embedding weights
runtime Check Go benchmark performance against baseline

Flags:
-h, --help Show help

Run 'semantic-bench <command> --help' for command-specific help.
`

func main() {
if len(os.Args) < 2 {
runCheck(os.Args[1:])
return
}

cmd := os.Args[1]
args := os.Args[2:]

switch cmd {
case "check":
runCheck(args)
case "run":
runRun(args)
case "compare":
runCompare(args)
case "lint":
runLint(args)
case "catalog":
runCatalog(args)
case "baseline":
runBaseline(args)
case "calibrate":
runCalibrate(args)
case "tune":
runTune(args)
case "runtime":
runRuntime(args)
case "-h", "--help", "help":
fmt.Print(usage)
default:
fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage)
os.Exit(2)
}
}

func runCheck(args []string) {
cfg := benchmark.ParseCheckFlags(args)
result, err := benchmark.RunCheck(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCheckResult(result, cfg)
if result.Status == "fail" {
os.Exit(1)
}
}

func runRun(args []string) {
cfg := benchmark.ParseRunFlags(args)
result, err := benchmark.RunBenchmark(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintRunResult(result, cfg)
}

func runCompare(args []string) {
cfg := benchmark.ParseCompareFlags(args)
result, err := benchmark.RunCompare(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCompareResult(result, cfg)
if result.Status == "fail" {
os.Exit(1)
}
}

func runLint(args []string) {
cfg := benchmark.ParseLintFlags(args)
result, err := benchmark.RunLint(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintLintResult(result, cfg)
if result.Errors > 0 {
os.Exit(1)
}
}

func runCatalog(args []string) {
cfg := benchmark.ParseCatalogFlags(args)
result, err := benchmark.RunCatalog(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCatalogResult(result, cfg)
}

func runBaseline(args []string) {
cfg := benchmark.ParseBaselineFlags(args)
result, err := benchmark.RunBaseline(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintBaselineResult(result, cfg)
}

func runCalibrate(args []string) {
cfg := benchmark.ParseCalibrateFlags(args)
result, err := benchmark.RunCalibrate(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCalibrateResult(result, cfg)
}

func runTune(args []string) {
cfg := benchmark.ParseTuneFlags(args)
result, err := benchmark.RunTune(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintTuneResult(result, cfg)
}

func runRuntime(args []string) {
cfg := benchmark.ParseRuntimeFlags(args)
result, err := benchmark.RunRuntime(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintRuntimeResult(result, cfg)
if result.Status == "fail" && cfg.FailOnRegression {
os.Exit(1)
}
}
61 changes: 55 additions & 6 deletions cmd/semantic/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,24 @@ Flags (find/match):
--threshold <n> Minimum score (default: 0.3)
--top-k <n> Max results (default: 3)
--strategy <name> lexical, embedding, or combined (default: combined)
--lexical-weight <n> Combined strategy lexical weight override
--embedding-weight <n> Combined strategy embedding weight override
--format <fmt> json, table, or refs (default: table)
`)
}

// snapshotElement is the JSON shape from pinchtab's /snapshot endpoint.
type snapshotPositional struct {
Depth int `json:"depth"`
SiblingIndex int `json:"sibling_index"`
SiblingCount int `json:"sibling_count"`
LabelledBy string `json:"labelled_by"`
Depth int `json:"depth"`
SiblingIndex int `json:"sibling_index"`
SiblingCount int `json:"sibling_count"`
LabelledBy string `json:"labelled_by"`
X float64 `json:"x"`
Y float64 `json:"y"`
Top float64 `json:"top"`
Left float64 `json:"left"`
Width float64 `json:"width"`
Height float64 `json:"height"`
}

type snapshotElement struct {
Expand All @@ -80,6 +88,12 @@ type snapshotElement struct {
SiblingIdx int `json:"sibling_index"`
SiblingCnt int `json:"sibling_count"`
LabelledBy string `json:"labelled_by"`
X float64 `json:"x"`
Y float64 `json:"y"`
Top float64 `json:"top"`
Left float64 `json:"left"`
Width float64 `json:"width"`
Height float64 `json:"height"`
Positional *snapshotPositional `json:"positional"`
}

Expand Down Expand Up @@ -112,6 +126,16 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) {
depth := e.Depth
siblingIdx := e.SiblingIdx
siblingCnt := e.SiblingCnt
x := e.X
y := e.Y
if x == 0 && e.Left != 0 {
x = e.Left
}
if y == 0 && e.Top != 0 {
y = e.Top
}
width := e.Width
height := e.Height
if e.Positional != nil {
if e.Positional.Depth != 0 {
depth = e.Positional.Depth
Expand All @@ -125,6 +149,23 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) {
if e.Positional.LabelledBy != "" {
labelledBy = e.Positional.LabelledBy
}

hasHorizontal := e.Positional.X != 0 || e.Positional.Left != 0 || e.Positional.Width > 0
hasVertical := e.Positional.Y != 0 || e.Positional.Top != 0 || e.Positional.Height > 0
if hasHorizontal {
x = e.Positional.X
if x == 0 && e.Positional.Left != 0 {
x = e.Positional.Left
}
width = e.Positional.Width
}
if hasVertical {
y = e.Positional.Y
if y == 0 && e.Positional.Top != 0 {
y = e.Positional.Top
}
height = e.Positional.Height
}
}

descs[i] = semantic.ElementDescriptor{
Expand All @@ -140,6 +181,10 @@ func loadSnapshot(path string) ([]semantic.ElementDescriptor, error) {
SiblingIndex: siblingIdx,
SiblingCount: siblingCnt,
LabelledBy: labelledBy,
X: x,
Y: y,
Width: width,
Height: height,
},
}
}
Expand All @@ -166,6 +211,8 @@ func runFind(args []string) {
threshold := fs.Float64("threshold", 0.3, "minimum score")
topK := fs.Int("top-k", 3, "max results")
strategy := fs.String("strategy", "combined", "matching strategy")
lexicalWeight := fs.Float64("lexical-weight", 0, "combined strategy lexical weight override")
embeddingWeight := fs.Float64("embedding-weight", 0, "combined strategy embedding weight override")
format := fs.String("format", "table", "output format: json, table, refs")
_ = fs.Parse(args)

Expand All @@ -183,8 +230,10 @@ func runFind(args []string) {

matcher := newMatcher(*strategy)
result, err := matcher.Find(context.Background(), query, elements, semantic.FindOptions{
Threshold: *threshold,
TopK: *topK,
Threshold: *threshold,
TopK: *topK,
LexicalWeight: *lexicalWeight,
EmbeddingWeight: *embeddingWeight,
})
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
Expand Down
16 changes: 14 additions & 2 deletions cmd/semantic/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) {
}

json := `[
{"ref":"e1","role":"button","name":"Submit","interactive":true,"parent":"Login form","section":"Authentication","depth":3,"sibling_index":1,"sibling_count":2,"labelled_by":"Primary Action"},
{"ref":"e2","role":"text","name":"Submit","interactive":false,"parent":"Payment form","section":"Checkout","positional":{"depth":2,"sibling_index":0,"sibling_count":1,"labelled_by":"Secondary Action"}}
{"ref":"e1","role":"button","name":"Submit","interactive":true,"parent":"Login form","section":"Authentication","depth":3,"sibling_index":1,"sibling_count":2,"labelled_by":"Primary Action","x":20,"y":40,"width":120,"height":30},
{"ref":"e2","role":"text","name":"Submit","interactive":false,"parent":"Payment form","section":"Checkout","positional":{"depth":2,"sibling_index":0,"sibling_count":1,"labelled_by":"Secondary Action","left":300,"top":640,"width":200,"height":44}}
]`
if _, err := f.WriteString(json); err != nil {
t.Fatalf("WriteString failed: %v", err)
Expand Down Expand Up @@ -50,6 +50,12 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) {
if descs[0].Positional.LabelledBy != "Primary Action" {
t.Fatalf("expected first descriptor labelled_by=Primary Action, got %q", descs[0].Positional.LabelledBy)
}
if descs[0].Positional.X != 20 || descs[0].Positional.Y != 40 {
t.Fatalf("expected first descriptor x/y=20/40, got %f/%f", descs[0].Positional.X, descs[0].Positional.Y)
}
if descs[0].Positional.Width != 120 || descs[0].Positional.Height != 30 {
t.Fatalf("expected first descriptor width/height=120/30, got %f/%f", descs[0].Positional.Width, descs[0].Positional.Height)
}
if descs[1].Interactive {
t.Fatalf("expected second descriptor interactive=false")
}
Expand All @@ -71,4 +77,10 @@ func TestLoadSnapshot_PropagatesInteractiveFlag(t *testing.T) {
if descs[1].Positional.LabelledBy != "Secondary Action" {
t.Fatalf("expected second descriptor labelled_by=Secondary Action, got %q", descs[1].Positional.LabelledBy)
}
if descs[1].Positional.X != 300 || descs[1].Positional.Y != 640 {
t.Fatalf("expected second descriptor x/y=300/640, got %f/%f", descs[1].Positional.X, descs[1].Positional.Y)
}
if descs[1].Positional.Width != 200 || descs[1].Positional.Height != 44 {
t.Fatalf("expected second descriptor width/height=200/44, got %f/%f", descs[1].Positional.Width, descs[1].Positional.Height)
}
}
Loading
Loading