Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Binary
/semantic
/semantic-bench
tests/benchmark/semantic
tests/e2e/semantic
*.exe
Expand All @@ -21,4 +22,5 @@ cover.out
.claude
tests/e2e/results/*.txt
tests/benchmark/results/*.json
tests/benchmark/results/*.md
tests/benchmark/results/*.md
tests/benchmark/baselines/*.json
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ The library uses only the Go standard library. No external dependencies, no mode

## Design Trade-offs

See [docs/DESIGN.md](docs/DESIGN.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.
See [docs/architecture/design-decisions.md](docs/architecture/design-decisions.md) for detailed discussion of architectural decisions: hashing vs real embeddings, fixed synonym table vs learned, Jaccard vs TF-IDF, and recovery callbacks vs direct integration.

## Origin

Expand Down
168 changes: 168 additions & 0 deletions cmd/semantic-bench/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package main

import (
"fmt"
"os"

"github.com/pinchtab/semantic/internal/benchmark"
)

const usage = `semantic-bench - Benchmark runner for semantic matching

Usage:
semantic-bench <command> [flags]

Commands:
check Run benchmark and compare against baseline (default)
run Run benchmark suites
compare Compare two reports
lint Validate dataset
catalog Print dataset inventory
baseline Manage quality baselines (create, update)
calibrate Find optimal thresholds via precision/recall analysis
tune Grid-search lexical/embedding weights
runtime Check Go benchmark performance against baseline

Flags:
-h, --help Show help

Run 'semantic-bench <command> --help' for command-specific help.
`

func main() {
if len(os.Args) < 2 {
runCheck(os.Args[1:])
return
}

cmd := os.Args[1]
args := os.Args[2:]

switch cmd {
case "check":
runCheck(args)
case "run":
runRun(args)
case "compare":
runCompare(args)
case "lint":
runLint(args)
case "catalog":
runCatalog(args)
case "baseline":
runBaseline(args)
case "calibrate":
runCalibrate(args)
case "tune":
runTune(args)
case "runtime":
runRuntime(args)
case "-h", "--help", "help":
fmt.Print(usage)
default:
fmt.Fprintf(os.Stderr, "unknown command: %s\n\n%s", cmd, usage)
os.Exit(2)
}
}

func runCheck(args []string) {
cfg := benchmark.ParseCheckFlags(args)
result, err := benchmark.RunCheck(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCheckResult(result, cfg)
if result.Status == "fail" {
os.Exit(1)
}
}

func runRun(args []string) {
cfg := benchmark.ParseRunFlags(args)
result, err := benchmark.RunBenchmark(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintRunResult(result, cfg)
}

func runCompare(args []string) {
cfg := benchmark.ParseCompareFlags(args)
result, err := benchmark.RunCompare(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCompareResult(result, cfg)
if result.Status == "fail" {
os.Exit(1)
}
}

func runLint(args []string) {
cfg := benchmark.ParseLintFlags(args)
result, err := benchmark.RunLint(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintLintResult(result, cfg)
if result.Errors > 0 {
os.Exit(1)
}
}

func runCatalog(args []string) {
cfg := benchmark.ParseCatalogFlags(args)
result, err := benchmark.RunCatalog(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCatalogResult(result, cfg)
}

func runBaseline(args []string) {
cfg := benchmark.ParseBaselineFlags(args)
result, err := benchmark.RunBaseline(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintBaselineResult(result, cfg)
}

func runCalibrate(args []string) {
cfg := benchmark.ParseCalibrateFlags(args)
result, err := benchmark.RunCalibrate(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintCalibrateResult(result, cfg)
}

func runTune(args []string) {
cfg := benchmark.ParseTuneFlags(args)
result, err := benchmark.RunTune(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintTuneResult(result, cfg)
}

func runRuntime(args []string) {
cfg := benchmark.ParseRuntimeFlags(args)
result, err := benchmark.RunRuntime(cfg)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(2)
}
benchmark.PrintRuntimeResult(result, cfg)
if result.Status == "fail" && cfg.FailOnRegression {
os.Exit(1)
}
}
131 changes: 124 additions & 7 deletions dev
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,27 @@ ERROR=$'\033[38;2;230;57;70m'
NC=$'\033[0m'

commands=(
"pr:🚀:Pre-PR checks (check + e2e + bench)"
"doctor:🩺:Setup dev environment"
"test:🧪:Run unit tests"
"test verbose:🧪:Run unit tests (verbose)"
"test race:🧪:Run unit tests with race detector"
"coverage:📊:Run tests with coverage report"
"lint:🔍:Run golangci-lint"
"lint corpus:🔍:Lint benchmark corpus"
"lint docs:🔍:Check documentation links"
"fmt:✨:Format code"
"vet:🔬:Run go vet"
"check:✅:Run all checks (fmt + vet + lint + test)"
"build:📦:Build CLI binary"
"bench:🏋:Run corpus benchmark suite"
"bench:🏋:Run corpus benchmark"
"bench full:🏋:Run full benchmark suite"
"baseline:📏:Create quality baseline"
"baseline check:📏:Check against baseline"
"baseline update:📏:Update baseline (--accept)"
"calibrate:🎯:Calibrate threshold recommendations"
"runtime:⏱️:Check runtime baseline"
"tune:🎛️:Tune combined weights"
"e2e:🐳:Run E2E tests (Docker)"
)

Expand All @@ -36,6 +46,36 @@ show_help() {
echo ""
}

run_pr() {
echo " ${ACCENT}${BOLD}🚀 Pre-PR checks${NC}"
echo ""

echo " ${MUTED}1/4 All checks (fmt + vet + lint + test)${NC}"
run_check

echo ""
echo " ${MUTED}2/4 E2E tests${NC}"
if [[ -f tests/e2e/run.sh ]]; then
go build -o /tmp/semantic ./cmd/semantic
PATH="/tmp:$PATH" bash tests/e2e/run.sh
echo " ${SUCCESS}✓${NC} E2E passed"
else
echo " ${MUTED}Skipped (no e2e/run.sh)${NC}"
fi

echo ""
echo " ${MUTED}3/4 Lint corpus${NC}"
run_lint_corpus

echo ""
echo " ${MUTED}4/4 Corpus benchmark${NC}"
run_bench > /dev/null 2>&1
echo " ${SUCCESS}✓${NC} Benchmark complete"

echo ""
echo " ${SUCCESS}${BOLD}🚀 Ready for PR${NC}"
}

run_test() {
echo " ${ACCENT}${BOLD}🧪 Running tests${NC}"
go test ./... -count=1
Expand Down Expand Up @@ -88,9 +128,19 @@ run_check() {
if [ -n "$unformatted" ]; then
echo " ${ERROR}✗${NC} Unformatted files:"
echo "$unformatted"
exit 1
echo ""
printf " Fix formatting now? (Y/n) "
read -r answer
if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then
gofmt -w .
echo " ${SUCCESS}✓${NC} Format (fixed)"
else
echo " ${MUTED}Run: gofmt -w .${NC}"
exit 1
fi
else
echo " ${SUCCESS}✓${NC} Format"
fi
echo " ${SUCCESS}✓${NC} Format"

echo " ${MUTED}2/4 Vet${NC}"
go vet ./...
Expand All @@ -115,8 +165,53 @@ run_build() {
}

run_bench() {
echo " ${ACCENT}${BOLD}⏱️ Running corpus benchmark suite${NC}"
bash tests/benchmark/scripts/run-corpus-benchmark.sh
echo " ${ACCENT}${BOLD}🏋 Running corpus benchmark${NC}"
go run ./cmd/semantic-bench check "$@"
}

run_bench_full() {
echo " ${ACCENT}${BOLD}🏋 Running full benchmark suite${NC}"
go run ./cmd/semantic-bench run -suite=all "$@"
}

run_lint_corpus() {
echo " ${ACCENT}${BOLD}🔍 Linting benchmark corpus${NC}"
go run ./cmd/semantic-bench lint "$@"
}

run_lint_docs() {
echo " ${ACCENT}${BOLD}🔍 Checking documentation links${NC}"
bash scripts/check-docs-links.sh
}

run_baseline() {
echo " ${ACCENT}${BOLD}📏 Creating quality baseline${NC}"
go run ./cmd/semantic-bench baseline create "$@"
}

run_baseline_check() {
echo " ${ACCENT}${BOLD}📏 Checking against baseline${NC}"
go run ./cmd/semantic-bench check "$@"
}

run_baseline_update() {
echo " ${ACCENT}${BOLD}📏 Updating baseline${NC}"
go run ./cmd/semantic-bench baseline update --accept "$@"
}

run_calibrate() {
echo " ${ACCENT}${BOLD}🎯 Calibrating thresholds${NC}"
go run ./cmd/semantic-bench calibrate -verbose "$@"
}

run_runtime() {
echo " ${ACCENT}${BOLD}⏱️ Checking runtime baseline${NC}"
go run ./cmd/semantic-bench runtime "$@"
}

run_tune() {
echo " ${ACCENT}${BOLD}🎛️ Tuning combined weights${NC}"
go run ./cmd/semantic-bench tune -verbose "$@"
}

run_e2e() {
Expand All @@ -129,6 +224,7 @@ run_e2e() {
}

case "${1:-help}" in
pr) run_pr ;;
doctor) exec bash scripts/doctor.sh ;;
test)
case "${2:-}" in
Expand All @@ -138,12 +234,33 @@ case "${1:-help}" in
esac
;;
coverage) run_coverage ;;
lint) run_lint ;;
lint)
case "${2:-}" in
corpus) run_lint_corpus ;;
docs) run_lint_docs ;;
*) run_lint ;;
esac
;;
fmt) run_fmt ;;
vet) run_vet ;;
check) run_check ;;
build) run_build ;;
bench|benchmark) run_bench ;;
bench|benchmark)
case "${2:-}" in
full) run_bench_full ;;
*) shift; run_bench "$@" ;;
esac
;;
baseline)
case "${2:-}" in
check) shift 2; run_baseline_check "$@" ;;
update) shift 2; run_baseline_update "$@" ;;
*) shift; run_baseline "$@" ;;
esac
;;
calibrate) shift; run_calibrate "$@" ;;
runtime) shift; run_runtime "$@" ;;
tune) shift; run_tune "$@" ;;
e2e) run_e2e ;;
help|*) show_help ;;
esac
Loading
Loading