From 94f484cc1fac6c137f4ae52285a8df01147c251d Mon Sep 17 00:00:00 2001 From: Aaro Koinsaari <89689072+koinsaari@users.noreply.github.com> Date: Tue, 9 Jun 2026 16:03:44 +0300 Subject: [PATCH] test(identity): expand matcher regression fixtures Grow the source-agnostic regression set from 8 to 18 entries and document it. Drop confidence-band assertions: specific scores shift on any weight tweak and are noise for a regression suite. - fakeRepo now applies the compat filter so fixtures can include category-incompatible distractors and verify blocking excludes them. - Added coverage for distance boundary (5/35/49/51 m), name variations (business suffix, word reorder, partial overlap), genuine ambiguity, category distractor, and address mismatch. - New testdata/README.md explains the entry format, assertion policy, and how to add a new entry. Closes #73. Co-Authored-By: Claude Opus 4.7 --- internal/identity/matcher_test.go | 35 +++--- internal/identity/testdata/README.md | 41 +++++++ .../identity/testdata/match_fixtures.json | 102 ++++++++++++++++-- 3 files changed, 156 insertions(+), 22 deletions(-) create mode 100644 internal/identity/testdata/README.md diff --git a/internal/identity/matcher_test.go b/internal/identity/matcher_test.go index f70e7c4..1ec5637 100644 --- a/internal/identity/matcher_test.go +++ b/internal/identity/matcher_test.go @@ -17,8 +17,10 @@ import ( "github.com/InWheelOrg/inwheel-api/pkg/models" ) -// fakeRepo returns a fixed candidate slice and records the categories it was -// called with so tests can assert the compat-filter was applied. +// fakeRepo holds an unfiltered candidate slice and applies the compat-filter +// that real CandidateRepo implementations apply at the database. Tests can +// therefore write fixtures that include category-incompatible distractors and +// verify the filter excludes them. type fakeRepo struct { candidates []models.Place err error @@ -27,14 +29,25 @@ type fakeRepo struct { func (f *fakeRepo) FindCandidates(_ context.Context, _, _, _ float64, cats []models.Category) ([]models.Place, error) { f.lastCats = cats - return f.candidates, f.err + if f.err != nil { + return nil, f.err + } + allowed := make(map[models.Category]bool, len(cats)) + for _, c := range cats { + allowed[c] = true + } + out := make([]models.Place, 0, len(f.candidates)) + for _, p := range f.candidates { + if allowed[p.Category] { + out = append(out, p) + } + } + return out, nil } type fixtureExpected struct { - Kind string `json:"Kind"` - MatchedPlaceID string `json:"MatchedPlaceID"` - MinConfidence float64 `json:"MinConfidence"` - MaxConfidence float64 `json:"MaxConfidence"` + Kind string `json:"Kind"` + MatchedPlaceID string `json:"MatchedPlaceID"` } type fixture struct { @@ -214,14 +227,6 @@ func TestMatch_Fixtures(t *testing.T) { t.Errorf("MatchedPlaceID = %q, want %q", d.MatchedPlaceID, f.Expected.MatchedPlaceID) pass = false } - if f.Expected.MinConfidence > 0 && d.Confidence < f.Expected.MinConfidence { - t.Errorf("Confidence = %v, want >= %v", d.Confidence, f.Expected.MinConfidence) - pass = false - } - if f.Expected.MaxConfidence > 0 && d.Confidence > f.Expected.MaxConfidence { - t.Errorf("Confidence = %v, want <= %v", d.Confidence, f.Expected.MaxConfidence) - pass = false - } total++ if pass { correct++ diff --git a/internal/identity/testdata/README.md b/internal/identity/testdata/README.md new file mode 100644 index 0000000..b2c11e8 --- /dev/null +++ b/internal/identity/testdata/README.md @@ -0,0 +1,41 @@ +# identity matcher fixtures + +`match_fixtures.json` is the source-agnostic regression set for `identity.Match`. Each entry pairs an incoming `identity.Record` with a list of candidate places and the decision the matcher must produce. `TestMatch_Fixtures` in `../matcher_test.go` runs every entry and asserts the result. + +The goal is to lock in current matcher behaviour and catch accidental regressions when scoring, normalization, blocking, or thresholds are touched. This file is **not** a tuning corpus — it does not measure aggregate precision/recall against realistic data. Per-source realism fixtures live under `internal/sources//testdata/` and arrive with each new source. + +## Entry format + +```json +{ + "name": "short label that appears as the subtest name", + "record": { "Name": "...", "Lat": 0.0, "Lng": 0.0, "Category": "...", + "Street": "...", "HouseNumber": "..." }, + "candidates": [ + { "id": "p1", "name": "...", "lat": 0.0, "lng": 0.0, "category": "...", + "tags": { "addr:street": "...", "addr:housenumber": "..." } } + ], + "expected": { "Kind": "confident|low_confidence|no_match", "MatchedPlaceID": "p1" } +} +``` + +- `record` fields mirror `identity.Record`. Omitted string fields default to `""`. +- `candidates` are `models.Place` values. The fake repo in the test applies the category compat filter (`identity.Compatible`) before passing them to `Match`, so entries can include incompatible distractors and verify they are excluded. +- `expected.Kind` is required. `expected.MatchedPlaceID` is asserted only when non-empty; omit it for `no_match` entries. +- Confidence values are **not** asserted. Specific scores shift whenever a weight or threshold changes; only the decision band matters for a regression test. + +## Adding a new entry + +1. Pick what behaviour the entry pins down — name normalization, distance falloff, compat filter, threshold band, tiebreak, etc. One concern per entry. +2. Compute the expected outcome by hand from the constants in `score.go` (`RadiusM`, `ConfidentThreshold`, `LowConfidenceThreshold`, the three weights). If the outcome depends on tuning being exactly what it is today, that is a signal the entry will be noisy and may need to be rewritten when thresholds change. +3. Keep coordinates in the same neighbourhood as existing entries (around `(46.4628, 6.8417)`). Compute lat offsets as `meters / 111000`; longitude offsets are not used by the current set. +4. Run `go test ./internal/identity/ -run TestMatch_Fixtures -v` and confirm the new entry passes. + +## What is covered today + +- Confident, low-confidence, and no-match outcomes across coordinate, name, and address signals. +- Distance falloff at 5 m, 25 m, 35 m, 49 m, and beyond `RadiusM`. +- Name normalization: diacritics, business-suffix drop, word reorder, partial overlap. +- Category compat: candidates of an incompatible category are filtered out by blocking. +- Tiebreak: stronger name beats slightly-closer distractor; argmax across multiple candidates. +- Address weight: matching street + housenumber boosts the score; mismatched address does not block a confident match driven by name and distance. diff --git a/internal/identity/testdata/match_fixtures.json b/internal/identity/testdata/match_fixtures.json index 87ca717..877934e 100644 --- a/internal/identity/testdata/match_fixtures.json +++ b/internal/identity/testdata/match_fixtures.json @@ -9,7 +9,7 @@ {"id": "p1", "name": "Café Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe", "tags": {"addr:street": "Rue du Simplon", "addr:housenumber": "10"}} ], - "expected": {"Kind": "confident", "MatchedPlaceID": "p1", "MinConfidence": 0.95} + "expected": {"Kind": "confident", "MatchedPlaceID": "p1"} }, { "name": "low confidence: 25 m away, name match, no address", @@ -17,7 +17,7 @@ "candidates": [ {"id": "p1", "name": "Pascal", "lat": 46.462575, "lng": 6.8417, "category": "cafe"} ], - "expected": {"Kind": "low_confidence", "MatchedPlaceID": "p1", "MinConfidence": 0.55, "MaxConfidence": 0.80} + "expected": {"Kind": "low_confidence", "MatchedPlaceID": "p1"} }, { "name": "no match: no candidates returned", @@ -40,7 +40,7 @@ {"id": "p1", "name": "Roma", "lat": 46.4628, "lng": 6.8417, "category": "cafe"}, {"id": "p2", "name": "Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe"} ], - "expected": {"Kind": "confident", "MatchedPlaceID": "p2", "MinConfidence": 0.55} + "expected": {"Kind": "confident", "MatchedPlaceID": "p2"} }, { "name": "address absent: still confident on strong name + distance", @@ -48,7 +48,7 @@ "candidates": [ {"id": "p1", "name": "Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe"} ], - "expected": {"Kind": "confident", "MatchedPlaceID": "p1", "MinConfidence": 0.95} + "expected": {"Kind": "confident", "MatchedPlaceID": "p1"} }, { "name": "diacritic normalization: Café matches Cafe", @@ -56,12 +56,100 @@ "candidates": [ {"id": "p1", "name": "Cafe Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe"} ], - "expected": {"Kind": "confident", "MatchedPlaceID": "p1", "MinConfidence": 0.95} + "expected": {"Kind": "confident", "MatchedPlaceID": "p1"} }, { - "name": "category incompatible: fake repo returns empty", + "name": "category incompatible: cafe candidate filtered from healthcare record", "record": {"Name": "Pascal Pharmacy", "Lat": 46.4628, "Lng": 6.8417, "Category": "healthcare"}, - "candidates": [], + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe"} + ], "expected": {"Kind": "no_match"} + }, + { + "name": "boundary: 5 m offset, identical name, no address → confident", + "record": {"Name": "Pascal", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.462755, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "confident", "MatchedPlaceID": "p1"} + }, + { + "name": "boundary: 35 m offset, identical name, no address → low_confidence", + "record": {"Name": "Pascal", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.462485, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "low_confidence", "MatchedPlaceID": "p1"} + }, + { + "name": "boundary: 49 m offset, identical name, no address → no_match (just inside radius, score below floor)", + "record": {"Name": "Pascal", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.462359, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "no_match"} + }, + { + "name": "boundary: 51 m offset, identical name → no_match (distance score clamps to 0)", + "record": {"Name": "Pascal", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.462341, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "no_match"} + }, + { + "name": "name variation: business suffix dropped (Pascal Inc → Pascal)", + "record": {"Name": "Pascal Inc", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "confident", "MatchedPlaceID": "p1"} + }, + { + "name": "name variation: word reorder (Mario Pizza ↔ Pizza Mario)", + "record": {"Name": "Mario Pizza", "Lat": 46.4628, "Lng": 6.8417, "Category": "restaurant"}, + "candidates": [ + {"id": "p1", "name": "Pizza Mario", "lat": 46.4628, "lng": 6.8417, "category": "restaurant"} + ], + "expected": {"Kind": "confident", "MatchedPlaceID": "p1"} + }, + { + "name": "name variation: partial token overlap (Café Pascal Bistro vs Pascal) → low_confidence", + "record": {"Name": "Café Pascal Bistro", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "low_confidence", "MatchedPlaceID": "p1"} + }, + { + "name": "ambiguity: stronger name signal beats slightly-closer noisier candidate", + "record": {"Name": "Pascal", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.462620, "lng": 6.8417, "category": "cafe"}, + {"id": "p2", "name": "Pascal Cafe", "lat": 46.462755, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "low_confidence", "MatchedPlaceID": "p1"} + }, + { + "name": "category distractor: pharmacy filtered out, nearby cafe wins", + "record": {"Name": "Pascal", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe"}, + "candidates": [ + {"id": "p1", "name": "Pharmacie Pascal", "lat": 46.4628, "lng": 6.8417, "category": "healthcare"}, + {"id": "p2", "name": "Pascal", "lat": 46.46271, "lng": 6.8417, "category": "cafe"} + ], + "expected": {"Kind": "confident", "MatchedPlaceID": "p2"} + }, + { + "name": "address mismatch: still confident when name + distance dominate", + "record": { + "Name": "Pascal", "Lat": 46.4628, "Lng": 6.8417, "Category": "cafe", + "Street": "Rue du Simplon", "HouseNumber": "10" + }, + "candidates": [ + {"id": "p1", "name": "Pascal", "lat": 46.4628, "lng": 6.8417, "category": "cafe", + "tags": {"addr:street": "Rue de la Gare", "addr:housenumber": "5"}} + ], + "expected": {"Kind": "confident", "MatchedPlaceID": "p1"} } ]