diff --git a/apps/tray-ui/src/components/kb/TrayEntityLink.tsx b/apps/tray-ui/src/components/kb/TrayEntityLink.tsx index e80db05..eb59aab 100644 --- a/apps/tray-ui/src/components/kb/TrayEntityLink.tsx +++ b/apps/tray-ui/src/components/kb/TrayEntityLink.tsx @@ -24,7 +24,7 @@ import type { ReferenceCategory, ReferenceEntry, } from '../../lib/reference'; -import { webKbUrl } from '../../lib/reference'; +import { resolveReferenceEntry, webKbUrl } from '../../lib/reference'; import { TierChip } from './TierChip'; interface Props { @@ -69,8 +69,10 @@ export function TrayEntityLink({ return {label ?? ''}; } - const entry: ReferenceEntry | undefined = catalog?.get( - classKey.toLowerCase(), + const entry: ReferenceEntry | undefined = resolveReferenceEntry( + category, + classKey, + catalog, ); const text = label ?? entry?.display_name ?? classKey; diff --git a/apps/tray-ui/src/lib/reference.test.ts b/apps/tray-ui/src/lib/reference.test.ts new file mode 100644 index 0000000..f3aee0b --- /dev/null +++ b/apps/tray-ui/src/lib/reference.test.ts @@ -0,0 +1,101 @@ +import { describe, expect, it } from 'vitest'; + +import { + findEntityInBundles, + isCosmeticItemPort, + isNonLinkableItemClass, + resolveReferenceEntry, + type AllReferenceBundles, + type CategoryBundle, + type ReferenceCatalog, + type ReferenceCategory, + type ReferenceEntry, +} from './reference'; + +function refEntry( + category: ReferenceCategory, + class_name: string, + display_name: string, + slug: string | null = null, +): ReferenceEntry { + return { category, class_name, display_name, slug, summary: { category } }; +} + +function makeCatalog(entries: ReferenceEntry[]): ReferenceCatalog { + const m = new Map(); + for (const e of entries) m.set(e.class_name.toLowerCase(), e); + return m; +} + +function bundle(entries: ReferenceEntry[]): CategoryBundle { + return { map: new Map(), catalog: makeCatalog(entries), list: entries }; +} + +describe('resolveReferenceEntry — tray mirror', () => { + const vehicles = makeCatalog([ + refEntry('vehicle', 'ARGO_MOLE', 'ARGO MOLE', 'argo-mole'), + ]); + + it('strips _Teach loaner suffix', () => { + expect( + resolveReferenceEntry('vehicle', 'ARGO_MOLE_Teach', vehicles)?.slug, + ).toBe('argo-mole'); + }); + + it('resolves exact + case-insensitive', () => { + expect(resolveReferenceEntry('vehicle', 'argo_mole', vehicles)?.slug).toBe( + 'argo-mole', + ); + }); + + it('filters avatar/structural item noise', () => { + const items = makeCatalog([ + refEntry('item', 'grin_multitool_01', 'Greycat Multi-Tool', 'multitool'), + ]); + expect( + resolveReferenceEntry('item', 'Head_Eyelashes', items), + ).toBeUndefined(); + expect( + resolveReferenceEntry('item', 'grin_multitool_01', items)?.slug, + ).toBe('multitool'); + }); +}); + +describe('isNonLinkableItemClass / isCosmeticItemPort — tray mirror', () => { + it('flags noise classes but not equipment', () => { + expect(isNonLinkableItemClass('Default')).toBe(true); + expect(isNonLinkableItemClass('Shared_Scalp_Unified')).toBe(true); + expect(isNonLinkableItemClass('grin_multitool_01')).toBe(false); + }); + + it('flags cosmetic ports but not equipment ports', () => { + expect(isCosmeticItemPort('Hair_ItemPort')).toBe(true); + expect(isCosmeticItemPort('weapon_attach_hand_right')).toBe(false); + expect(isCosmeticItemPort(null)).toBe(false); + }); +}); + +describe('findEntityInBundles — applies noise + suffix logic', () => { + const bundles: AllReferenceBundles = { + vehicle: bundle([refEntry('vehicle', 'ARGO_MOLE', 'ARGO MOLE', 'argo-mole')]), + weapon: bundle([]), + item: bundle([ + refEntry('item', 'grin_multitool_01', 'Greycat Multi-Tool', 'multitool'), + ]), + location: bundle([]), + }; + + it('finds a loaner variant via suffix strip', () => { + const hit = findEntityInBundles('ARGO_MOLE_Teach', bundles); + expect(hit?.category).toBe('vehicle'); + expect(hit?.entry.slug).toBe('argo-mole'); + }); + + it('does not bind avatar noise even though it probes the item catalog', () => { + expect(findEntityInBundles('Head_Eyelashes', bundles)).toBeNull(); + }); + + it('returns null for a genuinely unknown identifier', () => { + expect(findEntityInBundles('NOPE_Unknown_Thing', bundles)).toBeNull(); + }); +}); diff --git a/apps/tray-ui/src/lib/reference.ts b/apps/tray-ui/src/lib/reference.ts index 2b0891e..6f2f364 100644 --- a/apps/tray-ui/src/lib/reference.ts +++ b/apps/tray-ui/src/lib/reference.ts @@ -328,6 +328,74 @@ export function webKbUrl( return `${base}/kb/${category}`; } +/** + * Variant / loaner suffixes appended to a base class name in some + * event payloads but absent from the wiki catalogue. Mirrors the web + * `apps/web/src/lib/reference-types.ts`. Stripped as a second lookup + * attempt so `ARGO_MOLE_Teach` resolves to `ARGO_MOLE`. Lowercased. + */ +const VARIANT_SUFFIXES: readonly string[] = ['_teach', '_loaner']; + +/** + * Item class identifiers that are character-avatar parts, structural + * placeholders, or engine defaults — never catalogued equipment. Keep + * in sync with the web mirror. Match is case-insensitive. + */ +const NON_LINKABLE_ITEM_PATTERNS: readonly RegExp[] = [ + /^default(_|$)/i, + /^head_/i, + /^body_/i, + /^shared_scalp/i, + /^pu_protos/i, + /^fp_visor$/i, + /^fps_default/i, + /lensdisplay/i, +]; + +/** True when an item class is avatar/structural noise. Pure. */ +export function isNonLinkableItemClass(classKey: string): boolean { + return NON_LINKABLE_ITEM_PATTERNS.some((re) => re.test(classKey)); +} + +const COSMETIC_ITEM_PORTS: readonly RegExp[] = [ + /^(eyes|hair|eyelashes|eyebrow|beard|teeth|head|face)_itemport$/i, + /^body_itemport$/i, + /_scalp/i, +]; + +/** True when an item PORT is avatar customisation / structural. Pure. */ +export function isCosmeticItemPort(port: string | null | undefined): boolean { + if (!port) return false; + return COSMETIC_ITEM_PORTS.some((re) => re.test(port)); +} + +/** + * Resolve a raw class identifier within a single category's catalog, + * applying the item-noise filter and variant-suffix strip. Mirror of + * the web `resolveReferenceEntry`. Pure; returns undefined on miss. + */ +export function resolveReferenceEntry( + category: ReferenceCategory, + classKey: string | null | undefined, + catalog: ReferenceCatalog | undefined, +): ReferenceEntry | undefined { + if (!classKey || !catalog) return undefined; + if (category === 'item' && isNonLinkableItemClass(classKey)) { + return undefined; + } + const key = classKey.toLowerCase(); + const direct = catalog.get(key); + if (direct) return direct; + if (category === 'location') return undefined; + for (const suffix of VARIANT_SUFFIXES) { + if (key.endsWith(suffix) && key.length > suffix.length) { + const stripped = catalog.get(key.slice(0, -suffix.length)); + if (stripped) return stripped; + } + } + return undefined; +} + /** Locate a class identifier across all four catalogues. Used by * the ReactNode prettifier (`prettifySummaryReact`) — the regex * picks tokens out of a server-rendered summary string without @@ -337,6 +405,10 @@ export function webKbUrl( * practice because the wiki sync namespaces by category, but the * iteration order is deterministic if they ever did. * + * Applies the item-noise filter + variant-suffix strip per category + * via `resolveReferenceEntry`, so loaner variants resolve and avatar + * noise doesn't bind. + * * Returns `null` when no catalogue claims the identifier — the * caller falls back to the raw string in that case (same * behaviour as the legacy `prettifySummary`). */ @@ -344,9 +416,12 @@ export function findEntityInBundles( classKey: string, bundles: AllReferenceBundles, ): { category: ReferenceCategory; entry: ReferenceEntry } | null { - const key = classKey.toLowerCase(); for (const category of REFERENCE_CATEGORIES) { - const entry = bundles[category].catalog.get(key); + const entry = resolveReferenceEntry( + category, + classKey, + bundles[category].catalog, + ); if (entry) return { category, entry }; } return null; diff --git a/apps/web/src/components/kb/EntityLink.tsx b/apps/web/src/components/kb/EntityLink.tsx index c03a5f4..36233b7 100644 --- a/apps/web/src/components/kb/EntityLink.tsx +++ b/apps/web/src/components/kb/EntityLink.tsx @@ -26,6 +26,7 @@ import type { ReferenceCatalog, ReferenceCategory, } from '@/lib/reference-types'; +import { resolveReferenceEntry } from '@/lib/reference-types'; import { toFriendlyName } from '@/lib/heuristic-name'; import { EntityHoverCard } from './EntityHoverCard'; import { TierChip } from './TierChip'; @@ -71,7 +72,7 @@ export function EntityLink({ return {label ?? ''}; } - const entry = catalog?.get(classKey.toLowerCase()); + const entry = resolveReferenceEntry(category, classKey, catalog); const text = label ?? entry?.display_name ?? toFriendlyName(classKey); // Tier chip is opt-in via `showTier` and only meaningful for diff --git a/apps/web/src/lib/reference-types.test.ts b/apps/web/src/lib/reference-types.test.ts index 83d26cc..69ad97c 100644 --- a/apps/web/src/lib/reference-types.test.ts +++ b/apps/web/src/lib/reference-types.test.ts @@ -1,11 +1,16 @@ import { describe, expect, it } from 'vitest'; import { + isCosmeticItemPort, + isNonLinkableItemClass, placementLabel, + resolveReferenceEntry, subtypeLabel, tierLabel, type LocationSummary, type Placement, + type ReferenceCatalog, + type ReferenceEntry, } from './reference-types'; describe('tierLabel', () => { @@ -90,3 +95,125 @@ describe('LocationSummary backward compat', () => { } }); }); + +function refEntry( + category: ReferenceEntry['category'], + class_name: string, + display_name: string, + slug: string | null = null, +): ReferenceEntry { + return { category, class_name, display_name, slug, summary: { category } }; +} + +/** Catalog keyed by lowercased class_name, mirroring getCategoryBundle. */ +function makeCatalog(entries: ReferenceEntry[]): ReferenceCatalog { + const m = new Map(); + for (const e of entries) m.set(e.class_name.toLowerCase(), e); + return m; +} + +describe('resolveReferenceEntry — variant-suffix strip (workstream A)', () => { + const vehicles = makeCatalog([ + refEntry('vehicle', 'ARGO_MOLE', 'ARGO MOLE', 'argo-mole'), + refEntry('vehicle', 'DRAK_Vulture', 'Drake Vulture', 'drake-vulture'), + ]); + + it('resolves an exact (and case-insensitive) class name', () => { + expect(resolveReferenceEntry('vehicle', 'ARGO_MOLE', vehicles)?.slug).toBe( + 'argo-mole', + ); + expect(resolveReferenceEntry('vehicle', 'argo_mole', vehicles)?.slug).toBe( + 'argo-mole', + ); + }); + + it('strips the _Teach loaner suffix to the base class', () => { + // The two real misses found in the live tray DB (93 + 13 events). + expect( + resolveReferenceEntry('vehicle', 'ARGO_MOLE_Teach', vehicles)?.slug, + ).toBe('argo-mole'); + expect( + resolveReferenceEntry('vehicle', 'DRAK_Vulture_Teach', vehicles)?.slug, + ).toBe('drake-vulture'); + }); + + it('does not over-strip when there is no catalogued base', () => { + expect( + resolveReferenceEntry('vehicle', 'SOME_Unknown_Teach', vehicles), + ).toBeUndefined(); + }); + + it('returns undefined when no catalog is supplied', () => { + expect( + resolveReferenceEntry('vehicle', 'ARGO_MOLE', undefined), + ).toBeUndefined(); + }); +}); + +describe('isNonLinkableItemClass — item noise filter (workstream D)', () => { + it('flags avatar / structural / default classes', () => { + for (const c of [ + 'Default', + 'Default_LensDisplay_PU', + 'Head_Eyelashes', + 'Head_Teeth', + 'body_01_noMagicPocket', + 'Shared_Scalp_Unified', + 'PU_Protos_Head', + 'FP_Visor', + 'FPS_DefaultRadar_Lens', + ]) { + expect(isNonLinkableItemClass(c), c).toBe(true); + } + }); + + it('does NOT flag genuine equipment', () => { + for (const c of [ + 'grin_multitool_01', + 'klwe_pistol_energy_01_mag', + 'crlf_consumable_healing_01', + 'behr_gren_frag_01', + ]) { + expect(isNonLinkableItemClass(c), c).toBe(false); + } + }); + + it('keeps noise item classes from resolving (renders plain text)', () => { + const items = makeCatalog([ + refEntry('item', 'Head_Eyelashes', 'Eyelashes', 'eyelashes'), + refEntry('item', 'grin_multitool_01', 'Greycat Multi-Tool', 'multitool'), + ]); + expect( + resolveReferenceEntry('item', 'Head_Eyelashes', items), + ).toBeUndefined(); + expect( + resolveReferenceEntry('item', 'grin_multitool_01', items)?.slug, + ).toBe('multitool'); + }); +}); + +describe('isCosmeticItemPort', () => { + it('flags avatar / structural ports, not equipment ports', () => { + for (const p of [ + 'Eyes_ItemPort', + 'Hair_ItemPort', + 'Eyelashes_ItemPort', + 'Body_ItemPort', + ]) { + expect(isCosmeticItemPort(p), p).toBe(true); + } + for (const p of [ + 'weapon_attach_hand_right', + 'magazine_attach', + 'Armor_Helmet', + 'utility_attach_1', + ]) { + expect(isCosmeticItemPort(p), p).toBe(false); + } + }); + + it('handles null / undefined', () => { + expect(isCosmeticItemPort(null)).toBe(false); + expect(isCosmeticItemPort(undefined)).toBe(false); + }); +}); diff --git a/apps/web/src/lib/reference-types.ts b/apps/web/src/lib/reference-types.ts index 1a30507..66b224d 100644 --- a/apps/web/src/lib/reference-types.ts +++ b/apps/web/src/lib/reference-types.ts @@ -346,6 +346,97 @@ export function prettyClass( return map.get(raw.toLowerCase()) ?? toFriendlyName(raw); } +/** + * Variant / loaner suffixes appended to a base class name in some + * event payloads but ABSENT from the wiki catalogue. Stripped as a + * second lookup attempt so e.g. `ARGO_MOLE_Teach` (the tutorial + * loaner) and `DRAK_Vulture_Teach` resolve to the catalogued + * `ARGO_MOLE` / `DRAK_Vulture`. Lowercased; matched as a suffix. + */ +const VARIANT_SUFFIXES: readonly string[] = ['_teach', '_loaner']; + +/** + * Item class identifiers that are character-avatar parts, structural + * placeholders, or engine defaults — never catalogued equipment. The + * `attachment_received` stream is dominated by these (avatar assembly + * on spawn), so without a filter the catalogue is consulted for + * thousands of `Head_Eyelashes` / `Default` / `body_*` "items". Match + * is case-insensitive; a hit means "render as plain text, never a + * link". Conservative — only patterns confirmed against real logs. + */ +const NON_LINKABLE_ITEM_PATTERNS: readonly RegExp[] = [ + /^default(_|$)/i, // "Default", "Default_LensDisplay_PU" + /^head_/i, // Head_Eyelashes, Head_Teeth, Head_Eyedetail + /^body_/i, // body_01_noMagicPocket (corpse / avatar body) + /^shared_scalp/i, // Shared_Scalp_Unified + /^pu_protos/i, // PU_Protos_Head + /^fp_visor$/i, // FP_Visor + /^fps_default/i, // FPS_DefaultRadar_Lens + /lensdisplay/i, // *_LensDisplay_* HUD glass +]; + +/** + * True when an item class is avatar/structural noise rather than a + * catalogued, linkable piece of equipment. Pure. + */ +export function isNonLinkableItemClass(classKey: string): boolean { + return NON_LINKABLE_ITEM_PATTERNS.some((re) => re.test(classKey)); +} + +/** + * Item *ports* that hold avatar customisation or structural sockets + * rather than meaningful equipment (`Eyes_ItemPort`, `Hair_ItemPort`, + * `Body_ItemPort`). Exposed so event-rendering surfaces can suppress + * `attachment_received` noise by port. Pure. + */ +const COSMETIC_ITEM_PORTS: readonly RegExp[] = [ + /^(eyes|hair|eyelashes|eyebrow|beard|teeth|head|face)_itemport$/i, + /^body_itemport$/i, + /_scalp/i, +]; + +export function isCosmeticItemPort(port: string | null | undefined): boolean { + if (!port) return false; + return COSMETIC_ITEM_PORTS.some((re) => re.test(port)); +} + +/** + * Resolve a raw class identifier to a catalog entry, applying two + * fallbacks beyond the exact case-insensitive key: + * 1. **Item noise filter** — avatar / structural item classes never + * resolve (so they render as plain text, never a misleading + * link). Only applies to `category === 'item'`. + * 2. **Variant-suffix strip** — `_Teach` / `_loaner` loaner variants + * fall back to their base class (`ARGO_MOLE_Teach` → `ARGO_MOLE`). + * Applies to vehicle / weapon / item. + * + * Locations are looked up by exact key only here — their richer + * multi-index / classifier resolution lives in the location catalog + * path, not in this generic catalog. Pure; returns undefined on miss. + */ +export function resolveReferenceEntry( + category: ReferenceCategory, + classKey: string | null | undefined, + catalog: ReferenceCatalog | undefined, +): ReferenceEntry | undefined { + if (!classKey || !catalog) return undefined; + if (category === 'item' && isNonLinkableItemClass(classKey)) { + return undefined; + } + const key = classKey.toLowerCase(); + const direct = catalog.get(key); + if (direct) return direct; + // Locations don't carry loaner-style variant suffixes. + if (category === 'location') return undefined; + for (const suffix of VARIANT_SUFFIXES) { + if (key.endsWith(suffix) && key.length > suffix.length) { + const stripped = catalog.get(key.slice(0, -suffix.length)); + if (stripped) return stripped; + } + } + return undefined; +} + // -- Location catalog types (Wave 1: catalog-driven hierarchy) --------- /** Trimmed shape of a wiki location entry — only the fields we use diff --git a/crates/starstats-core/src/location_catalog.rs b/crates/starstats-core/src/location_catalog.rs index 04846ed..12156a2 100644 --- a/crates/starstats-core/src/location_catalog.rs +++ b/crates/starstats-core/src/location_catalog.rs @@ -94,8 +94,61 @@ pub struct LocationCatalog { by_engine_tag: HashMap>, by_slug: HashMap>, by_normalized_name: HashMap>, + /// Content tokens of every entry's display name, aligned by index + /// with `entries`. Used by [`LocationCatalog::fuzzy_match`] to score + /// token overlap. Built once; never mutated after construction. + entry_tokens: Vec>, + /// Document frequency of each content token across all display + /// names. Drives the inverse-document-frequency weighting — a rare + /// word (`"kaltag"`, df 1) dominates a common one (`"outpost"`, + /// df ~200) so a match resting on filler words scores near zero. + name_token_df: HashMap, + /// Inverted index `token → entry indices`, restricted to tokens + /// rare enough to be worth gathering candidates on (df ≤ + /// [`MAX_INDEX_DF`]). Common filler tokens are deliberately absent — + /// they still contribute to *scoring* (via `name_token_df`) but + /// never *seed* a candidate set. + name_token_index: HashMap>, } +/// A token must appear in at most this many display names to seed a +/// fuzzy-match candidate set. Above this it's filler (`"outpost"`, +/// `"station"`, `"research"`) and gathering on it would scan hundreds +/// of rows for no precision gain. +const MAX_INDEX_DF: u32 = 40; + +/// The *anchor* requirement: to accept a fuzzy match, the matched entry +/// must share a non-digit content token this rare with the query. Set +/// deliberately low (≤ 4) so the anchor is a near-unique *place* name +/// (`"kaltag"`, `"goldenrod"`, df 1), never a corporate operator +/// shared across a dozen sibling outposts (`"rayari"`, df 6). This is +/// the guard that rejects `RayariHydro_McGarth` (no catalogued +/// `mcgarth`) instead of letting it land on a random Rayari outpost. +/// Trade-off: digit-discriminated families whose only non-digit token +/// is a shared operator (Shubin `SAL-2`/`SAL-5`) fall through to the +/// system heuristic rather than risk a wrong sibling. +const FUZZY_ANCHOR_DF: u32 = 4; + +/// Operator / utility words that appear in engine *affiliation* +/// segments (`RayariHydro_…`) and happen to also be rare words in some +/// unrelated wiki name. Excluded from anchor eligibility so a +/// coincidental affiliation-word overlap can't carry a match — e.g. +/// `hydro` must never bind `RayariHydro_McGarth` to `Terra Mills +/// HydroFarm`. They still contribute to *scoring* once a real anchor +/// exists; they just can't be the anchor themselves. +const AFFILIATION_NOISE: &[&str] = &[ + "hydro", + "dynamics", + "corp", + "corporation", + "industries", + "industrial", + "manufacturing", + "security", + "logistics", + "aerospace", +]; + impl LocationCatalog { /// Build the catalogue + all three indices in a single pass. /// Collisions on any index are resolved last-write-wins; the @@ -127,12 +180,145 @@ impl LocationCatalog { .by_normalized_name .insert(normalize_name(&arc.display_name), arc.clone()); + // Content tokens of the display name feed the fuzzy + // matcher. Dedup per-entry so a name like "Pyro2 M Trdp 01" + // counts each token once toward document frequency. + let mut toks = content_tokens(&arc.display_name); + toks.sort(); + toks.dedup(); + for tok in &toks { + *catalog.name_token_df.entry(tok.clone()).or_insert(0) += 1; + } + catalog.entry_tokens.push(toks); catalog.entries.push(arc); } + // Second pass: build the inverted index now that every token's + // document frequency is known, skipping filler tokens. + for (idx, toks) in catalog.entry_tokens.iter().enumerate() { + for tok in toks { + if catalog.name_token_df.get(tok).copied().unwrap_or(0) <= MAX_INDEX_DF { + catalog + .name_token_index + .entry(tok.clone()) + .or_default() + .push(idx); + } + } + } + catalog } + /// Fuzzy fallback used by the classifier when no exact engine-tag, + /// slug, or normalized-name key matched. Scores catalog entries by + /// inverse-document-frequency-weighted token overlap against the + /// query tokens (already split on `_` and `` by the + /// classifier), and returns the single best entry — or `None` when + /// no candidate clears the precision bar. + /// + /// Two guards keep precision high: + /// * **Distinctive-token requirement** — the winner must share a + /// non-digit token with df ≤ [`FUZZY_ANCHOR_DF`]. A match resting only + /// on filler (`"research"`, `"outpost"`) is rejected. + /// * **System consistency** — when the caller knows the system + /// (parsed from the engine string) and a candidate declares a + /// *different* system, that candidate is discarded. A + /// `Stanton…` engine string can never resolve to a Pyro row. + /// + /// Deterministic: ties break by score, then shared-token count, + /// then slug — never by `HashMap` iteration order. + pub fn fuzzy_match( + &self, + query_tokens: &[String], + system_hint: Option<&str>, + ) -> Option<&LocationCatalogEntry> { + // Expand the query into its content-token set. + let mut query: Vec = Vec::new(); + for t in query_tokens { + for tok in content_tokens(t) { + if !query.contains(&tok) { + query.push(tok); + } + } + } + if query.is_empty() { + return None; + } + + // Gather candidate entries: any entry sharing a non-filler + // token with the query. + let mut candidates: Vec = Vec::new(); + for tok in &query { + if let Some(idxs) = self.name_token_index.get(tok) { + candidates.extend_from_slice(idxs); + } + } + candidates.sort_unstable(); + candidates.dedup(); + + // Score each candidate over the FULL shared-token set (including + // common tokens, weighted near-zero), and apply both guards. + struct Scored { + idx: usize, + score: f32, + shared: u32, + } + let mut best: Option = None; + for &i in &candidates { + let entry = &self.entries[i]; + if let (Some(hint), Some(sys)) = (system_hint, entry.system.as_deref()) { + if !hint.eq_ignore_ascii_case(sys) { + continue; + } + } + let mut score = 0.0f32; + let mut shared = 0u32; + let mut has_anchor = false; + for tok in &self.entry_tokens[i] { + if !query.contains(tok) { + continue; + } + let df = self.name_token_df.get(tok).copied().unwrap_or(1).max(1); + // Every shared token counts toward the ranking score + // (so a shared digit still breaks SAL-2 from SAL-5)… + score += 1.0 / df as f32; + shared += 1; + // …but only a rare, non-digit, non-affiliation token + // qualifies as the *anchor* that licenses the match. + if df <= FUZZY_ANCHOR_DF + && tok.len() >= 3 + && !tok.chars().all(|c| c.is_ascii_digit()) + && !AFFILIATION_NOISE.contains(&tok.as_str()) + { + has_anchor = true; + } + } + if !has_anchor { + continue; + } + let better = match &best { + None => true, + Some(b) => { + score > b.score + || (score == b.score && shared > b.shared) + || (score == b.score + && shared == b.shared + && entry.slug < self.entries[b.idx].slug) + } + }; + if better { + best = Some(Scored { + idx: i, + score, + shared, + }); + } + } + + best.map(|b| self.entries[b.idx].as_ref()) + } + /// Number of entries in the catalogue (post-dedup of empty rows). pub fn len(&self) -> usize { self.entries.len() @@ -194,6 +380,73 @@ fn normalize_name(name: &str) -> String { out } +/// System names are dropped from content tokens — the classifier +/// tracks the system separately, and including it would let any two +/// same-system locations share a (useless) token. +const SYSTEM_TOKENS: &[&str] = &["stanton", "pyro", "nyx", "castra", "terra", "sol"]; + +/// Split a name or engine identifier into lowercase content tokens for +/// fuzzy matching. Boundaries: non-alphanumerics, camelCase humps, and +/// letter↔digit transitions. Drops system names and sub-3-char +/// non-numeric noise (single letters like a stray `b` from `1b`). +/// +/// * `"RayariHydro_Deltana"` → `["rayari", "hydro", "deltana"]` +/// * `"Shubin Mining SAL-2"` → `["shubin", "mining", "sal", "2"]` +/// * `"Stanton4a_Shubin_SM0_13"` → `["4a"→…, "shubin", "sm", "0", "13"]` +pub fn content_tokens(s: &str) -> Vec { + #[derive(PartialEq, Clone, Copy)] + enum Kind { + Upper, + Lower, + Digit, + Other, + } + fn kind(c: char) -> Kind { + if c.is_ascii_digit() { + Kind::Digit + } else if c.is_ascii_uppercase() { + Kind::Upper + } else if c.is_ascii_lowercase() { + Kind::Lower + } else { + Kind::Other + } + } + + let mut tokens: Vec = Vec::new(); + let mut cur = String::new(); + let mut prev: Option = None; + for c in s.chars() { + let k = kind(c); + let boundary = matches!( + (prev, k), + (_, Kind::Other) + | (Some(Kind::Other), _) + | (Some(Kind::Lower), Kind::Upper) + | (Some(Kind::Digit), Kind::Upper) + | (Some(Kind::Lower), Kind::Digit) + | (Some(Kind::Upper), Kind::Digit) + | (Some(Kind::Digit), Kind::Lower) + ); + if boundary && !cur.is_empty() { + tokens.push(std::mem::take(&mut cur)); + } + if k != Kind::Other { + cur.push(c.to_ascii_lowercase()); + } + prev = Some(k); + } + if !cur.is_empty() { + tokens.push(cur); + } + + tokens.retain(|t| { + let all_digit = t.chars().all(|c| c.is_ascii_digit()); + (t.len() >= 3 || all_digit) && !SYSTEM_TOKENS.contains(&t.as_str()) + }); + tokens +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/starstats-core/src/location_classifier.rs b/crates/starstats-core/src/location_classifier.rs index e5142d4..937f048 100644 --- a/crates/starstats-core/src/location_classifier.rs +++ b/crates/starstats-core/src/location_classifier.rs @@ -20,20 +20,29 @@ //! Resolution order (first hit wins): //! //! 1. **Synthetic** — engine patterns the wiki doesn't model -//! (jump points, comm arrays, crash sites, caves, bunkers). -//! These map to a synthetic `AnonymousPoi` tier with a -//! derived subtype. -//! 2. **Catalog** — `LocationCatalog::lookup_by_token` against -//! every token in the stripped raw string. The strongest +//! (jump points, comm arrays, crash sites, caves, bunkers), plus +//! *noise* patterns (procedural mining/cluster nodes, dynamic +//! mission/nav markers) which get an honest generic label and a +//! suppressible subtype instead of being title-cased into fake +//! proper-noun places. All map to a synthetic `AnonymousPoi` tier +//! with a derived subtype. +//! 2. **Catalog (exact)** — `LocationCatalog::lookup_by_token` +//! against every token in the stripped raw string. The strongest //! binding because it pulls real wiki taxonomy. -//! 3. **System fallback** — engine string contains a known +//! 3. **Catalog (fuzzy)** — `LocationCatalog::fuzzy_match`: idf- +//! weighted distinctive-token overlap, guarded by a rarity floor +//! and system consistency. Recovers real wiki rows the engine +//! names differently (`Stanton4a_RayariHydro_Kaltag` → +//! `Rayari Kaltag Research Outpost`). Runs before the heuristic +//! so a real row beats a bare-system guess. +//! 4. **System fallback** — engine string contains a known //! system token (`Stanton`/`Pyro`/`Nyx`/…) but no catalogue //! hit. Tier left as `AnonymousPoi`; system populated. -//! 4. **Body short-code fallback** — engine emits Lagrange +//! 5. **Body short-code fallback** — engine emits Lagrange //! prefixes like `HUR_L1` or affiliation short codes like //! `HDMS_*` / `Shubin_*`. Mapped to the parent system + a //! synthetic body name. -//! 5. **Last-resort title-case** — none of the above matched. +//! 6. **Last-resort title-case** — none of the above matched. //! Display name is the title-cased raw; tier `AnonymousPoi`. use std::collections::HashMap; @@ -90,8 +99,13 @@ pub struct LocationClassification { #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum ClassificationSource { - /// Matched against `LocationCatalog`. + /// Matched against `LocationCatalog` by an exact key (engine tag, + /// slug, or normalized name). Catalog, + /// Matched `LocationCatalog` via the distinctive-token fuzzy + /// fallback. A real wiki binding, but lower-confidence than an + /// exact key hit — kept distinct for coverage-quality telemetry. + Fuzzy, /// Matched a `SYNTHETIC_MATCHER` (engine-only pattern with no wiki entry). Synthetic, /// Matched a system / body short-code dictionary. @@ -119,24 +133,72 @@ pub fn classify(raw: &str, catalog: &LocationCatalog) -> LocationClassification // `Stanton1b`-style engine tags), then each segment. let joined = parts.join(""); if let Some(hit) = catalog.lookup_by_token(&joined) { - return from_catalog(hit, raw); + return from_catalog(hit, raw, ClassificationSource::Catalog); + } + // Exact per-token, but skip bare *system* tokens on this pass. The + // system itself is a catalogued location (slug `stanton`), and the + // engine emits the system token FIRST (`OOC_Stanton_2b_Daymar`), + // so matching it here would shadow the specific body/place that + // follows — collapsing every planet/moon to its system name. + // Deferred to a second pass below so a *bare* system identifier + // still resolves. + for token in &parts { + if KNOWN_SYSTEMS.contains_key(token.to_ascii_lowercase().as_str()) { + continue; + } + if let Some(hit) = catalog.lookup_by_token(token) { + return from_catalog(hit, raw, ClassificationSource::Catalog); + } + } + + // 3. Distinctive-token fuzzy match. Runs BEFORE the system + // heuristic: a real wiki row (`Rayari Kaltag Research Outpost`) + // beats a bare-system guess (`Stanton`). System hint, parsed + // from the same parts, prevents cross-system false positives. + let hint = system_hint(&parts); + if let Some(hit) = catalog.fuzzy_match(&parts, hint) { + return from_catalog(hit, raw, ClassificationSource::Fuzzy); } + + // 4. Deferred system-token exact match — a bare system identifier + // (just `Stanton`) still resolves to the system row with its + // full taxonomy now that specific tokens have had priority. for token in &parts { if let Some(hit) = catalog.lookup_by_token(token) { - return from_catalog(hit, raw); + return from_catalog(hit, raw, ClassificationSource::Catalog); } } - // 3-4. System / body short-code heuristics. + // 5. System / body short-code heuristics. if let Some(c) = system_or_body_heuristic(&parts, raw) { return c; } - // 5. Last-resort title-case. + // 6. Last-resort title-case. fallback(&title_case_segments(&parts), raw) } -fn from_catalog(hit: &LocationCatalogEntry, raw: &str) -> LocationClassification { +/// Best-effort system parse from already-stripped parts, reusing the +/// same dictionaries the heuristic tier uses. Feeds the fuzzy matcher's +/// system-consistency guard. +fn system_hint(parts: &[String]) -> Option<&'static str> { + for p in parts { + let key = p.to_ascii_lowercase(); + if let Some(meta) = KNOWN_BODY_SHORT_CODES.get(key.as_str()) { + return Some(meta.system); + } + if let Some(sys) = KNOWN_SYSTEMS.get(key.as_str()) { + return Some(sys); + } + } + None +} + +fn from_catalog( + hit: &LocationCatalogEntry, + raw: &str, + source: ClassificationSource, +) -> LocationClassification { LocationClassification { display_name: hit.display_name.clone(), slug: Some(hit.slug.clone()), @@ -153,7 +215,7 @@ fn from_catalog(hit: &LocationCatalogEntry, raw: &str) -> LocationClassification operator: hit.taxonomy.operator.clone(), faction: hit.taxonomy.faction.clone(), raw: raw.to_string(), - source: ClassificationSource::Catalog, + source, } } @@ -277,6 +339,14 @@ static SYNTHETIC_MATCHERS: &[SyntheticMatcher] = &[ match_rest_stop_engine, match_rest_stop_generic, match_orbital_marker, + // Noise patterns — engine-only dynamic / procedural identifiers + // that have no catalogued wiki page. Classified with an honest + // generic label + a suppressible subtype so they stop being + // title-cased into fake proper-noun "places" (e.g. + // `ab_mine_stanton2_med_010` → "Asteroid mining node", not + // "Ab Mine Stanton 2 Med 010"). + match_dynamic_marker, + match_procedural_node, ]; /// Jump-point detection. The shape gating the matcher is simply the @@ -645,6 +715,51 @@ fn match_orbital_marker(parts: &[String], raw: &str) -> Option Option { + let lower = raw.to_ascii_lowercase(); + let (display, subtype) = + if lower.contains("navpoint_dynamic") || lower.contains("dynamic_navpoint") { + ("Dynamic nav point", "nav_marker") + } else if lower.contains("mission_qt") || lower.contains("quantum_beacon") { + ("Mission marker", "mission_marker") + } else { + return None; + }; + Some(synthetic(display.to_string(), subtype, None, raw)) +} + +/// Procedural / instanced resource sites: asteroid mining and gas +/// collection nodes, asteroid clusters (`*.socpak` object containers), +/// and static race tracks. Real places players visit, but not +/// catalogued wiki entities — so we give an honest category label and +/// attach the system when the engine string carries one. +fn match_procedural_node(parts: &[String], raw: &str) -> Option { + let lower = raw.to_ascii_lowercase(); + let (display, subtype) = if lower.contains("ab_mine") { + ("Asteroid mining node", "mining_node") + } else if lower.contains("ab_collector") { + ("Gas collection node", "gas_node") + } else if lower.contains("_cluster_") || lower.ends_with(".socpak") { + ("Asteroid cluster", "asteroid_cluster") + } else if lower.contains("racing_static") { + ("Race track", "race_track") + } else { + return None; + }; + Some(synthetic( + display.to_string(), + subtype, + system_hint(parts).map(str::to_string), + raw, + )) +} + fn synthetic( display: String, subtype: &str, @@ -1254,4 +1369,294 @@ mod tests { assert_eq!(c.source, ClassificationSource::Catalog); assert_eq!(c.display_name, "Aberdeen"); } + + #[test] + fn specific_body_wins_over_bare_system_token() { + // Regression guard for the shadowing fix: with BOTH the Stanton + // system row and the Daymar moon in the catalogue, the engine + // string `OOC_Stanton_2b_Daymar` must resolve to "Daymar", not + // be shadowed by the leading `Stanton` token. Pre-fix this + // collapsed 91% of real location events to their system name. + let daymar = LocationCatalogEntry { + slug: "daymar".into(), + display_name: "Daymar".into(), + class_name: "daymar".into(), + engine_tag: None, + system: Some("Stanton".into()), + parent_body: Some("Crusader".into()), + classification: Some("Moon".into()), + taxonomy: LocationTaxonomy { + tier: Some(LocationTier::AstronomicalObject), + subtype: Some("moon".into()), + ..LocationTaxonomy::default() + }, + }; + let stanton_system = LocationCatalogEntry { + slug: "stanton".into(), + display_name: "Stanton".into(), + class_name: "stanton".into(), + engine_tag: None, + system: Some("Stanton".into()), + parent_body: None, + classification: Some("System".into()), + taxonomy: LocationTaxonomy::default(), + }; + let cat = catalog_with(vec![stanton_system, daymar]); + let c = classify("OOC_Stanton_2b_Daymar", &cat); + assert_eq!(c.display_name, "Daymar"); + assert_eq!(c.source, ClassificationSource::Catalog); + assert_eq!(c.subtype.as_deref(), Some("moon")); + } + + #[test] + fn bare_system_identifier_still_resolves_to_system_row() { + // The flip side of the shadowing fix: a *bare* system string + // (no specific body token) must still hit the system row via + // the deferred second pass — keeping its slug + System tier. + let stanton_system = LocationCatalogEntry { + slug: "stanton".into(), + display_name: "Stanton".into(), + class_name: "stanton".into(), + engine_tag: None, + system: Some("Stanton".into()), + parent_body: None, + classification: Some("System".into()), + taxonomy: LocationTaxonomy { + tier: Some(LocationTier::System), + ..LocationTaxonomy::default() + }, + }; + let cat = catalog_with(vec![stanton_system]); + let c = classify("OOC_Stanton", &cat); + assert_eq!(c.display_name, "Stanton"); + assert_eq!(c.source, ClassificationSource::Catalog); + assert_eq!(c.tier, LocationTier::System); + } + + // ---- distinctive-token fuzzy matcher --------------------------- + // + // Every engine identifier below is verbatim from a real LIVE tray + // DB (2026-05-31); the wiki names are the real + // api.star-citizen.wiki display names they should resolve to. + // These are the "real but unmatched by exact keys" locations that + // motivated the fuzzy tier. + + fn outpost(slug: &str, name: &str, system: &str) -> LocationCatalogEntry { + LocationCatalogEntry { + slug: slug.into(), + display_name: name.into(), + class_name: name.replace(' ', ""), + engine_tag: None, + system: Some(system.into()), + parent_body: None, + classification: Some("Outpost".into()), + taxonomy: LocationTaxonomy { + tier: Some(LocationTier::Landmark), + subtype: Some("outpost".into()), + ..LocationTaxonomy::default() + }, + } + } + + /// A realistic-ish catalog: the real recoverable rows plus enough + /// filler "* Research Outpost" rows that `outpost` / `research` / + /// `mining` climb above `FUZZY_ANCHOR_DF`, so a filler-only overlap is + /// correctly rejected (mirrors the real ~1955-row catalogue). + fn fuzzy_catalog() -> LocationCatalog { + let mut entries = vec![ + outpost( + "rayari-kaltag-research-outpost", + "Rayari Kaltag Research Outpost", + "Stanton", + ), + outpost( + "rayari-deltana-research-outpost", + "Rayari Deltana Research Outpost", + "Stanton", + ), + outpost( + "rayari-cantwell-research-outpost", + "Rayari Cantwell Research Outpost", + "Stanton", + ), + outpost( + "rayari-anvik-research-outpost", + "Rayari Anvik Research Outpost", + "Stanton", + ), + // Fifth Rayari outpost → `rayari` df 5 (> FUZZY_ANCHOR_DF), + // so the operator name alone can never anchor a match. + outpost( + "rayari-hickes-research-outpost", + "Rayari Hickes Research Outpost", + "Stanton", + ), + // Coincidental rare `hydro` token — the trap that must NOT + // catch `RayariHydro_*` engine strings. + outpost("terra-mills-hydrofarm", "Terra Mills HydroFarm", "Stanton"), + outpost( + "shubin-mining-facility-sal-2", + "Shubin Mining Facility SAL-2", + "Stanton", + ), + outpost( + "shubin-mining-facility-sal-5", + "Shubin Mining Facility SAL-5", + "Stanton", + ), + outpost( + "sakura-sun-goldenrod-workcenter", + "Sakura Sun Goldenrod Workcenter", + "Stanton", + ), + outpost("benson-mining-outpost", "Benson Mining Outpost", "Stanton"), + outpost( + "deakins-research-outpost", + "Deakins Research Outpost", + "Stanton", + ), + ]; + // Filler padding — inflate df of generic words (`research`, + // `outpost`) so a filler-only overlap can't clear the anchor. + for i in 0..12 { + entries.push(outpost( + &format!("filler-{i}-research-outpost"), + &format!("Filler{i} Research Outpost"), + "Stanton", + )); + } + catalog_with(entries) + } + + #[test] + fn fuzzy_recovers_rayari_kaltag() { + let cat = fuzzy_catalog(); + let c = classify("Stanton4a_RayariHydro_Kaltag", &cat); + assert_eq!(c.source, ClassificationSource::Fuzzy); + assert_eq!(c.slug.as_deref(), Some("rayari-kaltag-research-outpost")); + assert_eq!(c.system.as_deref(), Some("Stanton")); + } + + #[test] + fn fuzzy_recovers_sakura_sun_goldenrod() { + let cat = fuzzy_catalog(); + let c = classify("Stanton4_DistributionCentre_SakuraSun_Goldenrod", &cat); + assert_eq!(c.source, ClassificationSource::Fuzzy); + assert_eq!(c.slug.as_deref(), Some("sakura-sun-goldenrod-workcenter")); + } + + #[test] + fn fuzzy_disambiguates_shubin_sal2_from_sal5() { + // Both rows share `shubin`+`mining`+`facility`+`sal`; only the + // trailing digit separates them. The idf score must tip toward + // the row that also shares the `2`. (Here `shubin` df is 2, + // within FUZZY_ANCHOR_DF, so it anchors; in the full + // production catalogue `shubin` is more common and this family + // falls back to the system heuristic — a deliberate + // precision-over-recall trade for digit-only discriminators.) + let cat = fuzzy_catalog(); + let c2 = classify("Stanton3a_Shubin_SAL2", &cat); + assert_eq!(c2.slug.as_deref(), Some("shubin-mining-facility-sal-2")); + let c5 = classify("Stanton3a_Shubin_SAL5", &cat); + assert_eq!(c5.slug.as_deref(), Some("shubin-mining-facility-sal-5")); + } + + #[test] + fn fuzzy_rejects_filler_only_overlap() { + // Engine string shares only `research`/`outpost` (both far + // above FUZZY_ANCHOR_DF). No distinctive anchor → no fuzzy hit; + // falls through to the system heuristic instead of fabricating + // a wrong wiki link. + let cat = fuzzy_catalog(); + let c = classify("Stanton2a_Unmapped_Research_Outpost", &cat); + assert_ne!(c.source, ClassificationSource::Fuzzy); + assert_eq!(c.system.as_deref(), Some("Stanton")); + } + + #[test] + fn fuzzy_rejects_uncatalogued_place_with_only_operator_overlap() { + // `RayariHydro_McGarth`: there is no `McGarth` row. The engine + // string overlaps the catalogue only on the operator `rayari` + // (df 5, above the anchor bar) and the affiliation word `hydro` + // (denylisted). Neither may anchor → no match, so it must NOT + // bind to a random Rayari sibling or to Terra Mills HydroFarm. + let cat = fuzzy_catalog(); + let c = classify("Stanton4b_RayariHydro_McGarth", &cat); + assert_ne!( + c.source, + ClassificationSource::Fuzzy, + "unexpected fuzzy bind to {}", + c.display_name + ); + } + + #[test] + fn fuzzy_respects_system_consistency_guard() { + // The only `Kaltag` row is in Pyro; a Stanton engine string + // must NOT cross-match it. + let cat = catalog_with(vec![outpost( + "rayari-kaltag-research-outpost", + "Rayari Kaltag Research Outpost", + "Pyro", + )]); + let c = classify("Stanton4a_RayariHydro_Kaltag", &cat); + assert_ne!(c.source, ClassificationSource::Fuzzy); + } + + #[test] + fn fuzzy_does_not_fire_when_exact_key_matches() { + // Exact engine-tag/slug must always win over fuzzy. + let cat = catalog_with(vec![aberdeen_entry()]); + let c = classify("OOC_Stanton_1b_Aberdeen", &cat); + assert_eq!(c.source, ClassificationSource::Catalog); + } + + // ---- noise classification -------------------------------------- + + #[test] + fn noise_asteroid_mining_node() { + let c = classify("ab_mine_stanton2_med_010", &empty_catalog()); + assert_eq!(c.source, ClassificationSource::Synthetic); + assert_eq!(c.subtype.as_deref(), Some("mining_node")); + assert_eq!(c.system.as_deref(), Some("Stanton")); + } + + #[test] + fn noise_gas_collection_node() { + let c = classify("ab_collector_gas_Stanton1", &empty_catalog()); + assert_eq!(c.subtype.as_deref(), Some("gas_node")); + } + + #[test] + fn noise_asteroid_cluster_socpak_beats_fuzzy() { + // `shubin_cluster_..._.socpak` is a procedural asteroid field, + // NOT the Shubin facility. Noise classification (a synthetic + // matcher) runs before fuzzy, so even with the facility in the + // catalog it must classify as a cluster. + let cat = fuzzy_catalog(); + let c = classify( + "shubin_cluster_001_frost_{13DA184B-8620-4DAE-9450-5CE6F2ADA1A5}.socpak", + &cat, + ); + assert_eq!(c.source, ClassificationSource::Synthetic); + assert_eq!(c.subtype.as_deref(), Some("asteroid_cluster")); + } + + #[test] + fn noise_mission_marker() { + let c = classify("MISSION_QT_Quantum_Beacon_286174403838", &empty_catalog()); + assert_eq!(c.subtype.as_deref(), Some("mission_marker")); + } + + #[test] + fn noise_dynamic_nav_point() { + let c = classify("NavPoint_Dynamic_285165357631", &empty_catalog()); + assert_eq!(c.subtype.as_deref(), Some("nav_marker")); + } + + #[test] + fn noise_race_track() { + let c = classify("racing_static_st2c_ghexasteroid", &empty_catalog()); + assert_eq!(c.subtype.as_deref(), Some("race_track")); + } }