From 1f8ad91479e4ed70d03f7572c7380366ef6876d0 Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 00:11:59 +0100 Subject: [PATCH 1/9] feat: planning the hmrc docs change --- docs/hmrc-csv-format-change.md | 192 +++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 docs/hmrc-csv-format-change.md diff --git a/docs/hmrc-csv-format-change.md b/docs/hmrc-csv-format-change.md new file mode 100644 index 0000000..cb4f7a5 --- /dev/null +++ b/docs/hmrc-csv-format-change.md @@ -0,0 +1,192 @@ +# Plan: Adapt HMRC ingestion to the new gov.uk CSV format (+ CH-sourced location) + +## Context + +The gov.uk "Worker and Temporary Worker" sponsor CSV changed format on 2026-06-09, which +hard-failed the ingestion GitHub Action (schema-validation guard, working as designed — +exit 1 before touching the DB). The feed: + +- **Dropped** `Town/City`, `County` (HMRC can't guarantee location correctness — it's + Companies House's domain, which is *why* they removed it). +- **Renamed** `Type & Rating` → `TierRating` and `Route` → `Migrant Classification` + (verified: the value strings are **byte-for-byte identical** — same 9 / 17 distinct + values, trailing spaces and all). +- **Added** `Sponsor Licence Number` (a stable per-sponsor ID) and `Sponsor Status` + (currently single-valued: "Licensed and Fully Active"). + +**Decisions locked with the user:** + +1. Ingest stays **true to HMRC** — clean atomic swap, **no carry-forward** of old + town/county or slugs. Row `hash` (the `/company/$id/$slug` URL id) therefore changes. +2. **Adopt** the Sponsor Licence Number (and Status) into the table. +3. Confirmed Companies House API exposes **no** sponsor licence number — so the licence is + a within-dataset stable key, *not* a CH join key. CH joins still go via org name. +4. Listing-page location now comes from Companies House via a **query-time `LEFT JOIN`** + (not denormalized). Verified live: ~86% of rows (121,265 / 141,264) resolve to a CH + locality; `address_line_2` fallback adds only ~850. The join is ~free — location is + pure display (never in `WHERE`/`ORDER BY`), so it's PK probes on the ≤50 returned rows, + dominated by the existing trigram scan. + +**Outcome:** ingestion green again on the new feed; listing/detail location sourced from +CH; licence number captured; the HMRC table reflects exactly what HMRC publishes. + +--- + +## Changes by area + +### 1. Ingestion script — `apps/web/scripts/ingest-hmrc-csv.ts` + +- **`EXPECTED_COLUMNS`** (lines 7-13) → `['Sponsor Licence Number', 'Organisation Name', + 'TierRating', 'Migrant Classification', 'Sponsor Status']`. Keep the guard — it now also + catches the *next* format change. +- **Column mapping at parse** (dedup loop, lines 142-163): keep internal DB names so the + matching pipeline is untouched — `TierRating` → `type_rating`, `Migrant Classification` + → `route`. Read `Sponsor Licence Number` → `sponsor_licence_number`, `Sponsor Status` → + `sponsor_status`. Drop all `Town/City` / `County` reads. +- **New hash basis** — `computeHash(licence, typeRating, route)` (replaces lines 113-126): + `base64url(sha256(licence | type_rating | route))[:11]`. Rationale: + - Stable: licence is durable, so **future ingests no longer churn URLs — this is the + last hash migration** (the old town/county inputs were the churny part). + - Disambiguates the 903 org-names that map to >1 licence. + - Excludes org name → company renames don't change the URL (`nameSlug` is cosmetic; the + loader resolves by hash). + - Keep the `UNIQUE(hash)` constraint as a collision guard; dedup on the same key. +- **`nameSlug`** unchanged: `slugify(orgName) || hash` (line 149). +- **Staging DDL** (lines 88-99): drop `town_city`, `county`; add + `sponsor_licence_number varchar(20)`, `sponsor_status varchar(64)`. Update the + `INSERT`/placeholder builder (lines 169-199) and `CleanedRow` type (129-137) to match. +- **Indexes** (lines 203-209): drop `stg_idx_hmrc_town_city`; keep org_name / name_slug / + route / org_name_trgm; add `stg_idx_hmrc_licence` on `sponsor_licence_number`. +- **Atomic-swap renames** (lines 215-224): drop the `town_city` index rename; add the + licence index rename. +- Checksum guard + `hmrc_ingestion_meta` insert unchanged. + +### 2. DB schema + migration — `packages/db/src/schema.ts` + +- `hmrcSkilledWorkers` (lines 17-39): remove `townCity`, `county`; add + `sponsorLicenceNumber varchar(20)`, `sponsorStatus varchar(64)`. Remove + `idx_hmrc_town_city`; add `idx_hmrc_licence`. +- Generate a Drizzle migration (`bun run` the drizzle-kit generate flow from root). +- **Lockstep invariant:** `schema.ts` ≡ the ingest script's `CREATE TABLE` DDL ≡ the + migration result must all agree. The ingest atomic-swap rebuilds the table from its own + DDL, but the migration must run *first* so the live table has the new columns before any + deployed query references them (avoids a SELECT on a not-yet-existing column). + +### 3. Search query — `apps/web/src/api/hmrc.ts` + +- `searchHmrc` (18-63): add + `.leftJoin(hmrcCompanyMapping, eq(hmrcCompanyMapping.organisationName, hmrcSkilledWorkers.organisationName))` + then `.leftJoin(companiesHouseProfiles, eq(companiesHouseProfiles.companyNumber, hmrcCompanyMapping.companyNumber))`. + Replace the `townCity`/`county` selects with + `location: sql\`COALESCE(${companiesHouseProfiles.locality}, ${companiesHouseProfiles.addressLine2})\``. + Score/order/limit unchanged (they reference only `hmrcSkilledWorkers`, so ranking and + `LIMIT` pushdown are unaffected). +- `getHmrcBySlugId` (69-90): drop `townCity`/`county` (detail page sources location from + the CH profile it already loads). Optionally also select `sponsorStatus` for a future + status badge. +- `HmrcRow` type (157) updates automatically; drop town/county, add `location`. +- `getHmrcBySlug` fallback (130-143) unchanged. + +### 4. UI + +- **`HmrcCard.tsx:88-90`** — replace `[row.townCity, row.county]…join(', ')` with + `titleCase(row.location ?? '')` (single token, e.g. "London"). +- **`HmrcResults.tsx:41-46`** (pretext field 2 `getText`) — change to + `(row) => titleCase(row.location ?? '')`. Keep `font:'14px Geist'`, `lineHeight:20`, + `fixedHeight:62`. ⚠️ **`CLAUDE.md` invariant** — `useCardMetrics` must stay in sync with + card CSS; the measured *text* changes (now a single CH token, shorter → less wrapping) + but the font/height config is unchanged, so `fixedHeight` stays 62. +- **`company.$id.$slug.tsx`** — the loader already fetches the CH `profile` + (lines 79-81). Source location from it: `displayLocation` (244) and head() (127-128) → + `formatLocation(profile?.registered_office_address?.locality, …?.region)`. Detail page + keeps its richer two-part display (town + region), now CH-sourced. Remove `townCity`/ + `county` from the loaderData `sponsor` type (90-96) and all usages. +- **`McpTools.tsx:80,215`** — update location formatting to use the new `location` field + from the search row. +- **`utils.ts` `formatLocation` (87-102)** — keep as-is (generic two-arg joiner), now fed + CH locality/region on the detail page. + +### 5. Matching pipeline — compile-safe, accept tiebreak loss + +`route` is preserved (renamed at ingest), so the route-type hard gate is unaffected. Only +the town/county *DB reads* break: + +- **`apps/web/src/lib/phase5/sql.ts` `makeLookupSponsor` (122-147)** — SELECT drops + `town_city, county`, keeps `route`; return `{ townCity: null, county: null, route }`. +- **`apps/web/src/api/companiesHouse.ts` on-demand resolver (222-244)** — remove the + `hmrcRow` SELECT of `townCity`/`county`; pass `{ townCity: null, county: null }` to + `resolveOneSponsor`. +- Leave the resolver/scorer **types and logic intact** (`pipeline.ts pickByLocality`, + `score-candidate.ts`, `compare-candidates.ts`, `resolve-sponsor.ts`, `sweep.ts`) — they + now operate on null locality, so the geographic tiebreak is inert. Existing phase5 tests + pass mock localities directly, so they still pass. +- **Documented degradation (accepted):** without HMRC town, ambiguous same-name orgs lose + the `pickByLocality` tiebreak → more `human_review` / a growing review queue. Acceptable + per the premise (HMRC location was unreliable); the 88% already-mapped orgs keep their + `company_number`, so steady-state location coverage holds. Optional later: replace the + geo tiebreak with a non-geo signal (company status / name-similarity margin) — out of + scope here. + +### 6. Sitemap + SEO + +- `apps/web/scripts/generate-sitemap.ts` — no change (selects `hash`, `nameSlug`, both + retained). After the swap all hashes change; the workflow's `data-changed`-gated + `sitemap:generate` + PR steps regenerate them. +- **One-time URL churn:** every detail URL changes once. The existing loader fallback + (`company.$id.$slug.tsx:59-77`) absorbs it — 301 for single-row orgs (~90%, unique + `name_slug`), 302→search for multi-route orgs. Because the new hash is licence-based and + stable, this is a one-time event, not recurring. +- **Optional SEO safeguard (recommend):** change the multi-match branch (69-75) from + 302→search to **301→the canonical (first) row**, so old multi-route URLs land on a real + page and retain link equity instead of bouncing to search. + +### 7. Workflow / dev scripts + +- `.github/workflows/hmrc-ingestion.yaml` — no change; it succeeds once the script handles + the new columns. (The `actions/checkout@v4` Node-20 deprecation warning is unrelated + housekeeping.) +- **Follow-up (non-blocking, local-only):** `generate-hmrc-seed-sql.ts` parses the old CSV + columns and will break on the new format / old fixture; update it or refresh + `apps/web/data/2026-03-31-Worker.csv`. Re-check `seed-companies-house.ts` for town/county + reads. + +--- + +## Verification + +1. **Lint/type/test** (from repo root): `bun lint:fix && bun lint`, the monorepo + type-check (HmrcRow ripple → HmrcCard/HmrcResults/McpTools/detail page), `bun test` + (phase5 tests should stay green). +2. **Ingest dry-run on a branch/staging DB** against the live new CSV with `--force`: + confirm schema validation passes, ~140,876 unique rows, no town/county columns, + `sponsor_licence_number`/`sponsor_status` populated, hashes minted, UNIQUE holds. +3. **Listing**: run search locally, confirm cards show CH locality (e.g. "Checkout LTD" → + "London"), ~86% populated / ~14% blank; confirm the join doesn't regress search latency + (EXPLAIN: trigram scan dominates, joins are PK nested-loops on the returned window). +4. **Detail**: confirm location now renders from the CH profile. +5. **Sitemap**: `bun run sitemap:generate` → new hashes in output. +6. **Redirects**: hit an old `/company/OLDHASH/slug` → 301 (single-row) or 302/301 + (multi-route, depending on the optional safeguard). +7. **Deploy order**: migration (adds columns) → deploy code → trigger ingest (`--force`, + repopulates licence/status + new hashes) → sitemap regen PR. + +--- + +## Open items to confirm before implementing + +- **Hash basis = `licence|type_rating|route`** (recommended; makes this the last URL + migration). Alternative would be `org|type|route`, but that re-churns on renames and + collides on the 903 multi-licence names. +- **Optional 301→canonical multi-match safeguard** (§6) — include or skip? +- **Matching tiebreak degradation** accepted as-is (§5) — confirm OK to defer a + replacement signal. + +--- + +## Source data reference (2026-06-09 feed) + +- New columns: `Sponsor Licence Number, Organisation Name, TierRating, Migrant Classification, Sponsor Status` +- Old columns: `Organisation Name, Town/City, County, Type & Rating, Route` +- New feed: 141,806 rows → ~140,876 unique `(org, type, route)` +- Live DB coverage at planning time: 141,264 rows / 126,420 orgs; 111,320 orgs mapped to a + company number; 121,265 rows (~86%) resolve to a CH locality (or `address_line_2`). From 44e9b728a755eeccef1093aa819fbe09fcfb9819 Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 09:51:26 +0100 Subject: [PATCH 2/9] feat: (fable) transform the schema with the updated hmrc changes --- apps/web/scripts/drain-review-queue.ts | 20 +- apps/web/scripts/ingest-hmrc-csv.ts | 51 +- apps/web/scripts/seed-companies-house.ts | 10 +- apps/web/src/api/companiesHouse.ts | 22 +- apps/web/src/api/hmrc.ts | 36 +- apps/web/src/components/HmrcCard.tsx | 2 +- apps/web/src/components/HmrcResults.tsx | 3 +- apps/web/src/components/McpTools.tsx | 18 +- apps/web/src/lib/phase5/sql.ts | 14 +- apps/web/src/routes/company.$id.$slug.tsx | 47 +- .../migrations/0025_add-sponsor-licence.sql | 3 + .../db/migrations/0026_drop-town-county.sql | 3 + .../db/migrations/meta/0025_snapshot.json | 946 ++++++++++++++++++ .../db/migrations/meta/0026_snapshot.json | 919 +++++++++++++++++ packages/db/migrations/meta/_journal.json | 14 + packages/db/src/schema.ts | 6 +- 16 files changed, 2015 insertions(+), 99 deletions(-) create mode 100644 packages/db/migrations/0025_add-sponsor-licence.sql create mode 100644 packages/db/migrations/0026_drop-town-county.sql create mode 100644 packages/db/migrations/meta/0025_snapshot.json create mode 100644 packages/db/migrations/meta/0026_snapshot.json diff --git a/apps/web/scripts/drain-review-queue.ts b/apps/web/scripts/drain-review-queue.ts index ffc63d5..22472e1 100644 --- a/apps/web/scripts/drain-review-queue.ts +++ b/apps/web/scripts/drain-review-queue.ts @@ -143,7 +143,7 @@ type ProfileRow = { previous_company_names: string[] | null; }; -type SponsorRow = { town_city: string | null; route: string }; +type SponsorRow = { route: string }; type StrategyOutcome = | { action: 'swap'; reason: string; s_e?: number; s_p?: number } @@ -180,26 +180,26 @@ async function loadProfiles( return new Map(rows.map((r) => [r.company_number, r])); } -/** Picks the most common (town_city, route) tuple per organisation_name. - * HMRC publishes one row per worker, so an org with mixed routes/locations - * picks the dominant pairing — same heuristic the inline scorer will use. */ +/** Picks the most common route per organisation_name. HMRC publishes one + * row per worker, so an org with mixed routes picks the dominant one — + * same heuristic the inline scorer will use. (The 2026-06 feed dropped + * town/county, so the locality tiebreak is inert.) */ async function loadSponsors( orgNames: string[], ): Promise> { if (orgNames.length === 0) return new Map(); const rows = (await sql` SELECT DISTINCT ON (organisation_name) - organisation_name, town_city, route + organisation_name, route FROM ( - SELECT organisation_name, town_city, route, COUNT(*) AS n + SELECT organisation_name, route, COUNT(*) AS n FROM hmrc_skilled_workers WHERE organisation_name = ANY(${orgNames}) - GROUP BY organisation_name, town_city, route + GROUP BY organisation_name, route ) ranked - ORDER BY organisation_name, n DESC, route, town_city NULLS LAST + ORDER BY organisation_name, n DESC, route `) as { organisation_name: string; - town_city: string | null; route: string; }[]; return new Map(rows.map((r) => [r.organisation_name, r])); @@ -285,7 +285,7 @@ function profileRowToFullProfile(row: ProfileRow): CHFullProfile { } function sponsorRowToScorerSponsor(row: SponsorRow): ScorerSponsor { - return { route: row.route, townCity: row.town_city }; + return { route: row.route, townCity: null }; } // ───────────────────────────────────────────────────────────────────────────── diff --git a/apps/web/scripts/ingest-hmrc-csv.ts b/apps/web/scripts/ingest-hmrc-csv.ts index 4cbd947..d1d9669 100644 --- a/apps/web/scripts/ingest-hmrc-csv.ts +++ b/apps/web/scripts/ingest-hmrc-csv.ts @@ -5,11 +5,11 @@ import { slugify } from '../src/utils'; import { setGitHubOutput } from './ci-utils'; const EXPECTED_COLUMNS = [ + 'Sponsor Licence Number', 'Organisation Name', - 'Town/City', - 'County', - 'Type & Rating', - 'Route', + 'TierRating', + 'Migrant Classification', + 'Sponsor Status', ] as const; const BATCH_SIZE = 500; @@ -91,8 +91,8 @@ await sql` "hash" varchar(11) NOT NULL UNIQUE, "organisation_name" varchar(255) NOT NULL, "name_slug" varchar(255) NOT NULL, - "town_city" varchar(100), - "county" varchar(100), + "sponsor_licence_number" varchar(20), + "sponsor_status" varchar(64), "type_rating" varchar(100) NOT NULL, "route" varchar(100) NOT NULL ) @@ -110,16 +110,15 @@ function clean(val: string | undefined): string | null { return trimmed; } +/** Mint the stable URL id from the licence-based row identity. Licence is a + * durable per-sponsor key, so hashes survive company renames and future + * ingests — org name is deliberately excluded. */ function computeHash( - orgName: string, - townCity: string | null, - county: string | null, + licence: string, typeRating: string, route: string, ): string { - const input = [orgName, townCity ?? '', county ?? '', typeRating, route].join( - '|', - ); + const input = [licence, typeRating, route].join('|'); const bytes = new Bun.CryptoHasher('sha256').update(input).digest(); // Take first 8 bytes (64 bits), encode as base64url, trim to 11 chars return Buffer.from(bytes.slice(0, 8)).toString('base64url').slice(0, 11); @@ -130,8 +129,8 @@ type CleanedRow = { hash: string; orgName: string; nameSlug: string; - townCity: string | null; - county: string | null; + licence: string; + status: string | null; typeRating: string; route: string; }; @@ -140,12 +139,12 @@ const seen = new Set(); const dedupedRows: CleanedRow[] = []; for (const r of records) { + const licence = r['Sponsor Licence Number'].trim(); const orgName = r['Organisation Name'].trim(); - const townCity = clean(r['Town/City']); - const county = clean(r.County); - const typeRating = r['Type & Rating'].trim(); - const route = r.Route.trim(); - const hash = computeHash(orgName, townCity, county, typeRating, route); + const typeRating = r.TierRating.trim(); + const route = r['Migrant Classification'].trim(); + const status = clean(r['Sponsor Status']); + const hash = computeHash(licence, typeRating, route); const nameSlug = slugify(orgName) || hash; if (!seen.has(hash)) { @@ -154,8 +153,8 @@ for (const r of records) { hash, orgName, nameSlug, - townCity, - county, + licence, + status, typeRating, route, }); @@ -181,15 +180,15 @@ for (let i = 0; i < dedupedRows.length; i += BATCH_SIZE) { r.hash, r.orgName, r.nameSlug, - r.townCity, - r.county, + r.licence, + r.status, r.typeRating, r.route, ); } await sql.query( - `INSERT INTO "hmrc_skilled_workers_staging" ("hash", "organisation_name", "name_slug", "town_city", "county", "type_rating", "route") VALUES ${placeholders.join(', ')}`, + `INSERT INTO "hmrc_skilled_workers_staging" ("hash", "organisation_name", "name_slug", "sponsor_licence_number", "sponsor_status", "type_rating", "route") VALUES ${placeholders.join(', ')}`, values, ); @@ -203,7 +202,7 @@ console.log('Building indexes on staging table...'); await Promise.all([ sql`CREATE INDEX "stg_idx_hmrc_org_name" ON "hmrc_skilled_workers_staging" USING btree ("organisation_name")`, sql`CREATE INDEX "stg_idx_hmrc_name_slug" ON "hmrc_skilled_workers_staging" USING btree ("name_slug")`, - sql`CREATE INDEX "stg_idx_hmrc_town_city" ON "hmrc_skilled_workers_staging" USING btree ("town_city")`, + sql`CREATE INDEX "stg_idx_hmrc_licence" ON "hmrc_skilled_workers_staging" USING btree ("sponsor_licence_number")`, sql`CREATE INDEX "stg_idx_hmrc_route" ON "hmrc_skilled_workers_staging" USING btree ("route")`, sql`CREATE INDEX "stg_idx_hmrc_org_name_trgm" ON "hmrc_skilled_workers_staging" USING gin ("organisation_name" gin_trgm_ops)`, ]); @@ -217,7 +216,7 @@ await sql.transaction([ sql`ALTER TABLE "hmrc_skilled_workers_staging" RENAME TO "hmrc_skilled_workers"`, sql`ALTER INDEX "stg_idx_hmrc_org_name" RENAME TO "idx_hmrc_org_name"`, sql`ALTER INDEX "stg_idx_hmrc_name_slug" RENAME TO "idx_hmrc_name_slug"`, - sql`ALTER INDEX "stg_idx_hmrc_town_city" RENAME TO "idx_hmrc_town_city"`, + sql`ALTER INDEX "stg_idx_hmrc_licence" RENAME TO "idx_hmrc_licence"`, sql`ALTER INDEX "stg_idx_hmrc_route" RENAME TO "idx_hmrc_route"`, sql`ALTER INDEX "stg_idx_hmrc_org_name_trgm" RENAME TO "idx_hmrc_org_name_trgm"`, sql`ALTER INDEX "hmrc_skilled_workers_staging_hash_key" RENAME TO "hmrc_skilled_workers_hash_unique"`, diff --git a/apps/web/scripts/seed-companies-house.ts b/apps/web/scripts/seed-companies-house.ts index 5604b2d..8baf4fc 100644 --- a/apps/web/scripts/seed-companies-house.ts +++ b/apps/web/scripts/seed-companies-house.ts @@ -60,15 +60,13 @@ async function fetchApi(path: string): Promise { return res.json(); } -// Get only org names that aren't already cached, plus a representative -// town_city/county per org for the locality tiebreaker in the verification -// pipeline. selectDistinctOn(orgName) collapses multi-row sponsors (one per +// Get only org names that aren't already cached. The 2026-06 HMRC feed +// dropped town/county, so the resolver's locality tiebreak runs inert. +// selectDistinctOn(orgName) collapses multi-row sponsors (one per // route/rating) to a single representative row. const uncached = await db .selectDistinctOn([hmrcSkilledWorkers.organisationName], { organisationName: hmrcSkilledWorkers.organisationName, - townCity: hmrcSkilledWorkers.townCity, - county: hmrcSkilledWorkers.county, }) .from(hmrcSkilledWorkers) .leftJoin( @@ -137,7 +135,7 @@ for (const row of uncached) { // point users at the wrong CH entity. See docs/hmrc-ch-mapping-fix.md. const result = await resolveOneSponsor( orgName, - { townCity: row.townCity, county: row.county }, + { townCity: null, county: null }, throttledFetchApi, ); diff --git a/apps/web/src/api/companiesHouse.ts b/apps/web/src/api/companiesHouse.ts index 76b06a0..c6a8307 100644 --- a/apps/web/src/api/companiesHouse.ts +++ b/apps/web/src/api/companiesHouse.ts @@ -1,9 +1,4 @@ -import { - companiesHouseProfiles, - hmrcCompanyMapping, - hmrcSkilledWorkers, - sicCodes, -} from '@ss/db'; +import { companiesHouseProfiles, hmrcCompanyMapping, sicCodes } from '@ss/db'; import { queryOptions } from '@tanstack/react-query'; import { createServerFn } from '@tanstack/react-start'; import { setResponseHeader } from '@tanstack/react-start/server'; @@ -219,24 +214,13 @@ const getCompanyProfile = createServerFn() // top-hit logic that was silently mapping new sponsors to wrong CH // entities. See docs/hmrc-ch-mapping-fix.md "Phase 3 — on-demand // resolver hardening". - const [hmrcRow] = await db - .select({ - townCity: hmrcSkilledWorkers.townCity, - county: hmrcSkilledWorkers.county, - }) - .from(hmrcSkilledWorkers) - .where(eq(hmrcSkilledWorkers.organisationName, companyName)) - .limit(1); - console.log( `[Profile] no mapping, resolving via CH for: "${companyName}"`, ); + // HMRC no longer publishes town/county, so the locality tiebreak is inert. const result = await resolveOneSponsor( companyName, - { - townCity: hmrcRow?.townCity ?? null, - county: hmrcRow?.county ?? null, - }, + { townCity: null, county: null }, async (path) => { const r = await fetchFromApi(path); return r.ok ? r.data : null; diff --git a/apps/web/src/api/hmrc.ts b/apps/web/src/api/hmrc.ts index 31c9a61..9ea6052 100644 --- a/apps/web/src/api/hmrc.ts +++ b/apps/web/src/api/hmrc.ts @@ -1,7 +1,11 @@ -import { hmrcSkilledWorkers } from '@ss/db'; +import { + companiesHouseProfiles, + hmrcCompanyMapping, + hmrcSkilledWorkers, +} from '@ss/db'; import { queryOptions } from '@tanstack/react-query'; import { createServerFn } from '@tanstack/react-start'; -import { desc, eq, sql } from 'drizzle-orm'; +import { asc, desc, eq, sql } from 'drizzle-orm'; import { db } from '../db.server'; import { LONG_EDGE_CACHE, setRpcCacheControl } from './cache-headers'; @@ -32,18 +36,37 @@ export const searchHmrc = createServerFn() THEN 1.0 + word_similarity(${query}, ${hmrcSkilledWorkers.organisationName}) ELSE word_similarity(${query}, ${hmrcSkilledWorkers.organisationName}) END`; + // Listing location is CH-sourced (HMRC dropped town/county from the feed). + // Pure display joins: PK probes on the returned window only — never in + // WHERE/ORDER BY, so ranking and LIMIT pushdown are unaffected. const rows = await db .select({ slugId: hmrcSkilledWorkers.hash, organisationName: hmrcSkilledWorkers.organisationName, nameSlug: hmrcSkilledWorkers.nameSlug, - townCity: hmrcSkilledWorkers.townCity, - county: hmrcSkilledWorkers.county, + sponsorLicenceNumber: hmrcSkilledWorkers.sponsorLicenceNumber, + location: sql< + string | null + >`COALESCE(${companiesHouseProfiles.locality}, ${companiesHouseProfiles.addressLine2})`, typeRating: hmrcSkilledWorkers.typeRating, route: hmrcSkilledWorkers.route, score: scoreExpr, }) .from(hmrcSkilledWorkers) + .leftJoin( + hmrcCompanyMapping, + eq( + hmrcCompanyMapping.organisationName, + hmrcSkilledWorkers.organisationName, + ), + ) + .leftJoin( + companiesHouseProfiles, + eq( + companiesHouseProfiles.companyNumber, + hmrcCompanyMapping.companyNumber, + ), + ) .where( sql`( ${hmrcSkilledWorkers.organisationName} ~* ${wordBoundaryPattern} @@ -73,8 +96,7 @@ const getHmrcBySlugId = createServerFn() .select({ slugId: hmrcSkilledWorkers.hash, organisationName: hmrcSkilledWorkers.organisationName, - townCity: hmrcSkilledWorkers.townCity, - county: hmrcSkilledWorkers.county, + sponsorLicenceNumber: hmrcSkilledWorkers.sponsorLicenceNumber, typeRating: hmrcSkilledWorkers.typeRating, route: hmrcSkilledWorkers.route, }) @@ -124,6 +146,7 @@ export const sponsorCountQueryOptions = queryOptions({ * the given slug. Fallback for stale `/company/$id/$slug` URLs: when the hash * lookup 404s, the loader checks whether the name still maps to a current row * and 301s to its new hash. Capped at 2 since callers only branch on 0 / 1 / many. + * Ordered by hash so the multi-match 301 always picks the same canonical row. * Not wrapped in queryOptions — only the loader calls it, and the redirect * moves the user off this page so there's no second reader for the result. */ @@ -138,6 +161,7 @@ export const getHmrcBySlug = createServerFn() }) .from(hmrcSkilledWorkers) .where(eq(hmrcSkilledWorkers.nameSlug, slug)) + .orderBy(asc(hmrcSkilledWorkers.hash)) .limit(2); return rows; }); diff --git a/apps/web/src/components/HmrcCard.tsx b/apps/web/src/components/HmrcCard.tsx index 6f3dc6f..35a33e8 100644 --- a/apps/web/src/components/HmrcCard.tsx +++ b/apps/web/src/components/HmrcCard.tsx @@ -86,7 +86,7 @@ export default function HmrcCard({

- {[row.townCity, row.county].filter(Boolean).map(titleCase).join(', ')} + {titleCase(row.location)}

{titleCase(row.route)} diff --git a/apps/web/src/components/HmrcResults.tsx b/apps/web/src/components/HmrcResults.tsx index 9fb5ed4..8259438 100644 --- a/apps/web/src/components/HmrcResults.tsx +++ b/apps/web/src/components/HmrcResults.tsx @@ -39,8 +39,7 @@ export default function HmrcResults({ search }: { search: string }) { letterSpacing: -0.4, // heading-card utility }, { - getText: (row) => - [row.townCity, row.county].filter(Boolean).map(titleCase).join(', '), + getText: (row) => titleCase(row.location), font: '14px Geist', // text-sm lineHeight: 20, }, diff --git a/apps/web/src/components/McpTools.tsx b/apps/web/src/components/McpTools.tsx index b537d04..965fd0b 100644 --- a/apps/web/src/components/McpTools.tsx +++ b/apps/web/src/components/McpTools.tsx @@ -24,7 +24,7 @@ export function McpTools() { ctx.registerTool({ name: 'search_uk_visa_sponsors', description: - 'Search for UK companies licensed to sponsor skilled worker visas. Returns company name, location, visa route, and sponsor rating.', + 'Search for UK companies licensed to sponsor skilled worker visas. Returns company name, location, visa route, sponsor rating, and sponsor licence number.', inputSchema: { type: 'object', properties: { @@ -77,12 +77,10 @@ export function McpTools() { const formatted = result.rows.map((row) => ({ name: titleCase(row.organisationName), - location: [row.townCity, row.county] - .filter(Boolean) - .map(titleCase) - .join(', '), + location: titleCase(row.location), visaRoute: titleCase(row.route), rating: titleCase(row.typeRating), + sponsorLicenceNumber: row.sponsorLicenceNumber, })); return { @@ -120,7 +118,7 @@ export function McpTools() { ctx.registerTool({ name: 'get_uk_visa_sponsor_details', description: - 'Get detailed information about a specific UK visa sponsor by company name, combining HMRC sponsorship data (location, visa routes, sponsor ratings) with Companies House registration data (company number, status, incorporation date, registered address, industry/SIC descriptions). Use the exact name returned by search_uk_visa_sponsors for best results.', + 'Get detailed information about a specific UK visa sponsor by company name, combining HMRC sponsorship data (location, visa routes, sponsor ratings, sponsor licence numbers) with Companies House registration data (company number, status, incorporation date, registered address, industry/SIC descriptions). Use the exact name returned by search_uk_visa_sponsors for best results.', inputSchema: { type: 'object', properties: { @@ -207,15 +205,13 @@ export function McpTools() { .map((row) => ({ visaRoute: titleCase(row.route), rating: titleCase(row.typeRating), + // Per-row, not top-level: same-name orgs can hold multiple licences + sponsorLicenceNumber: row.sponsorLicenceNumber, })); const details = { name: titleCase(top.organisationName), - location: - [top.townCity, top.county] - .filter(Boolean) - .map(titleCase) - .join(', ') || null, + location: titleCase(top.location) || null, sponsorship, companiesHouse: profile ? { diff --git a/apps/web/src/lib/phase5/sql.ts b/apps/web/src/lib/phase5/sql.ts index f70cabe..79ecd19 100644 --- a/apps/web/src/lib/phase5/sql.ts +++ b/apps/web/src/lib/phase5/sql.ts @@ -116,9 +116,9 @@ function toExistingMapping(row: RawMappingRow): ExistingMapping { } /** Build a `lookupSponsor` matching `SweepDeps['lookupSponsor']`. Pulls - * `town_city` / `county` / `route` from `hmrc_skilled_workers` — the - * locality fields feed the resolver's tiebreak, the route feeds the inline - * scorer's route-type hard gate. */ + * `route` from `hmrc_skilled_workers` for the inline scorer's route-type + * hard gate. Locality is always null since the 2026-06 HMRC feed dropped + * town/county — the resolver's geographic tiebreak is inert. */ export function makeLookupSponsor(sql: Sql): SweepDeps['lookupSponsor'] { return async (organisationName) => { // `ORDER BY id ASC` for deterministic row selection — `hmrc_skilled_workers` @@ -127,20 +127,18 @@ export function makeLookupSponsor(sql: Sql): SweepDeps['lookupSponsor'] { // resolver's tiebreak depends on Postgres's storage order, which can // shift between runs. (CodeRabbit PR #85, comment 2.) const rows = (await sql` - SELECT town_city, county, route + SELECT route FROM hmrc_skilled_workers WHERE organisation_name = ${organisationName} ORDER BY id ASC LIMIT 1 `) as { - town_city: string | null; - county: string | null; route: string | null; }[]; const first = rows[0]; return { - townCity: first?.town_city ?? null, - county: first?.county ?? null, + townCity: null, + county: null, route: first?.route ?? null, } satisfies SweepSponsor; }; diff --git a/apps/web/src/routes/company.$id.$slug.tsx b/apps/web/src/routes/company.$id.$slug.tsx index 77a4fc1..93dc4bf 100644 --- a/apps/web/src/routes/company.$id.$slug.tsx +++ b/apps/web/src/routes/company.$id.$slug.tsx @@ -67,10 +67,13 @@ export const Route = createFileRoute('/company/$id/$slug')({ }); } if (matches.length > 1) { + // 301 to the canonical (hash-ordered first) row so old multi-route + // URLs land on a real page and keep link equity. throw redirect({ - to: '/', - search: { search: matches[0].organisationName }, - statusCode: 302, + to: '/company/$id/$slug', + params: { id: matches[0].slugId, slug: params.slug }, + search: (prev) => ({ search: prev.search ?? '' }), + statusCode: 301, }); } throw notFound(); @@ -89,8 +92,6 @@ export const Route = createFileRoute('/company/$id/$slug')({ | { sponsor: { organisationName: string; - townCity?: string | null; - county?: string | null; typeRating: string; route: string; }; @@ -124,7 +125,10 @@ export const Route = createFileRoute('/company/$id/$slug')({ ? titleCase(loaderData.sponsor.organisationName) : ''; const location = loaderData - ? formatLocation(loaderData.sponsor.townCity, loaderData.sponsor.county) + ? formatLocation( + loaderData.profile?.registered_office_address?.locality, + loaderData.profile?.registered_office_address?.region, + ) : ''; const industry = loaderData?.profile?.sicDescriptions ?.map((sic) => sic.description) @@ -241,7 +245,10 @@ function CompanyDetail() { const alsoRegisteredAs = normalizeName(sponsor.organisationName) !== currentKey ? hmrcName : null; const displayRoute = titleCase(sponsor.route); - const displayLocation = formatLocation(sponsor.townCity, sponsor.county); + const displayLocation = formatLocation( + profile?.registered_office_address?.locality, + profile?.registered_office_address?.region, + ); const industry = profile?.sicDescriptions ?.map((s) => s.description) .join(', '); @@ -332,6 +339,19 @@ function CompanyDetail() { {titleCase(sponsor.typeRating)}

+ {/* No CH profile → the second card never renders; surface the licence here instead */} + {!profile && sponsor.sponsorLicenceNumber && ( +
+
+ Sponsor Licence No. +
+
+ + {sponsor.sponsorLicenceNumber} + +
+
+ )} @@ -384,6 +404,19 @@ function CompanyDetail() { )} + {sponsor.sponsorLicenceNumber && ( +
+
+ Sponsor Licence No. +
+
+ + {sponsor.sponsorLicenceNumber} + +
+
+ )} + {formatAddress(profile.registered_office_address) && (
diff --git a/packages/db/migrations/0025_add-sponsor-licence.sql b/packages/db/migrations/0025_add-sponsor-licence.sql new file mode 100644 index 0000000..2d0558c --- /dev/null +++ b/packages/db/migrations/0025_add-sponsor-licence.sql @@ -0,0 +1,3 @@ +ALTER TABLE "hmrc_skilled_workers" ADD COLUMN IF NOT EXISTS "sponsor_licence_number" varchar(20);--> statement-breakpoint +ALTER TABLE "hmrc_skilled_workers" ADD COLUMN IF NOT EXISTS "sponsor_status" varchar(64);--> statement-breakpoint +CREATE INDEX IF NOT EXISTS "idx_hmrc_licence" ON "hmrc_skilled_workers" USING btree ("sponsor_licence_number"); \ No newline at end of file diff --git a/packages/db/migrations/0026_drop-town-county.sql b/packages/db/migrations/0026_drop-town-county.sql new file mode 100644 index 0000000..9b58977 --- /dev/null +++ b/packages/db/migrations/0026_drop-town-county.sql @@ -0,0 +1,3 @@ +DROP INDEX IF EXISTS "idx_hmrc_town_city";--> statement-breakpoint +ALTER TABLE "hmrc_skilled_workers" DROP COLUMN IF EXISTS "town_city";--> statement-breakpoint +ALTER TABLE "hmrc_skilled_workers" DROP COLUMN IF EXISTS "county"; \ No newline at end of file diff --git a/packages/db/migrations/meta/0025_snapshot.json b/packages/db/migrations/meta/0025_snapshot.json new file mode 100644 index 0000000..f67a688 --- /dev/null +++ b/packages/db/migrations/meta/0025_snapshot.json @@ -0,0 +1,946 @@ +{ + "id": "f72258ad-cc70-4aa0-9556-2ef319a1aca2", + "prevId": "986c92c9-3ee8-4ca9-bd17-323cc05a3d99", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.ch_stream_state": { + "name": "ch_stream_state", + "schema": "", + "columns": { + "key": { + "name": "key", + "type": "varchar(50)", + "primaryKey": true, + "notNull": true + }, + "last_timepoint": { + "name": "last_timepoint", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profile_cache": { + "name": "companies_house_profile_cache", + "schema": "", + "columns": { + "key": { + "name": "key", + "type": "varchar(50)", + "primaryKey": true, + "notNull": true + }, + "last_trail_id": { + "name": "last_trail_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profile_trails": { + "name": "companies_house_profile_trails", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": true + }, + "column_name": { + "name": "column_name", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "old_value": { + "name": "old_value", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "new_value": { + "name": "new_value", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_ch_trail_company_number": { + "name": "idx_ch_trail_company_number", + "columns": [ + { + "expression": "company_number", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_trail_created_at": { + "name": "idx_ch_trail_created_at", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profiles": { + "name": "companies_house_profiles", + "schema": "", + "columns": { + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": true, + "notNull": true + }, + "company_name": { + "name": "company_name", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "company_status": { + "name": "company_status", + "type": "varchar(50)", + "primaryKey": false, + "notNull": false + }, + "company_type": { + "name": "company_type", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "date_of_creation": { + "name": "date_of_creation", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "address_line_1": { + "name": "address_line_1", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "address_line_2": { + "name": "address_line_2", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "locality": { + "name": "locality", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "region": { + "name": "region", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "postal_code": { + "name": "postal_code", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "country": { + "name": "country", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "sic_codes": { + "name": "sic_codes", + "type": "text[]", + "primaryKey": false, + "notNull": false, + "default": "'{}'::text[]" + }, + "accounts_next_made_up_to": { + "name": "accounts_next_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "accounts_last_made_up_to": { + "name": "accounts_last_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "accounts_overdue": { + "name": "accounts_overdue", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "jurisdiction": { + "name": "jurisdiction", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "has_been_liquidated": { + "name": "has_been_liquidated", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "has_insolvency_history": { + "name": "has_insolvency_history", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "has_charges": { + "name": "has_charges", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "previous_company_names": { + "name": "previous_company_names", + "type": "text[]", + "primaryKey": false, + "notNull": false, + "default": "'{}'::text[]" + }, + "confirmation_statement_last_made_up_to": { + "name": "confirmation_statement_last_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_ch_company_name": { + "name": "idx_ch_company_name", + "columns": [ + { + "expression": "company_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_company_status": { + "name": "idx_ch_company_status", + "columns": [ + { + "expression": "company_status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_company_type": { + "name": "idx_ch_company_type", + "columns": [ + { + "expression": "company_type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_sic_codes": { + "name": "idx_ch_sic_codes", + "columns": [ + { + "expression": "sic_codes", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_ch_jurisdiction": { + "name": "idx_ch_jurisdiction", + "columns": [ + { + "expression": "jurisdiction", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_previous_names": { + "name": "idx_ch_previous_names", + "columns": [ + { + "expression": "previous_company_names", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping": { + "name": "hmrc_company_mapping", + "schema": "", + "columns": { + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "is_public_body": { + "name": "is_public_body", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "match_method": { + "name": "match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "match_score": { + "name": "match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "query_used": { + "name": "query_used", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "verified_at": { + "name": "verified_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "idx_mapping_method_verified": { + "name": "idx_mapping_method_verified", + "columns": [ + { + "expression": "match_method", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "verified_at", + "isExpression": false, + "asc": true, + "nulls": "first" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping_audit": { + "name": "hmrc_company_mapping_audit", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "old_company_number": { + "name": "old_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "new_company_number": { + "name": "new_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "old_match_method": { + "name": "old_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "new_match_method": { + "name": "new_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "changed_at": { + "name": "changed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "changed_by": { + "name": "changed_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping_review_queue": { + "name": "hmrc_company_mapping_review_queue", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "reason": { + "name": "reason", + "type": "varchar(40)", + "primaryKey": false, + "notNull": true + }, + "existing_company_number": { + "name": "existing_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "existing_match_method": { + "name": "existing_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "existing_match_score": { + "name": "existing_match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "proposed_company_number": { + "name": "proposed_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "proposed_match_method": { + "name": "proposed_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "proposed_match_score": { + "name": "proposed_match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "proposed_query_used": { + "name": "proposed_query_used", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "ch_search_results_top5": { + "name": "ch_search_results_top5", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "detected_by": { + "name": "detected_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "detected_at": { + "name": "detected_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "resolved_at": { + "name": "resolved_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "resolved_by": { + "name": "resolved_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "resolution": { + "name": "resolution", + "type": "varchar(40)", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "idx_review_queue_unresolved": { + "name": "idx_review_queue_unresolved", + "columns": [ + { + "expression": "detected_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "\"hmrc_company_mapping_review_queue\".\"resolved_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_review_queue_org": { + "name": "idx_review_queue_org", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "ux_review_queue_unresolved_org_reason": { + "name": "ux_review_queue_unresolved_org_reason", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "reason", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "where": "\"hmrc_company_mapping_review_queue\".\"resolved_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_ingestion_meta": { + "name": "hmrc_ingestion_meta", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "csv_url": { + "name": "csv_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "checksum": { + "name": "checksum", + "type": "varchar(64)", + "primaryKey": false, + "notNull": true + }, + "record_count": { + "name": "record_count", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "ingested_at": { + "name": "ingested_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_skilled_workers": { + "name": "hmrc_skilled_workers", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "hash": { + "name": "hash", + "type": "varchar(11)", + "primaryKey": false, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "name_slug": { + "name": "name_slug", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "town_city": { + "name": "town_city", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "county": { + "name": "county", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "sponsor_licence_number": { + "name": "sponsor_licence_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "sponsor_status": { + "name": "sponsor_status", + "type": "varchar(64)", + "primaryKey": false, + "notNull": false + }, + "type_rating": { + "name": "type_rating", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "route": { + "name": "route", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "idx_hmrc_org_name": { + "name": "idx_hmrc_org_name", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_name_slug": { + "name": "idx_hmrc_name_slug", + "columns": [ + { + "expression": "name_slug", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_town_city": { + "name": "idx_hmrc_town_city", + "columns": [ + { + "expression": "town_city", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_licence": { + "name": "idx_hmrc_licence", + "columns": [ + { + "expression": "sponsor_licence_number", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_route": { + "name": "idx_hmrc_route", + "columns": [ + { + "expression": "route", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_org_name_trgm": { + "name": "idx_hmrc_org_name_trgm", + "columns": [ + { + "expression": "\"organisation_name\" gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "hmrc_skilled_workers_hash_unique": { + "name": "hmrc_skilled_workers_hash_unique", + "nullsNotDistinct": false, + "columns": ["hash"] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sic_codes": { + "name": "sic_codes", + "schema": "", + "columns": { + "code": { + "name": "code", + "type": "varchar(10)", + "primaryKey": true, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/db/migrations/meta/0026_snapshot.json b/packages/db/migrations/meta/0026_snapshot.json new file mode 100644 index 0000000..9fff993 --- /dev/null +++ b/packages/db/migrations/meta/0026_snapshot.json @@ -0,0 +1,919 @@ +{ + "id": "45b567be-162a-44fc-adf8-d57fcdc67921", + "prevId": "f72258ad-cc70-4aa0-9556-2ef319a1aca2", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.ch_stream_state": { + "name": "ch_stream_state", + "schema": "", + "columns": { + "key": { + "name": "key", + "type": "varchar(50)", + "primaryKey": true, + "notNull": true + }, + "last_timepoint": { + "name": "last_timepoint", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profile_cache": { + "name": "companies_house_profile_cache", + "schema": "", + "columns": { + "key": { + "name": "key", + "type": "varchar(50)", + "primaryKey": true, + "notNull": true + }, + "last_trail_id": { + "name": "last_trail_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profile_trails": { + "name": "companies_house_profile_trails", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": true + }, + "column_name": { + "name": "column_name", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "old_value": { + "name": "old_value", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "new_value": { + "name": "new_value", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_ch_trail_company_number": { + "name": "idx_ch_trail_company_number", + "columns": [ + { + "expression": "company_number", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_trail_created_at": { + "name": "idx_ch_trail_created_at", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profiles": { + "name": "companies_house_profiles", + "schema": "", + "columns": { + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": true, + "notNull": true + }, + "company_name": { + "name": "company_name", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "company_status": { + "name": "company_status", + "type": "varchar(50)", + "primaryKey": false, + "notNull": false + }, + "company_type": { + "name": "company_type", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "date_of_creation": { + "name": "date_of_creation", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "address_line_1": { + "name": "address_line_1", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "address_line_2": { + "name": "address_line_2", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "locality": { + "name": "locality", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "region": { + "name": "region", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "postal_code": { + "name": "postal_code", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "country": { + "name": "country", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "sic_codes": { + "name": "sic_codes", + "type": "text[]", + "primaryKey": false, + "notNull": false, + "default": "'{}'::text[]" + }, + "accounts_next_made_up_to": { + "name": "accounts_next_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "accounts_last_made_up_to": { + "name": "accounts_last_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "accounts_overdue": { + "name": "accounts_overdue", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "jurisdiction": { + "name": "jurisdiction", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "has_been_liquidated": { + "name": "has_been_liquidated", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "has_insolvency_history": { + "name": "has_insolvency_history", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "has_charges": { + "name": "has_charges", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "previous_company_names": { + "name": "previous_company_names", + "type": "text[]", + "primaryKey": false, + "notNull": false, + "default": "'{}'::text[]" + }, + "confirmation_statement_last_made_up_to": { + "name": "confirmation_statement_last_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_ch_company_name": { + "name": "idx_ch_company_name", + "columns": [ + { + "expression": "company_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_company_status": { + "name": "idx_ch_company_status", + "columns": [ + { + "expression": "company_status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_company_type": { + "name": "idx_ch_company_type", + "columns": [ + { + "expression": "company_type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_sic_codes": { + "name": "idx_ch_sic_codes", + "columns": [ + { + "expression": "sic_codes", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_ch_jurisdiction": { + "name": "idx_ch_jurisdiction", + "columns": [ + { + "expression": "jurisdiction", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_previous_names": { + "name": "idx_ch_previous_names", + "columns": [ + { + "expression": "previous_company_names", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping": { + "name": "hmrc_company_mapping", + "schema": "", + "columns": { + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "is_public_body": { + "name": "is_public_body", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "match_method": { + "name": "match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "match_score": { + "name": "match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "query_used": { + "name": "query_used", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "verified_at": { + "name": "verified_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "idx_mapping_method_verified": { + "name": "idx_mapping_method_verified", + "columns": [ + { + "expression": "match_method", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "verified_at", + "isExpression": false, + "asc": true, + "nulls": "first" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping_audit": { + "name": "hmrc_company_mapping_audit", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "old_company_number": { + "name": "old_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "new_company_number": { + "name": "new_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "old_match_method": { + "name": "old_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "new_match_method": { + "name": "new_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "changed_at": { + "name": "changed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "changed_by": { + "name": "changed_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping_review_queue": { + "name": "hmrc_company_mapping_review_queue", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "reason": { + "name": "reason", + "type": "varchar(40)", + "primaryKey": false, + "notNull": true + }, + "existing_company_number": { + "name": "existing_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "existing_match_method": { + "name": "existing_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "existing_match_score": { + "name": "existing_match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "proposed_company_number": { + "name": "proposed_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "proposed_match_method": { + "name": "proposed_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "proposed_match_score": { + "name": "proposed_match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "proposed_query_used": { + "name": "proposed_query_used", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "ch_search_results_top5": { + "name": "ch_search_results_top5", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "detected_by": { + "name": "detected_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "detected_at": { + "name": "detected_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "resolved_at": { + "name": "resolved_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "resolved_by": { + "name": "resolved_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "resolution": { + "name": "resolution", + "type": "varchar(40)", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "idx_review_queue_unresolved": { + "name": "idx_review_queue_unresolved", + "columns": [ + { + "expression": "detected_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "\"hmrc_company_mapping_review_queue\".\"resolved_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_review_queue_org": { + "name": "idx_review_queue_org", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "ux_review_queue_unresolved_org_reason": { + "name": "ux_review_queue_unresolved_org_reason", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "reason", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "where": "\"hmrc_company_mapping_review_queue\".\"resolved_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_ingestion_meta": { + "name": "hmrc_ingestion_meta", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "csv_url": { + "name": "csv_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "checksum": { + "name": "checksum", + "type": "varchar(64)", + "primaryKey": false, + "notNull": true + }, + "record_count": { + "name": "record_count", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "ingested_at": { + "name": "ingested_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_skilled_workers": { + "name": "hmrc_skilled_workers", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "hash": { + "name": "hash", + "type": "varchar(11)", + "primaryKey": false, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "name_slug": { + "name": "name_slug", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "sponsor_licence_number": { + "name": "sponsor_licence_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "sponsor_status": { + "name": "sponsor_status", + "type": "varchar(64)", + "primaryKey": false, + "notNull": false + }, + "type_rating": { + "name": "type_rating", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "route": { + "name": "route", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "idx_hmrc_org_name": { + "name": "idx_hmrc_org_name", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_name_slug": { + "name": "idx_hmrc_name_slug", + "columns": [ + { + "expression": "name_slug", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_licence": { + "name": "idx_hmrc_licence", + "columns": [ + { + "expression": "sponsor_licence_number", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_route": { + "name": "idx_hmrc_route", + "columns": [ + { + "expression": "route", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_org_name_trgm": { + "name": "idx_hmrc_org_name_trgm", + "columns": [ + { + "expression": "\"organisation_name\" gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "hmrc_skilled_workers_hash_unique": { + "name": "hmrc_skilled_workers_hash_unique", + "nullsNotDistinct": false, + "columns": ["hash"] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sic_codes": { + "name": "sic_codes", + "schema": "", + "columns": { + "code": { + "name": "code", + "type": "varchar(10)", + "primaryKey": true, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/db/migrations/meta/_journal.json b/packages/db/migrations/meta/_journal.json index 8920d00..fd4d94c 100644 --- a/packages/db/migrations/meta/_journal.json +++ b/packages/db/migrations/meta/_journal.json @@ -176,6 +176,20 @@ "when": 1777709462971, "tag": "0024_sharp_next_avengers", "breakpoints": true + }, + { + "idx": 25, + "version": "7", + "when": 1781078983389, + "tag": "0025_add-sponsor-licence", + "breakpoints": true + }, + { + "idx": 26, + "version": "7", + "when": 1781079003844, + "tag": "0026_drop-town-county", + "breakpoints": true } ] } diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts index f20f648..a8ed893 100644 --- a/packages/db/src/schema.ts +++ b/packages/db/src/schema.ts @@ -21,15 +21,15 @@ export const hmrcSkilledWorkers = pgTable( hash: varchar('hash', { length: 11 }).notNull().unique(), organisationName: varchar('organisation_name', { length: 255 }).notNull(), nameSlug: varchar('name_slug', { length: 255 }).notNull(), - townCity: varchar('town_city', { length: 100 }), - county: varchar('county', { length: 100 }), + sponsorLicenceNumber: varchar('sponsor_licence_number', { length: 20 }), + sponsorStatus: varchar('sponsor_status', { length: 64 }), typeRating: varchar('type_rating', { length: 100 }).notNull(), route: varchar('route', { length: 100 }).notNull(), }, (table) => [ index('idx_hmrc_org_name').on(table.organisationName), index('idx_hmrc_name_slug').on(table.nameSlug), - index('idx_hmrc_town_city').on(table.townCity), + index('idx_hmrc_licence').on(table.sponsorLicenceNumber), index('idx_hmrc_route').on(table.route), index('idx_hmrc_org_name_trgm').using( 'gin', From 3a79848cdb62037808cb157e99fe1eb71cd28664 Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 11:23:46 +0100 Subject: [PATCH 3/9] feat: adding region client side formatting --- apps/web/src/api/hmrc.ts | 3 ++- apps/web/src/components/HmrcCard.tsx | 4 ++-- apps/web/src/components/HmrcResults.tsx | 4 ++-- apps/web/src/components/McpTools.tsx | 6 +++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/apps/web/src/api/hmrc.ts b/apps/web/src/api/hmrc.ts index 9ea6052..44d91c3 100644 --- a/apps/web/src/api/hmrc.ts +++ b/apps/web/src/api/hmrc.ts @@ -45,9 +45,10 @@ export const searchHmrc = createServerFn() organisationName: hmrcSkilledWorkers.organisationName, nameSlug: hmrcSkilledWorkers.nameSlug, sponsorLicenceNumber: hmrcSkilledWorkers.sponsorLicenceNumber, - location: sql< + locality: sql< string | null >`COALESCE(${companiesHouseProfiles.locality}, ${companiesHouseProfiles.addressLine2})`, + region: companiesHouseProfiles.region, typeRating: hmrcSkilledWorkers.typeRating, route: hmrcSkilledWorkers.route, score: scoreExpr, diff --git a/apps/web/src/components/HmrcCard.tsx b/apps/web/src/components/HmrcCard.tsx index 35a33e8..6369f8c 100644 --- a/apps/web/src/components/HmrcCard.tsx +++ b/apps/web/src/components/HmrcCard.tsx @@ -1,7 +1,7 @@ import { Link } from '@tanstack/react-router'; import type { HmrcRow } from '../api/hmrc'; -import { titleCase } from '../utils'; +import { formatLocation, titleCase } from '../utils'; import RatingIcon from './RatingIcon'; import UnionJackLens from './UnionJackLens'; @@ -86,7 +86,7 @@ export default function HmrcCard({

- {titleCase(row.location)} + {formatLocation(row.locality, row.region)}

{titleCase(row.route)} diff --git a/apps/web/src/components/HmrcResults.tsx b/apps/web/src/components/HmrcResults.tsx index 8259438..1eec8c1 100644 --- a/apps/web/src/components/HmrcResults.tsx +++ b/apps/web/src/components/HmrcResults.tsx @@ -6,7 +6,7 @@ import { useVirtualTextLayout } from 'virtual-text-layout'; import { useHmrcSearch } from '../hooks/useHmrcSearch'; import { useResultsKeyboardNav } from '../hooks/useResultsKeyboardNav'; -import { titleCase } from '../utils'; +import { formatLocation, titleCase } from '../utils'; import HmrcCard from './HmrcCard'; import SkeletonCards from './SkeletonCards'; @@ -39,7 +39,7 @@ export default function HmrcResults({ search }: { search: string }) { letterSpacing: -0.4, // heading-card utility }, { - getText: (row) => titleCase(row.location), + getText: (row) => formatLocation(row.locality, row.region), font: '14px Geist', // text-sm lineHeight: 20, }, diff --git a/apps/web/src/components/McpTools.tsx b/apps/web/src/components/McpTools.tsx index 965fd0b..4d7682f 100644 --- a/apps/web/src/components/McpTools.tsx +++ b/apps/web/src/components/McpTools.tsx @@ -4,7 +4,7 @@ import '@mcp-b/global'; import { companyProfileQueryOptions } from '../api/companiesHouse'; import { searchHmrcQueryOptions } from '../api/hmrc'; -import { titleCase } from '../utils'; +import { formatLocation, titleCase } from '../utils'; /** * Registers browser-side MCP tools with `navigator.modelContext` (via @@ -77,7 +77,7 @@ export function McpTools() { const formatted = result.rows.map((row) => ({ name: titleCase(row.organisationName), - location: titleCase(row.location), + location: formatLocation(row.locality, row.region), visaRoute: titleCase(row.route), rating: titleCase(row.typeRating), sponsorLicenceNumber: row.sponsorLicenceNumber, @@ -211,7 +211,7 @@ export function McpTools() { const details = { name: titleCase(top.organisationName), - location: titleCase(top.location) || null, + location: formatLocation(top.locality, top.region) || null, sponsorship, companiesHouse: profile ? { From c3d1558da3adc45c6f9d7a432b5ba696fe2131b3 Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 14:37:28 +0100 Subject: [PATCH 4/9] fix: the bug around same tier ordering across a paging boundary --- apps/web/src/api/hmrc.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/web/src/api/hmrc.ts b/apps/web/src/api/hmrc.ts index 44d91c3..35ae37f 100644 --- a/apps/web/src/api/hmrc.ts +++ b/apps/web/src/api/hmrc.ts @@ -75,7 +75,13 @@ export const searchHmrc = createServerFn() OR similarity(${query}, ${hmrcSkilledWorkers.organisationName}) > 0.5 )`, ) - .orderBy(desc(scoreExpr), sql`${hmrcSkilledWorkers.organisationName} ASC`) + .orderBy( + desc(scoreExpr), + sql`${hmrcSkilledWorkers.organisationName} ASC`, + // Unique tiebreak: multi-row orgs tie on score AND name, and unstable + // tie order across page fetches duplicates/drops rows at OFFSET boundaries + asc(hmrcSkilledWorkers.hash), + ) .limit(PAGE_SIZE + 1) .offset(offset); From de52499362392fe7daade20c39f6012da793ca1c Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 15:26:07 +0100 Subject: [PATCH 5/9] feat: dedup listing by sponsor licence number --- apps/web/scripts/generate-sitemap.ts | 17 +++- apps/web/src/api/hmrc.ts | 106 +++++++++++++++------- apps/web/src/components/McpTools.tsx | 8 +- apps/web/src/routes/company.$id.$slug.tsx | 28 ++++-- 4 files changed, 115 insertions(+), 44 deletions(-) diff --git a/apps/web/scripts/generate-sitemap.ts b/apps/web/scripts/generate-sitemap.ts index b96f4e3..2dde220 100644 --- a/apps/web/scripts/generate-sitemap.ts +++ b/apps/web/scripts/generate-sitemap.ts @@ -6,7 +6,7 @@ import { hmrcSkilledWorkers, } from '@ss/db'; import { Glob } from 'bun'; -import { eq } from 'drizzle-orm'; +import { eq, sql } from 'drizzle-orm'; import { db } from '../src/db.server'; @@ -30,9 +30,13 @@ async function generate() { } // Single pass over all rows; LEFT JOIN keeps HMRC entries without a CH match. + // One URL per (org, rating, route) group: multi-licence siblings 301 to the + // canonical min(hash) page, so only that hash belongs in the sitemap. + // updatedAt is constant per org (mapping PK is organisation_name), so adding + // it to GROUP BY never splits a group — it just keeps drizzle's Date mapping. const allRows = await db .select({ - hash: hmrcSkilledWorkers.hash, + hash: sql`min(${hmrcSkilledWorkers.hash})`, nameSlug: hmrcSkilledWorkers.nameSlug, updatedAt: companiesHouseProfiles.updatedAt, }) @@ -51,7 +55,14 @@ async function generate() { hmrcCompanyMapping.companyNumber, ), ) - .orderBy(hmrcSkilledWorkers.hash); + .groupBy( + hmrcSkilledWorkers.organisationName, + hmrcSkilledWorkers.nameSlug, + hmrcSkilledWorkers.typeRating, + hmrcSkilledWorkers.route, + companiesHouseProfiles.updatedAt, + ) + .orderBy(sql`min(${hmrcSkilledWorkers.hash})`); const entries = new Map( allRows.map((row) => [ diff --git a/apps/web/src/api/hmrc.ts b/apps/web/src/api/hmrc.ts index 35ae37f..234bae6 100644 --- a/apps/web/src/api/hmrc.ts +++ b/apps/web/src/api/hmrc.ts @@ -36,38 +36,27 @@ export const searchHmrc = createServerFn() THEN 1.0 + word_similarity(${query}, ${hmrcSkilledWorkers.organisationName}) ELSE word_similarity(${query}, ${hmrcSkilledWorkers.organisationName}) END`; - // Listing location is CH-sourced (HMRC dropped town/county from the feed). - // Pure display joins: PK probes on the returned window only — never in - // WHERE/ORDER BY, so ranking and LIMIT pushdown are unaffected. - const rows = await db + // One row per (org, rating, route): the same org can hold several licences + // with otherwise identical feed data (888 groups in the 2026-06 feed), and + // the cards show nothing that distinguishes them. min(hash) is the + // canonical slugId — the detail loader 301s the siblings to it. + // Grouping happens in the subquery, BEFORE the CH joins, so the joins stay + // PK probes on the returned window only and ranking/LIMIT are unaffected. + const grouped = db .select({ - slugId: hmrcSkilledWorkers.hash, + slugId: sql`min(${hmrcSkilledWorkers.hash})`.as('slug_id'), organisationName: hmrcSkilledWorkers.organisationName, nameSlug: hmrcSkilledWorkers.nameSlug, - sponsorLicenceNumber: hmrcSkilledWorkers.sponsorLicenceNumber, - locality: sql< - string | null - >`COALESCE(${companiesHouseProfiles.locality}, ${companiesHouseProfiles.addressLine2})`, - region: companiesHouseProfiles.region, + sponsorLicenceNumbers: sql< + string[] + >`coalesce(array_agg(distinct ${hmrcSkilledWorkers.sponsorLicenceNumber}) filter (where ${hmrcSkilledWorkers.sponsorLicenceNumber} is not null), '{}')`.as( + 'sponsor_licence_numbers', + ), typeRating: hmrcSkilledWorkers.typeRating, route: hmrcSkilledWorkers.route, - score: scoreExpr, + score: scoreExpr.as('score'), }) .from(hmrcSkilledWorkers) - .leftJoin( - hmrcCompanyMapping, - eq( - hmrcCompanyMapping.organisationName, - hmrcSkilledWorkers.organisationName, - ), - ) - .leftJoin( - companiesHouseProfiles, - eq( - companiesHouseProfiles.companyNumber, - hmrcCompanyMapping.companyNumber, - ), - ) .where( sql`( ${hmrcSkilledWorkers.organisationName} ~* ${wordBoundaryPattern} @@ -75,15 +64,56 @@ export const searchHmrc = createServerFn() OR similarity(${query}, ${hmrcSkilledWorkers.organisationName}) > 0.5 )`, ) + .groupBy( + hmrcSkilledWorkers.organisationName, + hmrcSkilledWorkers.nameSlug, + hmrcSkilledWorkers.typeRating, + hmrcSkilledWorkers.route, + ) .orderBy( desc(scoreExpr), sql`${hmrcSkilledWorkers.organisationName} ASC`, - // Unique tiebreak: multi-row orgs tie on score AND name, and unstable - // tie order across page fetches duplicates/drops rows at OFFSET boundaries - asc(hmrcSkilledWorkers.hash), + // Unique tiebreak: groups tie on score AND name, and unstable tie + // order across page fetches duplicates/drops rows at OFFSET boundaries + sql`min(${hmrcSkilledWorkers.hash}) ASC`, ) .limit(PAGE_SIZE + 1) - .offset(offset); + .offset(offset) + .as('g'); + + // Listing location is CH-sourced (HMRC dropped town/county from the feed). + const rows = await db + .select({ + slugId: grouped.slugId, + organisationName: grouped.organisationName, + nameSlug: grouped.nameSlug, + sponsorLicenceNumbers: grouped.sponsorLicenceNumbers, + locality: sql< + string | null + >`COALESCE(${companiesHouseProfiles.locality}, ${companiesHouseProfiles.addressLine2})`, + region: companiesHouseProfiles.region, + typeRating: grouped.typeRating, + route: grouped.route, + score: grouped.score, + }) + .from(grouped) + .leftJoin( + hmrcCompanyMapping, + eq(hmrcCompanyMapping.organisationName, grouped.organisationName), + ) + .leftJoin( + companiesHouseProfiles, + eq( + companiesHouseProfiles.companyNumber, + hmrcCompanyMapping.companyNumber, + ), + ) + // Joins don't guarantee order preservation; re-sort the ≤51-row window + .orderBy( + desc(grouped.score), + asc(grouped.organisationName), + asc(grouped.slugId), + ); const hasMore = rows.length > PAGE_SIZE; return { @@ -94,16 +124,30 @@ export const searchHmrc = createServerFn() /** * Server fn returning a single `hmrc_skilled_workers` row keyed by its stable - * `hash` slug id. Returns `null` when no matching row exists. + * `hash` slug id. Returns `null` when no matching row exists. Also returns the + * group canonical: multi-licence orgs have one row per licence with identical + * (org, rating, route) — search lists only min(hash), and the loader 301s the + * sibling hashes to `canonicalSlugId`. `sponsorLicenceNumbers` carries every + * licence in the group so the canonical page shows all of them. */ const getHmrcBySlugId = createServerFn() .inputValidator((input: unknown) => input as { slugId: string }) .handler(async ({ data: { slugId } }) => { + const groupFilter = sql` + h2.organisation_name = ${hmrcSkilledWorkers.organisationName} + AND h2.type_rating = ${hmrcSkilledWorkers.typeRating} + AND h2.route = ${hmrcSkilledWorkers.route}`; const [row] = await db .select({ slugId: hmrcSkilledWorkers.hash, + canonicalSlugId: sql`( + SELECT min(h2.hash) FROM hmrc_skilled_workers h2 WHERE ${groupFilter} + )`, organisationName: hmrcSkilledWorkers.organisationName, - sponsorLicenceNumber: hmrcSkilledWorkers.sponsorLicenceNumber, + sponsorLicenceNumbers: sql`( + SELECT coalesce(array_agg(distinct h2.sponsor_licence_number) filter (where h2.sponsor_licence_number is not null), '{}') + FROM hmrc_skilled_workers h2 WHERE ${groupFilter} + )`, typeRating: hmrcSkilledWorkers.typeRating, route: hmrcSkilledWorkers.route, }) diff --git a/apps/web/src/components/McpTools.tsx b/apps/web/src/components/McpTools.tsx index 4d7682f..e24969f 100644 --- a/apps/web/src/components/McpTools.tsx +++ b/apps/web/src/components/McpTools.tsx @@ -24,7 +24,7 @@ export function McpTools() { ctx.registerTool({ name: 'search_uk_visa_sponsors', description: - 'Search for UK companies licensed to sponsor skilled worker visas. Returns company name, location, visa route, sponsor rating, and sponsor licence number.', + 'Search for UK companies licensed to sponsor skilled worker visas. Returns company name, location, visa route, sponsor rating, and sponsor licence numbers.', inputSchema: { type: 'object', properties: { @@ -80,7 +80,7 @@ export function McpTools() { location: formatLocation(row.locality, row.region), visaRoute: titleCase(row.route), rating: titleCase(row.typeRating), - sponsorLicenceNumber: row.sponsorLicenceNumber, + sponsorLicenceNumbers: row.sponsorLicenceNumbers, })); return { @@ -205,8 +205,8 @@ export function McpTools() { .map((row) => ({ visaRoute: titleCase(row.route), rating: titleCase(row.typeRating), - // Per-row, not top-level: same-name orgs can hold multiple licences - sponsorLicenceNumber: row.sponsorLicenceNumber, + // Per-row, not top-level: licences vary by (rating, route) group + sponsorLicenceNumbers: row.sponsorLicenceNumbers, })); const details = { diff --git a/apps/web/src/routes/company.$id.$slug.tsx b/apps/web/src/routes/company.$id.$slug.tsx index 93dc4bf..5d1a501 100644 --- a/apps/web/src/routes/company.$id.$slug.tsx +++ b/apps/web/src/routes/company.$id.$slug.tsx @@ -56,6 +56,18 @@ export const Route = createFileRoute('/company/$id/$slug')({ hmrcBySlugIdQueryOptions(params.id), ); + if (sponsor && sponsor.canonicalSlugId !== params.id) { + // Sibling licence row of a multi-licence org — same name/rating/route, + // near-identical page. Search lists only the canonical (min-hash) row; + // 301 the rest onto it so duplicates don't accumulate in the index. + throw redirect({ + to: '/company/$id/$slug', + params: { id: sponsor.canonicalSlugId, slug: params.slug }, + search: (prev) => ({ search: prev.search ?? '' }), + statusCode: 301, + }); + } + if (!sponsor) { const matches = await getHmrcBySlug({ data: { slug: params.slug } }); if (matches.length === 1) { @@ -340,14 +352,15 @@ function CompanyDetail() {

{/* No CH profile → the second card never renders; surface the licence here instead */} - {!profile && sponsor.sponsorLicenceNumber && ( + {!profile && sponsor.sponsorLicenceNumbers.length > 0 && (
- Sponsor Licence No. + Sponsor Licence{' '} + {sponsor.sponsorLicenceNumbers.length > 1 ? 'Nos.' : 'No.'}
- {sponsor.sponsorLicenceNumber} + {sponsor.sponsorLicenceNumbers.join(', ')}
@@ -404,14 +417,17 @@ function CompanyDetail() { )} - {sponsor.sponsorLicenceNumber && ( + {sponsor.sponsorLicenceNumbers.length > 0 && (
- Sponsor Licence No. + Sponsor Licence{' '} + {sponsor.sponsorLicenceNumbers.length > 1 + ? 'Nos.' + : 'No.'}
- {sponsor.sponsorLicenceNumber} + {sponsor.sponsorLicenceNumbers.join(', ')}
From ebbcd55e2884b6c33640350ec280eb766dc35f79 Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 16:29:01 +0100 Subject: [PATCH 6/9] fix: stability over caching and old url redirects (301) --- apps/web/src/api/cache-headers.ts | 18 ++++ apps/web/src/api/hmrc.ts | 26 +++-- apps/web/src/routes/company.$id.$slug.tsx | 124 +++++++++++++++------- 3 files changed, 122 insertions(+), 46 deletions(-) diff --git a/apps/web/src/api/cache-headers.ts b/apps/web/src/api/cache-headers.ts index 9d9bc86..b25b503 100644 --- a/apps/web/src/api/cache-headers.ts +++ b/apps/web/src/api/cache-headers.ts @@ -9,6 +9,13 @@ import { getRequestUrl, setResponseHeader } from '@tanstack/start-server-core'; export const LONG_EDGE_CACHE = 's-maxage=2592000, stale-while-revalidate=604800'; +/** + * Short edge TTL for negative lookups (row not found). A missing hash can + * come back to life (licence reinstated by a later ingest), so a long-cached + * null would strand the URL — 5 minutes absorbs crawler storms without that. + */ +export const SHORT_EDGE_CACHE = 's-maxage=300, stale-while-revalidate=60'; + /** * Attach a `Cache-Control` header to the current response only when the * request is a server-fn RPC invocation (`/_serverFn/…`). Prevents the @@ -23,3 +30,14 @@ export const setRpcCacheControl = createIsomorphicFn() } }) .client(() => {}); + +/** + * Attach a `Cache-Control` header to the current SSR document response. + * Complement of `setRpcCacheControl` for route loaders that need to override + * a routeRule default on specific outcomes (e.g. short-cache a 404 document). + */ +export const setSsrCacheControl = createIsomorphicFn() + .server((value: string) => { + setResponseHeader('Cache-Control', value); + }) + .client(() => {}); diff --git a/apps/web/src/api/hmrc.ts b/apps/web/src/api/hmrc.ts index 234bae6..2d91334 100644 --- a/apps/web/src/api/hmrc.ts +++ b/apps/web/src/api/hmrc.ts @@ -8,7 +8,11 @@ import { createServerFn } from '@tanstack/react-start'; import { asc, desc, eq, sql } from 'drizzle-orm'; import { db } from '../db.server'; -import { LONG_EDGE_CACHE, setRpcCacheControl } from './cache-headers'; +import { + LONG_EDGE_CACHE, + SHORT_EDGE_CACHE, + setRpcCacheControl, +} from './cache-headers'; const PAGE_SIZE = 50; @@ -144,6 +148,9 @@ const getHmrcBySlugId = createServerFn() SELECT min(h2.hash) FROM hmrc_skilled_workers h2 WHERE ${groupFilter} )`, organisationName: hmrcSkilledWorkers.organisationName, + // The loader 301s slug mismatches onto this (renames leave stale-slug + // URLs serving 200 with a self-referential canonical otherwise) + nameSlug: hmrcSkilledWorkers.nameSlug, sponsorLicenceNumbers: sql`( SELECT coalesce(array_agg(distinct h2.sponsor_licence_number) filter (where h2.sponsor_licence_number is not null), '{}') FROM hmrc_skilled_workers h2 WHERE ${groupFilter} @@ -155,9 +162,11 @@ const getHmrcBySlugId = createServerFn() .where(eq(hmrcSkilledWorkers.hash, slugId)) .limit(1); - // slugId is a content hash of the row — (slugId → data) is immutable, so - // cache aggressively without tag-based invalidation - setRpcCacheControl(LONG_EDGE_CACHE); + // Found rows cache long: the hash is licence-based, so data behind it only + // changes via ingest, and the post-ingest sitemap deploy purges the edge. + // Nulls cache short — a licence can be reinstated under the same hash, and + // a 30-day-cached null would 301-loop the revived URL against itself. + setRpcCacheControl(row ? LONG_EDGE_CACHE : SHORT_EDGE_CACHE); return row ?? null; }); @@ -195,8 +204,11 @@ export const sponsorCountQueryOptions = queryOptions({ /** * Server fn returning `hmrc_skilled_workers` rows whose `name_slug` matches * the given slug. Fallback for stale `/company/$id/$slug` URLs: when the hash - * lookup 404s, the loader checks whether the name still maps to a current row - * and 301s to its new hash. Capped at 2 since callers only branch on 0 / 1 / many. + * lookup 404s, the loader 301s to the slug's first row — and also scans the + * matches for the requested hash itself, which detects a stale cached null + * (licence reinstated under the same hash). Capped at 10: a slug maps to at + * most a handful of (rating, route) groups, and the containment check needs + * to see them all, not just the first. * Ordered by hash so the multi-match 301 always picks the same canonical row. * Not wrapped in queryOptions — only the loader calls it, and the redirect * moves the user off this page so there's no second reader for the result. @@ -213,7 +225,7 @@ export const getHmrcBySlug = createServerFn() .from(hmrcSkilledWorkers) .where(eq(hmrcSkilledWorkers.nameSlug, slug)) .orderBy(asc(hmrcSkilledWorkers.hash)) - .limit(2); + .limit(10); return rows; }); diff --git a/apps/web/src/routes/company.$id.$slug.tsx b/apps/web/src/routes/company.$id.$slug.tsx index 5d1a501..afde375 100644 --- a/apps/web/src/routes/company.$id.$slug.tsx +++ b/apps/web/src/routes/company.$id.$slug.tsx @@ -9,6 +9,10 @@ import { import { ExternalLink, MapPin } from 'lucide-react'; import { useEffect, useRef, useState } from 'react'; +import { + SHORT_EDGE_CACHE, + setSsrCacheControl, +} from '../api/cache-headers'; import { companyProfileQueryOptions } from '../api/companiesHouse'; import { flagStateQueryOptions } from '../api/flags'; import { getHmrcBySlug, hmrcBySlugIdQueryOptions } from '../api/hmrc'; @@ -34,6 +38,24 @@ import { buildCompanyJsonLd, ratingPhrase } from '../utils/jsonld'; // Grammatical "A, B and C" joiner for the former-names sentence in the summary. const listFormatter = new Intl.ListFormat('en-GB', { type: 'conjunction' }); +/** + * Display location for a CH registered-office address. Mirrors searchHmrc's + * COALESCE(locality, address_line_2) + region so the detail page agrees with + * the listing card on whether a sponsor has a location. + */ +function registeredLocation( + address?: { + address_line_2?: string; + locality?: string; + region?: string; + } | null, +) { + return formatLocation( + address?.locality ?? address?.address_line_2, + address?.region, + ); +} + // Canonical key for company-name equality (case, punctuation, LTD/LIMITED). function normalizeName(name: string): string { return name @@ -52,35 +74,28 @@ export const Route = createFileRoute('/company/$id/$slug')({ middlewares: [stripSearchParams({ search: '' })], }, loader: async ({ params, context: { queryClient } }) => { - const sponsor = await queryClient.ensureQueryData( + let sponsor = await queryClient.ensureQueryData( hmrcBySlugIdQueryOptions(params.id), ); - if (sponsor && sponsor.canonicalSlugId !== params.id) { - // Sibling licence row of a multi-licence org — same name/rating/route, - // near-identical page. Search lists only the canonical (min-hash) row; - // 301 the rest onto it so duplicates don't accumulate in the index. - throw redirect({ - to: '/company/$id/$slug', - params: { id: sponsor.canonicalSlugId, slug: params.slug }, - search: (prev) => ({ search: prev.search ?? '' }), - statusCode: 301, - }); - } - if (!sponsor) { const matches = await getHmrcBySlug({ data: { slug: params.slug } }); - if (matches.length === 1) { - throw redirect({ - to: '/company/$id/$slug', - params: { id: matches[0].slugId, slug: params.slug }, - search: (prev) => ({ search: prev.search ?? '' }), - statusCode: 301, + if (matches.some((m) => m.slugId === params.id)) { + // The (uncached) slug lookup sees this very hash, so the cached null + // is stale — licence reinstated under the same hash by a later + // ingest. Drop the entry and refetch: invalidateQueries never + // refetches an observer-less query, and ensureQueryData would just + // return the cached null again. + queryClient.removeQueries({ + queryKey: hmrcBySlugIdQueryOptions(params.id).queryKey, }); - } - if (matches.length > 1) { - // 301 to the canonical (hash-ordered first) row so old multi-route - // URLs land on a real page and keep link equity. + sponsor = await queryClient.ensureQueryData( + hmrcBySlugIdQueryOptions(params.id), + ); + } else if (matches.length > 0) { + // 301 to the slug's canonical (hash-ordered first) row so stale + // URLs land on a real page and keep link equity. Safe on client + // navs too: getHmrcBySlug reads the DB uncached. throw redirect({ to: '/company/$id/$slug', params: { id: matches[0].slugId, slug: params.slug }, @@ -88,7 +103,41 @@ export const Route = createFileRoute('/company/$id/$slug')({ statusCode: 301, }); } - throw notFound(); + if (!sponsor) { + // Best effort: keep the 404 document short-lived at the edge (a + // reinstated licence can revive the URL). The static /company/** + // routeRule header may still win at the edge — verify on deploy; + // the post-ingest deploy purge bounds the damage either way. + setSsrCacheControl(SHORT_EDGE_CACHE); + throw notFound(); + } + } + + // Canonicalize on SSR only. Server loaders read the DB in-process, so the + // redirect decision is always fresh; client navs read RQ/edge caches whose + // canonicalSlugId/nameSlug can be stale (rename, removed sibling) — acting + // on those loops redirects or bounces correct URLs onto stale slugs. + // Crawlers only ever see SSR, so the SEO-relevant 301s are unaffected; + // client navs simply render under the URL they were given. + // Truthiness guards: a cached pre-deploy row may predate these fields, and + // `undefined !== params.id` would 301 to /company/undefined/undefined. + if ( + import.meta.env.SSR && + ((sponsor.canonicalSlugId && sponsor.canonicalSlugId !== params.id) || + (sponsor.nameSlug && sponsor.nameSlug !== params.slug)) + ) { + // One canonical URL per page: sibling licence hashes 301 onto the + // group's min-hash row, and stale slugs (post-rename) onto the current + // slug — otherwise near-duplicate 200s accumulate in the index. + throw redirect({ + to: '/company/$id/$slug', + params: { + id: sponsor.canonicalSlugId || params.id, + slug: sponsor.nameSlug || params.slug, + }, + search: (prev) => ({ search: prev.search ?? '' }), + statusCode: 301, + }); } const profile = await queryClient.ensureQueryData( @@ -137,10 +186,7 @@ export const Route = createFileRoute('/company/$id/$slug')({ ? titleCase(loaderData.sponsor.organisationName) : ''; const location = loaderData - ? formatLocation( - loaderData.profile?.registered_office_address?.locality, - loaderData.profile?.registered_office_address?.region, - ) + ? registeredLocation(loaderData.profile?.registered_office_address) : ''; const industry = loaderData?.profile?.sicDescriptions ?.map((sic) => sic.description) @@ -244,6 +290,9 @@ function CompanyDetail() { return () => observer.disconnect(); }, []); + // Router match-cache can replay loaderData from an older bundle (SWR render + // on revisit); tolerate the field's absence instead of crashing on .length + const licenceNumbers = sponsor.sponsorLicenceNumbers ?? []; const hmrcName = titleCase(sponsor.organisationName); // Lead with the Companies House current name; HMRC may hold a stale former name. const displayName = profile?.company_name @@ -257,9 +306,8 @@ function CompanyDetail() { const alsoRegisteredAs = normalizeName(sponsor.organisationName) !== currentKey ? hmrcName : null; const displayRoute = titleCase(sponsor.route); - const displayLocation = formatLocation( - profile?.registered_office_address?.locality, - profile?.registered_office_address?.region, + const displayLocation = registeredLocation( + profile?.registered_office_address, ); const industry = profile?.sicDescriptions ?.map((s) => s.description) @@ -352,15 +400,15 @@ function CompanyDetail() { {/* No CH profile → the second card never renders; surface the licence here instead */} - {!profile && sponsor.sponsorLicenceNumbers.length > 0 && ( + {!profile && licenceNumbers.length > 0 && (
Sponsor Licence{' '} - {sponsor.sponsorLicenceNumbers.length > 1 ? 'Nos.' : 'No.'} + {licenceNumbers.length > 1 ? 'Nos.' : 'No.'}
- {sponsor.sponsorLicenceNumbers.join(', ')} + {licenceNumbers.join(', ')}
@@ -417,17 +465,15 @@ function CompanyDetail() { )} - {sponsor.sponsorLicenceNumbers.length > 0 && ( + {licenceNumbers.length > 0 && (
Sponsor Licence{' '} - {sponsor.sponsorLicenceNumbers.length > 1 - ? 'Nos.' - : 'No.'} + {licenceNumbers.length > 1 ? 'Nos.' : 'No.'}
- {sponsor.sponsorLicenceNumbers.join(', ')} + {licenceNumbers.join(', ')}
From 2fcf08b89bdb6018c22d3e22e74580185a75284d Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 17:03:51 +0100 Subject: [PATCH 7/9] fix: harden ingest-hmrc-csv + linting --- apps/web/scripts/ingest-hmrc-csv.ts | 30 ++++++++++++++++++++++- apps/web/src/routes/company.$id.$slug.tsx | 8 ++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/apps/web/scripts/ingest-hmrc-csv.ts b/apps/web/scripts/ingest-hmrc-csv.ts index d1d9669..c2c9da4 100644 --- a/apps/web/scripts/ingest-hmrc-csv.ts +++ b/apps/web/scripts/ingest-hmrc-csv.ts @@ -137,13 +137,32 @@ type CleanedRow = { const seen = new Set(); const dedupedRows: CleanedRow[] = []; +// Licence is the hash backbone: blank values collide distinct orgs into one +// hash (silently dropped by dedup), and >20 chars aborts the batched INSERT +// mid-ingest with no row context. Fail fast naming the rows instead. +const invalidRows: string[] = []; -for (const r of records) { +for (const [i, r] of records.entries()) { + const rowNum = i + 2; // 1-based, after the header row const licence = r['Sponsor Licence Number'].trim(); const orgName = r['Organisation Name'].trim(); const typeRating = r.TierRating.trim(); const route = r['Migrant Classification'].trim(); const status = clean(r['Sponsor Status']); + + if (!licence || licence.length > 20) { + invalidRows.push( + `row ${rowNum} ("${orgName || '?'}"): bad Sponsor Licence Number ${JSON.stringify(licence)}`, + ); + continue; + } + if (status && status.length > 64) { + invalidRows.push( + `row ${rowNum} ("${orgName}"): Sponsor Status exceeds 64 chars (${status.length})`, + ); + continue; + } + const hash = computeHash(licence, typeRating, route); const nameSlug = slugify(orgName) || hash; @@ -161,6 +180,15 @@ for (const r of records) { } } +if (invalidRows.length > 0) { + console.error(`Row validation failed for ${invalidRows.length} row(s):`); + for (const line of invalidRows.slice(0, 10)) console.error(` ${line}`); + if (invalidRows.length > 10) { + console.error(` …and ${invalidRows.length - 10} more`); + } + process.exit(1); +} + console.log( `Deduplicated: ${records.length} → ${dedupedRows.length} unique records`, ); diff --git a/apps/web/src/routes/company.$id.$slug.tsx b/apps/web/src/routes/company.$id.$slug.tsx index afde375..116c3f4 100644 --- a/apps/web/src/routes/company.$id.$slug.tsx +++ b/apps/web/src/routes/company.$id.$slug.tsx @@ -9,10 +9,7 @@ import { import { ExternalLink, MapPin } from 'lucide-react'; import { useEffect, useRef, useState } from 'react'; -import { - SHORT_EDGE_CACHE, - setSsrCacheControl, -} from '../api/cache-headers'; +import { SHORT_EDGE_CACHE, setSsrCacheControl } from '../api/cache-headers'; import { companyProfileQueryOptions } from '../api/companiesHouse'; import { flagStateQueryOptions } from '../api/flags'; import { getHmrcBySlug, hmrcBySlugIdQueryOptions } from '../api/hmrc'; @@ -403,8 +400,7 @@ function CompanyDetail() { {!profile && licenceNumbers.length > 0 && (
- Sponsor Licence{' '} - {licenceNumbers.length > 1 ? 'Nos.' : 'No.'} + Sponsor Licence {licenceNumbers.length > 1 ? 'Nos.' : 'No.'}
From fe85e6a80d708f59d16353340503d017fd8a1b03 Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 17:14:26 +0100 Subject: [PATCH 8/9] feat: widen the sponsor license column to 64 making scope for future increase --- apps/web/scripts/ingest-hmrc-csv.ts | 12 +- .../migrations/0027_widen-sponsor-licence.sql | 1 + .../db/migrations/meta/0027_snapshot.json | 919 ++++++++++++++++++ packages/db/migrations/meta/_journal.json | 7 + packages/db/src/schema.ts | 2 +- 5 files changed, 936 insertions(+), 5 deletions(-) create mode 100644 packages/db/migrations/0027_widen-sponsor-licence.sql create mode 100644 packages/db/migrations/meta/0027_snapshot.json diff --git a/apps/web/scripts/ingest-hmrc-csv.ts b/apps/web/scripts/ingest-hmrc-csv.ts index c2c9da4..f6b2e22 100644 --- a/apps/web/scripts/ingest-hmrc-csv.ts +++ b/apps/web/scripts/ingest-hmrc-csv.ts @@ -13,6 +13,8 @@ const EXPECTED_COLUMNS = [ ] as const; const BATCH_SIZE = 500; +// Must agree with sponsor_licence_number varchar(64) in packages/db/src/schema.ts +const LICENCE_MAX_LEN = 64; const force = process.argv.includes('--force'); const url = process.argv.filter((a) => !a.startsWith('--'))[2]; @@ -85,18 +87,20 @@ console.log(`Validated schema: ${records.length} records found`); // Step 4: Create staging table console.log('Creating staging table...'); await sql`DROP TABLE IF EXISTS "hmrc_skilled_workers_staging"`; -await sql` +// sql.query (not the tagged template): DDL can't take $n params, and the +// licence width interpolates from LICENCE_MAX_LEN so DDL and guard can't drift +await sql.query(` CREATE TABLE "hmrc_skilled_workers_staging" ( "id" serial PRIMARY KEY NOT NULL, "hash" varchar(11) NOT NULL UNIQUE, "organisation_name" varchar(255) NOT NULL, "name_slug" varchar(255) NOT NULL, - "sponsor_licence_number" varchar(20), + "sponsor_licence_number" varchar(${LICENCE_MAX_LEN}), "sponsor_status" varchar(64), "type_rating" varchar(100) NOT NULL, "route" varchar(100) NOT NULL ) -`; +`); // Step 5: Bulk insert into staging table console.log( @@ -150,7 +154,7 @@ for (const [i, r] of records.entries()) { const route = r['Migrant Classification'].trim(); const status = clean(r['Sponsor Status']); - if (!licence || licence.length > 20) { + if (!licence || licence.length > LICENCE_MAX_LEN) { invalidRows.push( `row ${rowNum} ("${orgName || '?'}"): bad Sponsor Licence Number ${JSON.stringify(licence)}`, ); diff --git a/packages/db/migrations/0027_widen-sponsor-licence.sql b/packages/db/migrations/0027_widen-sponsor-licence.sql new file mode 100644 index 0000000..9d56afc --- /dev/null +++ b/packages/db/migrations/0027_widen-sponsor-licence.sql @@ -0,0 +1 @@ +ALTER TABLE "hmrc_skilled_workers" ALTER COLUMN "sponsor_licence_number" SET DATA TYPE varchar(64); \ No newline at end of file diff --git a/packages/db/migrations/meta/0027_snapshot.json b/packages/db/migrations/meta/0027_snapshot.json new file mode 100644 index 0000000..94d6e03 --- /dev/null +++ b/packages/db/migrations/meta/0027_snapshot.json @@ -0,0 +1,919 @@ +{ + "id": "f1952481-19f9-4bec-b127-6153e8b8d459", + "prevId": "45b567be-162a-44fc-adf8-d57fcdc67921", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.ch_stream_state": { + "name": "ch_stream_state", + "schema": "", + "columns": { + "key": { + "name": "key", + "type": "varchar(50)", + "primaryKey": true, + "notNull": true + }, + "last_timepoint": { + "name": "last_timepoint", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profile_cache": { + "name": "companies_house_profile_cache", + "schema": "", + "columns": { + "key": { + "name": "key", + "type": "varchar(50)", + "primaryKey": true, + "notNull": true + }, + "last_trail_id": { + "name": "last_trail_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profile_trails": { + "name": "companies_house_profile_trails", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": true + }, + "column_name": { + "name": "column_name", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "old_value": { + "name": "old_value", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "new_value": { + "name": "new_value", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_ch_trail_company_number": { + "name": "idx_ch_trail_company_number", + "columns": [ + { + "expression": "company_number", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_trail_created_at": { + "name": "idx_ch_trail_created_at", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.companies_house_profiles": { + "name": "companies_house_profiles", + "schema": "", + "columns": { + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": true, + "notNull": true + }, + "company_name": { + "name": "company_name", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "company_status": { + "name": "company_status", + "type": "varchar(50)", + "primaryKey": false, + "notNull": false + }, + "company_type": { + "name": "company_type", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "date_of_creation": { + "name": "date_of_creation", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "address_line_1": { + "name": "address_line_1", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "address_line_2": { + "name": "address_line_2", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "locality": { + "name": "locality", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "region": { + "name": "region", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "postal_code": { + "name": "postal_code", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "country": { + "name": "country", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "sic_codes": { + "name": "sic_codes", + "type": "text[]", + "primaryKey": false, + "notNull": false, + "default": "'{}'::text[]" + }, + "accounts_next_made_up_to": { + "name": "accounts_next_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "accounts_last_made_up_to": { + "name": "accounts_last_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "accounts_overdue": { + "name": "accounts_overdue", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "jurisdiction": { + "name": "jurisdiction", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "has_been_liquidated": { + "name": "has_been_liquidated", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "has_insolvency_history": { + "name": "has_insolvency_history", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "has_charges": { + "name": "has_charges", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "previous_company_names": { + "name": "previous_company_names", + "type": "text[]", + "primaryKey": false, + "notNull": false, + "default": "'{}'::text[]" + }, + "confirmation_statement_last_made_up_to": { + "name": "confirmation_statement_last_made_up_to", + "type": "date", + "primaryKey": false, + "notNull": false + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_ch_company_name": { + "name": "idx_ch_company_name", + "columns": [ + { + "expression": "company_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_company_status": { + "name": "idx_ch_company_status", + "columns": [ + { + "expression": "company_status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_company_type": { + "name": "idx_ch_company_type", + "columns": [ + { + "expression": "company_type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_sic_codes": { + "name": "idx_ch_sic_codes", + "columns": [ + { + "expression": "sic_codes", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_ch_jurisdiction": { + "name": "idx_ch_jurisdiction", + "columns": [ + { + "expression": "jurisdiction", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_ch_previous_names": { + "name": "idx_ch_previous_names", + "columns": [ + { + "expression": "previous_company_names", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping": { + "name": "hmrc_company_mapping", + "schema": "", + "columns": { + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "company_number": { + "name": "company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "is_public_body": { + "name": "is_public_body", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "match_method": { + "name": "match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "match_score": { + "name": "match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "query_used": { + "name": "query_used", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "verified_at": { + "name": "verified_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "idx_mapping_method_verified": { + "name": "idx_mapping_method_verified", + "columns": [ + { + "expression": "match_method", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "verified_at", + "isExpression": false, + "asc": true, + "nulls": "first" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping_audit": { + "name": "hmrc_company_mapping_audit", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "old_company_number": { + "name": "old_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "new_company_number": { + "name": "new_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "old_match_method": { + "name": "old_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "new_match_method": { + "name": "new_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "changed_at": { + "name": "changed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "changed_by": { + "name": "changed_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_company_mapping_review_queue": { + "name": "hmrc_company_mapping_review_queue", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "reason": { + "name": "reason", + "type": "varchar(40)", + "primaryKey": false, + "notNull": true + }, + "existing_company_number": { + "name": "existing_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "existing_match_method": { + "name": "existing_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "existing_match_score": { + "name": "existing_match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "proposed_company_number": { + "name": "proposed_company_number", + "type": "varchar(20)", + "primaryKey": false, + "notNull": false + }, + "proposed_match_method": { + "name": "proposed_match_method", + "type": "varchar(32)", + "primaryKey": false, + "notNull": false + }, + "proposed_match_score": { + "name": "proposed_match_score", + "type": "numeric(4, 3)", + "primaryKey": false, + "notNull": false + }, + "proposed_query_used": { + "name": "proposed_query_used", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "ch_search_results_top5": { + "name": "ch_search_results_top5", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "detected_by": { + "name": "detected_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "detected_at": { + "name": "detected_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "resolved_at": { + "name": "resolved_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "resolved_by": { + "name": "resolved_by", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "resolution": { + "name": "resolution", + "type": "varchar(40)", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "idx_review_queue_unresolved": { + "name": "idx_review_queue_unresolved", + "columns": [ + { + "expression": "detected_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "where": "\"hmrc_company_mapping_review_queue\".\"resolved_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_review_queue_org": { + "name": "idx_review_queue_org", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "ux_review_queue_unresolved_org_reason": { + "name": "ux_review_queue_unresolved_org_reason", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "reason", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "where": "\"hmrc_company_mapping_review_queue\".\"resolved_at\" IS NULL", + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_ingestion_meta": { + "name": "hmrc_ingestion_meta", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "csv_url": { + "name": "csv_url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "checksum": { + "name": "checksum", + "type": "varchar(64)", + "primaryKey": false, + "notNull": true + }, + "record_count": { + "name": "record_count", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "ingested_at": { + "name": "ingested_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.hmrc_skilled_workers": { + "name": "hmrc_skilled_workers", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "hash": { + "name": "hash", + "type": "varchar(11)", + "primaryKey": false, + "notNull": true + }, + "organisation_name": { + "name": "organisation_name", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "name_slug": { + "name": "name_slug", + "type": "varchar(255)", + "primaryKey": false, + "notNull": true + }, + "sponsor_licence_number": { + "name": "sponsor_licence_number", + "type": "varchar(64)", + "primaryKey": false, + "notNull": false + }, + "sponsor_status": { + "name": "sponsor_status", + "type": "varchar(64)", + "primaryKey": false, + "notNull": false + }, + "type_rating": { + "name": "type_rating", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + }, + "route": { + "name": "route", + "type": "varchar(100)", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "idx_hmrc_org_name": { + "name": "idx_hmrc_org_name", + "columns": [ + { + "expression": "organisation_name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_name_slug": { + "name": "idx_hmrc_name_slug", + "columns": [ + { + "expression": "name_slug", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_licence": { + "name": "idx_hmrc_licence", + "columns": [ + { + "expression": "sponsor_licence_number", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_route": { + "name": "idx_hmrc_route", + "columns": [ + { + "expression": "route", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_hmrc_org_name_trgm": { + "name": "idx_hmrc_org_name_trgm", + "columns": [ + { + "expression": "\"organisation_name\" gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "hmrc_skilled_workers_hash_unique": { + "name": "hmrc_skilled_workers_hash_unique", + "nullsNotDistinct": false, + "columns": ["hash"] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sic_codes": { + "name": "sic_codes", + "schema": "", + "columns": { + "code": { + "name": "code", + "type": "varchar(10)", + "primaryKey": true, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/db/migrations/meta/_journal.json b/packages/db/migrations/meta/_journal.json index fd4d94c..47e22d4 100644 --- a/packages/db/migrations/meta/_journal.json +++ b/packages/db/migrations/meta/_journal.json @@ -190,6 +190,13 @@ "when": 1781079003844, "tag": "0026_drop-town-county", "breakpoints": true + }, + { + "idx": 27, + "version": "7", + "when": 1781107776369, + "tag": "0027_widen-sponsor-licence", + "breakpoints": true } ] } diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts index a8ed893..85cab7b 100644 --- a/packages/db/src/schema.ts +++ b/packages/db/src/schema.ts @@ -21,7 +21,7 @@ export const hmrcSkilledWorkers = pgTable( hash: varchar('hash', { length: 11 }).notNull().unique(), organisationName: varchar('organisation_name', { length: 255 }).notNull(), nameSlug: varchar('name_slug', { length: 255 }).notNull(), - sponsorLicenceNumber: varchar('sponsor_licence_number', { length: 20 }), + sponsorLicenceNumber: varchar('sponsor_licence_number', { length: 64 }), sponsorStatus: varchar('sponsor_status', { length: 64 }), typeRating: varchar('type_rating', { length: 100 }).notNull(), route: varchar('route', { length: 100 }).notNull(), From d6681ae4fb86654094ef994a331769248d984682 Mon Sep 17 00:00:00 2001 From: Nikil Kuruvilla Date: Wed, 10 Jun 2026 17:51:30 +0100 Subject: [PATCH 9/9] fix: feedback --- apps/web/scripts/ingest-hmrc-csv.ts | 18 +++++++++++++----- apps/web/src/api/hmrc.ts | 14 +++++++------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/apps/web/scripts/ingest-hmrc-csv.ts b/apps/web/scripts/ingest-hmrc-csv.ts index f6b2e22..0b7c6d9 100644 --- a/apps/web/scripts/ingest-hmrc-csv.ts +++ b/apps/web/scripts/ingest-hmrc-csv.ts @@ -139,7 +139,7 @@ type CleanedRow = { route: string; }; -const seen = new Set(); +const seen = new Map(); const dedupedRows: CleanedRow[] = []; // Licence is the hash backbone: blank values collide distinct orgs into one // hash (silently dropped by dedup), and >20 chars aborts the batched INSERT @@ -170,9 +170,9 @@ for (const [i, r] of records.entries()) { const hash = computeHash(licence, typeRating, route); const nameSlug = slugify(orgName) || hash; - if (!seen.has(hash)) { - seen.add(hash); - dedupedRows.push({ + const previous = seen.get(hash); + if (!previous) { + const row: CleanedRow = { hash, orgName, nameSlug, @@ -180,7 +180,15 @@ for (const [i, r] of records.entries()) { status, typeRating, route, - }); + }; + seen.set(hash, row); + dedupedRows.push(row); + } else if (previous.orgName !== orgName || previous.status !== status) { + // Same licence|rating|route with a different identity: keeping either row + // picks an arbitrary name (and therefore CH mapping). Upstream anomaly. + invalidRows.push( + `row ${rowNum} ("${orgName}"): conflicts with earlier "${previous.orgName}" sharing licence|rating|route (${hash})`, + ); } } diff --git a/apps/web/src/api/hmrc.ts b/apps/web/src/api/hmrc.ts index 2d91334..bb9ef53 100644 --- a/apps/web/src/api/hmrc.ts +++ b/apps/web/src/api/hmrc.ts @@ -53,7 +53,7 @@ export const searchHmrc = createServerFn() nameSlug: hmrcSkilledWorkers.nameSlug, sponsorLicenceNumbers: sql< string[] - >`coalesce(array_agg(distinct ${hmrcSkilledWorkers.sponsorLicenceNumber}) filter (where ${hmrcSkilledWorkers.sponsorLicenceNumber} is not null), '{}')`.as( + >`coalesce(array_agg(distinct ${hmrcSkilledWorkers.sponsorLicenceNumber} order by ${hmrcSkilledWorkers.sponsorLicenceNumber}) filter (where ${hmrcSkilledWorkers.sponsorLicenceNumber} is not null), '{}')`.as( 'sponsor_licence_numbers', ), typeRating: hmrcSkilledWorkers.typeRating, @@ -152,7 +152,7 @@ const getHmrcBySlugId = createServerFn() // URLs serving 200 with a self-referential canonical otherwise) nameSlug: hmrcSkilledWorkers.nameSlug, sponsorLicenceNumbers: sql`( - SELECT coalesce(array_agg(distinct h2.sponsor_licence_number) filter (where h2.sponsor_licence_number is not null), '{}') + SELECT coalesce(array_agg(distinct h2.sponsor_licence_number order by h2.sponsor_licence_number) filter (where h2.sponsor_licence_number is not null), '{}') FROM hmrc_skilled_workers h2 WHERE ${groupFilter} )`, typeRating: hmrcSkilledWorkers.typeRating, @@ -206,9 +206,10 @@ export const sponsorCountQueryOptions = queryOptions({ * the given slug. Fallback for stale `/company/$id/$slug` URLs: when the hash * lookup 404s, the loader 301s to the slug's first row — and also scans the * matches for the requested hash itself, which detects a stale cached null - * (licence reinstated under the same hash). Capped at 10: a slug maps to at - * most a handful of (rating, route) groups, and the containment check needs - * to see them all, not just the first. + * (licence reinstated under the same hash). Uncapped: rows are per LICENCE + * (not per rating/route group) and namesake slugs pool orgs, so any cap could + * hide the requested hash from the containment scan; rows per slug are + * naturally tiny (max 8 across 126k slugs). * Ordered by hash so the multi-match 301 always picks the same canonical row. * Not wrapped in queryOptions — only the loader calls it, and the redirect * moves the user off this page so there's no second reader for the result. @@ -224,8 +225,7 @@ export const getHmrcBySlug = createServerFn() }) .from(hmrcSkilledWorkers) .where(eq(hmrcSkilledWorkers.nameSlug, slug)) - .orderBy(asc(hmrcSkilledWorkers.hash)) - .limit(10); + .orderBy(asc(hmrcSkilledWorkers.hash)); return rows; });