Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions apps/web/scripts/drain-review-queue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ type ProfileRow = {
previous_company_names: string[] | null;
};

type SponsorRow = { town_city: string | null; route: string };
type SponsorRow = { route: string };

type StrategyOutcome =
| { action: 'swap'; reason: string; s_e?: number; s_p?: number }
Expand Down Expand Up @@ -180,26 +180,26 @@ async function loadProfiles(
return new Map(rows.map((r) => [r.company_number, r]));
}

/** Picks the most common (town_city, route) tuple per organisation_name.
* HMRC publishes one row per worker, so an org with mixed routes/locations
* picks the dominant pairing — same heuristic the inline scorer will use. */
/** Picks the most common route per organisation_name. HMRC publishes one
* row per worker, so an org with mixed routes picks the dominant one —
* same heuristic the inline scorer will use. (The 2026-06 feed dropped
* town/county, so the locality tiebreak is inert.) */
async function loadSponsors(
orgNames: string[],
): Promise<Map<string, SponsorRow>> {
if (orgNames.length === 0) return new Map();
const rows = (await sql`
SELECT DISTINCT ON (organisation_name)
organisation_name, town_city, route
organisation_name, route
FROM (
SELECT organisation_name, town_city, route, COUNT(*) AS n
SELECT organisation_name, route, COUNT(*) AS n
FROM hmrc_skilled_workers
WHERE organisation_name = ANY(${orgNames})
GROUP BY organisation_name, town_city, route
GROUP BY organisation_name, route
) ranked
ORDER BY organisation_name, n DESC, route, town_city NULLS LAST
ORDER BY organisation_name, n DESC, route
`) as {
organisation_name: string;
town_city: string | null;
route: string;
}[];
return new Map(rows.map((r) => [r.organisation_name, r]));
Expand Down Expand Up @@ -285,7 +285,7 @@ function profileRowToFullProfile(row: ProfileRow): CHFullProfile {
}

function sponsorRowToScorerSponsor(row: SponsorRow): ScorerSponsor {
return { route: row.route, townCity: row.town_city };
return { route: row.route, townCity: null };
}

// ─────────────────────────────────────────────────────────────────────────────
Expand Down
17 changes: 14 additions & 3 deletions apps/web/scripts/generate-sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import {
hmrcSkilledWorkers,
} from '@ss/db';
import { Glob } from 'bun';
import { eq } from 'drizzle-orm';
import { eq, sql } from 'drizzle-orm';

import { db } from '../src/db.server';

Expand All @@ -30,9 +30,13 @@ async function generate() {
}

// Single pass over all rows; LEFT JOIN keeps HMRC entries without a CH match.
// One URL per (org, rating, route) group: multi-licence siblings 301 to the
// canonical min(hash) page, so only that hash belongs in the sitemap.
// updatedAt is constant per org (mapping PK is organisation_name), so adding
// it to GROUP BY never splits a group — it just keeps drizzle's Date mapping.
const allRows = await db
.select({
hash: hmrcSkilledWorkers.hash,
hash: sql<string>`min(${hmrcSkilledWorkers.hash})`,
nameSlug: hmrcSkilledWorkers.nameSlug,
updatedAt: companiesHouseProfiles.updatedAt,
})
Expand All @@ -51,7 +55,14 @@ async function generate() {
hmrcCompanyMapping.companyNumber,
),
)
.orderBy(hmrcSkilledWorkers.hash);
.groupBy(
hmrcSkilledWorkers.organisationName,
hmrcSkilledWorkers.nameSlug,
hmrcSkilledWorkers.typeRating,
hmrcSkilledWorkers.route,
companiesHouseProfiles.updatedAt,
)
.orderBy(sql`min(${hmrcSkilledWorkers.hash})`);

const entries = new Map(
allRows.map((row) => [
Expand Down
107 changes: 73 additions & 34 deletions apps/web/scripts/ingest-hmrc-csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ import { slugify } from '../src/utils';
import { setGitHubOutput } from './ci-utils';

const EXPECTED_COLUMNS = [
'Sponsor Licence Number',
'Organisation Name',
'Town/City',
'County',
'Type & Rating',
'Route',
'TierRating',
'Migrant Classification',
'Sponsor Status',
] as const;

const BATCH_SIZE = 500;
// Must agree with sponsor_licence_number varchar(64) in packages/db/src/schema.ts
const LICENCE_MAX_LEN = 64;

const force = process.argv.includes('--force');
const url = process.argv.filter((a) => !a.startsWith('--'))[2];
Expand Down Expand Up @@ -85,18 +87,20 @@ console.log(`Validated schema: ${records.length} records found`);
// Step 4: Create staging table
console.log('Creating staging table...');
await sql`DROP TABLE IF EXISTS "hmrc_skilled_workers_staging"`;
await sql`
// sql.query (not the tagged template): DDL can't take $n params, and the
// licence width interpolates from LICENCE_MAX_LEN so DDL and guard can't drift
await sql.query(`
CREATE TABLE "hmrc_skilled_workers_staging" (
"id" serial PRIMARY KEY NOT NULL,
"hash" varchar(11) NOT NULL UNIQUE,
"organisation_name" varchar(255) NOT NULL,
"name_slug" varchar(255) NOT NULL,
"town_city" varchar(100),
"county" varchar(100),
"sponsor_licence_number" varchar(${LICENCE_MAX_LEN}),
"sponsor_status" varchar(64),
"type_rating" varchar(100) NOT NULL,
"route" varchar(100) NOT NULL
)
`;
`);

// Step 5: Bulk insert into staging table
console.log(
Expand All @@ -110,16 +114,15 @@ function clean(val: string | undefined): string | null {
return trimmed;
}

/** Mint the stable URL id from the licence-based row identity. Licence is a
* durable per-sponsor key, so hashes survive company renames and future
* ingests — org name is deliberately excluded. */
function computeHash(
orgName: string,
townCity: string | null,
county: string | null,
licence: string,
typeRating: string,
route: string,
): string {
const input = [orgName, townCity ?? '', county ?? '', typeRating, route].join(
'|',
);
const input = [licence, typeRating, route].join('|');
Comment thread
coderabbitai[bot] marked this conversation as resolved.
const bytes = new Bun.CryptoHasher('sha256').update(input).digest();
// Take first 8 bytes (64 bits), encode as base64url, trim to 11 chars
return Buffer.from(bytes.slice(0, 8)).toString('base64url').slice(0, 11);
Expand All @@ -130,36 +133,72 @@ type CleanedRow = {
hash: string;
orgName: string;
nameSlug: string;
townCity: string | null;
county: string | null;
licence: string;
status: string | null;
typeRating: string;
route: string;
};

const seen = new Set<string>();
const seen = new Map<string, CleanedRow>();
const dedupedRows: CleanedRow[] = [];
// Licence is the hash backbone: blank values collide distinct orgs into one
// hash (silently dropped by dedup), and >20 chars aborts the batched INSERT
// mid-ingest with no row context. Fail fast naming the rows instead.
const invalidRows: string[] = [];

for (const r of records) {
for (const [i, r] of records.entries()) {
const rowNum = i + 2; // 1-based, after the header row
const licence = r['Sponsor Licence Number'].trim();
const orgName = r['Organisation Name'].trim();
const townCity = clean(r['Town/City']);
const county = clean(r.County);
const typeRating = r['Type & Rating'].trim();
const route = r.Route.trim();
const hash = computeHash(orgName, townCity, county, typeRating, route);
const typeRating = r.TierRating.trim();
const route = r['Migrant Classification'].trim();
const status = clean(r['Sponsor Status']);

if (!licence || licence.length > LICENCE_MAX_LEN) {
invalidRows.push(
`row ${rowNum} ("${orgName || '?'}"): bad Sponsor Licence Number ${JSON.stringify(licence)}`,
);
continue;
}
if (status && status.length > 64) {
invalidRows.push(
`row ${rowNum} ("${orgName}"): Sponsor Status exceeds 64 chars (${status.length})`,
);
continue;
}

const hash = computeHash(licence, typeRating, route);
const nameSlug = slugify(orgName) || hash;

if (!seen.has(hash)) {
seen.add(hash);
dedupedRows.push({
const previous = seen.get(hash);
if (!previous) {
const row: CleanedRow = {
hash,
orgName,
nameSlug,
townCity,
county,
licence,
status,
typeRating,
route,
});
};
seen.set(hash, row);
dedupedRows.push(row);
} else if (previous.orgName !== orgName || previous.status !== status) {
// Same licence|rating|route with a different identity: keeping either row
// picks an arbitrary name (and therefore CH mapping). Upstream anomaly.
invalidRows.push(
`row ${rowNum} ("${orgName}"): conflicts with earlier "${previous.orgName}" sharing licence|rating|route (${hash})`,
);
}
}

if (invalidRows.length > 0) {
console.error(`Row validation failed for ${invalidRows.length} row(s):`);
for (const line of invalidRows.slice(0, 10)) console.error(` ${line}`);
if (invalidRows.length > 10) {
console.error(` …and ${invalidRows.length - 10} more`);
}
process.exit(1);
}

console.log(
Expand All @@ -181,15 +220,15 @@ for (let i = 0; i < dedupedRows.length; i += BATCH_SIZE) {
r.hash,
r.orgName,
r.nameSlug,
r.townCity,
r.county,
r.licence,
r.status,
r.typeRating,
r.route,
);
}

await sql.query(
`INSERT INTO "hmrc_skilled_workers_staging" ("hash", "organisation_name", "name_slug", "town_city", "county", "type_rating", "route") VALUES ${placeholders.join(', ')}`,
`INSERT INTO "hmrc_skilled_workers_staging" ("hash", "organisation_name", "name_slug", "sponsor_licence_number", "sponsor_status", "type_rating", "route") VALUES ${placeholders.join(', ')}`,
values,
);

Expand All @@ -203,7 +242,7 @@ console.log('Building indexes on staging table...');
await Promise.all([
sql`CREATE INDEX "stg_idx_hmrc_org_name" ON "hmrc_skilled_workers_staging" USING btree ("organisation_name")`,
sql`CREATE INDEX "stg_idx_hmrc_name_slug" ON "hmrc_skilled_workers_staging" USING btree ("name_slug")`,
sql`CREATE INDEX "stg_idx_hmrc_town_city" ON "hmrc_skilled_workers_staging" USING btree ("town_city")`,
sql`CREATE INDEX "stg_idx_hmrc_licence" ON "hmrc_skilled_workers_staging" USING btree ("sponsor_licence_number")`,
sql`CREATE INDEX "stg_idx_hmrc_route" ON "hmrc_skilled_workers_staging" USING btree ("route")`,
sql`CREATE INDEX "stg_idx_hmrc_org_name_trgm" ON "hmrc_skilled_workers_staging" USING gin ("organisation_name" gin_trgm_ops)`,
]);
Expand All @@ -217,7 +256,7 @@ await sql.transaction([
sql`ALTER TABLE "hmrc_skilled_workers_staging" RENAME TO "hmrc_skilled_workers"`,
sql`ALTER INDEX "stg_idx_hmrc_org_name" RENAME TO "idx_hmrc_org_name"`,
sql`ALTER INDEX "stg_idx_hmrc_name_slug" RENAME TO "idx_hmrc_name_slug"`,
sql`ALTER INDEX "stg_idx_hmrc_town_city" RENAME TO "idx_hmrc_town_city"`,
sql`ALTER INDEX "stg_idx_hmrc_licence" RENAME TO "idx_hmrc_licence"`,
sql`ALTER INDEX "stg_idx_hmrc_route" RENAME TO "idx_hmrc_route"`,
sql`ALTER INDEX "stg_idx_hmrc_org_name_trgm" RENAME TO "idx_hmrc_org_name_trgm"`,
sql`ALTER INDEX "hmrc_skilled_workers_staging_hash_key" RENAME TO "hmrc_skilled_workers_hash_unique"`,
Expand Down
10 changes: 4 additions & 6 deletions apps/web/scripts/seed-companies-house.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,13 @@ async function fetchApi(path: string): Promise<unknown | null> {
return res.json();
}

// Get only org names that aren't already cached, plus a representative
// town_city/county per org for the locality tiebreaker in the verification
// pipeline. selectDistinctOn(orgName) collapses multi-row sponsors (one per
// Get only org names that aren't already cached. The 2026-06 HMRC feed
// dropped town/county, so the resolver's locality tiebreak runs inert.
// selectDistinctOn(orgName) collapses multi-row sponsors (one per
// route/rating) to a single representative row.
const uncached = await db
.selectDistinctOn([hmrcSkilledWorkers.organisationName], {
organisationName: hmrcSkilledWorkers.organisationName,
townCity: hmrcSkilledWorkers.townCity,
county: hmrcSkilledWorkers.county,
})
.from(hmrcSkilledWorkers)
.leftJoin(
Expand Down Expand Up @@ -137,7 +135,7 @@ for (const row of uncached) {
// point users at the wrong CH entity. See docs/hmrc-ch-mapping-fix.md.
const result = await resolveOneSponsor(
orgName,
{ townCity: row.townCity, county: row.county },
{ townCity: null, county: null },
throttledFetchApi,
);

Expand Down
18 changes: 18 additions & 0 deletions apps/web/src/api/cache-headers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ import { getRequestUrl, setResponseHeader } from '@tanstack/start-server-core';
export const LONG_EDGE_CACHE =
's-maxage=2592000, stale-while-revalidate=604800';

/**
* Short edge TTL for negative lookups (row not found). A missing hash can
* come back to life (licence reinstated by a later ingest), so a long-cached
* null would strand the URL — 5 minutes absorbs crawler storms without that.
*/
export const SHORT_EDGE_CACHE = 's-maxage=300, stale-while-revalidate=60';

/**
* Attach a `Cache-Control` header to the current response only when the
* request is a server-fn RPC invocation (`/_serverFn/…`). Prevents the
Expand All @@ -23,3 +30,14 @@ export const setRpcCacheControl = createIsomorphicFn()
}
})
.client(() => {});

/**
* Attach a `Cache-Control` header to the current SSR document response.
* Complement of `setRpcCacheControl` for route loaders that need to override
* a routeRule default on specific outcomes (e.g. short-cache a 404 document).
*/
export const setSsrCacheControl = createIsomorphicFn()
.server((value: string) => {
setResponseHeader('Cache-Control', value);
})
.client(() => {});
22 changes: 3 additions & 19 deletions apps/web/src/api/companiesHouse.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
import {
companiesHouseProfiles,
hmrcCompanyMapping,
hmrcSkilledWorkers,
sicCodes,
} from '@ss/db';
import { companiesHouseProfiles, hmrcCompanyMapping, sicCodes } from '@ss/db';
import { queryOptions } from '@tanstack/react-query';
import { createServerFn } from '@tanstack/react-start';
import { setResponseHeader } from '@tanstack/react-start/server';
Expand Down Expand Up @@ -219,24 +214,13 @@ const getCompanyProfile = createServerFn()
// top-hit logic that was silently mapping new sponsors to wrong CH
// entities. See docs/hmrc-ch-mapping-fix.md "Phase 3 — on-demand
// resolver hardening".
const [hmrcRow] = await db
.select({
townCity: hmrcSkilledWorkers.townCity,
county: hmrcSkilledWorkers.county,
})
.from(hmrcSkilledWorkers)
.where(eq(hmrcSkilledWorkers.organisationName, companyName))
.limit(1);

console.log(
`[Profile] no mapping, resolving via CH for: "${companyName}"`,
);
// HMRC no longer publishes town/county, so the locality tiebreak is inert.
const result = await resolveOneSponsor(
companyName,
{
townCity: hmrcRow?.townCity ?? null,
county: hmrcRow?.county ?? null,
},
{ townCity: null, county: null },
async (path) => {
const r = await fetchFromApi(path);
return r.ok ? r.data : null;
Expand Down
Loading
Loading