|
| 1 | +/** |
| 2 | + * Generates src/language-tags.gen.ts from the OpenType language system tag |
| 3 | + * registry and the IANA BCP 47 subtag registry. |
| 4 | + * |
| 5 | + * The approach is based on HarfBuzz's gen-tag-table.py: |
| 6 | + * https://github.com/harfbuzz/harfbuzz/blob/main/src/gen-tag-table.py |
| 7 | + * We reuse the same strategy of parsing the OT registry HTML, resolving |
| 8 | + * ISO 639-3 codes to 2-letter BCP 47 subtags, inheriting mappings from |
| 9 | + * macrolanguages, ranking by code count to pick the most specific tag, |
| 10 | + * and applying manual overrides for cases where the automated ranking |
| 11 | + * disagrees with HarfBuzz (Norwegian, Chinese, Quechua, Malayalam). |
| 12 | + * Our script is a simplified TypeScript rewrite that only produces the |
| 13 | + * BCP 47 → OpenType direction and only maps primary language subtags. |
| 14 | + * |
| 15 | + * Usage: node scripts/generate-language-tags.ts |
| 16 | + * |
| 17 | + * Input: |
| 18 | + * vendor.local/languagetags.html — OpenType language system tag registry |
| 19 | + * downloaded from https://learn.microsoft.com/en-us/typography/opentype/spec/languagetags |
| 20 | + * vendor.local/language-subtag-registry.txt — IANA BCP 47 subtag registry |
| 21 | + * downloaded from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |
| 22 | + * vendor.local/iso-639-3.tab — ISO 639-3 code table (for 3-letter to 2-letter mapping) |
| 23 | + * downloaded from https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab |
| 24 | + * Output: |
| 25 | + * src/language-tags.gen.ts — BCP 47 to OpenType language tag mapping. |
| 26 | + */ |
| 27 | +/* eslint-disable no-console */ |
| 28 | +import { readFileSync, writeFileSync } from 'node:fs'; |
| 29 | +import { dirname, resolve } from 'node:path'; |
| 30 | +import { fileURLToPath } from 'node:url'; |
| 31 | + |
| 32 | +const projectRoot = resolve(dirname(fileURLToPath(import.meta.url)), '..'); |
| 33 | +const otRegistryPath = resolve(projectRoot, 'vendor.local/languagetags.html'); |
| 34 | +const bcp47RegistryPath = resolve(projectRoot, 'vendor.local/language-subtag-registry.txt'); |
| 35 | +const iso639Path = resolve(projectRoot, 'vendor.local/iso-639-3.tab'); |
| 36 | +const outputPath = resolve(projectRoot, 'src/language-tags.gen.ts'); |
| 37 | + |
| 38 | +// Manual overrides matching HarfBuzz adjustments |
| 39 | +const manualOverrides: Record<string, string> = { |
| 40 | + // Norwegian: HTML maps NOR only to 'nob'. HarfBuzz explicitly adds 'no' -> 'NOR'. |
| 41 | + no: 'NOR', |
| 42 | + // Chinese: Without overrides, 'zh' maps to all Chinese variants. |
| 43 | + // HarfBuzz sets ZHS (Simplified) as the default for bare 'zh'. |
| 44 | + zh: 'ZHS', |
| 45 | + // Quechua: HarfBuzz removes QUZ from default and re-adds qu -> QUZ. |
| 46 | + qu: 'QUZ', |
| 47 | + // Malayalam: HarfBuzz increases MLR rank, making MAL (Traditional) preferred. |
| 48 | + ml: 'MAL', |
| 49 | +}; |
| 50 | + |
| 51 | +const otHtml = readFileSync(otRegistryPath, 'utf-8'); |
| 52 | +const bcp47Content = readFileSync(bcp47RegistryPath, 'utf-8'); |
| 53 | +const iso639Content = readFileSync(iso639Path, 'utf-8'); |
| 54 | + |
| 55 | +const otEntries = parseOtRegistry(otHtml); |
| 56 | +const iso3to1 = parseIso639(iso639Content); |
| 57 | +const macrolanguages = parseMacrolanguages(bcp47Content); |
| 58 | +const mapping = buildMapping(otEntries, iso3to1, macrolanguages); |
| 59 | + |
| 60 | +const output = generate(mapping); |
| 61 | +writeFileSync(outputPath, output); |
| 62 | + |
| 63 | +console.log(`Generated ${outputPath}`); |
| 64 | +console.log(` ${mapping.size} language tag mappings`); |
| 65 | + |
| 66 | +// --------------------------------------------------------------------------- |
| 67 | +// Parse OpenType language system tag registry (HTML) |
| 68 | +// --------------------------------------------------------------------------- |
| 69 | + |
| 70 | +type OtEntry = { tag: string; isoCodes: string[] }; |
| 71 | + |
| 72 | +function parseOtRegistry(html: string): OtEntry[] { |
| 73 | + const entries: OtEntry[] = []; |
| 74 | + |
| 75 | + // Match <tr> blocks containing 2-3 <td> cells. |
| 76 | + const trRegex = |
| 77 | + /<tr[^>]*>\s*<td[^>]*>([\s\S]*?)<\/td>\s*<td[^>]*>([\s\S]*?)<\/td>\s*(?:<td[^>]*>([\s\S]*?)<\/td>\s*)?<\/tr>/gi; |
| 78 | + |
| 79 | + let match; |
| 80 | + while ((match = trRegex.exec(html)) !== null) { |
| 81 | + const rawTag = match[2]; |
| 82 | + const rawCodes = match[3]; |
| 83 | + |
| 84 | + const tag = parseOtTag(rawTag); |
| 85 | + if (!tag) continue; |
| 86 | + |
| 87 | + const isoCodes = parseIsoCodes(rawCodes); |
| 88 | + |
| 89 | + entries.push({ tag, isoCodes }); |
| 90 | + } |
| 91 | + |
| 92 | + if (entries.length < 100) { |
| 93 | + throw new Error( |
| 94 | + `Expected at least 100 OpenType entries, got ${entries.length}. ` + |
| 95 | + 'The HTML structure may have changed.', |
| 96 | + ); |
| 97 | + } |
| 98 | + |
| 99 | + return entries; |
| 100 | +} |
| 101 | + |
| 102 | +function parseOtTag(raw: string): string | undefined { |
| 103 | + let text = stripHtml(raw).trim(); |
| 104 | + // Skip deprecated tags |
| 105 | + if (/\(deprecated\)/i.test(text)) return undefined; |
| 106 | + // Strip surrounding single quotes used in the OT registry HTML (e.g. 'AFK ') |
| 107 | + text = text.replace(/^'|'$/g, '').trim(); |
| 108 | + if (!/^[A-Z]{3,4}$/i.test(text)) return undefined; |
| 109 | + return text.toUpperCase(); |
| 110 | +} |
| 111 | + |
| 112 | +function parseIsoCodes(raw: string | undefined): string[] { |
| 113 | + if (!raw) return []; |
| 114 | + // Take content before <br> (anything after is comments) |
| 115 | + let text = raw.split(/<br\s*\/?>/i)[0]; |
| 116 | + text = stripHtml(text).trim(); |
| 117 | + if (!text) return []; |
| 118 | + // Split on commas and whitespace, keep only valid ISO 639 codes |
| 119 | + return text |
| 120 | + .split(/[,\s]+/) |
| 121 | + .map((s) => s.trim().toLowerCase()) |
| 122 | + .filter((s) => /^[a-z]{2,3}$/.test(s)); |
| 123 | +} |
| 124 | + |
| 125 | +function stripHtml(s: string): string { |
| 126 | + return s |
| 127 | + .replace(/<[^>]+>/g, '') |
| 128 | + .replace(/&/g, '&') |
| 129 | + .replace(/</g, '<') |
| 130 | + .replace(/>/g, '>') |
| 131 | + .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16))) |
| 132 | + .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10))) |
| 133 | + .replace(/ /gi, ' ') |
| 134 | + .replace(/&[a-z]+;/gi, ''); |
| 135 | +} |
| 136 | + |
| 137 | +// --------------------------------------------------------------------------- |
| 138 | +// Parse ISO 639-3 table (3-letter Id -> 2-letter Part1) |
| 139 | +// --------------------------------------------------------------------------- |
| 140 | + |
| 141 | +function parseIso639(content: string): Map<string, string> { |
| 142 | + const map = new Map<string, string>(); |
| 143 | + for (const line of content.split('\n').slice(1)) { |
| 144 | + const cols = line.split('\t'); |
| 145 | + const id = cols[0]; // 3-letter code |
| 146 | + const part1 = cols[3]; // 2-letter code (may be empty) |
| 147 | + if (id && part1) { |
| 148 | + map.set(id, part1); |
| 149 | + } |
| 150 | + } |
| 151 | + return map; |
| 152 | +} |
| 153 | + |
| 154 | +// --------------------------------------------------------------------------- |
| 155 | +// Parse IANA BCP 47 subtag registry (macrolanguage relationships only) |
| 156 | +// --------------------------------------------------------------------------- |
| 157 | + |
| 158 | +function parseMacrolanguages(content: string): Map<string, Set<string>> { |
| 159 | + const macrolanguages = new Map<string, Set<string>>(); |
| 160 | + |
| 161 | + for (const record of content.split('%%')) { |
| 162 | + const fields = new Map<string, string>(); |
| 163 | + let currentKey = ''; |
| 164 | + for (const line of record.split('\n')) { |
| 165 | + if (line.startsWith(' ')) { |
| 166 | + if (currentKey) { |
| 167 | + fields.set(currentKey, (fields.get(currentKey) ?? '') + ' ' + line.trim()); |
| 168 | + } |
| 169 | + continue; |
| 170 | + } |
| 171 | + const m = line.match(/^([A-Za-z-]+):\s*(.*)$/); |
| 172 | + if (m) { |
| 173 | + currentKey = m[1]; |
| 174 | + fields.set(currentKey, m[2].trim()); |
| 175 | + } |
| 176 | + } |
| 177 | + |
| 178 | + const type = fields.get('Type'); |
| 179 | + const subtag = fields.get('Subtag'); |
| 180 | + const macro = fields.get('Macrolanguage'); |
| 181 | + |
| 182 | + if (type === 'language' && subtag && macro) { |
| 183 | + if (!macrolanguages.has(macro)) { |
| 184 | + macrolanguages.set(macro, new Set()); |
| 185 | + } |
| 186 | + macrolanguages.get(macro)!.add(subtag); |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + return macrolanguages; |
| 191 | +} |
| 192 | + |
| 193 | +// --------------------------------------------------------------------------- |
| 194 | +// Build BCP 47 (2-letter) -> OpenType tag mapping |
| 195 | +// --------------------------------------------------------------------------- |
| 196 | + |
| 197 | +function buildMapping( |
| 198 | + otEntries: OtEntry[], |
| 199 | + iso3to1: Map<string, string>, |
| 200 | + macrolanguages: Map<string, Set<string>>, |
| 201 | +): Map<string, string> { |
| 202 | + // Rank per OT tag: 2 × number of associated ISO codes. |
| 203 | + // Lower rank = more specific = preferred. |
| 204 | + const otTagRank = new Map<string, number>(); |
| 205 | + // BCP 47 2-letter code -> set of OT tags |
| 206 | + const bcp47ToOt = new Map<string, Set<string>>(); |
| 207 | + |
| 208 | + for (const entry of otEntries) { |
| 209 | + const bcp47Codes = new Set<string>(); |
| 210 | + for (const iso of entry.isoCodes) { |
| 211 | + const twoLetter = iso3to1.get(iso) ?? (iso.length === 2 ? iso : undefined); |
| 212 | + if (twoLetter && twoLetter.length === 2) { |
| 213 | + bcp47Codes.add(twoLetter); |
| 214 | + } |
| 215 | + } |
| 216 | + if (bcp47Codes.size === 0) continue; |
| 217 | + |
| 218 | + // Rank based on the number of associated ISO codes. |
| 219 | + // Lower rank = more specific = preferred when multiple OT tags compete. |
| 220 | + otTagRank.set(entry.tag, 2 * entry.isoCodes.length); |
| 221 | + |
| 222 | + for (const code of bcp47Codes) { |
| 223 | + if (!bcp47ToOt.has(code)) { |
| 224 | + bcp47ToOt.set(code, new Set()); |
| 225 | + } |
| 226 | + bcp47ToOt.get(code)!.add(entry.tag); |
| 227 | + } |
| 228 | + } |
| 229 | + |
| 230 | + // Macrolanguage inheritance: if a 2-letter macrolanguage code has no OT |
| 231 | + // mapping, inherit from its member languages. |
| 232 | + for (const [macro, members] of macrolanguages) { |
| 233 | + const macro2 = iso3to1.get(macro) ?? (macro.length === 2 ? macro : undefined); |
| 234 | + if (!macro2 || macro2.length !== 2) continue; |
| 235 | + if (bcp47ToOt.has(macro2)) continue; |
| 236 | + |
| 237 | + const inherited = new Set<string>(); |
| 238 | + for (const member of members) { |
| 239 | + const m2 = iso3to1.get(member) ?? (member.length === 2 ? member : undefined); |
| 240 | + if (m2 && bcp47ToOt.has(m2)) { |
| 241 | + for (const tag of bcp47ToOt.get(m2)!) { |
| 242 | + inherited.add(tag); |
| 243 | + } |
| 244 | + } |
| 245 | + } |
| 246 | + if (inherited.size > 0) { |
| 247 | + bcp47ToOt.set(macro2, inherited); |
| 248 | + } |
| 249 | + } |
| 250 | + |
| 251 | + // For each 2-letter code, pick the best OT tag (lowest rank, then alphabetical). |
| 252 | + const result = new Map<string, string>(); |
| 253 | + for (const [bcp47, tags] of bcp47ToOt) { |
| 254 | + if (bcp47.length !== 2) continue; |
| 255 | + |
| 256 | + const sorted = [...tags].sort((a, b) => { |
| 257 | + const rankA = otTagRank.get(a) ?? 0; |
| 258 | + const rankB = otTagRank.get(b) ?? 0; |
| 259 | + if (rankA !== rankB) return rankA - rankB; |
| 260 | + return a.localeCompare(b); |
| 261 | + }); |
| 262 | + result.set(bcp47, sorted[0]); |
| 263 | + } |
| 264 | + |
| 265 | + // Apply manual overrides |
| 266 | + for (const [bcp47, otTag] of Object.entries(manualOverrides)) { |
| 267 | + result.set(bcp47, otTag); |
| 268 | + } |
| 269 | + |
| 270 | + return result; |
| 271 | +} |
| 272 | + |
| 273 | +// --------------------------------------------------------------------------- |
| 274 | +// Generate output |
| 275 | +// --------------------------------------------------------------------------- |
| 276 | + |
| 277 | +function generate(mapping: Map<string, string>): string { |
| 278 | + const sorted = [...mapping.entries()].sort((a, b) => a[0].localeCompare(b[0])); |
| 279 | + |
| 280 | + const lines: string[] = [ |
| 281 | + '/**', |
| 282 | + ' * Mapping from BCP 47 primary language subtags to OpenType language system tags.', |
| 283 | + ' *', |
| 284 | + ' * Generated from:', |
| 285 | + ' * - OpenType language system tag registry (Microsoft)', |
| 286 | + ' * - IANA BCP 47 subtag registry', |
| 287 | + ' *', |
| 288 | + ' * Source: https://learn.microsoft.com/en-us/typography/opentype/spec/languagetags', |
| 289 | + ' *', |
| 290 | + ' * Do not edit manually. Regenerate with: node scripts/generate-language-tags.ts', |
| 291 | + ' */', |
| 292 | + '', |
| 293 | + 'const langSysTagMap = {', |
| 294 | + ]; |
| 295 | + |
| 296 | + for (const [bcp47, otTag] of sorted) { |
| 297 | + lines.push(` ${bcp47}: '${otTag}',`); |
| 298 | + } |
| 299 | + |
| 300 | + lines.push('} as const;', ''); |
| 301 | + lines.push('type LangKey = keyof typeof langSysTagMap;', ''); |
| 302 | + lines.push('/**'); |
| 303 | + lines.push(' * Map a BCP 47 language tag to an OpenType language system tag.'); |
| 304 | + lines.push(' * Only the primary language subtag (the part before the first hyphen)'); |
| 305 | + lines.push(' * is used for the lookup.'); |
| 306 | + lines.push(' * Returns `undefined` for unmapped languages.'); |
| 307 | + lines.push(' */'); |
| 308 | + lines.push('export function languageToOpenTypeTag(language: string): string | undefined {'); |
| 309 | + lines.push(" const primary = language.split('-')[0].toLowerCase() as LangKey;"); |
| 310 | + lines.push(' return langSysTagMap[primary];'); |
| 311 | + lines.push('}'); |
| 312 | + lines.push(''); |
| 313 | + |
| 314 | + return lines.join('\n'); |
| 315 | +} |
0 commit comments