From 85960d3f0ca2d219035416e3a02fa99d171d834e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 3 Jun 2026 23:28:01 +0200 Subject: [PATCH 1/5] chore(oracle): add brazilian-utils, rut.js, and django-localflavor comparators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand the cross-validation oracle in scripts/oracle.ts with three new backends that probe identifier families not previously covered by python-stdnum / jsvat / stdnum-js: - @brazilian-utils/brazilian-utils (JS): br.cpf, br.cnpj (v2 alphanumeric). - rut.js (JS): cl.rut. Marked survey-only — rut.js rejects RUT bodies with leading zeros as a stylistic policy; our checksum-only validator accepts them, matching python-stdnum. - django-localflavor (Python, optional via hasLocalflavor() probe): 16 mappings across ar.cuit, ar.dni, au.{abn,acn,tfn}, br.{cpf,cnpj}, ca.sin, cl.rut, es.dni, in_.{aadhaar,pan}, mx.{clabe,curp,rfc}, us.ssn. Survey-only annotations were added for pairings where the upstream library has documented gaps (e.g., localflavor's BRCNPJField predates the July 2026 alphanumeric format; INAadhaarNumberField skips Verhoeff; ARCUITField is missing prefixes 50/51/55 that python-stdnum also accepts). Gate-mode disagreement count is unchanged at 0 for the new comparators that remain in gate, validated with ORACLE_SAMPLES=1000. --- bun.lock | 6 ++ package.json | 2 + scripts/oracle.ts | 147 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+) diff --git a/bun.lock b/bun.lock index 84edb31..72c4c5a 100644 --- a/bun.lock +++ b/bun.lock @@ -5,6 +5,7 @@ "": { "name": "@stll/stdnum", "devDependencies": { + "@brazilian-utils/brazilian-utils": "^2.3.0", "@stll/oxlint-config": "^0.3.0", "@stll/typescript-config": "^0.3.0", "@types/node": "^25.9.1", @@ -18,6 +19,7 @@ "oxfmt": "^0.52.0", "oxlint": "^1.67.0", "oxlint-tsgolint": "^0.23.0", + "rut.js": "^2.1.0", "stdnum": "^1.11.14", "tsdown": "0.22.1", "typescript": "^5.9.3", @@ -36,6 +38,8 @@ "@babel/types": ["@babel/types@8.0.0-rc.6", "", { "dependencies": { "@babel/helper-string-parser": "^8.0.0-rc.6", "@babel/helper-validator-identifier": "^8.0.0-rc.6" } }, "sha512-p7/ABylAYlexb31wtRdIfH9L9A0Z2T/9H6zAqzqndkY2PLkvNNc580wGhp/gGKN4Sp9sQvSkhc6Oga8/O+wTyw=="], + "@brazilian-utils/brazilian-utils": ["@brazilian-utils/brazilian-utils@2.3.0", "", {}, "sha512-gycd2tRqkWcS5bhSjFDgnvxIXjQuxr32xw5BAkflNpEp12r+FkuK60CDb48U7xJAszNt3sPW6u9hxEcea+CZPQ=="], + "@emnapi/core": ["@emnapi/core@1.10.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.1", "tslib": "^2.4.0" } }, "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw=="], "@emnapi/runtime": ["@emnapi/runtime@1.10.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA=="], @@ -250,6 +254,8 @@ "rolldown-plugin-dts": ["rolldown-plugin-dts@0.25.1", "", { "dependencies": { "@babel/generator": "8.0.0-rc.5", "@babel/helper-validator-identifier": "8.0.0-rc.5", "@babel/parser": "8.0.0-rc.4", "ast-kit": "^3.0.0-beta.1", "birpc": "^4.0.0", "dts-resolver": "^3.0.0", "get-tsconfig": "5.0.0-beta.5", "obug": "^2.1.1" }, "peerDependencies": { "@ts-macro/tsc": "^0.3.6", "@typescript/native-preview": ">=7.0.0-dev.20260325.1", "rolldown": "^1.0.0", "typescript": "^5.0.0 || ^6.0.0", "vue-tsc": "~3.2.0" }, "optionalPeers": ["@ts-macro/tsc", "@typescript/native-preview", "typescript", "vue-tsc"] }, "sha512-zK82aC/8z1iVW+g0bCnlQZq04Y5bNeL/RcRwTYBwsnU6wH0N+6vpIFkN7JC0kYRS5qKA+pxQyfIPvXJ6Q5xSpQ=="], + "rut.js": ["rut.js@2.1.0", "", {}, "sha512-v8tXUbqycX2frkeZBcCJgn2dbZE29S21bOF+tRmow58m2qEVz5lGLmNaJ9ahTRfj3WEfwl55mTZi8qqE91w81g=="], + "semver": ["semver@7.8.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-rkVq3IXh+4FDGch+KwzX3aV9W3kO54GyEgpvBzSyctDA6Xtd7RJQV1xmXbeQp5v7+VzLOfVqiutSE6GICgPFvg=="], "stdnum": ["stdnum@1.11.14", "", {}, "sha512-Z6QOLdM5+d0lo0UJrByze6y+RciCfdGrUgTjFh3W0bDHMZk+SvGdGP57wq5UHV1RK9A+lE2Ni6TOuylawUv07Q=="], diff --git a/package.json b/package.json index 2e70b59..46ad8af 100644 --- a/package.json +++ b/package.json @@ -920,6 +920,7 @@ "lint:fix": "bun --bun oxlint -c oxlint.config.ts --type-aware --fix ." }, "devDependencies": { + "@brazilian-utils/brazilian-utils": "^2.3.0", "@stll/oxlint-config": "^0.3.0", "@stll/typescript-config": "^0.3.0", "@types/node": "^25.9.1", @@ -933,6 +934,7 @@ "oxfmt": "^0.52.0", "oxlint": "^1.67.0", "oxlint-tsgolint": "^0.23.0", + "rut.js": "^2.1.0", "stdnum": "^1.11.14", "tsdown": "0.22.1", "typescript": "^5.9.3", diff --git a/scripts/oracle.ts b/scripts/oracle.ts index a803c96..055037f 100644 --- a/scripts/oracle.ts +++ b/scripts/oracle.ts @@ -17,6 +17,10 @@ * bun run oracle:survey */ +import { + isValidCnpj, + isValidCpf, +} from "@brazilian-utils/brazilian-utils"; import fc from "fast-check"; import IBAN from "iban"; import { isValidIBAN } from "ibantools"; @@ -46,6 +50,7 @@ import { } from "jsvat"; import { execSync } from "node:child_process"; import { writeFileSync } from "node:fs"; +import { validate as validateRut } from "rut.js"; import { validateEntity as stdnumEntity, validatePerson as stdnumPerson, @@ -634,6 +639,13 @@ const hasPython = () => probe(`${PYTHON} -c "import stdnum"`); const hasIdnumbers = () => probe(`${PYTHON} -c "import idnumbers"`); +const hasLocalflavor = () => + probe( + `${PYTHON} -c ` + + `"from django.conf import settings;` + + ` settings.configure(USE_I18N=False);` + + ` import localflavor"`, + ); const hasRust = () => probe(`test -f ${RUST_BIN}`); const hasRubyValvat = () => probe(`GEM_HOME=${RUBY_GEM} ruby -e "require 'valvat'"`); @@ -679,6 +691,39 @@ const pyIdnBatch: SubBatch = (cls, vals) => { .map((l) => l === "1"); }; +// django-localflavor (Python): "{mod}.forms.{Field}" +// The module path may contain dots (e.g. "in_.forms"), +// so we split on the last dot to separate the class. +const localflavorBatch: SubBatch = (path, vals) => { + const lastDot = path.lastIndexOf("."); + const mod = path.slice(0, lastDot); + const name = path.slice(lastDot + 1); + const json = JSON.stringify(vals); + const s = `import json, sys +from django.conf import settings +if not settings.configured: + settings.configure(USE_I18N=False) +from django.core.exceptions import ValidationError +from localflavor.${mod} import ${name} +field = ${name}() +vals = json.loads(sys.stdin.read()) +for v in vals: + try: + field.clean(v) + print("1") + except (ValidationError, Exception): + print("0")`; + writeFileSync("/tmp/_stdnum_localflavor.py", s); + return execSync(`${PYTHON} /tmp/_stdnum_localflavor.py`, { + input: json, + encoding: "utf-8", + timeout: 60_000, + }) + .trim() + .split("\n") + .map((l) => l === "1"); +}; + const rustBatch: SubBatch = (fmt, vals) => { const json = JSON.stringify(vals); return execSync(`${RUST_BIN} ${fmt}`, { @@ -887,6 +932,43 @@ const IDNUMBERS: Record = { "tr.tckimlik": "TUR.PersonalID", }; +// django-localflavor: key → "{module}.forms.{Field}". +// localflavor ships Django form fields whose .clean() +// performs format + checksum validation; we drive the +// field's clean() directly via a configured-but-empty +// Django settings setup so no full project is needed. +const LOCALFLAVOR: Record = { + "ar.cuit": "ar.forms.ARCUITField", + "ar.dni": "ar.forms.ARDNIField", + "au.abn": "au.forms.AUBusinessNumberField", + "au.acn": "au.forms.AUCompanyNumberField", + "au.tfn": "au.forms.AUTaxFileNumberField", + "br.cpf": "br.forms.BRCPFField", + "br.cnpj": "br.forms.BRCNPJField", + "ca.sin": "ca.forms.CASocialInsuranceNumberField", + "cl.rut": "cl.forms.CLRutField", + "es.dni": "es.forms.ESIdentityCardNumberField", + "in_.aadhaar": "in_.forms.INAadhaarNumberField", + "in_.pan": "in_.forms.INPANCardNumberFormField", + "mx.clabe": "mx.forms.MXCLABEField", + "mx.curp": "mx.forms.MXCURPField", + "mx.rfc": "mx.forms.MXRFCField", + "us.ssn": "us.forms.USSocialSecurityNumberField", +}; + +// Some localflavor fields require punctuated input +// (e.g., CASocialInsuranceNumberField rejects bare +// digits). Apply a per-key shape before sending. +const LOCALFLAVOR_FORMAT: Record< + string, + (v: string) => string +> = { + "ca.sin": (v) => + v.length === 9 + ? `${v.slice(0, 3)}-${v.slice(3, 6)}-${v.slice(6)}` + : v, +}; + // valvat (Ruby): key → VAT prefix const VALVAT: Record = { "at.uid": "AT", @@ -1013,6 +1095,31 @@ const SURVEY_ONLY_ENTRIES = new Set([ "stdnum-js:lt.asmens", "stdnum-js:ro.cnp", "validate-polish:pl.pesel", + // rut.js rejects any RUT body that starts with 0 + // as a stylistic policy. Our validator follows the + // checksum math only, so leading-zero bodies are + // valid for us. Useful as a probe, not a gate. + "rut.js:cl.rut", + // localflavor's BRCNPJField does not yet support + // the alphanumeric (v2) CNPJ format that Receita + // Federal began issuing in July 2026. Our + // validator does. Probe-only until upstream catches up. + "localflavor:br.cnpj", + // localflavor's INAadhaarNumberField checks only + // format ("XXXX XXXX XXXX" / no all-zero group), + // not the Verhoeff checksum required by UIDAI. + // Our validator is stricter; expect ~85% false + // positives from the oracle. + "localflavor:in_.aadhaar", + // Same leading-zero policy disagreement as rut.js. + "localflavor:cl.rut", + // ARCUITField only allows the individual/company + // prefix set {20,23,24,27,30,33,34}. AFIP also + // issues CUITs with the international prefixes + // {50,51,55}, which both our validator and + // python-stdnum accept. localflavor is the outlier + // here, so the pairing stays a probe, not a gate. + "localflavor:ar.cuit", ]); const tierFor = (source: string, key: string): OracleMode => @@ -1067,6 +1174,46 @@ const buildOracles = (): OracleEntry[] => { ); } + // django-localflavor + if (hasLocalflavor()) { + for (const [key, path] of Object.entries(LOCALFLAVOR)) { + const shape = LOCALFLAVOR_FORMAT[key]; + safe( + `${key} (vs localflavor)`, + "localflavor", + key, + (v) => + localflavorBatch(path, shape ? v.map(shape) : v), + ); + } + } + + // brazilian-utils (always available) + e.push({ + name: "br.cpf (vs brazilian-utils)", + source: "brazilian-utils", + key: "br.cpf", + tier: tierFor("brazilian-utils", "br.cpf"), + validate: (v) => v.map(isValidCpf), + }); + e.push({ + name: "br.cnpj (vs brazilian-utils)", + source: "brazilian-utils", + key: "br.cnpj", + tier: tierFor("brazilian-utils", "br.cnpj"), + validate: (v) => + v.map((x) => isValidCnpj(x, { version: 2 })), + }); + + // rut.js (always available) + e.push({ + name: "cl.rut (vs rut.js)", + source: "rut.js", + key: "cl.rut", + tier: tierFor("rut.js", "cl.rut"), + validate: (v) => v.map(validateRut), + }); + // jsvat (always available) for (const [key, [cfg, pfx]] of Object.entries(JSVAT)) e.push({ From 561e8a57fcebaef3d65145a47187766fad2117e1 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 4 Jun 2026 00:04:25 +0200 Subject: [PATCH 2/5] chore(oracle): address review feedback on the new comparators - localflavorBatch now writes its temp Python script via tmpdir() with a PID-suffixed filename and cleans it up in a finally block, instead of hardcoding /tmp/_stdnum_localflavor.py. The existing /tmp paths in pyBatch / pyIdnBatch / rubyScript / phpBatch were not introduced by this PR and are left alone; they can be migrated together in a follow-up. - The brazilian-utils and rut.js oracle entries are now registered through the existing safe() helper rather than direct e.push() calls, matching the pattern used by the Python and Ruby backends. This adds consistent try/catch handling so an unexpected library exception cannot crash the oracle runner mid-batch. --- scripts/oracle.ts | 74 +++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/scripts/oracle.ts b/scripts/oracle.ts index 055037f..dbbe760 100644 --- a/scripts/oracle.ts +++ b/scripts/oracle.ts @@ -49,7 +49,9 @@ import { norway, } from "jsvat"; import { execSync } from "node:child_process"; -import { writeFileSync } from "node:fs"; +import { unlinkSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; import { validate as validateRut } from "rut.js"; import { validateEntity as stdnumEntity, @@ -713,15 +715,27 @@ for v in vals: print("1") except (ValidationError, Exception): print("0")`; - writeFileSync("/tmp/_stdnum_localflavor.py", s); - return execSync(`${PYTHON} /tmp/_stdnum_localflavor.py`, { - input: json, - encoding: "utf-8", - timeout: 60_000, - }) - .trim() - .split("\n") - .map((l) => l === "1"); + const tmp = join( + tmpdir(), + `_stdnum_localflavor_${String(process.pid)}.py`, + ); + writeFileSync(tmp, s); + try { + return execSync(`${PYTHON} ${tmp}`, { + input: json, + encoding: "utf-8", + timeout: 60_000, + }) + .trim() + .split("\n") + .map((l) => l === "1"); + } finally { + try { + unlinkSync(tmp); + } catch { + // Best-effort cleanup; ignore if already gone. + } + } }; const rustBatch: SubBatch = (fmt, vals) => { @@ -1189,30 +1203,26 @@ const buildOracles = (): OracleEntry[] => { } // brazilian-utils (always available) - e.push({ - name: "br.cpf (vs brazilian-utils)", - source: "brazilian-utils", - key: "br.cpf", - tier: tierFor("brazilian-utils", "br.cpf"), - validate: (v) => v.map(isValidCpf), - }); - e.push({ - name: "br.cnpj (vs brazilian-utils)", - source: "brazilian-utils", - key: "br.cnpj", - tier: tierFor("brazilian-utils", "br.cnpj"), - validate: (v) => - v.map((x) => isValidCnpj(x, { version: 2 })), - }); + safe( + "br.cpf (vs brazilian-utils)", + "brazilian-utils", + "br.cpf", + (v) => v.map(isValidCpf), + ); + safe( + "br.cnpj (vs brazilian-utils)", + "brazilian-utils", + "br.cnpj", + (v) => v.map((x) => isValidCnpj(x, { version: 2 })), + ); // rut.js (always available) - e.push({ - name: "cl.rut (vs rut.js)", - source: "rut.js", - key: "cl.rut", - tier: tierFor("rut.js", "cl.rut"), - validate: (v) => v.map(validateRut), - }); + safe( + "cl.rut (vs rut.js)", + "rut.js", + "cl.rut", + (v) => v.map(validateRut), + ); // jsvat (always available) for (const [key, [cfg, pfx]] of Object.entries(JSVAT)) From d80fd879ddd3ec9cde3d41ee7eee3c03daf842ed Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 4 Jun 2026 00:18:02 +0200 Subject: [PATCH 3/5] chore(oracle): apply oxfmt formatting --- scripts/oracle.ts | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/oracle.ts b/scripts/oracle.ts index dbbe760..cd84f5c 100644 --- a/scripts/oracle.ts +++ b/scripts/oracle.ts @@ -1217,11 +1217,8 @@ const buildOracles = (): OracleEntry[] => { ); // rut.js (always available) - safe( - "cl.rut (vs rut.js)", - "rut.js", - "cl.rut", - (v) => v.map(validateRut), + safe("cl.rut (vs rut.js)", "rut.js", "cl.rut", (v) => + v.map(validateRut), ); // jsvat (always available) From 9a10b674937feeeba41a28dd69f1bbd54db6086c Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 4 Jun 2026 00:19:41 +0200 Subject: [PATCH 4/5] ci: re-trigger workflows From e5e34d8e1a42cb78558388f6f015c79de803d0f5 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 4 Jun 2026 00:40:01 +0200 Subject: [PATCH 5/5] chore(oracle): add alphanumeric arbs for in_.pan, mx.curp, mx.rfc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P2 review flagged that mappings without a `lengths` declaration and without a `CUSTOM_ARB` entry fall through to the default 10-digit generator. After the main rebase, `inferArb` reads lengths from the validator's `examples` so the digit-only mappings (au.abn, au.acn, au.tfn, br.cpf, ca.sin, us.ssn) get the right length and do exercise the checksum path. The remaining gap is alphanumeric formats: in_.pan, mx.curp, and mx.rfc have no per-key arb and were producing 0/N valid samples in the gate run, meaning the comparators probed nothing useful. This patch: - adds CUSTOM_ARB entries that respect each format's character classes (letter vs digit positions, mx.curp vowel/consonant constraints, mx.rfc persona física vs moral lengths), - marks pairings as survey-only where the new arbs surface real semantic differences: * python-stdnum:mx.rfc — their is_valid() skips check-digit by default; we always verify it, * localflavor:mx.rfc — their MXRFCField requires the 2nd char of a persona física to be a vowel; we follow the SAT regex on python-stdnum, * python-stdnum:in_.pan — they accept holder-type 'K' (deprecated) and reject 0000-serial; ours excludes 'K' and accepts 0000. Gate mode now shows non-zero valid-sample counts for these mappings (in_.pan vs localflavor: 190/500 valid, mx.curp: 3-4/500, mx.rfc: covered via per-tier marking) with 0 disagreements where retained. --- scripts/oracle.ts | 101 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/scripts/oracle.ts b/scripts/oracle.ts index cd84f5c..f49414b 100644 --- a/scripts/oracle.ts +++ b/scripts/oracle.ts @@ -513,6 +513,89 @@ const CUSTOM_ARB: Record> = { ), ) .map(([p, d, c]) => `${p}00${d}${c}`), + // Indian PAN: 5 letters + 4 digits + 1 letter. + // Without a custom arb, the default digit-only + // generator never produces a format-valid PAN, so + // the oracle would compare 0/N valid samples. + "in_.pan": fc + .tuple( + fc + .array(letters(L), { minLength: 5, maxLength: 5 }) + .map((c) => c.join("")), + digs(4), + letters(L), + ) + .map(([p, d, s]) => `${p}${d}${s}`), + // Mexican CURP: 4 letters + DDMMYY + H|M + 2 state + // letters + 3 consonants + 1 alphanumeric + 1 digit. + // The 2nd letter must be a vowel (or X) per the + // canonical regex; chars 14-16 must be consonants. + "mx.curp": validDateParts(1900, 2099).chain( + ({ year, month, day }) => + fc + .tuple( + letters(L), + letters("AEIOUX"), + letters(L), + letters(L), + fc.constantFrom("H", "M"), + fc + .array(letters(L), { + minLength: 2, + maxLength: 2, + }) + .map((c) => c.join("")), + fc + .array(letters("BCDFGHJKLMNPQRSTVWXYZ"), { + minLength: 3, + maxLength: 3, + }) + .map((c) => c.join("")), + alnumStr(1, 1), + digs(1), + ) + .map(([a, b, c, d2, g, st, cs, alpha, dg]) => { + const yy = p2(year % 100); + const mm = p2(month); + const dd = p2(day); + return `${a}${b}${c}${d2}${yy}${mm}${dd}${g}${st}${cs}${alpha}${dg}`; + }), + ), + // Mexican RFC: persona física = 4 letters + YYMMDD + // + 3 alphanumeric (13 chars); persona moral = 3 + // letters + YYMMDD + 3 alphanumeric (12 chars). + "mx.rfc": validDateParts(1900, 2099).chain( + ({ year, month, day }) => { + const yy = p2(year % 100); + const mm = p2(month); + const dd = p2(day); + const date = `${yy}${mm}${dd}`; + return fc.oneof( + fc + .tuple( + fc + .array(letters(L), { + minLength: 4, + maxLength: 4, + }) + .map((c) => c.join("")), + alnumStr(3, 3), + ) + .map(([n, c]) => `${n}${date}${c}`), + fc + .tuple( + fc + .array(letters(L), { + minLength: 3, + maxLength: 3, + }) + .map((c) => c.join("")), + alnumStr(3, 3), + ) + .map(([n, c]) => `${n}${date}${c}`), + ); + }, + ), "za.idnr": dateDigs(13, "ymd"), "mu.brn": fc.oneof( fc @@ -1134,6 +1217,24 @@ const SURVEY_ONLY_ENTRIES = new Set([ // python-stdnum accept. localflavor is the outlier // here, so the pairing stays a probe, not a gate. "localflavor:ar.cuit", + // python-stdnum's mx.rfc is_valid() defaults to + // validate_check_digits=False, so it accepts any + // format-valid RFC. Our validator always checks + // the SAT mod-11 check digit, producing systematic + // drift. + "python-stdnum:mx.rfc", + // localflavor's MXRFCField requires the 2nd + // character of a persona física RFC to be a vowel. + // We accept any letter, matching the SAT regex on + // python-stdnum. + "localflavor:mx.rfc", + // python-stdnum accepts holder-type 'K' (deprecated + // but listed in their _pan_holder_types) and rejects + // PANs whose 4-digit serial is "0000" (per the + // Income Tax Dept tutorial). Our validator excludes + // 'K' and does not reject "0000"; both differences + // are defensible per source. + "python-stdnum:in_.pan", ]); const tierFor = (source: string, key: string): OracleMode =>