From dd1677d06bcea43366048f73776abfbcbe3fa835 Mon Sep 17 00:00:00 2001 From: alltheseas Date: Wed, 1 Apr 2026 14:22:43 -0500 Subject: [PATCH 1/4] feat: add hex_alternation and base64_2pad native ops (closes #37) Add two new PatternCheck ops for MIP-00/MIP-05 schema patterns that previously fell back to regex: - hex_alternation: multi-length hex string (e.g. 64|96|128 chars) - base64_2pad: strict base64 with mandatory == padding Both ops are implemented in all 11 non-TS emitters with matching helper functions. Fuzz-equivalence tests verify correctness against the actual regex patterns. Co-Authored-By: Claude Opus 4.6 --- src/classify-pattern.ts | 21 ++++++++++++++ src/emit-c.ts | 29 +++++++++++++++++++ src/emit-cpp.ts | 26 +++++++++++++++++ src/emit-csharp.ts | 30 ++++++++++++++++++++ src/emit-dart.ts | 29 ++++++++++++++++++- src/emit-go.ts | 33 ++++++++++++++++++++++ src/emit-java.ts | 28 +++++++++++++++++++ src/emit-kotlin.ts | 27 ++++++++++++++++++ src/emit-php.ts | 27 ++++++++++++++++++ src/emit-python.ts | 28 +++++++++++++++++++ src/emit-ruby.ts | 26 +++++++++++++++++ src/emit-rust.ts | 28 +++++++++++++++++++ src/emit-swift.ts | 28 +++++++++++++++++++ tests/classify-pattern.test.ts | 25 +++++++++++++++++ tests/fuzz-equivalence.test.ts | 51 ++++++++++++++++++++++++++++++++++ 15 files changed, 435 insertions(+), 1 deletion(-) diff --git a/src/classify-pattern.ts b/src/classify-pattern.ts index 504dd98..678764e 100644 --- a/src/classify-pattern.ts +++ b/src/classify-pattern.ts @@ -66,6 +66,8 @@ export type PatternCheck = | { op: 'space_separated_tokens' } | { op: 'starts_with_charset'; charset: string } | { op: 'base64' } + | { op: 'hex_alternation'; lengths: number[]; case: 'lower' | 'mixed' } + | { op: 'base64_2pad' } | { op: 'nostr_uri' } | { op: 'nip04_encrypted' } | { op: 'nip05_identifier' } @@ -102,6 +104,18 @@ export function classifyRegex(pattern: string): PatternCheck { } } + // Multi-length hex alternation: ^(?:[a-f0-9]{64}|[a-f0-9]{96}|[a-f0-9]{128})$ + { + const m = pattern.match( + /^\^\(\?:(\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{\d+\}(?:\|\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{\d+\})+)\)\$$/ + ); + if (m) { + const isLower = !m[1].includes('A-F'); + const lengths = [...m[1].matchAll(/\{(\d+)\}/g)].map(mm => parseInt(mm[1], 10)); + return { op: 'hex_alternation', lengths, case: isLower ? 'lower' : 'mixed' }; + } + } + // Range-length hex: ^[a-f0-9]{7,40}$ { const m = pattern.match(/^\^\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{(\d+),(\d+)\}\$$/); @@ -409,6 +423,11 @@ export function classifyRegex(pattern: string): PatternCheck { return { op: 'base64' }; } + // Base64 strict 2-pad: ^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$ + if (pattern === '^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$') { + return { op: 'base64_2pad' }; + } + // Nostr URI: ^nostr:((npub|note)1[02-9ac-hj-np-z]{58}|(nprofile|nevent|naddr)1[02-9ac-hj-np-z]+)$ if (pattern === '^nostr:((npub|note)1[02-9ac-hj-np-z]{58}|(nprofile|nevent|naddr)1[02-9ac-hj-np-z]+)$') { return { op: 'nostr_uri' }; @@ -639,6 +658,8 @@ export function isNativeCheck(check: PatternCheck): boolean { case 'space_separated_tokens': case 'starts_with_charset': case 'base64': + case 'hex_alternation': + case 'base64_2pad': case 'nostr_uri': case 'nip04_encrypted': case 'nip05_identifier': diff --git a/src/emit-c.ts b/src/emit-c.ts index c3b860f..07f93ec 100644 --- a/src/emit-c.ts +++ b/src/emit-c.ts @@ -307,6 +307,19 @@ function renderPatternCheckC(check: PatternCheck, varExpr: string): { expr: stri helpers.add('schemata_check_base64'); return { expr: `schemata_check_base64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `schemata_check_hex${len}` : `schemata_check_hex${len}_mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('schemata_check_base64_2pad'); + helpers.add('schemata_check_base64'); // for schemata_is_b64_char + return { expr: `schemata_check_base64_2pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('schemata_check_nostr_uri'); return { expr: `schemata_check_nostr_uri(${varExpr})`, helpers }; @@ -1496,6 +1509,22 @@ function emitHelperFunctions(helpers: Set): string { lines.push(''); } + if (helpers.has('schemata_check_base64_2pad')) { + lines.push('/* strict base64 with mandatory 2-char padding */'); + lines.push('static int schemata_check_base64_2pad(const char *s) {'); + lines.push(' if (!s) return 0;'); + lines.push(' size_t len = strlen(s);'); + lines.push(' if (len < 4 || len % 4 != 0) return 0;'); + lines.push(" if (s[len - 1] != '=' || s[len - 2] != '=') return 0;"); + lines.push(' size_t i;'); + lines.push(' for (i = 0; i < len - 2; i++) {'); + lines.push(' if (!schemata_is_b64_char(s[i])) return 0;'); + lines.push(' }'); + lines.push(' return 1;'); + lines.push('}'); + lines.push(''); + } + if (helpers.has('schemata_check_nostr_uri')) { lines.push('static int schemata_is_bech32_data_char(char c) {'); lines.push(" return (c >= '0' && c <= '9' && c != '1') || (c >= 'a' && c <= 'z' && c != 'b' && c != 'i' && c != 'o');"); diff --git a/src/emit-cpp.ts b/src/emit-cpp.ts index de7b3dd..fc2542f 100644 --- a/src/emit-cpp.ts +++ b/src/emit-cpp.ts @@ -202,6 +202,19 @@ function renderPatternCheckCpp(check: PatternCheck, varExpr: string): { expr: st helpers.add('check_base64'); return { expr: `check_base64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `check_hex${len}` : `check_hex${len}_mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('check_base64_2pad'); + helpers.add('check_base64'); // for is_b64_char + return { expr: `check_base64_2pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('check_nostr_uri'); return { expr: `check_nostr_uri(${varExpr})`, helpers }; @@ -1136,6 +1149,19 @@ function emitCppHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('check_base64_2pad')) { + lines.push('/* strict base64 with mandatory 2-char padding */'); + lines.push('inline bool check_base64_2pad(const std::string& s) {'); + lines.push(' if (s.size() < 4 || s.size() % 4 != 0) return false;'); + lines.push(" if (s[s.size() - 1] != '=' || s[s.size() - 2] != '=') return false;"); + lines.push(' for (size_t i = 0; i < s.size() - 2; i++) {'); + lines.push(' if (!is_b64_char(s[i])) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push('}'); + lines.push(''); + } + if (helpers.has('check_nostr_uri')) { lines.push('inline bool is_bech32_data_char(char c) {'); lines.push(" return (c >= '0' && c <= '9' && c != '1') || (c >= 'a' && c <= 'z' && c != 'b' && c != 'i' && c != 'o');"); diff --git a/src/emit-csharp.ts b/src/emit-csharp.ts index ad450e1..8f02cea 100644 --- a/src/emit-csharp.ts +++ b/src/emit-csharp.ts @@ -199,6 +199,19 @@ function renderPatternCheckCSharp(check: PatternCheck, varExpr: string): { expr: helpers.add('IsB64Char'); return { expr: `CheckBase64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `CheckHex${len}` : `CheckHex${len}Mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('CheckBase642Pad'); + helpers.add('IsB64Char'); + return { expr: `CheckBase642Pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('CheckNostrUri'); helpers.add('IsBech32Char'); @@ -1145,6 +1158,23 @@ function emitCSharpHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('CheckBase642Pad')) { + lines.push(' /* strict base64 with mandatory 2-char padding */'); + lines.push(' private static bool CheckBase642Pad(string s)'); + lines.push(' {'); + lines.push(' if (s == null) return false;'); + lines.push(' int len = s.Length;'); + lines.push(' if (len < 4 || len % 4 != 0) return false;'); + lines.push(" if (s[len - 1] != '=' || s[len - 2] != '=') return false;"); + lines.push(' for (int i = 0; i < len - 2; i++)'); + lines.push(' {'); + lines.push(' if (!IsB64Char(s[i])) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push(' }'); + lines.push(''); + } + if (helpers.has('CheckNostrUri')) { lines.push(' /* ^nostr:((npub|note)1[bech32]{58}|(nprofile|nevent|naddr)1[bech32]+)$ */'); lines.push(' private static bool CheckNostrUri(string s)'); diff --git a/src/emit-dart.ts b/src/emit-dart.ts index eeda2e4..c61aeca 100644 --- a/src/emit-dart.ts +++ b/src/emit-dart.ts @@ -195,6 +195,19 @@ function renderPatternCheckDart(check: PatternCheck, varExpr: string): { expr: s helpers.add('_checkBase64'); return { expr: `_checkBase64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `_checkHex${len}` : `_checkHex${len}Mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('_checkBase642Pad'); + helpers.add('_isB64Char'); + return { expr: `_checkBase642Pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('_checkNostrUri'); helpers.add('_isBech32Char'); @@ -1102,7 +1115,7 @@ function emitDartHelpers(helpers: Set): string { lines.push(''); } - if (helpers.has('_isB64Char') || helpers.has('_checkBase64') || helpers.has('_checkNip04Encrypted')) { + if (helpers.has('_isB64Char') || helpers.has('_checkBase64') || helpers.has('_checkNip04Encrypted') || helpers.has('_checkBase642Pad')) { lines.push('bool _isB64Char(int c) {'); lines.push(' return (c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || c == 43 || c == 47;'); lines.push('}'); @@ -1134,6 +1147,20 @@ function emitDartHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('_checkBase642Pad')) { + lines.push('// strict base64 with mandatory 2-char padding'); + lines.push('bool _checkBase642Pad(String s) {'); + lines.push(' final l = s.length;'); + lines.push(' if (l < 4 || l % 4 != 0) return false;'); + lines.push(" if (s.codeUnitAt(l - 1) != 61 || s.codeUnitAt(l - 2) != 61) return false; // '='"); + lines.push(' for (var i = 0; i < l - 2; i++) {'); + lines.push(' if (!_isB64Char(s.codeUnitAt(i))) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push('}'); + lines.push(''); + } + if (helpers.has('_checkNostrUri')) { lines.push('bool _checkNostrUri(String s) {'); lines.push(" if (!s.startsWith('nostr:')) return false;"); diff --git a/src/emit-go.ts b/src/emit-go.ts index 68c199c..afd8eb6 100644 --- a/src/emit-go.ts +++ b/src/emit-go.ts @@ -212,6 +212,19 @@ function renderPatternCheckGo(check: PatternCheck, varExpr: string): { expr: str helpers.add('checkBase64'); return { expr: `checkBase64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `checkHex${len}` : `checkHex${len}Mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('checkBase642Pad'); + helpers.add('checkBase64'); // for isB64Char + return { expr: `checkBase642Pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('checkNostrUri'); helpers.add('strings'); @@ -1654,6 +1667,26 @@ function emitGoHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkBase642Pad')) { + lines.push('// strict base64 with mandatory 2-char padding'); + lines.push('func checkBase642Pad(s string) bool {'); + lines.push('\tl := len(s)'); + lines.push('\tif l < 4 || l%4 != 0 {'); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push("\tif s[l-1] != '=' || s[l-2] != '=' {"); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push('\tfor i := 0; i < l-2; i++ {'); + lines.push('\t\tif !isB64Char(s[i]) {'); + lines.push('\t\t\treturn false'); + lines.push('\t\t}'); + lines.push('\t}'); + lines.push('\treturn true'); + lines.push('}'); + lines.push(''); + } + // checkNostrUri: ^nostr:((npub|note)1[bech32]{58}|(nprofile|nevent|naddr)1[bech32]+)$ if (helpers.has('checkNostrUri')) { lines.push('func isNostrBech32DataChar(b byte) bool {'); diff --git a/src/emit-java.ts b/src/emit-java.ts index d23cb12..18c87c1 100644 --- a/src/emit-java.ts +++ b/src/emit-java.ts @@ -202,6 +202,19 @@ function renderPatternCheckJava(check: PatternCheck, varExpr: string): { expr: s helpers.add('checkBase64'); return { expr: `checkBase64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `checkHex${len}` : `checkHex${len}Mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('checkBase642Pad'); + helpers.add('checkBase64'); // for isB64Char + return { expr: `checkBase642Pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('checkNostrUri'); helpers.add('checkBech32'); // triggers isBech32Char @@ -1166,6 +1179,21 @@ function emitJavaHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkBase642Pad')) { + lines.push(' /* strict base64 with mandatory 2-char padding */'); + lines.push(' private static boolean checkBase642Pad(String s) {'); + lines.push(' if (s == null) return false;'); + lines.push(' int len = s.length();'); + lines.push(' if (len < 4 || len % 4 != 0) return false;'); + lines.push(" if (s.charAt(len - 1) != '=' || s.charAt(len - 2) != '=') return false;"); + lines.push(' for (int i = 0; i < len - 2; i++) {'); + lines.push(' if (!isB64Char(s.charAt(i))) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push(' }'); + lines.push(''); + } + if (helpers.has('checkNostrUri')) { lines.push(' /* ^nostr:((npub|note)1[bech32]{58}|(nprofile|nevent|naddr)1[bech32]+)$ */'); lines.push(' private static boolean checkNostrUri(String s) {'); diff --git a/src/emit-kotlin.ts b/src/emit-kotlin.ts index 8d3ac4a..6ffaf17 100644 --- a/src/emit-kotlin.ts +++ b/src/emit-kotlin.ts @@ -197,6 +197,19 @@ function renderPatternCheckKotlin(check: PatternCheck, varExpr: string): { expr: helpers.add('checkBase64'); return { expr: `checkBase64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `checkHex${len}` : `checkHex${len}Mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('checkBase642Pad'); + helpers.add('checkBase64'); // for isB64Char + return { expr: `checkBase642Pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('checkNostrUri'); helpers.add('checkBech32'); // triggers isBech32Char @@ -1055,6 +1068,20 @@ function emitKotlinHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkBase642Pad')) { + lines.push('/* strict base64 with mandatory 2-char padding */'); + lines.push('private fun checkBase642Pad(s: String): Boolean {'); + lines.push(' val len = s.length'); + lines.push(' if (len < 4 || len % 4 != 0) return false'); + lines.push(" if (s[len - 1] != '=' || s[len - 2] != '=') return false"); + lines.push(' for (i in 0 until len - 2) {'); + lines.push(' if (!isB64Char(s[i])) return false'); + lines.push(' }'); + lines.push(' return true'); + lines.push('}'); + lines.push(''); + } + if (helpers.has('checkNostrUri')) { lines.push('/* ^nostr:((npub|note)1[bech32]{58}|(nprofile|nevent|naddr)1[bech32]+)$ */'); lines.push('private fun checkNostrUri(s: String): Boolean {'); diff --git a/src/emit-php.ts b/src/emit-php.ts index 5d66ab7..80cbe09 100644 --- a/src/emit-php.ts +++ b/src/emit-php.ts @@ -202,6 +202,19 @@ function renderPatternCheckPhp(check: PatternCheck, varExpr: string): { expr: st helpers.add('schemata_check_base64'); return { expr: `schemata_check_base64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `schemata_check_hex${len}` : `schemata_check_hex${len}_mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('schemata_check_base64_2pad'); + helpers.add('schemata_check_base64'); // for schemata_is_b64_char + return { expr: `schemata_check_base64_2pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('schemata_check_nostr_uri'); return { expr: `schemata_check_nostr_uri(${varExpr})`, helpers }; @@ -1250,6 +1263,20 @@ function emitPhpHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('schemata_check_base64_2pad')) { + lines.push('/* strict base64 with mandatory 2-char padding */'); + lines.push('function schemata_check_base64_2pad(string $s): bool {'); + lines.push(' $n = strlen($s);'); + lines.push(' if ($n < 4 || $n % 4 !== 0) { return false; }'); + lines.push(" if ($s[$n - 1] !== '=' || $s[$n - 2] !== '=') { return false; }"); + lines.push(' for ($i = 0; $i < $n - 2; $i++) {'); + lines.push(' if (!schemata_is_b64_char($s[$i])) { return false; }'); + lines.push(' }'); + lines.push(' return true;'); + lines.push('}'); + lines.push(''); + } + if (helpers.has('schemata_check_nostr_uri')) { lines.push('function schemata_is_bech32_data_char(string $c): bool {'); lines.push(" return ($c >= '0' && $c <= '9' && $c !== '1') || ($c >= 'a' && $c <= 'z' && $c !== 'b' && $c !== 'i' && $c !== 'o');"); diff --git a/src/emit-python.ts b/src/emit-python.ts index 9943fc4..cb2b730 100644 --- a/src/emit-python.ts +++ b/src/emit-python.ts @@ -198,6 +198,19 @@ function renderPatternCheckPython(check: PatternCheck, varExpr: string): { expr: helpers.add('_check_base64'); return { expr: `_check_base64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `_check_hex${len}` : `_check_hex${len}_mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' or ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('_check_base64_2pad'); + helpers.add('_check_base64'); // for _is_b64_char + return { expr: `_check_base64_2pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('_check_nostr_uri'); return { expr: `_check_nostr_uri(${varExpr})`, helpers }; @@ -1218,6 +1231,21 @@ function emitPythonHelpers(helpers: Set): string { lines.push(' return True'); } + if (helpers.has('_check_base64_2pad')) { + if (lines.length > 0) lines.push(''); + lines.push(''); + lines.push('def _check_base64_2pad(s: str) -> bool:'); + lines.push(' n = len(s)'); + lines.push(' if n < 4 or n % 4 != 0:'); + lines.push(' return False'); + lines.push(" if s[n - 1] != '=' or s[n - 2] != '=':"); + lines.push(' return False'); + lines.push(' for i in range(n - 2):'); + lines.push(' if not _is_b64_char(s[i]):'); + lines.push(' return False'); + lines.push(' return True'); + } + if (helpers.has('_check_nostr_uri')) { if (lines.length > 0) lines.push(''); lines.push(''); diff --git a/src/emit-ruby.ts b/src/emit-ruby.ts index 1393a73..2d3ed65 100644 --- a/src/emit-ruby.ts +++ b/src/emit-ruby.ts @@ -202,6 +202,19 @@ function renderPatternCheckRuby(check: PatternCheck, varExpr: string): { expr: s helpers.add('check_base64'); return { expr: `check_base64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `check_hex${len}` : `check_hex${len}_mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('check_base64_2pad'); + helpers.add('check_base64'); // for is_b64_char + return { expr: `check_base64_2pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('check_nostr_uri'); return { expr: `check_nostr_uri(${varExpr})`, helpers }; @@ -1124,6 +1137,19 @@ function emitRubyHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('check_base64_2pad')) { + lines.push(' # strict base64 with mandatory 2-char padding'); + lines.push(' def self.check_base64_2pad(s)'); + lines.push(' return false unless s.is_a?(String)'); + lines.push(' n = s.length'); + lines.push(' return false if n < 4 || n % 4 != 0'); + lines.push(" return false unless s[n - 1] == '=' && s[n - 2] == '='"); + lines.push(' (0...n - 2).each { |i| return false unless is_b64_char(s[i]) }'); + lines.push(' true'); + lines.push(' end'); + lines.push(''); + } + if (helpers.has('check_nostr_uri')) { lines.push(' BECH32_DATA_CHARS = Set.new("023456789acdefghjklmnpqrstuvwxyz".chars).freeze'); lines.push(''); diff --git a/src/emit-rust.ts b/src/emit-rust.ts index f6c3054..1d015c3 100644 --- a/src/emit-rust.ts +++ b/src/emit-rust.ts @@ -256,6 +256,19 @@ function renderPatternCheckRust(check: PatternCheck, varExpr: string): { expr: s helpers.add('check_base64'); return { expr: `check_base64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `check_hex${len}` : `check_hex${len}_mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('check_base64_2pad'); + helpers.add('check_base64'); // for is_b64_char + return { expr: `check_base64_2pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('check_nostr_uri'); helpers.add('check_bech32'); // for is_bech32_char @@ -1160,6 +1173,21 @@ function emitRustHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('check_base64_2pad')) { + lines.push('/// strict base64 with mandatory 2-char padding'); + lines.push('fn check_base64_2pad(s: &str) -> bool {'); + lines.push(' let b = s.as_bytes();'); + lines.push(' let len = b.len();'); + lines.push(' if len < 4 || len % 4 != 0 { return false; }'); + lines.push(" if b[len - 1] != b'=' || b[len - 2] != b'=' { return false; }"); + lines.push(' for i in 0..len - 2 {'); + lines.push(' if !is_b64_char(b[i]) { return false; }'); + lines.push(' }'); + lines.push(' true'); + lines.push('}'); + lines.push(''); + } + // nostr:((npub|note)1[bech32]{58}|(nprofile|nevent|naddr)1[bech32]+) if (helpers.has('check_nostr_uri')) { lines.push('fn check_nostr_uri(s: &str) -> bool {'); diff --git a/src/emit-swift.ts b/src/emit-swift.ts index 932687b..43416b2 100644 --- a/src/emit-swift.ts +++ b/src/emit-swift.ts @@ -201,6 +201,19 @@ function renderPatternCheckSwift(check: PatternCheck, varExpr: string): { expr: helpers.add('checkBase64'); return { expr: `checkBase64(${varExpr})`, helpers }; } + case 'hex_alternation': { + const fns = check.lengths.map(len => { + const fn = check.case === 'lower' ? `checkHex${len}` : `checkHex${len}Mixed`; + helpers.add(fn); + return `${fn}(${varExpr})`; + }); + return { expr: `(${fns.join(' || ')})`, helpers }; + } + case 'base64_2pad': { + helpers.add('checkBase642Pad'); + helpers.add('checkBase64'); // for isB64Char + return { expr: `checkBase642Pad(${varExpr})`, helpers }; + } case 'nostr_uri': { helpers.add('checkNostrUri'); helpers.add('checkBech32'); // triggers isBech32Char @@ -1183,6 +1196,21 @@ function emitSwiftHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkBase642Pad')) { + lines.push('// strict base64 with mandatory 2-char padding'); + lines.push('private func checkBase642Pad(_ s: String) -> Bool {'); + lines.push(' let u = Array(s.utf8)'); + lines.push(' let len = u.count'); + lines.push(' if len < 4 || len % 4 != 0 { return false }'); + lines.push(' if u[len - 1] != 0x3D || u[len - 2] != 0x3D { return false }'); + lines.push(' for i in 0.. Bool {'); lines.push(' (b >= 0x30 && b <= 0x39 && b != 0x31) || (b >= 0x61 && b <= 0x7A && b != 0x62 && b != 0x69 && b != 0x6F)'); diff --git a/tests/classify-pattern.test.ts b/tests/classify-pattern.test.ts index 977f3fe..610f9d3 100644 --- a/tests/classify-pattern.test.ts +++ b/tests/classify-pattern.test.ts @@ -511,6 +511,28 @@ describe('classifyRegex', () => { assert.ok(isNativeCheck(r)); }); + // --- hex_alternation --- + + it('classifies multi-length hex alternation as hex_alternation', () => { + const r = classifyRegex('^(?:[a-f0-9]{64}|[a-f0-9]{96}|[a-f0-9]{128})$'); + assert.deepStrictEqual(r, { op: 'hex_alternation', lengths: [64, 96, 128], case: 'lower' }); + assert.ok(isNativeCheck(r)); + }); + + it('classifies mixed-case hex alternation', () => { + const r = classifyRegex('^(?:[a-fA-F0-9]{32}|[a-fA-F0-9]{64})$'); + assert.deepStrictEqual(r, { op: 'hex_alternation', lengths: [32, 64], case: 'mixed' }); + assert.ok(isNativeCheck(r)); + }); + + // --- base64_2pad --- + + it('classifies strict base64 2-pad as base64_2pad', () => { + const r = classifyRegex('^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$'); + assert.deepStrictEqual(r, { op: 'base64_2pad' }); + assert.ok(isNativeCheck(r)); + }); + // --- nostr_uri --- it('classifies nostr URI pattern as nostr_uri', () => { @@ -724,6 +746,9 @@ describe('classifyRegex coverage of schemata patterns', () => { '^refs/(heads|tags)/[^\\s]+$', '^https?://\\S+$', '^dim [0-9]{1,5}x[0-9]{1,5}$', + // New patterns (PR #37: hex_alternation + base64_2pad) + '^(?:[a-f0-9]{64}|[a-f0-9]{96}|[a-f0-9]{128})$', + '^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$', ]; it('processes all schemata patterns without throwing', () => { diff --git a/tests/fuzz-equivalence.test.ts b/tests/fuzz-equivalence.test.ts index 6b27224..1a7da8b 100644 --- a/tests/fuzz-equivalence.test.ts +++ b/tests/fuzz-equivalence.test.ts @@ -772,6 +772,30 @@ function buildNativeChecker(check: PatternCheck, originalPattern?: string): ((s: }; } + case 'hex_alternation': { + const validChars = check.case === 'lower' ? HEX_LOWER : HEX_MIXED; + const lengthSet = new Set(check.lengths); + return (s) => { + if (!lengthSet.has(s.length)) return false; + for (let i = 0; i < s.length; i++) { + if (!validChars.includes(s[i])) return false; + } + return true; + }; + } + + case 'base64_2pad': { + const isB64 = (c: string) => (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c === '+' || c === '/'; + return (s) => { + if (s.length < 4 || s.length % 4 !== 0) return false; + if (s[s.length - 1] !== '=' || s[s.length - 2] !== '=') return false; + for (let i = 0; i < s.length - 2; i++) { + if (!isB64(s[i])) return false; + } + return true; + }; + } + case 'nostr_uri': { const BC = '023456789acdefghjklmnpqrstuvwxyz'; return (s) => { @@ -1724,6 +1748,29 @@ function generateInputsForCheck(rng: Rng, check: PatternCheck): string[] { return [...base, '0abc', 'bXYZ', '', 'ABC', 'a123']; case 'base64': return [...base, '', 'AAAA', 'SGVsbG8=', 'SGVsbA==', 'A', 'ABC', '====', 'AA==AAAA']; + case 'hex_alternation': { + const hexAltInputs: string[] = []; + const hexAltChars = check.case === 'lower' ? HEX_LOWER : HEX_MIXED; + for (const len of check.lengths) { + hexAltInputs.push(rng.randomString(len, hexAltChars)); // valid + hexAltInputs.push(rng.randomString(len + 1, hexAltChars)); // off by 1 + hexAltInputs.push(rng.randomString(len - 1, hexAltChars)); // off by 1 + } + hexAltInputs.push('a'.repeat(64), 'a'.repeat(96), 'a'.repeat(128)); + hexAltInputs.push('a'.repeat(65), 'a'.repeat(95), 'a'.repeat(127)); + if (check.case === 'lower') hexAltInputs.push('A'.repeat(64)); // uppercase rejected + hexAltInputs.push('g'.repeat(64)); // invalid hex char + hexAltInputs.push(''); + return [...base, ...hexAltInputs]; + } + case 'base64_2pad': + return [...base, + 'AAAA', 'AA==', 'AAAAAA==', 'AAAAAAAA', // valid: 4-byte groups ending == + '', 'SGVsbG8=', 'A===', '====', // invalid + 'AA==AAAA', 'AA=\n', 'AA==\n', // adversarial + 'AAAAAAAAAAAA', 'AAAAAAAAAA==', // 12 chars valid, 12 chars with pad + 'AB==', '+/==', 'ab==', '00==', // valid 4-byte + ]; case 'nostr_uri': return [...base, 'nostr:npub1' + '0'.repeat(58), 'nostr:note1' + '0'.repeat(58), 'nostr:nprofile1' + '0'.repeat(10), 'nostr:', '', 'npub1' + '0'.repeat(58)]; case 'nip04_encrypted': @@ -1833,6 +1880,10 @@ const ALL_PATTERNS: string[] = [ // Additional from schemata dist '^(02|03)[a-f0-9]{64}$', '^https?://\\S+$', + // Multi-length hex alternation (MIP-00 i-tag) + '^(?:[a-f0-9]{64}|[a-f0-9]{96}|[a-f0-9]{128})$', + // Strict base64 2-pad (MIP-05 token tag) + '^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$', ]; // Deduplicate From 095d0ef5ec772f87ae351a892c952cde84d0ef03 Mon Sep 17 00:00:00 2001 From: alltheseas Date: Wed, 1 Apr 2026 22:40:03 -0500 Subject: [PATCH 2/4] feat: add identifier, space_separated_charset, uri_scheme native ops (#40) Add 3 new PatternCheck ops to eliminate the remaining 7 regex fallbacks introduced in schemata v0.3.2: - identifier: ^[prefix]?[firstCharset][restCharset]*$ (5 patterns) - space_separated_charset: ^[charset]+( [charset]+)*$ (1 pattern) - uri_scheme: ^[A-Za-z][A-Za-z0-9+.-]*:// (1 pattern) All 12 language emitters updated with renderPatternCheck cases and helper implementations. Classifier coverage: 90/90 patterns native, zero regex fallbacks. Co-Authored-By: Claude Opus 4.6 --- src/classify-pattern.ts | 48 ++++++++ src/emit-c.ts | 63 +++++++++++ src/emit-cpp.ts | 60 ++++++++++ src/emit-csharp.ts | 60 ++++++++++ src/emit-dart.ts | 61 ++++++++++ src/emit-go.ts | 90 +++++++++++++++ src/emit-java.ts | 60 ++++++++++ src/emit-kotlin.ts | 61 ++++++++++ src/emit-php.ts | 63 +++++++++++ src/emit-python.ts | 72 ++++++++++++ src/emit-ruby.ts | 66 +++++++++++ src/emit-rust.ts | 66 +++++++++++ src/emit-swift.ts | 64 +++++++++++ tests/classify-pattern.test.ts | 76 +++++++++++++ tests/fuzz-equivalence.test.ts | 198 +++++++++++++++++++++++++++++++++ 15 files changed, 1108 insertions(+) diff --git a/src/classify-pattern.ts b/src/classify-pattern.ts index 678764e..85e1dc2 100644 --- a/src/classify-pattern.ts +++ b/src/classify-pattern.ts @@ -68,6 +68,9 @@ export type PatternCheck = | { op: 'base64' } | { op: 'hex_alternation'; lengths: number[]; case: 'lower' | 'mixed' } | { op: 'base64_2pad' } + | { op: 'identifier'; optionalPrefix?: string; firstCharset: string; restCharset: string } + | { op: 'space_separated_charset'; charset: string } + | { op: 'uri_scheme' } | { op: 'nostr_uri' } | { op: 'nip04_encrypted' } | { op: 'nip05_identifier' } @@ -458,6 +461,48 @@ export function classifyRegex(pattern: string): PatternCheck { return { op: 'prefix_delim_rest', charset: expandCharset('a-zA-Z0-9') + '_-', delimiter: ': ' }; } + // Identifier: ^[optionalPrefix]?[firstCharset][restCharset]*$ + // Covers: ^[a-z][a-z0-9]*$, ^[A-Z][a-zA-Z0-9]*$, ^[a-z][a-z0-9-]*$, + // ^!?[a-z][a-z0-9]*$, ^!?[0-9]+$ + { + // Match: ^?[firstCharset][restCharset]*$ or ^?[charset]+$ + const m = pattern.match(/^\^(!?)\??\[([A-Za-z0-9-]+)\](\[([A-Za-z0-9-]+)\]\*|\+)\$$/); + if (m) { + const prefixChar = m[1]; // '' or '!' + const hasOptionalPrefix = prefixChar !== '' && pattern.startsWith('^' + prefixChar + '?'); + const firstCharset = expandCharset(m[2]); + + if (m[3] === '+') { + // ^[charset]+$ or ^!?[charset]+$ + if (hasOptionalPrefix) { + // ^!?[0-9]+$ — identifier with optional prefix, same first and rest charset + return { op: 'identifier', optionalPrefix: prefixChar, firstCharset, restCharset: firstCharset }; + } + // ^[charset]+$ — equivalent to chars_in, already handled above; skip + } else { + // ^[firstCharset][restCharset]*$ or ^!?[firstCharset][restCharset]*$ + const restCharset = expandCharset(m[4]); + if (hasOptionalPrefix) { + return { op: 'identifier', optionalPrefix: prefixChar, firstCharset, restCharset }; + } + return { op: 'identifier', firstCharset, restCharset }; + } + } + } + + // Space-separated charset: ^[charset]+( [charset]+)*$ + { + const m = pattern.match(/^\^\[([A-Za-z0-9_-]+)\]\+\( \[([A-Za-z0-9_-]+)\]\+\)\*\$$/); + if (m && m[1] === m[2]) { + return { op: 'space_separated_charset', charset: expandCharset(m[1]) }; + } + } + + // URI scheme: ^[A-Za-z][A-Za-z0-9+.-]*:// + if (pattern === '^[A-Za-z][A-Za-z0-9+.-]*://') { + return { op: 'uri_scheme' }; + } + // Fallback: preserve original regex return { op: 'regex', pattern }; } @@ -665,6 +710,9 @@ export function isNativeCheck(check: PatternCheck): boolean { case 'nip05_identifier': case 'mime_type_strict': case 'prefix_delim_rest': + case 'identifier': + case 'space_separated_charset': + case 'uri_scheme': return true; case 'compound': return check.checks.every(isNativeCheck); diff --git a/src/emit-c.ts b/src/emit-c.ts index 07f93ec..6421f27 100644 --- a/src/emit-c.ts +++ b/src/emit-c.ts @@ -343,6 +343,18 @@ function renderPatternCheckC(check: PatternCheck, varExpr: string): { expr: stri helpers.add('schemata_check_prefix_delim_rest'); return { expr: `schemata_check_prefix_delim_rest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('schemata_check_identifier'); + return { expr: `schemata_check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ', 0'})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('schemata_check_space_separated_charset'); + return { expr: `schemata_check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('schemata_check_uri_scheme'); + return { expr: `schemata_check_uri_scheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1667,5 +1679,56 @@ function emitHelperFunctions(helpers: Set): string { lines.push(''); } + if (helpers.has('schemata_check_identifier')) { + if (lines.length > 0) lines.push(''); + lines.push('static int schemata_check_identifier(const char *s, const char *first_charset, const char *rest_charset, char prefix) {'); + lines.push(' size_t len = strlen(s);'); + lines.push(' size_t i = 0;'); + lines.push(' if (prefix && i < len && s[i] == prefix) i++;'); + lines.push(' if (i >= len) return 0;'); + lines.push(' if (!strchr(first_charset, s[i])) return 0;'); + lines.push(' i++;'); + lines.push(' for (; i < len; i++) {'); + lines.push(' if (!strchr(rest_charset, s[i])) return 0;'); + lines.push(' }'); + lines.push(' return 1;'); + lines.push('}'); + } + + if (helpers.has('schemata_check_space_separated_charset')) { + if (lines.length > 0) lines.push(''); + lines.push('static int schemata_check_space_separated_charset(const char *s, const char *charset) {'); + lines.push(' size_t len = strlen(s);'); + lines.push(' if (len == 0) return 0;'); + lines.push(' size_t i = 0;'); + lines.push(' if (!strchr(charset, s[i])) return 0;'); + lines.push(' while (i < len && strchr(charset, s[i])) i++;'); + lines.push(" while (i < len && s[i] == ' ') {"); + lines.push(' i++;'); + lines.push(' if (i >= len || !strchr(charset, s[i])) return 0;'); + lines.push(' while (i < len && strchr(charset, s[i])) i++;'); + lines.push(' }'); + lines.push(' return i == len;'); + lines.push('}'); + } + + if (helpers.has('schemata_check_uri_scheme')) { + if (lines.length > 0) lines.push(''); + lines.push('static int schemata_check_uri_scheme(const char *s) {'); + lines.push(' size_t len = strlen(s);'); + lines.push(' if (len < 4) return 0;'); + lines.push(' char c = s[0];'); + lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return 0;"); + lines.push(' size_t i = 1;'); + lines.push(' while (i < len) {'); + lines.push(' c = s[i];'); + lines.push(" if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') i++;"); + lines.push(' else break;'); + lines.push(' }'); + lines.push(" if (i + 3 > len) return 0;"); + lines.push(" return s[i] == ':' && s[i+1] == '/' && s[i+2] == '/';"); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/src/emit-cpp.ts b/src/emit-cpp.ts index fc2542f..ccf54fd 100644 --- a/src/emit-cpp.ts +++ b/src/emit-cpp.ts @@ -238,6 +238,18 @@ function renderPatternCheckCpp(check: PatternCheck, varExpr: string): { expr: st helpers.add('check_prefix_delim_rest'); return { expr: `check_prefix_delim_rest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('check_identifier'); + return { expr: `check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ', 0'})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('check_space_separated_charset'); + return { expr: `check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('check_uri_scheme'); + return { expr: `check_uri_scheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1325,5 +1337,53 @@ function emitCppHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('check_identifier')) { + if (lines.length > 0) lines.push(''); + lines.push('static bool check_identifier(const std::string& s, const std::string& first_charset, const std::string& rest_charset, char prefix = 0) {'); + lines.push(' size_t i = 0;'); + lines.push(' if (prefix && i < s.size() && s[i] == prefix) i++;'); + lines.push(' if (i >= s.size()) return false;'); + lines.push(' if (first_charset.find(s[i]) == std::string::npos) return false;'); + lines.push(' i++;'); + lines.push(' for (; i < s.size(); i++) {'); + lines.push(' if (rest_charset.find(s[i]) == std::string::npos) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push('}'); + } + + if (helpers.has('check_space_separated_charset')) { + if (lines.length > 0) lines.push(''); + lines.push('static bool check_space_separated_charset(const std::string& s, const std::string& charset) {'); + lines.push(' if (s.empty()) return false;'); + lines.push(' size_t i = 0;'); + lines.push(' if (charset.find(s[i]) == std::string::npos) return false;'); + lines.push(' while (i < s.size() && charset.find(s[i]) != std::string::npos) i++;'); + lines.push(" while (i < s.size() && s[i] == ' ') {"); + lines.push(' i++;'); + lines.push(' if (i >= s.size() || charset.find(s[i]) == std::string::npos) return false;'); + lines.push(' while (i < s.size() && charset.find(s[i]) != std::string::npos) i++;'); + lines.push(' }'); + lines.push(' return i == s.size();'); + lines.push('}'); + } + + if (helpers.has('check_uri_scheme')) { + if (lines.length > 0) lines.push(''); + lines.push('static bool check_uri_scheme(const std::string& s) {'); + lines.push(' if (s.size() < 4) return false;'); + lines.push(' char c = s[0];'); + lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return false;"); + lines.push(' size_t i = 1;'); + lines.push(' while (i < s.size()) {'); + lines.push(' c = s[i];'); + lines.push(" if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') i++;"); + lines.push(' else break;'); + lines.push(' }'); + lines.push(' if (i + 3 > s.size()) return false;'); + lines.push(" return s[i] == ':' && s[i+1] == '/' && s[i+2] == '/';"); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/src/emit-csharp.ts b/src/emit-csharp.ts index 8f02cea..9e945cf 100644 --- a/src/emit-csharp.ts +++ b/src/emit-csharp.ts @@ -234,6 +234,18 @@ function renderPatternCheckCSharp(check: PatternCheck, varExpr: string): { expr: helpers.add('CheckPrefixDelimRest'); return { expr: `CheckPrefixDelimRest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('CheckIdentifier'); + return { expr: `CheckIdentifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ', (char)0'})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('CheckSpaceSeparatedCharset'); + return { expr: `CheckSpaceSeparatedCharset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('CheckUriScheme'); + return { expr: `CheckUriScheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1304,5 +1316,53 @@ function emitCSharpHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('CheckIdentifier')) { + if (lines.length > 0) lines.push(''); + lines.push(' private static bool CheckIdentifier(string s, string firstCharset, string restCharset, char prefix = (char)0) {'); + lines.push(' int i = 0;'); + lines.push(' if (prefix != 0 && i < s.Length && s[i] == prefix) i++;'); + lines.push(' if (i >= s.Length) return false;'); + lines.push(' if (firstCharset.IndexOf(s[i]) < 0) return false;'); + lines.push(' i++;'); + lines.push(' for (; i < s.Length; i++) {'); + lines.push(' if (restCharset.IndexOf(s[i]) < 0) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push(' }'); + } + + if (helpers.has('CheckSpaceSeparatedCharset')) { + if (lines.length > 0) lines.push(''); + lines.push(' private static bool CheckSpaceSeparatedCharset(string s, string charset) {'); + lines.push(' if (s.Length == 0) return false;'); + lines.push(' int i = 0;'); + lines.push(' if (charset.IndexOf(s[i]) < 0) return false;'); + lines.push(' while (i < s.Length && charset.IndexOf(s[i]) >= 0) i++;'); + lines.push(" while (i < s.Length && s[i] == ' ') {"); + lines.push(' i++;'); + lines.push(' if (i >= s.Length || charset.IndexOf(s[i]) < 0) return false;'); + lines.push(' while (i < s.Length && charset.IndexOf(s[i]) >= 0) i++;'); + lines.push(' }'); + lines.push(' return i == s.Length;'); + lines.push(' }'); + } + + if (helpers.has('CheckUriScheme')) { + if (lines.length > 0) lines.push(''); + lines.push(' private static bool CheckUriScheme(string s) {'); + lines.push(' if (s.Length < 4) return false;'); + lines.push(' char c = s[0];'); + lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return false;"); + lines.push(' int i = 1;'); + lines.push(' while (i < s.Length) {'); + lines.push(' c = s[i];'); + lines.push(" if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') i++;"); + lines.push(' else break;'); + lines.push(' }'); + lines.push(' if (i + 3 > s.Length) return false;'); + lines.push(" return s[i] == ':' && s[i+1] == '/' && s[i+2] == '/';"); + lines.push(' }'); + } + return lines.join('\n'); } diff --git a/src/emit-dart.ts b/src/emit-dart.ts index c61aeca..ed0dc75 100644 --- a/src/emit-dart.ts +++ b/src/emit-dart.ts @@ -230,6 +230,18 @@ function renderPatternCheckDart(check: PatternCheck, varExpr: string): { expr: s helpers.add('_checkPrefixDelimRest'); return { expr: `_checkPrefixDelimRest(${varExpr}, ${dartString(check.charset)}, ${dartString(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('_checkIdentifier'); + return { expr: `_checkIdentifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ''})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('_checkSpaceSeparatedCharset'); + return { expr: `_checkSpaceSeparatedCharset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('_checkUriScheme'); + return { expr: `_checkUriScheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1297,5 +1309,54 @@ function emitDartHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('_checkIdentifier')) { + if (lines.length > 0) lines.push(''); + lines.push("bool _checkIdentifier(String s, String firstCharset, String restCharset, [String prefix = '']) {"); + lines.push(' var i = 0;'); + lines.push(" if (prefix.isNotEmpty && i < s.length && s[i] == prefix) i++;"); + lines.push(' if (i >= s.length) return false;'); + lines.push(' if (!firstCharset.contains(s[i])) return false;'); + lines.push(' i++;'); + lines.push(' while (i < s.length) {'); + lines.push(' if (!restCharset.contains(s[i])) return false;'); + lines.push(' i++;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push('}'); + } + + if (helpers.has('_checkSpaceSeparatedCharset')) { + if (lines.length > 0) lines.push(''); + lines.push('bool _checkSpaceSeparatedCharset(String s, String charset) {'); + lines.push(' if (s.isEmpty) return false;'); + lines.push(' var i = 0;'); + lines.push(' if (!charset.contains(s[i])) return false;'); + lines.push(' while (i < s.length && charset.contains(s[i])) i++;'); + lines.push(" while (i < s.length && s[i] == ' ') {"); + lines.push(' i++;'); + lines.push(' if (i >= s.length || !charset.contains(s[i])) return false;'); + lines.push(' while (i < s.length && charset.contains(s[i])) i++;'); + lines.push(' }'); + lines.push(' return i == s.length;'); + lines.push('}'); + } + + if (helpers.has('_checkUriScheme')) { + if (lines.length > 0) lines.push(''); + lines.push('bool _checkUriScheme(String s) {'); + lines.push(' if (s.length < 4) return false;'); + lines.push(' var c = s.codeUnitAt(0);'); + lines.push(' if (!((c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A))) return false;'); + lines.push(' var i = 1;'); + lines.push(' while (i < s.length) {'); + lines.push(' c = s.codeUnitAt(i);'); + lines.push(' if ((c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A) || (c >= 0x30 && c <= 0x39) || c == 0x2B || c == 0x2E || c == 0x2D) { i++; }'); + lines.push(' else { break; }'); + lines.push(' }'); + lines.push(' if (i + 3 > s.length) return false;'); + lines.push(" return s[i] == ':' && s[i+1] == '/' && s[i+2] == '/';"); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/src/emit-go.ts b/src/emit-go.ts index afd8eb6..b5f53fe 100644 --- a/src/emit-go.ts +++ b/src/emit-go.ts @@ -250,6 +250,20 @@ function renderPatternCheckGo(check: PatternCheck, varExpr: string): { expr: str helpers.add('strings'); return { expr: `checkPrefixDelimRest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('checkIdentifier'); + helpers.add('strings'); + return { expr: `checkIdentifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ', 0'})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('checkSpaceSeparatedCharset'); + helpers.add('strings'); + return { expr: `checkSpaceSeparatedCharset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('checkUriScheme'); + return { expr: `checkUriScheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1901,5 +1915,81 @@ function emitGoHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkIdentifier')) { + if (lines.length > 0) lines.push(''); + lines.push('func checkIdentifier(s string, firstCharset string, restCharset string, prefix rune) bool {'); + lines.push('\ti := 0'); + lines.push('\tif prefix != 0 && i < len(s) && rune(s[i]) == prefix {'); + lines.push('\t\ti++'); + lines.push('\t}'); + lines.push('\tif i >= len(s) {'); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push('\tif !strings.ContainsRune(firstCharset, rune(s[i])) {'); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push('\ti++'); + lines.push('\tfor i < len(s) {'); + lines.push('\t\tif !strings.ContainsRune(restCharset, rune(s[i])) {'); + lines.push('\t\t\treturn false'); + lines.push('\t\t}'); + lines.push('\t\ti++'); + lines.push('\t}'); + lines.push('\treturn true'); + lines.push('}'); + } + + if (helpers.has('checkSpaceSeparatedCharset')) { + if (lines.length > 0) lines.push(''); + lines.push('func checkSpaceSeparatedCharset(s string, charset string) bool {'); + lines.push('\tif len(s) == 0 {'); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push('\ti := 0'); + lines.push('\tif !strings.ContainsRune(charset, rune(s[i])) {'); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push('\tfor i < len(s) && strings.ContainsRune(charset, rune(s[i])) {'); + lines.push('\t\ti++'); + lines.push('\t}'); + lines.push("\tfor i < len(s) && s[i] == ' ' {"); + lines.push('\t\ti++'); + lines.push('\t\tif i >= len(s) || !strings.ContainsRune(charset, rune(s[i])) {'); + lines.push('\t\t\treturn false'); + lines.push('\t\t}'); + lines.push('\t\tfor i < len(s) && strings.ContainsRune(charset, rune(s[i])) {'); + lines.push('\t\t\ti++'); + lines.push('\t\t}'); + lines.push('\t}'); + lines.push('\treturn i == len(s)'); + lines.push('}'); + } + + if (helpers.has('checkUriScheme')) { + if (lines.length > 0) lines.push(''); + lines.push('func checkUriScheme(s string) bool {'); + lines.push('\tif len(s) < 4 {'); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push('\tc := s[0]'); + lines.push("\tif !((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {"); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push('\ti := 1'); + lines.push('\tfor i < len(s) {'); + lines.push('\t\tc = s[i]'); + lines.push("\t\tif (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-' {"); + lines.push('\t\t\ti++'); + lines.push('\t\t} else {'); + lines.push('\t\t\tbreak'); + lines.push('\t\t}'); + lines.push('\t}'); + lines.push('\tif i+3 > len(s) {'); + lines.push('\t\treturn false'); + lines.push('\t}'); + lines.push("\treturn s[i] == ':' && s[i+1] == '/' && s[i+2] == '/'"); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/src/emit-java.ts b/src/emit-java.ts index 18c87c1..41a4871 100644 --- a/src/emit-java.ts +++ b/src/emit-java.ts @@ -237,6 +237,18 @@ function renderPatternCheckJava(check: PatternCheck, varExpr: string): { expr: s helpers.add('checkPrefixDelimRest'); return { expr: `checkPrefixDelimRest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('checkIdentifier'); + return { expr: `checkIdentifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : `, (char) 0`})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('checkSpaceSeparatedCharset'); + return { expr: `checkSpaceSeparatedCharset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('checkUriScheme'); + return { expr: `checkUriScheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1323,6 +1335,54 @@ function emitJavaHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkIdentifier')) { + if (lines.length > 0) lines.push(''); + lines.push(' private static boolean checkIdentifier(String s, String firstCharset, String restCharset, char prefix) {'); + lines.push(' int i = 0;'); + lines.push(' if (prefix != 0 && i < s.length() && s.charAt(i) == prefix) i++;'); + lines.push(' if (i >= s.length()) return false;'); + lines.push(' if (firstCharset.indexOf(s.charAt(i)) < 0) return false;'); + lines.push(' i++;'); + lines.push(' for (; i < s.length(); i++) {'); + lines.push(' if (restCharset.indexOf(s.charAt(i)) < 0) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push(' }'); + } + + if (helpers.has('checkSpaceSeparatedCharset')) { + if (lines.length > 0) lines.push(''); + lines.push(' private static boolean checkSpaceSeparatedCharset(String s, String charset) {'); + lines.push(' if (s.isEmpty()) return false;'); + lines.push(' int i = 0;'); + lines.push(' if (charset.indexOf(s.charAt(i)) < 0) return false;'); + lines.push(' while (i < s.length() && charset.indexOf(s.charAt(i)) >= 0) i++;'); + lines.push(" while (i < s.length() && s.charAt(i) == ' ') {"); + lines.push(' i++;'); + lines.push(' if (i >= s.length() || charset.indexOf(s.charAt(i)) < 0) return false;'); + lines.push(' while (i < s.length() && charset.indexOf(s.charAt(i)) >= 0) i++;'); + lines.push(' }'); + lines.push(' return i == s.length();'); + lines.push(' }'); + } + + if (helpers.has('checkUriScheme')) { + if (lines.length > 0) lines.push(''); + lines.push(' private static boolean checkUriScheme(String s) {'); + lines.push(' if (s.length() < 4) return false;'); + lines.push(' char c = s.charAt(0);'); + lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return false;"); + lines.push(' int i = 1;'); + lines.push(' while (i < s.length()) {'); + lines.push(' c = s.charAt(i);'); + lines.push(" if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') i++;"); + lines.push(' else break;'); + lines.push(' }'); + lines.push(' if (i + 3 > s.length()) return false;'); + lines.push(" return s.charAt(i) == ':' && s.charAt(i+1) == '/' && s.charAt(i+2) == '/';"); + lines.push(' }'); + } + if (helpers.has('checkPackageId')) { lines.push(' private static boolean checkPackageId(String s) {'); lines.push(' if (s == null || s.isEmpty()) return false;'); diff --git a/src/emit-kotlin.ts b/src/emit-kotlin.ts index 6ffaf17..5da2e4b 100644 --- a/src/emit-kotlin.ts +++ b/src/emit-kotlin.ts @@ -232,6 +232,18 @@ function renderPatternCheckKotlin(check: PatternCheck, varExpr: string): { expr: helpers.add('checkPrefixDelimRest'); return { expr: `checkPrefixDelimRest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('checkIdentifier'); + return { expr: `checkIdentifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : `, '\\u0000'`})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('checkSpaceSeparatedCharset'); + return { expr: `checkSpaceSeparatedCharset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('checkUriScheme'); + return { expr: `checkUriScheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1229,5 +1241,54 @@ function emitKotlinHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkIdentifier')) { + if (lines.length > 0) lines.push(''); + lines.push('private fun checkIdentifier(s: String, firstCharset: String, restCharset: String, prefix: Char): Boolean {'); + lines.push(" var i = 0"); + lines.push(" if (prefix != '\\u0000' && i < s.length && s[i] == prefix) i++"); + lines.push(' if (i >= s.length) return false'); + lines.push(' if (firstCharset.indexOf(s[i]) < 0) return false'); + lines.push(' i++'); + lines.push(' while (i < s.length) {'); + lines.push(' if (restCharset.indexOf(s[i]) < 0) return false'); + lines.push(' i++'); + lines.push(' }'); + lines.push(' return true'); + lines.push('}'); + } + + if (helpers.has('checkSpaceSeparatedCharset')) { + if (lines.length > 0) lines.push(''); + lines.push('private fun checkSpaceSeparatedCharset(s: String, charset: String): Boolean {'); + lines.push(' if (s.isEmpty()) return false'); + lines.push(' var i = 0'); + lines.push(' if (charset.indexOf(s[i]) < 0) return false'); + lines.push(' while (i < s.length && charset.indexOf(s[i]) >= 0) i++'); + lines.push(" while (i < s.length && s[i] == ' ') {"); + lines.push(' i++'); + lines.push(' if (i >= s.length || charset.indexOf(s[i]) < 0) return false'); + lines.push(' while (i < s.length && charset.indexOf(s[i]) >= 0) i++'); + lines.push(' }'); + lines.push(' return i == s.length'); + lines.push('}'); + } + + if (helpers.has('checkUriScheme')) { + if (lines.length > 0) lines.push(''); + lines.push('private fun checkUriScheme(s: String): Boolean {'); + lines.push(' if (s.length < 4) return false'); + lines.push(' var c = s[0]'); + lines.push(" if (!((c in 'A'..'Z') || (c in 'a'..'z'))) return false"); + lines.push(' var i = 1'); + lines.push(' while (i < s.length) {'); + lines.push(' c = s[i]'); + lines.push(" if ((c in 'A'..'Z') || (c in 'a'..'z') || (c in '0'..'9') || c == '+' || c == '.' || c == '-') i++"); + lines.push(' else break'); + lines.push(' }'); + lines.push(' if (i + 3 > s.length) return false'); + lines.push(" return s[i] == ':' && s[i+1] == '/' && s[i+2] == '/'"); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/src/emit-php.ts b/src/emit-php.ts index 80cbe09..bd96e32 100644 --- a/src/emit-php.ts +++ b/src/emit-php.ts @@ -236,6 +236,18 @@ function renderPatternCheckPhp(check: PatternCheck, varExpr: string): { expr: st helpers.add('schemata_check_prefix_delim_rest'); return { expr: `schemata_check_prefix_delim_rest(${varExpr}, ${phpString(check.charset)}, ${phpString(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('check_identifier'); + return { expr: `check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, ${JSON.stringify(check.optionalPrefix)}` : ''})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('check_space_separated_charset'); + return { expr: `check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('check_uri_scheme'); + return { expr: `check_uri_scheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1419,5 +1431,56 @@ function emitPhpHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('check_identifier')) { + if (lines.length > 0) lines.push(''); + lines.push("function check_identifier(string $s, string $first_charset, string $rest_charset, string $prefix = ''): bool {"); + lines.push(' $i = 0;'); + lines.push(' $len = strlen($s);'); + lines.push(" if ($prefix !== '' && $i < $len && $s[$i] === $prefix) $i++;"); + lines.push(' if ($i >= $len) return false;'); + lines.push(' if (strpos($first_charset, $s[$i]) === false) return false;'); + lines.push(' $i++;'); + lines.push(' for (; $i < $len; $i++) {'); + lines.push(' if (strpos($rest_charset, $s[$i]) === false) return false;'); + lines.push(' }'); + lines.push(' return true;'); + lines.push('}'); + } + + if (helpers.has('check_space_separated_charset')) { + if (lines.length > 0) lines.push(''); + lines.push('function check_space_separated_charset(string $s, string $charset): bool {'); + lines.push(' $len = strlen($s);'); + lines.push(' if ($len === 0) return false;'); + lines.push(' $i = 0;'); + lines.push(' if (strpos($charset, $s[$i]) === false) return false;'); + lines.push(' while ($i < $len && strpos($charset, $s[$i]) !== false) $i++;'); + lines.push(" while ($i < $len && $s[$i] === ' ') {"); + lines.push(' $i++;'); + lines.push(' if ($i >= $len || strpos($charset, $s[$i]) === false) return false;'); + lines.push(' while ($i < $len && strpos($charset, $s[$i]) !== false) $i++;'); + lines.push(' }'); + lines.push(' return $i === $len;'); + lines.push('}'); + } + + if (helpers.has('check_uri_scheme')) { + if (lines.length > 0) lines.push(''); + lines.push('function check_uri_scheme(string $s): bool {'); + lines.push(' $len = strlen($s);'); + lines.push(' if ($len < 4) return false;'); + lines.push(' $c = $s[0];'); + lines.push(" if (!(($c >= 'A' && $c <= 'Z') || ($c >= 'a' && $c <= 'z'))) return false;"); + lines.push(' $i = 1;'); + lines.push(' while ($i < $len) {'); + lines.push(' $c = $s[$i];'); + lines.push(" if (($c >= 'A' && $c <= 'Z') || ($c >= 'a' && $c <= 'z') || ($c >= '0' && $c <= '9') || $c === '+' || $c === '.' || $c === '-') $i++;"); + lines.push(' else break;'); + lines.push(' }'); + lines.push(' if ($i + 3 > $len) return false;'); + lines.push(" return $s[$i] === ':' && $s[$i+1] === '/' && $s[$i+2] === '/';"); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/src/emit-python.ts b/src/emit-python.ts index cb2b730..f8d1e2e 100644 --- a/src/emit-python.ts +++ b/src/emit-python.ts @@ -232,6 +232,18 @@ function renderPatternCheckPython(check: PatternCheck, varExpr: string): { expr: helpers.add('_check_prefix_delim_rest'); return { expr: `_check_prefix_delim_rest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('_check_identifier'); + return { expr: `_check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, ${JSON.stringify(check.optionalPrefix)}` : ''})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('_check_space_separated_charset'); + return { expr: `_check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('_check_uri_scheme'); + return { expr: `_check_uri_scheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1409,5 +1421,65 @@ function emitPythonHelpers(helpers: Set): string { lines.push(' return True'); } + if (helpers.has('_check_identifier')) { + if (lines.length > 0) lines.push(''); + lines.push(''); + lines.push('def _check_identifier(s: str, first_charset: str, rest_charset: str, prefix: str = "") -> bool:'); + lines.push(' i = 0'); + lines.push(' if prefix and i < len(s) and s[i] == prefix:'); + lines.push(' i += 1'); + lines.push(' if i >= len(s):'); + lines.push(' return False'); + lines.push(' if s[i] not in first_charset:'); + lines.push(' return False'); + lines.push(' i += 1'); + lines.push(' while i < len(s):'); + lines.push(' if s[i] not in rest_charset:'); + lines.push(' return False'); + lines.push(' i += 1'); + lines.push(' return True'); + } + + if (helpers.has('_check_space_separated_charset')) { + if (lines.length > 0) lines.push(''); + lines.push(''); + lines.push('def _check_space_separated_charset(s: str, charset: str) -> bool:'); + lines.push(' if not s:'); + lines.push(' return False'); + lines.push(' i = 0'); + lines.push(' if s[i] not in charset:'); + lines.push(' return False'); + lines.push(' while i < len(s) and s[i] in charset:'); + lines.push(' i += 1'); + lines.push(" while i < len(s) and s[i] == ' ':"); + lines.push(' i += 1'); + lines.push(' if i >= len(s) or s[i] not in charset:'); + lines.push(' return False'); + lines.push(' while i < len(s) and s[i] in charset:'); + lines.push(' i += 1'); + lines.push(' return i == len(s)'); + } + + if (helpers.has('_check_uri_scheme')) { + if (lines.length > 0) lines.push(''); + lines.push(''); + lines.push('def _check_uri_scheme(s: str) -> bool:'); + lines.push(' if len(s) < 4:'); + lines.push(' return False'); + lines.push(' c = s[0]'); + lines.push(" if not (('A' <= c <= 'Z') or ('a' <= c <= 'z')):"); + lines.push(' return False'); + lines.push(' i = 1'); + lines.push(' while i < len(s):'); + lines.push(' c = s[i]'); + lines.push(" if ('A' <= c <= 'Z') or ('a' <= c <= 'z') or ('0' <= c <= '9') or c in '+.-':"); + lines.push(' i += 1'); + lines.push(' else:'); + lines.push(' break'); + lines.push(' if i + 3 > len(s):'); + lines.push(' return False'); + lines.push(" return s[i] == ':' and s[i+1] == '/' and s[i+2] == '/'"); + } + return lines.join('\n'); } diff --git a/src/emit-ruby.ts b/src/emit-ruby.ts index 2d3ed65..bed5c95 100644 --- a/src/emit-ruby.ts +++ b/src/emit-ruby.ts @@ -238,6 +238,18 @@ function renderPatternCheckRuby(check: PatternCheck, varExpr: string): { expr: s helpers.add('check_prefix_delim_rest'); return { expr: `check_prefix_delim_rest(${varExpr}, ${rubyString(check.charset)}, ${rubyString(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('check_identifier'); + return { expr: `check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, ${JSON.stringify(check.optionalPrefix)}` : ''})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('check_space_separated_charset'); + return { expr: `check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('check_uri_scheme'); + return { expr: `check_uri_scheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1298,5 +1310,59 @@ function emitRubyHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('check_identifier')) { + if (lines.length > 0) lines.push(''); + lines.push("def check_identifier(s, first_charset, rest_charset, prefix = '')"); + lines.push(' i = 0'); + lines.push(" if prefix != '' && i < s.length && s[i] == prefix"); + lines.push(' i += 1'); + lines.push(' end'); + lines.push(' return false if i >= s.length'); + lines.push(' return false unless first_charset.include?(s[i])'); + lines.push(' i += 1'); + lines.push(' while i < s.length'); + lines.push(' return false unless rest_charset.include?(s[i])'); + lines.push(' i += 1'); + lines.push(' end'); + lines.push(' true'); + lines.push('end'); + } + + if (helpers.has('check_space_separated_charset')) { + if (lines.length > 0) lines.push(''); + lines.push('def check_space_separated_charset(s, charset)'); + lines.push(' return false if s.empty?'); + lines.push(' i = 0'); + lines.push(' return false unless charset.include?(s[i])'); + lines.push(' i += 1 while i < s.length && charset.include?(s[i])'); + lines.push(" while i < s.length && s[i] == ' '"); + lines.push(' i += 1'); + lines.push(' return false if i >= s.length || !charset.include?(s[i])'); + lines.push(' i += 1 while i < s.length && charset.include?(s[i])'); + lines.push(' end'); + lines.push(' i == s.length'); + lines.push('end'); + } + + if (helpers.has('check_uri_scheme')) { + if (lines.length > 0) lines.push(''); + lines.push('def check_uri_scheme(s)'); + lines.push(' return false if s.length < 4'); + lines.push(' c = s[0]'); + lines.push(" return false unless (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')"); + lines.push(' i = 1'); + lines.push(' while i < s.length'); + lines.push(' c = s[i]'); + lines.push(" if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-'"); + lines.push(' i += 1'); + lines.push(' else'); + lines.push(' break'); + lines.push(' end'); + lines.push(' end'); + lines.push(' return false if i + 3 > s.length'); + lines.push(" s[i] == ':' && s[i+1] == '/' && s[i+2] == '/'"); + lines.push('end'); + } + return lines.join('\n'); } diff --git a/src/emit-rust.ts b/src/emit-rust.ts index 1d015c3..49f0a0d 100644 --- a/src/emit-rust.ts +++ b/src/emit-rust.ts @@ -293,6 +293,18 @@ function renderPatternCheckRust(check: PatternCheck, varExpr: string): { expr: s helpers.add('check_prefix_delim_rest'); return { expr: `check_prefix_delim_rest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('check_identifier'); + return { expr: `check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, Some('${check.optionalPrefix}')` : ', None'})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('check_space_separated_charset'); + return { expr: `check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('check_uri_scheme'); + return { expr: `check_uri_scheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1328,5 +1340,59 @@ function emitRustHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('check_identifier')) { + if (lines.length > 0) lines.push(''); + lines.push('fn check_identifier(s: &str, first_charset: &str, rest_charset: &str, prefix: Option) -> bool {'); + lines.push(' let bytes = s.as_bytes();'); + lines.push(' let mut i = 0;'); + lines.push(' if let Some(p) = prefix {'); + lines.push(' if i < bytes.len() && bytes[i] == p as u8 { i += 1; }'); + lines.push(' }'); + lines.push(' if i >= bytes.len() { return false; }'); + lines.push(' if !first_charset.contains(bytes[i] as char) { return false; }'); + lines.push(' i += 1;'); + lines.push(' while i < bytes.len() {'); + lines.push(' if !rest_charset.contains(bytes[i] as char) { return false; }'); + lines.push(' i += 1;'); + lines.push(' }'); + lines.push(' true'); + lines.push('}'); + } + + if (helpers.has('check_space_separated_charset')) { + if (lines.length > 0) lines.push(''); + lines.push('fn check_space_separated_charset(s: &str, charset: &str) -> bool {'); + lines.push(' let bytes = s.as_bytes();'); + lines.push(' if bytes.is_empty() { return false; }'); + lines.push(' let mut i = 0;'); + lines.push(' if !charset.contains(bytes[i] as char) { return false; }'); + lines.push(' while i < bytes.len() && charset.contains(bytes[i] as char) { i += 1; }'); + lines.push(" while i < bytes.len() && bytes[i] == b' ' {"); + lines.push(' i += 1;'); + lines.push(' if i >= bytes.len() || !charset.contains(bytes[i] as char) { return false; }'); + lines.push(' while i < bytes.len() && charset.contains(bytes[i] as char) { i += 1; }'); + lines.push(' }'); + lines.push(' i == bytes.len()'); + lines.push('}'); + } + + if (helpers.has('check_uri_scheme')) { + if (lines.length > 0) lines.push(''); + lines.push('fn check_uri_scheme(s: &str) -> bool {'); + lines.push(' let bytes = s.as_bytes();'); + lines.push(' if bytes.len() < 4 { return false; }'); + lines.push(' let c = bytes[0];'); + lines.push(' if !((c >= b\'A\' && c <= b\'Z\') || (c >= b\'a\' && c <= b\'z\')) { return false; }'); + lines.push(' let mut i = 1;'); + lines.push(' while i < bytes.len() {'); + lines.push(' let c = bytes[i];'); + lines.push(' if (c >= b\'A\' && c <= b\'Z\') || (c >= b\'a\' && c <= b\'z\') || (c >= b\'0\' && c <= b\'9\') || c == b\'+\' || c == b\'.\' || c == b\'-\' { i += 1; }'); + lines.push(' else { break; }'); + lines.push(' }'); + lines.push(' if i + 3 > bytes.len() { return false; }'); + lines.push(' bytes[i] == b\':\' && bytes[i+1] == b\'/\' && bytes[i+2] == b\'/\''); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/src/emit-swift.ts b/src/emit-swift.ts index 43416b2..ecf9120 100644 --- a/src/emit-swift.ts +++ b/src/emit-swift.ts @@ -238,6 +238,18 @@ function renderPatternCheckSwift(check: PatternCheck, varExpr: string): { expr: helpers.add('checkPrefixDelimRest'); return { expr: `checkPrefixDelimRest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers }; } + case 'identifier': { + helpers.add('checkIdentifier'); + return { expr: `checkIdentifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, Character("${check.optionalPrefix}")` : ', nil'})`, helpers }; + } + case 'space_separated_charset': { + helpers.add('checkSpaceSeparatedCharset'); + return { expr: `checkSpaceSeparatedCharset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + } + case 'uri_scheme': { + helpers.add('checkUriScheme'); + return { expr: `checkUriScheme(${varExpr})`, helpers }; + } case 'compound': { const allHelpers = new Set(); const parts: string[] = []; @@ -1377,5 +1389,57 @@ function emitSwiftHelpers(helpers: Set): string { lines.push(''); } + if (helpers.has('checkIdentifier')) { + if (lines.length > 0) lines.push(''); + lines.push('private func checkIdentifier(_ s: String, _ firstCharset: String, _ restCharset: String, _ prefix: Character?) -> Bool {'); + lines.push(' let chars = Array(s)'); + lines.push(' var i = 0'); + lines.push(' if let p = prefix, i < chars.count && chars[i] == p { i += 1 }'); + lines.push(' if i >= chars.count { return false }'); + lines.push(' if !firstCharset.contains(chars[i]) { return false }'); + lines.push(' i += 1'); + lines.push(' while i < chars.count {'); + lines.push(' if !restCharset.contains(chars[i]) { return false }'); + lines.push(' i += 1'); + lines.push(' }'); + lines.push(' return true'); + lines.push('}'); + } + + if (helpers.has('checkSpaceSeparatedCharset')) { + if (lines.length > 0) lines.push(''); + lines.push('private func checkSpaceSeparatedCharset(_ s: String, _ charset: String) -> Bool {'); + lines.push(' let chars = Array(s)'); + lines.push(' if chars.isEmpty { return false }'); + lines.push(' var i = 0'); + lines.push(' if !charset.contains(chars[i]) { return false }'); + lines.push(' while i < chars.count && charset.contains(chars[i]) { i += 1 }'); + lines.push(' while i < chars.count && chars[i] == " " {'); + lines.push(' i += 1'); + lines.push(' if i >= chars.count || !charset.contains(chars[i]) { return false }'); + lines.push(' while i < chars.count && charset.contains(chars[i]) { i += 1 }'); + lines.push(' }'); + lines.push(' return i == chars.count'); + lines.push('}'); + } + + if (helpers.has('checkUriScheme')) { + if (lines.length > 0) lines.push(''); + lines.push('private func checkUriScheme(_ s: String) -> Bool {'); + lines.push(' let u = Array(s.utf8)'); + lines.push(' if u.count < 4 { return false }'); + lines.push(' let c = u[0]'); + lines.push(' if !((c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A)) { return false }'); + lines.push(' var i = 1'); + lines.push(' while i < u.count {'); + lines.push(' let c = u[i]'); + lines.push(' if (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A) || (c >= 0x30 && c <= 0x39) || c == 0x2B || c == 0x2E || c == 0x2D { i += 1 }'); + lines.push(' else { break }'); + lines.push(' }'); + lines.push(' if i + 3 > u.count { return false }'); + lines.push(' return u[i] == 0x3A && u[i+1] == 0x2F && u[i+2] == 0x2F'); + lines.push('}'); + } + return lines.join('\n'); } diff --git a/tests/classify-pattern.test.ts b/tests/classify-pattern.test.ts index 610f9d3..2d33655 100644 --- a/tests/classify-pattern.test.ts +++ b/tests/classify-pattern.test.ts @@ -579,6 +579,82 @@ describe('classifyRegex', () => { assert.ok(isNativeCheck(r)); }); + // --- identifier --- + + it('classifies ^[a-z][a-z0-9]*$ as identifier', () => { + const r = classifyRegex('^[a-z][a-z0-9]*$'); + assert.strictEqual(r.op, 'identifier'); + assert.ok(r.op === 'identifier'); + assert.strictEqual(r.optionalPrefix, undefined); + assert.ok(r.firstCharset.includes('a')); + assert.ok(r.firstCharset.includes('z')); + assert.ok(!r.firstCharset.includes('0')); + assert.ok(r.restCharset.includes('a')); + assert.ok(r.restCharset.includes('0')); + assert.ok(isNativeCheck(r)); + }); + + it('classifies ^[A-Z][a-zA-Z0-9]*$ as identifier', () => { + const r = classifyRegex('^[A-Z][a-zA-Z0-9]*$'); + assert.strictEqual(r.op, 'identifier'); + assert.ok(r.op === 'identifier'); + assert.strictEqual(r.optionalPrefix, undefined); + assert.ok(r.firstCharset.includes('A')); + assert.ok(r.firstCharset.includes('Z')); + assert.ok(!r.firstCharset.includes('a')); + assert.ok(r.restCharset.includes('a')); + assert.ok(r.restCharset.includes('A')); + assert.ok(r.restCharset.includes('0')); + }); + + it('classifies ^[a-z][a-z0-9-]*$ as identifier', () => { + const r = classifyRegex('^[a-z][a-z0-9-]*$'); + assert.strictEqual(r.op, 'identifier'); + assert.ok(r.op === 'identifier'); + assert.strictEqual(r.optionalPrefix, undefined); + assert.ok(r.restCharset.includes('-')); + }); + + it('classifies ^!?[a-z][a-z0-9]*$ as identifier with prefix', () => { + const r = classifyRegex('^!?[a-z][a-z0-9]*$'); + assert.strictEqual(r.op, 'identifier'); + assert.ok(r.op === 'identifier'); + assert.strictEqual(r.optionalPrefix, '!'); + assert.ok(r.firstCharset.includes('a')); + assert.ok(r.restCharset.includes('0')); + }); + + it('classifies ^!?[0-9]+$ as identifier with prefix', () => { + const r = classifyRegex('^!?[0-9]+$'); + assert.strictEqual(r.op, 'identifier'); + assert.ok(r.op === 'identifier'); + assert.strictEqual(r.optionalPrefix, '!'); + assert.ok(r.firstCharset.includes('0')); + assert.ok(r.firstCharset.includes('9')); + assert.strictEqual(r.firstCharset, r.restCharset); + }); + + // --- space_separated_charset --- + + it('classifies ^[a-z_]+( [a-z_]+)*$ as space_separated_charset', () => { + const r = classifyRegex('^[a-z_]+( [a-z_]+)*$'); + assert.strictEqual(r.op, 'space_separated_charset'); + assert.ok(r.op === 'space_separated_charset'); + assert.ok(r.charset.includes('a')); + assert.ok(r.charset.includes('z')); + assert.ok(r.charset.includes('_')); + assert.ok(!r.charset.includes(' ')); + assert.ok(isNativeCheck(r)); + }); + + // --- uri_scheme --- + + it('classifies ^[A-Za-z][A-Za-z0-9+.-]*:// as uri_scheme', () => { + const r = classifyRegex('^[A-Za-z][A-Za-z0-9+.-]*://'); + assert.deepStrictEqual(r, { op: 'uri_scheme' }); + assert.ok(isNativeCheck(r)); + }); + // --- Compound: compressed pubkey --- it('classifies ^(02|03)[a-f0-9]{64}$ as compound', () => { diff --git a/tests/fuzz-equivalence.test.ts b/tests/fuzz-equivalence.test.ts index 1a7da8b..3de30b9 100644 --- a/tests/fuzz-equivalence.test.ts +++ b/tests/fuzz-equivalence.test.ts @@ -899,6 +899,62 @@ function buildNativeChecker(check: PatternCheck, originalPattern?: string): ((s: }; } + case 'identifier': { + const fc = check.firstCharset; + const rc = check.restCharset; + const prefix = check.optionalPrefix; + return (s) => { + let i = 0; + if (prefix && i < s.length && s[i] === prefix) i++; + if (i >= s.length) return false; + if (!fc.includes(s[i])) return false; + i++; + while (i < s.length) { + if (!rc.includes(s[i])) return false; + i++; + } + return true; + }; + } + + case 'space_separated_charset': { + const cs = check.charset; + return (s) => { + if (s.length === 0) return false; + let i = 0; + // First token: at least one char from charset + if (!cs.includes(s[i])) return false; + while (i < s.length && cs.includes(s[i])) i++; + // Subsequent tokens: space + at least one char from charset + while (i < s.length && s[i] === ' ') { + i++; + if (i >= s.length || !cs.includes(s[i])) return false; + while (i < s.length && cs.includes(s[i])) i++; + } + return i === s.length; + }; + } + + case 'uri_scheme': { + // ^[A-Za-z][A-Za-z0-9+.-]*:// (no end anchor) + return (s) => { + if (s.length < 4) return false; // at least "x://" + // First char: alpha + const c0 = s[0]; + if (!((c0 >= 'A' && c0 <= 'Z') || (c0 >= 'a' && c0 <= 'z'))) return false; + let i = 1; + while (i < s.length) { + const c = s[i]; + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c === '+' || c === '.' || c === '-') { + i++; + } else break; + } + // Must find :// at current position + if (i + 3 > s.length) return false; + return s[i] === ':' && s[i + 1] === '/' && s[i + 2] === '/'; + }; + } + case 'compound': { const checkers = check.checks.map(c => buildNativeChecker(c, originalPattern)); if (checkers.some(c => c === null)) return null; @@ -1674,6 +1730,132 @@ function generateHexRangeInputs(rng: Rng, min: number, max: number, caseType: 'l // Specific input generators dispatched by op type // --------------------------------------------------------------------------- +function generateIdentifierInputs(rng: Rng, check: Extract): string[] { + const inputs: string[] = []; + const { firstCharset, restCharset, optionalPrefix } = check; + + // Valid: just first char + inputs.push(firstCharset[0]); + inputs.push(firstCharset[firstCharset.length - 1]); + + // Valid: first char + rest + for (let i = 0; i < 15; i++) { + const restLen = rng.nextInt(0, 10); + let s = firstCharset[rng.nextInt(0, firstCharset.length - 1)]; + s += rng.randomString(restLen, restCharset); + if (optionalPrefix && rng.next() > 0.5) { + s = optionalPrefix + s; + } + inputs.push(s); + } + + // With optional prefix + if (optionalPrefix) { + inputs.push(optionalPrefix + firstCharset[0]); + inputs.push(optionalPrefix + firstCharset[0] + rng.randomString(5, restCharset)); + inputs.push(optionalPrefix); // prefix only — no first char + } + + // Invalid: empty + inputs.push(''); + + // Invalid: start with rest-only char (if different from first) + if (restCharset.length > 0) { + for (const c of restCharset) { + if (!firstCharset.includes(c)) { + inputs.push(c); // starts with non-first char + inputs.push(c + rng.randomString(3, restCharset)); + break; + } + } + } + + // Invalid: uppercase when only lowercase allowed, etc. + inputs.push('ABC', 'abc', '123', 'a-b', '!abc', 'hello world', '\n', '\r'); + inputs.push('a\nb', 'a\rb', 'a\u2028b', 'a\u2029b'); + + // Random + for (let i = 0; i < 20; i++) { + inputs.push(rng.randomString(rng.nextInt(0, 10), ASCII_PRINTABLE)); + } + + return inputs; +} + +function generateSpaceSeparatedCharsetInputs(rng: Rng, charset: string): string[] { + const inputs: string[] = []; + + // Valid: single token + inputs.push(charset[0]); + inputs.push(rng.randomString(5, charset)); + + // Valid: multiple tokens + for (let i = 0; i < 10; i++) { + const tokenCount = rng.nextInt(1, 4); + const tokens: string[] = []; + for (let j = 0; j < tokenCount; j++) { + tokens.push(rng.randomString(rng.nextInt(1, 6), charset)); + } + inputs.push(tokens.join(' ')); + } + + // Invalid: empty + inputs.push(''); + + // Invalid: double space + inputs.push(charset[0] + ' ' + charset[0]); + + // Invalid: leading/trailing space + inputs.push(' ' + charset[0]); + inputs.push(charset[0] + ' '); + + // Invalid: char not in charset + inputs.push('ABC', '123', '!!!', ' ', 'hello\tworld'); + + // Invalid: line terminators + inputs.push(charset[0] + '\n' + charset[0]); + inputs.push(charset[0] + '\r' + charset[0]); + + // Random + for (let i = 0; i < 20; i++) { + inputs.push(rng.randomString(rng.nextInt(0, 15), charset + ' ' + ASCII_PRINTABLE)); + } + + return inputs; +} + +function generateUriSchemeInputs(rng: Rng): string[] { + const inputs: string[] = []; + const SCHEME_REST = ASCII_ALPHA + DIGITS + '+.-'; + + // Valid + inputs.push('http://', 'https://', 'ftp://', 'a://', 'Z://', 'custom+scheme://'); + inputs.push('a+b.c-d://'); + inputs.push('http://example.com'); + inputs.push('x://'); + + // Invalid + inputs.push('', '://', '1http://', '+http://', '.http://', '-http://'); + inputs.push('http:', 'http:/', 'http', 'http//'); + inputs.push('HTTP://'); // uppercase should match — [A-Za-z] covers it + + // Random valid schemes + for (let i = 0; i < 15; i++) { + const first = ASCII_ALPHA[rng.nextInt(0, ASCII_ALPHA.length - 1)]; + const restLen = rng.nextInt(0, 10); + const rest = rng.randomString(restLen, SCHEME_REST); + const hasSep = rng.next() > 0.3; + inputs.push(first + rest + (hasSep ? '://' : '')); + } + + // Random garbage + for (let i = 0; i < 10; i++) { + inputs.push(rng.randomString(rng.nextInt(0, 15), ASCII_PRINTABLE)); + } + + return inputs; +} + function generateInputsForCheck(rng: Rng, check: PatternCheck): string[] { const base = generateBaseInputs(rng); @@ -1782,6 +1964,12 @@ function generateInputsForCheck(rng: Rng, check: PatternCheck): string[] { case 'prefix_delim_rest': return [...base, '123:hello', '0:x', '', '123', '123:', 'abc:def', '123:\n', '123:\r', '123:\u2028', '123:\u2029', '123:\nfoo', '123:a\nfoo']; + case 'identifier': + return [...base, ...generateIdentifierInputs(rng, check)]; + case 'space_separated_charset': + return [...base, ...generateSpaceSeparatedCharsetInputs(rng, check.charset)]; + case 'uri_scheme': + return [...base, ...generateUriSchemeInputs(rng)]; case 'compound': // For compound, generate inputs for the first sub-check if (check.checks.length > 0) { @@ -1884,6 +2072,16 @@ const ALL_PATTERNS: string[] = [ '^(?:[a-f0-9]{64}|[a-f0-9]{96}|[a-f0-9]{128})$', // Strict base64 2-pad (MIP-05 token tag) '^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$', + // Identifier patterns + '^[a-z][a-z0-9]*$', + '^[A-Z][a-zA-Z0-9]*$', + '^[a-z][a-z0-9-]*$', + '^!?[a-z][a-z0-9]*$', + '^!?[0-9]+$', + // Space-separated charset + '^[a-z_]+( [a-z_]+)*$', + // URI scheme + '^[A-Za-z][A-Za-z0-9+.-]*://', ]; // Deduplicate From 1068e39881d8f3552df9ce337555f1034a10be4e Mon Sep 17 00:00:00 2001 From: alltheseas Date: Wed, 1 Apr 2026 22:50:19 -0500 Subject: [PATCH 3/4] fix: Ruby helpers use module singleton methods, hex_alternation validates per-branch case P1: Ruby check_identifier, check_space_separated_charset, check_uri_scheme were emitted as instance methods (`def check_*`) but called from module singleton context. Changed to `def self.check_*` with correct 2/4-space indentation matching all existing helpers. P2: hex_alternation classifier assumed all alternation branches shared the same case policy. Now checks each branch individually and falls through to regex if policies differ, preventing silent case-widening. Co-Authored-By: Claude Opus 4.6 --- src/classify-pattern.ts | 12 +++++-- src/emit-ruby.ts | 75 +++++++++++++++++++++-------------------- 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/src/classify-pattern.ts b/src/classify-pattern.ts index 85e1dc2..22fd18d 100644 --- a/src/classify-pattern.ts +++ b/src/classify-pattern.ts @@ -113,9 +113,15 @@ export function classifyRegex(pattern: string): PatternCheck { /^\^\(\?:(\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{\d+\}(?:\|\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{\d+\})+)\)\$$/ ); if (m) { - const isLower = !m[1].includes('A-F'); - const lengths = [...m[1].matchAll(/\{(\d+)\}/g)].map(mm => parseInt(mm[1], 10)); - return { op: 'hex_alternation', lengths, case: isLower ? 'lower' : 'mixed' }; + // Verify all branches share the same case policy — reject mixed policies + const branches = m[1].split('|'); + const branchCases = branches.map(b => b.includes('A-F') ? 'mixed' : 'lower'); + const allSame = branchCases.every(c => c === branchCases[0]); + if (allSame) { + const lengths = [...m[1].matchAll(/\{(\d+)\}/g)].map(mm => parseInt(mm[1], 10)); + return { op: 'hex_alternation', lengths, case: branchCases[0] }; + } + // Mixed case policies across branches — fall through to regex } } diff --git a/src/emit-ruby.ts b/src/emit-ruby.ts index bed5c95..565e90c 100644 --- a/src/emit-ruby.ts +++ b/src/emit-ruby.ts @@ -1312,56 +1312,59 @@ function emitRubyHelpers(helpers: Set): string { if (helpers.has('check_identifier')) { if (lines.length > 0) lines.push(''); - lines.push("def check_identifier(s, first_charset, rest_charset, prefix = '')"); - lines.push(' i = 0'); - lines.push(" if prefix != '' && i < s.length && s[i] == prefix"); - lines.push(' i += 1'); - lines.push(' end'); - lines.push(' return false if i >= s.length'); - lines.push(' return false unless first_charset.include?(s[i])'); - lines.push(' i += 1'); - lines.push(' while i < s.length'); - lines.push(' return false unless rest_charset.include?(s[i])'); + lines.push(" def self.check_identifier(s, first_charset, rest_charset, prefix = '')"); + lines.push(' i = 0'); + lines.push(" if prefix != '' && i < s.length && s[i] == prefix"); + lines.push(' i += 1'); + lines.push(' end'); + lines.push(' return false if i >= s.length'); + lines.push(' return false unless first_charset.include?(s[i])'); lines.push(' i += 1'); + lines.push(' while i < s.length'); + lines.push(' return false unless rest_charset.include?(s[i])'); + lines.push(' i += 1'); + lines.push(' end'); + lines.push(' true'); lines.push(' end'); - lines.push(' true'); - lines.push('end'); + lines.push(''); } if (helpers.has('check_space_separated_charset')) { if (lines.length > 0) lines.push(''); - lines.push('def check_space_separated_charset(s, charset)'); - lines.push(' return false if s.empty?'); - lines.push(' i = 0'); - lines.push(' return false unless charset.include?(s[i])'); - lines.push(' i += 1 while i < s.length && charset.include?(s[i])'); - lines.push(" while i < s.length && s[i] == ' '"); - lines.push(' i += 1'); - lines.push(' return false if i >= s.length || !charset.include?(s[i])'); + lines.push(' def self.check_space_separated_charset(s, charset)'); + lines.push(' return false if s.empty?'); + lines.push(' i = 0'); + lines.push(' return false unless charset.include?(s[i])'); lines.push(' i += 1 while i < s.length && charset.include?(s[i])'); + lines.push(" while i < s.length && s[i] == ' '"); + lines.push(' i += 1'); + lines.push(' return false if i >= s.length || !charset.include?(s[i])'); + lines.push(' i += 1 while i < s.length && charset.include?(s[i])'); + lines.push(' end'); + lines.push(' i == s.length'); lines.push(' end'); - lines.push(' i == s.length'); - lines.push('end'); + lines.push(''); } if (helpers.has('check_uri_scheme')) { if (lines.length > 0) lines.push(''); - lines.push('def check_uri_scheme(s)'); - lines.push(' return false if s.length < 4'); - lines.push(' c = s[0]'); - lines.push(" return false unless (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')"); - lines.push(' i = 1'); - lines.push(' while i < s.length'); - lines.push(' c = s[i]'); - lines.push(" if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-'"); - lines.push(' i += 1'); - lines.push(' else'); - lines.push(' break'); + lines.push(' def self.check_uri_scheme(s)'); + lines.push(' return false if s.length < 4'); + lines.push(' c = s[0]'); + lines.push(" return false unless (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')"); + lines.push(' i = 1'); + lines.push(' while i < s.length'); + lines.push(' c = s[i]'); + lines.push(" if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-'"); + lines.push(' i += 1'); + lines.push(' else'); + lines.push(' break'); + lines.push(' end'); lines.push(' end'); + lines.push(' return false if i + 3 > s.length'); + lines.push(" s[i] == ':' && s[i+1] == '/' && s[i+2] == '/'"); lines.push(' end'); - lines.push(' return false if i + 3 > s.length'); - lines.push(" s[i] == ':' && s[i+1] == '/' && s[i+2] == '/'"); - lines.push('end'); + lines.push(''); } return lines.join('\n'); From a2384ec7b9430d5116b083341c5c695881bb9d3b Mon Sep 17 00:00:00 2001 From: alltheseas Date: Thu, 2 Apr 2026 00:03:40 -0500 Subject: [PATCH 4/4] fix: hex_alternation naming mismatch, null guards, PHP naming convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - C++/Python/Ruby/Rust: hex_alternation used check_hex{len} but helpers declared check_hex_{len} — add missing underscore before length - C#/Java: add null guards to CheckIdentifier, CheckSpaceSeparatedCharset, CheckUriScheme matching all other helpers in those emitters - PHP: rename check_identifier/check_space_separated_charset/check_uri_scheme to schemata_check_* matching file convention, use phpString() instead of JSON.stringify() for string escaping - Dart: use dartString() instead of JSON.stringify() for charset literals Co-Authored-By: Claude Opus 4.6 --- src/emit-cpp.ts | 2 +- src/emit-csharp.ts | 5 +++-- src/emit-dart.ts | 4 ++-- src/emit-java.ts | 5 +++-- src/emit-php.ts | 24 ++++++++++++------------ src/emit-python.ts | 2 +- src/emit-ruby.ts | 2 +- src/emit-rust.ts | 2 +- 8 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/emit-cpp.ts b/src/emit-cpp.ts index ccf54fd..7c8b56e 100644 --- a/src/emit-cpp.ts +++ b/src/emit-cpp.ts @@ -204,7 +204,7 @@ function renderPatternCheckCpp(check: PatternCheck, varExpr: string): { expr: st } case 'hex_alternation': { const fns = check.lengths.map(len => { - const fn = check.case === 'lower' ? `check_hex${len}` : `check_hex${len}_mixed`; + const fn = check.case === 'lower' ? `check_hex_${len}` : `check_hex_${len}_mixed`; helpers.add(fn); return `${fn}(${varExpr})`; }); diff --git a/src/emit-csharp.ts b/src/emit-csharp.ts index 9e945cf..d1ad34d 100644 --- a/src/emit-csharp.ts +++ b/src/emit-csharp.ts @@ -1319,6 +1319,7 @@ function emitCSharpHelpers(helpers: Set): string { if (helpers.has('CheckIdentifier')) { if (lines.length > 0) lines.push(''); lines.push(' private static bool CheckIdentifier(string s, string firstCharset, string restCharset, char prefix = (char)0) {'); + lines.push(' if (s == null) return false;'); lines.push(' int i = 0;'); lines.push(' if (prefix != 0 && i < s.Length && s[i] == prefix) i++;'); lines.push(' if (i >= s.Length) return false;'); @@ -1334,7 +1335,7 @@ function emitCSharpHelpers(helpers: Set): string { if (helpers.has('CheckSpaceSeparatedCharset')) { if (lines.length > 0) lines.push(''); lines.push(' private static bool CheckSpaceSeparatedCharset(string s, string charset) {'); - lines.push(' if (s.Length == 0) return false;'); + lines.push(' if (s == null || s.Length == 0) return false;'); lines.push(' int i = 0;'); lines.push(' if (charset.IndexOf(s[i]) < 0) return false;'); lines.push(' while (i < s.Length && charset.IndexOf(s[i]) >= 0) i++;'); @@ -1350,7 +1351,7 @@ function emitCSharpHelpers(helpers: Set): string { if (helpers.has('CheckUriScheme')) { if (lines.length > 0) lines.push(''); lines.push(' private static bool CheckUriScheme(string s) {'); - lines.push(' if (s.Length < 4) return false;'); + lines.push(' if (s == null || s.Length < 4) return false;'); lines.push(' char c = s[0];'); lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return false;"); lines.push(' int i = 1;'); diff --git a/src/emit-dart.ts b/src/emit-dart.ts index ed0dc75..7a11d42 100644 --- a/src/emit-dart.ts +++ b/src/emit-dart.ts @@ -232,11 +232,11 @@ function renderPatternCheckDart(check: PatternCheck, varExpr: string): { expr: s } case 'identifier': { helpers.add('_checkIdentifier'); - return { expr: `_checkIdentifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ''})`, helpers }; + return { expr: `_checkIdentifier(${varExpr}, ${dartString(check.firstCharset)}, ${dartString(check.restCharset)}${check.optionalPrefix ? `, ${dartString(check.optionalPrefix)}` : ''})`, helpers }; } case 'space_separated_charset': { helpers.add('_checkSpaceSeparatedCharset'); - return { expr: `_checkSpaceSeparatedCharset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + return { expr: `_checkSpaceSeparatedCharset(${varExpr}, ${dartString(check.charset)})`, helpers }; } case 'uri_scheme': { helpers.add('_checkUriScheme'); diff --git a/src/emit-java.ts b/src/emit-java.ts index 41a4871..7ed67ee 100644 --- a/src/emit-java.ts +++ b/src/emit-java.ts @@ -1338,6 +1338,7 @@ function emitJavaHelpers(helpers: Set): string { if (helpers.has('checkIdentifier')) { if (lines.length > 0) lines.push(''); lines.push(' private static boolean checkIdentifier(String s, String firstCharset, String restCharset, char prefix) {'); + lines.push(' if (s == null) return false;'); lines.push(' int i = 0;'); lines.push(' if (prefix != 0 && i < s.length() && s.charAt(i) == prefix) i++;'); lines.push(' if (i >= s.length()) return false;'); @@ -1353,7 +1354,7 @@ function emitJavaHelpers(helpers: Set): string { if (helpers.has('checkSpaceSeparatedCharset')) { if (lines.length > 0) lines.push(''); lines.push(' private static boolean checkSpaceSeparatedCharset(String s, String charset) {'); - lines.push(' if (s.isEmpty()) return false;'); + lines.push(' if (s == null || s.isEmpty()) return false;'); lines.push(' int i = 0;'); lines.push(' if (charset.indexOf(s.charAt(i)) < 0) return false;'); lines.push(' while (i < s.length() && charset.indexOf(s.charAt(i)) >= 0) i++;'); @@ -1369,7 +1370,7 @@ function emitJavaHelpers(helpers: Set): string { if (helpers.has('checkUriScheme')) { if (lines.length > 0) lines.push(''); lines.push(' private static boolean checkUriScheme(String s) {'); - lines.push(' if (s.length() < 4) return false;'); + lines.push(' if (s == null || s.length() < 4) return false;'); lines.push(' char c = s.charAt(0);'); lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return false;"); lines.push(' int i = 1;'); diff --git a/src/emit-php.ts b/src/emit-php.ts index bd96e32..ba75ff9 100644 --- a/src/emit-php.ts +++ b/src/emit-php.ts @@ -237,16 +237,16 @@ function renderPatternCheckPhp(check: PatternCheck, varExpr: string): { expr: st return { expr: `schemata_check_prefix_delim_rest(${varExpr}, ${phpString(check.charset)}, ${phpString(check.delimiter)})`, helpers }; } case 'identifier': { - helpers.add('check_identifier'); - return { expr: `check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, ${JSON.stringify(check.optionalPrefix)}` : ''})`, helpers }; + helpers.add('schemata_check_identifier'); + return { expr: `schemata_check_identifier(${varExpr}, ${phpString(check.firstCharset)}, ${phpString(check.restCharset)}${check.optionalPrefix ? `, ${phpString(check.optionalPrefix)}` : ''})`, helpers }; } case 'space_separated_charset': { - helpers.add('check_space_separated_charset'); - return { expr: `check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers }; + helpers.add('schemata_check_space_separated_charset'); + return { expr: `schemata_check_space_separated_charset(${varExpr}, ${phpString(check.charset)})`, helpers }; } case 'uri_scheme': { - helpers.add('check_uri_scheme'); - return { expr: `check_uri_scheme(${varExpr})`, helpers }; + helpers.add('schemata_check_uri_scheme'); + return { expr: `schemata_check_uri_scheme(${varExpr})`, helpers }; } case 'compound': { const allHelpers = new Set(); @@ -1431,9 +1431,9 @@ function emitPhpHelpers(helpers: Set): string { lines.push(''); } - if (helpers.has('check_identifier')) { + if (helpers.has('schemata_check_identifier')) { if (lines.length > 0) lines.push(''); - lines.push("function check_identifier(string $s, string $first_charset, string $rest_charset, string $prefix = ''): bool {"); + lines.push("function schemata_check_identifier(string $s, string $first_charset, string $rest_charset, string $prefix = ''): bool {"); lines.push(' $i = 0;'); lines.push(' $len = strlen($s);'); lines.push(" if ($prefix !== '' && $i < $len && $s[$i] === $prefix) $i++;"); @@ -1447,9 +1447,9 @@ function emitPhpHelpers(helpers: Set): string { lines.push('}'); } - if (helpers.has('check_space_separated_charset')) { + if (helpers.has('schemata_check_space_separated_charset')) { if (lines.length > 0) lines.push(''); - lines.push('function check_space_separated_charset(string $s, string $charset): bool {'); + lines.push('function schemata_check_space_separated_charset(string $s, string $charset): bool {'); lines.push(' $len = strlen($s);'); lines.push(' if ($len === 0) return false;'); lines.push(' $i = 0;'); @@ -1464,9 +1464,9 @@ function emitPhpHelpers(helpers: Set): string { lines.push('}'); } - if (helpers.has('check_uri_scheme')) { + if (helpers.has('schemata_check_uri_scheme')) { if (lines.length > 0) lines.push(''); - lines.push('function check_uri_scheme(string $s): bool {'); + lines.push('function schemata_check_uri_scheme(string $s): bool {'); lines.push(' $len = strlen($s);'); lines.push(' if ($len < 4) return false;'); lines.push(' $c = $s[0];'); diff --git a/src/emit-python.ts b/src/emit-python.ts index f8d1e2e..9f037d2 100644 --- a/src/emit-python.ts +++ b/src/emit-python.ts @@ -200,7 +200,7 @@ function renderPatternCheckPython(check: PatternCheck, varExpr: string): { expr: } case 'hex_alternation': { const fns = check.lengths.map(len => { - const fn = check.case === 'lower' ? `_check_hex${len}` : `_check_hex${len}_mixed`; + const fn = check.case === 'lower' ? `_check_hex_${len}` : `_check_hex_${len}_mixed`; helpers.add(fn); return `${fn}(${varExpr})`; }); diff --git a/src/emit-ruby.ts b/src/emit-ruby.ts index 565e90c..678b4c4 100644 --- a/src/emit-ruby.ts +++ b/src/emit-ruby.ts @@ -204,7 +204,7 @@ function renderPatternCheckRuby(check: PatternCheck, varExpr: string): { expr: s } case 'hex_alternation': { const fns = check.lengths.map(len => { - const fn = check.case === 'lower' ? `check_hex${len}` : `check_hex${len}_mixed`; + const fn = check.case === 'lower' ? `check_hex_${len}` : `check_hex_${len}_mixed`; helpers.add(fn); return `${fn}(${varExpr})`; }); diff --git a/src/emit-rust.ts b/src/emit-rust.ts index 49f0a0d..406d91b 100644 --- a/src/emit-rust.ts +++ b/src/emit-rust.ts @@ -258,7 +258,7 @@ function renderPatternCheckRust(check: PatternCheck, varExpr: string): { expr: s } case 'hex_alternation': { const fns = check.lengths.map(len => { - const fn = check.case === 'lower' ? `check_hex${len}` : `check_hex${len}_mixed`; + const fn = check.case === 'lower' ? `check_hex_${len}` : `check_hex_${len}_mixed`; helpers.add(fn); return `${fn}(${varExpr})`; });