Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions src/classify-pattern.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ export type PatternCheck =
| { op: 'space_separated_tokens' }
| { op: 'starts_with_charset'; charset: string }
| { op: 'base64' }
| { op: 'hex_alternation'; lengths: number[]; case: 'lower' | 'mixed' }
| { op: 'base64_2pad' }
| { op: 'identifier'; optionalPrefix?: string; firstCharset: string; restCharset: string }
| { op: 'space_separated_charset'; charset: string }
| { op: 'uri_scheme' }
| { op: 'nostr_uri' }
| { op: 'nip04_encrypted' }
| { op: 'nip05_identifier' }
Expand Down Expand Up @@ -102,6 +107,24 @@ export function classifyRegex(pattern: string): PatternCheck {
}
}

// Multi-length hex alternation: ^(?:[a-f0-9]{64}|[a-f0-9]{96}|[a-f0-9]{128})$
{
const m = pattern.match(
/^\^\(\?:(\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{\d+\}(?:\|\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{\d+\})+)\)\$$/
);
if (m) {
// Verify all branches share the same case policy — reject mixed policies
const branches = m[1].split('|');
const branchCases = branches.map(b => b.includes('A-F') ? 'mixed' : 'lower');
const allSame = branchCases.every(c => c === branchCases[0]);
if (allSame) {
const lengths = [...m[1].matchAll(/\{(\d+)\}/g)].map(mm => parseInt(mm[1], 10));
return { op: 'hex_alternation', lengths, case: branchCases[0] };
}
// Mixed case policies across branches — fall through to regex
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

// Range-length hex: ^[a-f0-9]{7,40}$
{
const m = pattern.match(/^\^\[(?:a-f0-9|a-fA-F0-9|0-9a-f|0-9a-fA-F)\]\{(\d+),(\d+)\}\$$/);
Expand Down Expand Up @@ -409,6 +432,11 @@ export function classifyRegex(pattern: string): PatternCheck {
return { op: 'base64' };
}

// Base64 strict 2-pad: ^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$
if (pattern === '^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==)$') {
return { op: 'base64_2pad' };
}

// Nostr URI: ^nostr:((npub|note)1[02-9ac-hj-np-z]{58}|(nprofile|nevent|naddr)1[02-9ac-hj-np-z]+)$
if (pattern === '^nostr:((npub|note)1[02-9ac-hj-np-z]{58}|(nprofile|nevent|naddr)1[02-9ac-hj-np-z]+)$') {
return { op: 'nostr_uri' };
Expand Down Expand Up @@ -439,6 +467,48 @@ export function classifyRegex(pattern: string): PatternCheck {
return { op: 'prefix_delim_rest', charset: expandCharset('a-zA-Z0-9') + '_-', delimiter: ': ' };
}

// Identifier: ^[optionalPrefix]?[firstCharset][restCharset]*$
// Covers: ^[a-z][a-z0-9]*$, ^[A-Z][a-zA-Z0-9]*$, ^[a-z][a-z0-9-]*$,
// ^!?[a-z][a-z0-9]*$, ^!?[0-9]+$
{
// Match: ^<optionalChar>?[firstCharset][restCharset]*$ or ^<optionalChar>?[charset]+$
const m = pattern.match(/^\^(!?)\??\[([A-Za-z0-9-]+)\](\[([A-Za-z0-9-]+)\]\*|\+)\$$/);
if (m) {
const prefixChar = m[1]; // '' or '!'
const hasOptionalPrefix = prefixChar !== '' && pattern.startsWith('^' + prefixChar + '?');
const firstCharset = expandCharset(m[2]);

if (m[3] === '+') {
// ^[charset]+$ or ^!?[charset]+$
if (hasOptionalPrefix) {
// ^!?[0-9]+$ — identifier with optional prefix, same first and rest charset
return { op: 'identifier', optionalPrefix: prefixChar, firstCharset, restCharset: firstCharset };
}
// ^[charset]+$ — equivalent to chars_in, already handled above; skip
} else {
// ^[firstCharset][restCharset]*$ or ^!?[firstCharset][restCharset]*$
const restCharset = expandCharset(m[4]);
if (hasOptionalPrefix) {
return { op: 'identifier', optionalPrefix: prefixChar, firstCharset, restCharset };
}
return { op: 'identifier', firstCharset, restCharset };
}
}
}

// Space-separated charset: ^[charset]+( [charset]+)*$
{
const m = pattern.match(/^\^\[([A-Za-z0-9_-]+)\]\+\( \[([A-Za-z0-9_-]+)\]\+\)\*\$$/);
if (m && m[1] === m[2]) {
return { op: 'space_separated_charset', charset: expandCharset(m[1]) };
}
}

// URI scheme: ^[A-Za-z][A-Za-z0-9+.-]*://
if (pattern === '^[A-Za-z][A-Za-z0-9+.-]*://') {
return { op: 'uri_scheme' };
}

// Fallback: preserve original regex
return { op: 'regex', pattern };
}
Expand Down Expand Up @@ -639,11 +709,16 @@ export function isNativeCheck(check: PatternCheck): boolean {
case 'space_separated_tokens':
case 'starts_with_charset':
case 'base64':
case 'hex_alternation':
case 'base64_2pad':
case 'nostr_uri':
case 'nip04_encrypted':
case 'nip05_identifier':
case 'mime_type_strict':
case 'prefix_delim_rest':
case 'identifier':
case 'space_separated_charset':
case 'uri_scheme':
return true;
case 'compound':
return check.checks.every(isNativeCheck);
Expand Down
92 changes: 92 additions & 0 deletions src/emit-c.ts
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,19 @@ function renderPatternCheckC(check: PatternCheck, varExpr: string): { expr: stri
helpers.add('schemata_check_base64');
return { expr: `schemata_check_base64(${varExpr})`, helpers };
}
case 'hex_alternation': {
const fns = check.lengths.map(len => {
const fn = check.case === 'lower' ? `schemata_check_hex${len}` : `schemata_check_hex${len}_mixed`;
helpers.add(fn);
return `${fn}(${varExpr})`;
});
return { expr: `(${fns.join(' || ')})`, helpers };
}
case 'base64_2pad': {
helpers.add('schemata_check_base64_2pad');
helpers.add('schemata_check_base64'); // for schemata_is_b64_char
return { expr: `schemata_check_base64_2pad(${varExpr})`, helpers };
}
case 'nostr_uri': {
helpers.add('schemata_check_nostr_uri');
return { expr: `schemata_check_nostr_uri(${varExpr})`, helpers };
Expand All @@ -330,6 +343,18 @@ function renderPatternCheckC(check: PatternCheck, varExpr: string): { expr: stri
helpers.add('schemata_check_prefix_delim_rest');
return { expr: `schemata_check_prefix_delim_rest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers };
}
case 'identifier': {
helpers.add('schemata_check_identifier');
return { expr: `schemata_check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ', 0'})`, helpers };
}
case 'space_separated_charset': {
helpers.add('schemata_check_space_separated_charset');
return { expr: `schemata_check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers };
}
case 'uri_scheme': {
helpers.add('schemata_check_uri_scheme');
return { expr: `schemata_check_uri_scheme(${varExpr})`, helpers };
}
case 'compound': {
const allHelpers = new Set<string>();
const parts: string[] = [];
Expand Down Expand Up @@ -1496,6 +1521,22 @@ function emitHelperFunctions(helpers: Set<string>): string {
lines.push('');
}

if (helpers.has('schemata_check_base64_2pad')) {
lines.push('/* strict base64 with mandatory 2-char padding */');
lines.push('static int schemata_check_base64_2pad(const char *s) {');
lines.push(' if (!s) return 0;');
lines.push(' size_t len = strlen(s);');
lines.push(' if (len < 4 || len % 4 != 0) return 0;');
lines.push(" if (s[len - 1] != '=' || s[len - 2] != '=') return 0;");
lines.push(' size_t i;');
lines.push(' for (i = 0; i < len - 2; i++) {');
lines.push(' if (!schemata_is_b64_char(s[i])) return 0;');
lines.push(' }');
lines.push(' return 1;');
lines.push('}');
lines.push('');
}

if (helpers.has('schemata_check_nostr_uri')) {
lines.push('static int schemata_is_bech32_data_char(char c) {');
lines.push(" return (c >= '0' && c <= '9' && c != '1') || (c >= 'a' && c <= 'z' && c != 'b' && c != 'i' && c != 'o');");
Expand Down Expand Up @@ -1638,5 +1679,56 @@ function emitHelperFunctions(helpers: Set<string>): string {
lines.push('');
}

if (helpers.has('schemata_check_identifier')) {
if (lines.length > 0) lines.push('');
lines.push('static int schemata_check_identifier(const char *s, const char *first_charset, const char *rest_charset, char prefix) {');
lines.push(' size_t len = strlen(s);');
lines.push(' size_t i = 0;');
lines.push(' if (prefix && i < len && s[i] == prefix) i++;');
lines.push(' if (i >= len) return 0;');
lines.push(' if (!strchr(first_charset, s[i])) return 0;');
lines.push(' i++;');
lines.push(' for (; i < len; i++) {');
lines.push(' if (!strchr(rest_charset, s[i])) return 0;');
lines.push(' }');
lines.push(' return 1;');
lines.push('}');
}

if (helpers.has('schemata_check_space_separated_charset')) {
if (lines.length > 0) lines.push('');
lines.push('static int schemata_check_space_separated_charset(const char *s, const char *charset) {');
lines.push(' size_t len = strlen(s);');
lines.push(' if (len == 0) return 0;');
lines.push(' size_t i = 0;');
lines.push(' if (!strchr(charset, s[i])) return 0;');
lines.push(' while (i < len && strchr(charset, s[i])) i++;');
lines.push(" while (i < len && s[i] == ' ') {");
lines.push(' i++;');
lines.push(' if (i >= len || !strchr(charset, s[i])) return 0;');
lines.push(' while (i < len && strchr(charset, s[i])) i++;');
lines.push(' }');
lines.push(' return i == len;');
lines.push('}');
}

if (helpers.has('schemata_check_uri_scheme')) {
if (lines.length > 0) lines.push('');
lines.push('static int schemata_check_uri_scheme(const char *s) {');
lines.push(' size_t len = strlen(s);');
lines.push(' if (len < 4) return 0;');
lines.push(' char c = s[0];');
lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return 0;");
lines.push(' size_t i = 1;');
lines.push(' while (i < len) {');
lines.push(' c = s[i];');
lines.push(" if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') i++;");
lines.push(' else break;');
lines.push(' }');
lines.push(" if (i + 3 > len) return 0;");
lines.push(" return s[i] == ':' && s[i+1] == '/' && s[i+2] == '/';");
lines.push('}');
}

return lines.join('\n');
}
86 changes: 86 additions & 0 deletions src/emit-cpp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,19 @@ function renderPatternCheckCpp(check: PatternCheck, varExpr: string): { expr: st
helpers.add('check_base64');
return { expr: `check_base64(${varExpr})`, helpers };
}
case 'hex_alternation': {
const fns = check.lengths.map(len => {
const fn = check.case === 'lower' ? `check_hex_${len}` : `check_hex_${len}_mixed`;
helpers.add(fn);
return `${fn}(${varExpr})`;
});
return { expr: `(${fns.join(' || ')})`, helpers };
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
case 'base64_2pad': {
helpers.add('check_base64_2pad');
helpers.add('check_base64'); // for is_b64_char
return { expr: `check_base64_2pad(${varExpr})`, helpers };
}
case 'nostr_uri': {
helpers.add('check_nostr_uri');
return { expr: `check_nostr_uri(${varExpr})`, helpers };
Expand All @@ -225,6 +238,18 @@ function renderPatternCheckCpp(check: PatternCheck, varExpr: string): { expr: st
helpers.add('check_prefix_delim_rest');
return { expr: `check_prefix_delim_rest(${varExpr}, ${JSON.stringify(check.charset)}, ${JSON.stringify(check.delimiter)})`, helpers };
}
case 'identifier': {
helpers.add('check_identifier');
return { expr: `check_identifier(${varExpr}, ${JSON.stringify(check.firstCharset)}, ${JSON.stringify(check.restCharset)}${check.optionalPrefix ? `, '${check.optionalPrefix}'` : ', 0'})`, helpers };
}
case 'space_separated_charset': {
helpers.add('check_space_separated_charset');
return { expr: `check_space_separated_charset(${varExpr}, ${JSON.stringify(check.charset)})`, helpers };
}
case 'uri_scheme': {
helpers.add('check_uri_scheme');
return { expr: `check_uri_scheme(${varExpr})`, helpers };
}
case 'compound': {
const allHelpers = new Set<string>();
const parts: string[] = [];
Expand Down Expand Up @@ -1136,6 +1161,19 @@ function emitCppHelpers(helpers: Set<string>): string {
lines.push('');
}

if (helpers.has('check_base64_2pad')) {
lines.push('/* strict base64 with mandatory 2-char padding */');
lines.push('inline bool check_base64_2pad(const std::string& s) {');
lines.push(' if (s.size() < 4 || s.size() % 4 != 0) return false;');
lines.push(" if (s[s.size() - 1] != '=' || s[s.size() - 2] != '=') return false;");
lines.push(' for (size_t i = 0; i < s.size() - 2; i++) {');
lines.push(' if (!is_b64_char(s[i])) return false;');
lines.push(' }');
lines.push(' return true;');
lines.push('}');
lines.push('');
}

if (helpers.has('check_nostr_uri')) {
lines.push('inline bool is_bech32_data_char(char c) {');
lines.push(" return (c >= '0' && c <= '9' && c != '1') || (c >= 'a' && c <= 'z' && c != 'b' && c != 'i' && c != 'o');");
Expand Down Expand Up @@ -1299,5 +1337,53 @@ function emitCppHelpers(helpers: Set<string>): string {
lines.push('');
}

if (helpers.has('check_identifier')) {
if (lines.length > 0) lines.push('');
lines.push('static bool check_identifier(const std::string& s, const std::string& first_charset, const std::string& rest_charset, char prefix = 0) {');
lines.push(' size_t i = 0;');
lines.push(' if (prefix && i < s.size() && s[i] == prefix) i++;');
lines.push(' if (i >= s.size()) return false;');
lines.push(' if (first_charset.find(s[i]) == std::string::npos) return false;');
lines.push(' i++;');
lines.push(' for (; i < s.size(); i++) {');
lines.push(' if (rest_charset.find(s[i]) == std::string::npos) return false;');
lines.push(' }');
lines.push(' return true;');
lines.push('}');
}

if (helpers.has('check_space_separated_charset')) {
if (lines.length > 0) lines.push('');
lines.push('static bool check_space_separated_charset(const std::string& s, const std::string& charset) {');
lines.push(' if (s.empty()) return false;');
lines.push(' size_t i = 0;');
lines.push(' if (charset.find(s[i]) == std::string::npos) return false;');
lines.push(' while (i < s.size() && charset.find(s[i]) != std::string::npos) i++;');
lines.push(" while (i < s.size() && s[i] == ' ') {");
lines.push(' i++;');
lines.push(' if (i >= s.size() || charset.find(s[i]) == std::string::npos) return false;');
lines.push(' while (i < s.size() && charset.find(s[i]) != std::string::npos) i++;');
lines.push(' }');
lines.push(' return i == s.size();');
lines.push('}');
}

if (helpers.has('check_uri_scheme')) {
if (lines.length > 0) lines.push('');
lines.push('static bool check_uri_scheme(const std::string& s) {');
lines.push(' if (s.size() < 4) return false;');
lines.push(' char c = s[0];');
lines.push(" if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) return false;");
lines.push(' size_t i = 1;');
lines.push(' while (i < s.size()) {');
lines.push(' c = s[i];');
lines.push(" if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') i++;");
lines.push(' else break;');
lines.push(' }');
lines.push(' if (i + 3 > s.size()) return false;');
lines.push(" return s[i] == ':' && s[i+1] == '/' && s[i+2] == '/';");
lines.push('}');
}

return lines.join('\n');
}
Loading
Loading