From f301811db11bb21f9d92739526c28b3e116c31df Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 5 Jun 2026 12:50:10 +0200 Subject: [PATCH] fix: issues in atom extraction from regular expressions. (#673) This PR optimizes the atom extraction process to prevent generating an excessive number of atoms for patterns with optional suffixes. It also fixes a bug where exact prefix atoms could prematurely match and bypass the regex VM, leading to incorrect (non-greedy) match results. Summary of Changes Modified concat_seq in lib/src/re/thompson/compiler.rs to identify and ignore optional sequences at the tail of the concatenation when extracting candidate atoms. For example, in (com|net)[^{}]{0,100}, the [^{}]{0,100} suffix is optional and can be empty. We now stop concatenating before this suffix, avoiding the generation of a massive number of atoms. If any tail sequences are skipped, the resulting atoms are correctly marked as inexact to force the regex VM to run and verify the optional suffix. If a shorter atom is a prefix of a longer atom (e.g., ab and abc), the shorter one is made inexact. This forces the engine to run the regex VM to ensure greedy matching (e.g., matching abbb instead of stopping at abb for /a(bb|b)b/). Since the shorter inexact atom will trigger the VM to match the longer path anyway, the longer atom (e.g., abc) is redundant and is completely removed from the set. If two atoms have the same bytes but different exactness (one exact and one inexact), the exact one is removed. --- lib/src/re/thompson/compiler.rs | 57 ++++++++++++++++++++++++++++++++- lib/src/re/thompson/tests.rs | 17 +++++----- lib/src/tests/mod.rs | 8 ----- 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/lib/src/re/thompson/compiler.rs b/lib/src/re/thompson/compiler.rs index 748f05495..8288aba9b 100644 --- a/lib/src/re/thompson/compiler.rs +++ b/lib/src/re/thompson/compiler.rs @@ -6,6 +6,7 @@ More specifically, the compiler produces two instruction sequences, one that matches the regexp left-to-right, and another one that matches right-to-left. */ +use itertools::Itertools; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::fmt::{Display, Formatter}; @@ -1876,11 +1877,30 @@ fn concat_seq(seqs: &[Seq]) -> Option { _ => {} } + // Count of the number of sequences at the tail that can be empty. + // For instance, if we have sequences [s1, s2, s3], the result will + // be 2 if both s2 and s3 can be empty. + let empty_tail = seqs + .iter() + .rev() + .map_while(|seq| { + if matches!(seq.min_literal_len(), Some(x) if x == 0) { + Some(seq) + } else { + None + } + }) + .count(); + + // The sequences that can be empty at the tail won't be candidates for + // concatenation. + let seqs_considered = seqs.len() - empty_tail; + let mut seqs_added = 0; let mut total_min_literal_len = 0; let mut result = Seq::singleton(hir::literal::Literal::exact(vec![])); - for seq in seqs.iter() { + for seq in seqs.iter().take(seqs_considered) { match seq.min_literal_len() { Some(min_literal_len) => { // If the cross product of `result` with `seq` produces too many @@ -2010,6 +2030,41 @@ fn seq_to_atoms(seq: Seq) -> Option> { atoms.sort(); atoms.dedup(); + // For any pair of atoms, if one is a prefix of the other, the shorter + // one must be made inexact, and the longer one can be completely removed. + // + // Since the atoms are sorted lexicographically, any prefix of an atom + // must be adjacent to it in the sorted list. + let mut to_make_inexact = Vec::new(); + let mut to_remove = Vec::new(); + + for ((atom_idx, atom), (next_idx, next)) in + atoms.iter().map(|atom| atom.as_ref()).enumerate().tuple_windows() + { + if atom == next { + // If they have the same bytes, the exact one (which sorts + // after the inexact one) must be removed. + to_remove.push(next_idx); + } else if next.starts_with(atom) { + // If the next atom contains the current one as a prefix, + // the next one must be removed and the current one marked + // as inexact. + to_make_inexact.push(atom_idx); + to_remove.push(next_idx); + } + } + + for idx in to_make_inexact { + atoms[idx].make_inexact(); + } + + // Since to_remove was populated in ascending order, by iterating it + // in reverse order we get indexes in descending order to safely remove + // elements without index shifting. + for idx in to_remove.into_iter().rev() { + atoms.remove(idx); + } + Some(atoms) } diff --git a/lib/src/re/thompson/tests.rs b/lib/src/re/thompson/tests.rs index 4d4fd5f39..d7b8773b6 100644 --- a/lib/src/re/thompson/tests.rs +++ b/lib/src/re/thompson/tests.rs @@ -887,19 +887,14 @@ fn re_code_17() { 00049: LIT 0x61 0004a: MATCH "#, - // Atoms vec![ RegexpAtom { atom: Atom::inexact(vec![]), - code_loc: CodeLoc { fwd: 0, bck_seq_id: 0, bck: 0x4A }, + code_loc: CodeLoc { fwd: 0x0D, bck_seq_id: 0, bck: 0x41 }, }, RegexpAtom { atom: Atom::inexact(vec![0x61, 0x62, 0x63]), - code_loc: CodeLoc { fwd: 0, bck_seq_id: 0, bck: 0x4A }, - }, - RegexpAtom { - atom: Atom::inexact(vec![0x61, 0x62, 0x63, 0x61]), - code_loc: CodeLoc { fwd: 0, bck_seq_id: 0, bck: 0x4A }, + code_loc: CodeLoc { fwd: 0x13, bck_seq_id: 0, bck: 0x4A }, }, ], // Epsilon closure starting at forward code 0. @@ -1337,7 +1332,6 @@ fn re_atoms() { r#"ab.*cd"#, vec![ Atom::inexact(b"ab"), - Atom::exact("abcd"), ] ); @@ -1521,6 +1515,13 @@ fn re_atoms() { v }); + assert_re_atoms!(r#"(com|net)[^{}]{0,100}"#, { + vec![ + Atom::inexact(b"com"), + Atom::inexact(b"net"), + ] + }); + assert_re_atoms!( r#"(?s)abc.d(((xy|xz)w.)|[a-c])(((xy|xz)w.)|[a-c])"#, vec![Atom::inexact(b"abc")] diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index b0595e0df..bc5aa2206 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -1355,15 +1355,7 @@ fn regexp_patterns_1() { pattern_match!(r#"/a(.*)*/"#, b"a", b"a"); pattern_match!(r#"/a(.*){2}/"#, b"a", b"a"); pattern_match!(r#"/a(.*){2,4}/"#, b"a", b"a"); - - // TODO: known issue related to exact atoms. The matching string - // should be "abbb" and not "abb". When the `exact-atoms` feature - // is disabled it works correctly. - #[cfg(not(feature = "exact-atoms"))] pattern_match!(r#"/a(bb|b)b/"#, b"abbbbbbbb", b"abbb"); - #[cfg(feature = "exact-atoms")] - pattern_match!(r#"/a(bb|b)b/"#, b"abbbbbbbb", b"abb"); - pattern_match!(r#"/a(b|bb)b/"#, b"abbbbbbbb", b"abb"); pattern_match!(