Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 56 additions & 1 deletion lib/src/re/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ More specifically, the compiler produces two instruction sequences, one that
matches the regexp left-to-right, and another one that matches right-to-left.
*/

use itertools::Itertools;
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::fmt::{Display, Formatter};
Expand Down Expand Up @@ -1876,11 +1877,30 @@ fn concat_seq(seqs: &[Seq]) -> Option<Seq> {
_ => {}
}

// Count of the number of sequences at the tail that can be empty.
// For instance, if we have sequences [s1, s2, s3], the result will
// be 2 if both s2 and s3 can be empty.
let empty_tail = seqs
.iter()
.rev()
.map_while(|seq| {
if matches!(seq.min_literal_len(), Some(x) if x == 0) {
Some(seq)
} else {
None
}
})
.count();

// The sequences that can be empty at the tail won't be candidates for
// concatenation.
let seqs_considered = seqs.len() - empty_tail;

let mut seqs_added = 0;
let mut total_min_literal_len = 0;
let mut result = Seq::singleton(hir::literal::Literal::exact(vec![]));

for seq in seqs.iter() {
for seq in seqs.iter().take(seqs_considered) {
match seq.min_literal_len() {
Some(min_literal_len) => {
// If the cross product of `result` with `seq` produces too many
Expand Down Expand Up @@ -2010,6 +2030,41 @@ fn seq_to_atoms(seq: Seq) -> Option<Vec<Atom>> {
atoms.sort();
atoms.dedup();

// For any pair of atoms, if one is a prefix of the other, the shorter
// one must be made inexact, and the longer one can be completely removed.
//
// Since the atoms are sorted lexicographically, any prefix of an atom
// must be adjacent to it in the sorted list.
let mut to_make_inexact = Vec::new();
let mut to_remove = Vec::new();

for ((atom_idx, atom), (next_idx, next)) in
atoms.iter().map(|atom| atom.as_ref()).enumerate().tuple_windows()
{
if atom == next {
// If they have the same bytes, the exact one (which sorts
// after the inexact one) must be removed.
to_remove.push(next_idx);
} else if next.starts_with(atom) {
// If the next atom contains the current one as a prefix,
// the next one must be removed and the current one marked
// as inexact.
to_make_inexact.push(atom_idx);
to_remove.push(next_idx);
}
}

for idx in to_make_inexact {
atoms[idx].make_inexact();
}

// Since to_remove was populated in ascending order, by iterating it
// in reverse order we get indexes in descending order to safely remove
// elements without index shifting.
for idx in to_remove.into_iter().rev() {
atoms.remove(idx);
}

Some(atoms)
}

Expand Down
17 changes: 9 additions & 8 deletions lib/src/re/thompson/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -887,19 +887,14 @@ fn re_code_17() {
00049: LIT 0x61
0004a: MATCH
"#,
// Atoms
vec![
RegexpAtom {
atom: Atom::inexact(vec![]),
code_loc: CodeLoc { fwd: 0, bck_seq_id: 0, bck: 0x4A },
code_loc: CodeLoc { fwd: 0x0D, bck_seq_id: 0, bck: 0x41 },
},
RegexpAtom {
atom: Atom::inexact(vec![0x61, 0x62, 0x63]),
code_loc: CodeLoc { fwd: 0, bck_seq_id: 0, bck: 0x4A },
},
RegexpAtom {
atom: Atom::inexact(vec![0x61, 0x62, 0x63, 0x61]),
code_loc: CodeLoc { fwd: 0, bck_seq_id: 0, bck: 0x4A },
code_loc: CodeLoc { fwd: 0x13, bck_seq_id: 0, bck: 0x4A },
},
],
// Epsilon closure starting at forward code 0.
Expand Down Expand Up @@ -1337,7 +1332,6 @@ fn re_atoms() {
r#"ab.*cd"#,
vec![
Atom::inexact(b"ab"),
Atom::exact("abcd"),
]
);

Expand Down Expand Up @@ -1521,6 +1515,13 @@ fn re_atoms() {
v
});

assert_re_atoms!(r#"(com|net)[^{}]{0,100}"#, {
vec![
Atom::inexact(b"com"),
Atom::inexact(b"net"),
]
});

assert_re_atoms!(
r#"(?s)abc.d(((xy|xz)w.)|[a-c])(((xy|xz)w.)|[a-c])"#,
vec![Atom::inexact(b"abc")]
Expand Down
8 changes: 0 additions & 8 deletions lib/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1355,15 +1355,7 @@ fn regexp_patterns_1() {
pattern_match!(r#"/a(.*)*/"#, b"a", b"a");
pattern_match!(r#"/a(.*){2}/"#, b"a", b"a");
pattern_match!(r#"/a(.*){2,4}/"#, b"a", b"a");

// TODO: known issue related to exact atoms. The matching string
// should be "abbb" and not "abb". When the `exact-atoms` feature
// is disabled it works correctly.
#[cfg(not(feature = "exact-atoms"))]
pattern_match!(r#"/a(bb|b)b/"#, b"abbbbbbbb", b"abbb");
#[cfg(feature = "exact-atoms")]
pattern_match!(r#"/a(bb|b)b/"#, b"abbbbbbbb", b"abb");

pattern_match!(r#"/a(b|bb)b/"#, b"abbbbbbbb", b"abb");

pattern_match!(
Expand Down
Loading