From b5952cfbb06ed7f3e838fca220234dfa0010f00b Mon Sep 17 00:00:00 2001 From: Testspieler09 Date: Sun, 4 May 2025 15:15:51 +0200 Subject: [PATCH 1/8] feat(glushkov): adjust base project to the new construction --- src/glushkov.rs | 50 +++++++ src/lib.rs | 3 +- src/regex_engine.rs | 257 ++++++++++++++++++++++++++++++++++-- src/{dfa.rs => thompson.rs} | 228 +------------------------------- tests/test_one.rs | 8 +- 5 files changed, 302 insertions(+), 244 deletions(-) create mode 100644 src/glushkov.rs rename src/{dfa.rs => thompson.rs} (78%) diff --git a/src/glushkov.rs b/src/glushkov.rs new file mode 100644 index 0000000..d6d7342 --- /dev/null +++ b/src/glushkov.rs @@ -0,0 +1,50 @@ +use crate::regex_engine::{is_valid_regex, normalise_regex}; +use std::collections::{HashMap, HashSet}; + +struct NFA { + transitions: HashMap<(u32, Option), Vec>, + accepting_state: u32, +} + +pub struct DFA { + transitions: HashMap<(u32, Option), u32>, + accepting_states: HashSet, +} + +// GLUSHKOV CONSTRUCTION +fn glushkov_construction(regex: &str) -> NFA { + // TODO: Step 1 (rename letters / index them) + // TODO: Step 2a () + // TODO: Step 2b () + // TODO: Step 3 () + // TODO: Step 4 () + todo!() +} + +fn nfa_no_epsilon_to_dfa() { + todo!() +} +// END GLUSHKOV CONSTRUCTION + +impl DFA { + pub fn new(regex: &str) -> Self { + if !is_valid_regex(regex) { + panic!("{} is not a valid regular expression!", regex); + } + + let normalised_regex = normalise_regex(®ex); + todo!() + } + + pub fn process(&self, input: &str) -> bool { + todo!() + } + + pub fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> { + todo!() + } + + pub fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> { + todo!() + } +} diff --git a/src/lib.rs b/src/lib.rs index 244eee8..1c11465 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,3 @@ -mod dfa; +mod glushkov; pub mod regex_engine; +mod thompson; diff --git a/src/regex_engine.rs b/src/regex_engine.rs index 2d528cf..77e15a5 100644 --- a/src/regex_engine.rs +++ b/src/regex_engine.rs @@ -1,36 +1,269 @@ -use crate::dfa::DFA; +use crate::glushkov::DFA as GlushkovDFA; +use crate::thompson::DFA as ThompsonDFA; + +pub enum ConstructionType { + Thompson, + Glushkov, +} + +enum DFAType { + Thompson(ThompsonDFA), + Glushkov(GlushkovDFA), +} pub struct Regex { - dfa: DFA, + dfa: DFAType, } impl Regex { - pub fn new(pattern: &str) -> Self { - Regex { - dfa: DFA::new(pattern), - } + pub fn new(pattern: &str, construction: ConstructionType) -> Self { + let dfa_type = match construction { + ConstructionType::Thompson => DFAType::Thompson(ThompsonDFA::new(pattern)), + ConstructionType::Glushkov => DFAType::Glushkov(GlushkovDFA::new(pattern)), + }; + Regex { dfa: dfa_type } } pub fn is_match(&self, text: &str) -> bool { - self.dfa.process(text) + match &self.dfa { + DFAType::Thompson(dfa) => dfa.process(text), + DFAType::Glushkov(dfa) => dfa.process(text), + } } pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> { - self.dfa.find_first_match(text) + match &self.dfa { + DFAType::Thompson(dfa) => dfa.find_first_match(text), + DFAType::Glushkov(dfa) => dfa.find_first_match(text), + } } pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> { - self.dfa.find_all_matches(text) + match &self.dfa { + DFAType::Thompson(dfa) => dfa.find_all_matches(text), + DFAType::Glushkov(dfa) => dfa.find_all_matches(text), + } + } +} + +pub fn is_valid_regex(regex: &str) -> bool { + if regex.is_empty() { + return false; + } + + let mut open_paren_count = 0; + let mut last_was_quantifier = false; + + let mut chars = regex.chars().peekable(); + while let Some(c) = chars.next() { + match c { + '(' => { + open_paren_count += 1; + last_was_quantifier = false; + } + + ')' => { + if open_paren_count == 0 { + return false; + } + open_paren_count -= 1; + last_was_quantifier = false; + } + + '*' | '+' => { + // Ensure quantifiers are not the first character and are not repeated + if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') { + return false; + } + last_was_quantifier = true; + } + + '|' => { + // Ensure alternation isn't the first or last character + if regex.starts_with('|') || chars.peek().is_none() { + return false; + } + last_was_quantifier = false; + } + + '\\' => { + // Handle escaped characters: ensure there's a character after the escape + if chars.peek().is_none() { + return false; + } + chars.next(); // Skip the escaped character + last_was_quantifier = false; + } + + _ => { + last_was_quantifier = false; + } + } + } + + open_paren_count == 0 +} + +pub fn normalise_regex(regex: &str) -> String { + let mut normalised = String::new(); + let mut escape_sequence = false; + let mut prev_char = '\0'; + + for curr_char in regex.chars() { + if escape_sequence { + // TODO: Implement further parsing features here (e.g. \w \d) + normalised.push(curr_char); + escape_sequence = false; + prev_char = curr_char; + continue; + } + + if curr_char == '\\' { + escape_sequence = true; + normalised.push(curr_char); + continue; + } + + if curr_char == '+' { + normalised.push(prev_char); + normalised.push('*'); + prev_char = curr_char; + continue; + } + if curr_char == '?' { + match prev_char { + ')' => { + let mut balance = 0; + + for j in (0..normalised.len()).rev() { + let ch = normalised.chars().nth(j).unwrap(); + if ch == ')' { + balance += 1; + } else if ch == '(' { + balance -= 1; + if balance == 0 { + normalised.insert(j, '('); + break; + } + } + } + } + _ => { + normalised.insert(normalised.len() - 1, '('); + } + } + normalised.push_str("|())"); + prev_char = curr_char; + continue; + } + if curr_char == '.' { + normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"); + prev_char = curr_char; + continue; + } + + normalised.push(curr_char); + prev_char = curr_char; } + + normalised } #[cfg(test)] mod tests { use super::*; + #[test] + fn valid_regex_basic_test() { + let regex = "(a|b)*"; + assert!(is_valid_regex(regex), "Expected valid regex."); + } + + #[test] + fn invalid_empty_regex_test() { + let regex = ""; + assert!(!is_valid_regex(regex), "Expected invalid regex (empty)."); + } + + #[test] + fn invalid_unbalanced_parentheses_test() { + let regex1 = "(a|b"; + let regex2 = "a|b)"; + assert!( + !is_valid_regex(regex1), + "Expected invalid regex (unbalanced parentheses)." + ); + assert!( + !is_valid_regex(regex2), + "Expected invalid regex (unbalanced parentheses)." + ); + } + + #[test] + fn invalid_operator_placement_test() { + let regex1 = "*a"; + let regex2 = "|a|b"; + assert!( + !is_valid_regex(regex1), + "Expected invalid regex (invalid quantifier placement)." + ); + assert!( + !is_valid_regex(regex2), + "Expected invalid regex (invalid alternation placement)." + ); + } + + #[test] + fn valid_nested_parentheses_test() { + let regex = "((a|b)*c)"; + assert!( + is_valid_regex(regex), + "Expected valid regex with nested parentheses." + ); + } + + #[test] + fn valid_escape_sequence_test() { + let regex = "a\\*b"; + assert!( + is_valid_regex(regex), + "Expected valid regex with escape sequence." + ); + } + + #[test] + fn invalid_escape_sequence_test() { + let regex = "a\\"; + assert!( + !is_valid_regex(regex), + "Expected invalid regex with unpaired escape." + ); + } + + #[test] + fn normalise_regex_test() { + let cases = [ + (r"a+", r"aa*"), + (r"a\+", r"a\+"), + (r"a?", r"(a|())"), + (r"a\?", r"a\?"), + (r"(ab)?", r"((ab)|())"), + (r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"), + ]; + + for (input, expected) in cases { + let result = normalise_regex(input); + assert_eq!( + result, expected, + "Normalisation failed for input '{}'", + input + ); + } + } + #[test] fn is_match_test() { - let regex_object = Regex::new("(a|b)*"); + let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson); let success_strings = vec!["abababaaaababa", ""]; for string in success_strings { @@ -45,7 +278,7 @@ mod tests { #[test] fn find_test() { - let regex_object = Regex::new("abc"); + let regex_object = Regex::new("abc", ConstructionType::Thompson); let test_cases = vec![ ("abcd", Some("abc")), ("xyzabc", Some("abc")), @@ -63,7 +296,7 @@ mod tests { #[test] fn find_all_test() { - let regex_object = Regex::new("abc*"); + let regex_object = Regex::new("abc*", ConstructionType::Thompson); let test_cases = vec![ ("abcd", vec!["abc"]), ("ac", vec![]), diff --git a/src/dfa.rs b/src/thompson.rs similarity index 78% rename from src/dfa.rs rename to src/thompson.rs index 17019c6..e96fcca 100644 --- a/src/dfa.rs +++ b/src/thompson.rs @@ -1,4 +1,4 @@ -use core::panic; +use crate::regex_engine::{is_valid_regex, normalise_regex}; use std::collections::{HashMap, HashSet, VecDeque}; struct NFA { @@ -11,144 +11,6 @@ pub struct DFA { accepting_states: HashSet, } -fn is_valid_regex(regex: &str) -> bool { - if regex.is_empty() { - return false; - } - - let mut open_paren_count = 0; - let mut last_was_quantifier = false; - - let mut chars = regex.chars().peekable(); - while let Some(c) = chars.next() { - match c { - '(' => { - open_paren_count += 1; - last_was_quantifier = false; - } - - ')' => { - if open_paren_count == 0 { - return false; - } - open_paren_count -= 1; - last_was_quantifier = false; - } - - '*' | '+' => { - // Ensure quantifiers are not the first character and are not repeated - if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') { - return false; - } - last_was_quantifier = true; - } - - '|' => { - // Ensure alternation isn't the first or last character - if regex.starts_with('|') || chars.peek().is_none() { - return false; - } - last_was_quantifier = false; - } - - '\\' => { - // Handle escaped characters: ensure there's a character after the escape - if chars.peek().is_none() { - return false; - } - chars.next(); // Skip the escaped character - last_was_quantifier = false; - } - - _ => { - last_was_quantifier = false; - } - } - } - - open_paren_count == 0 -} - -fn normalise_regex(regex: &str) -> String { - let mut normalised = String::new(); - let mut escape_sequence = false; - let mut prev_char = '\0'; - - for curr_char in regex.chars() { - if escape_sequence { - // TODO: Implement further parsing features here (e.g. \w \d) - normalised.push(curr_char); - escape_sequence = false; - prev_char = curr_char; - continue; - } - - if curr_char == '\\' { - escape_sequence = true; - normalised.push(curr_char); - continue; - } - - if curr_char == '+' { - normalised.push(prev_char); - normalised.push('*'); - prev_char = curr_char; - continue; - } - if curr_char == '?' { - match prev_char { - ')' => { - let mut balance = 0; - - for j in (0..normalised.len()).rev() { - let ch = normalised.chars().nth(j).unwrap(); - if ch == ')' { - balance += 1; - } else if ch == '(' { - balance -= 1; - if balance == 0 { - normalised.insert(j, '('); - break; - } - } - } - } - _ => { - normalised.insert(normalised.len() - 1, '('); - } - } - normalised.push_str("|())"); - prev_char = curr_char; - continue; - } - if curr_char == '.' { - normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"); - prev_char = curr_char; - continue; - } - - normalised.push(curr_char); - prev_char = curr_char; - } - - normalised -} - -// GLUSHKOV CONSTRUCTION -fn glushkov_construction(regex: &str) -> NFA { - // TODO: Step 1 (rename letters / index them) - // TODO: Step 2a () - // TODO: Step 2b () - // TODO: Step 3 () - // TODO: Step 4 () - todo!() -} - -fn nfa_no_epsilon_to_dfa() { - todo!() -} -// END GLUSHKOV CONSTRUCTION - // THOMPSON CONSTRUCTION --- fn thompson_construction(normalised_regex: &str) -> NFA { fn apply_operator(nfa_stack: &mut Vec, operator: char) { @@ -681,94 +543,6 @@ impl DFA { mod tests { use super::*; - #[test] - fn valid_regex_basic_test() { - let regex = "(a|b)*"; - assert!(is_valid_regex(regex), "Expected valid regex."); - } - - #[test] - fn invalid_empty_regex_test() { - let regex = ""; - assert!(!is_valid_regex(regex), "Expected invalid regex (empty)."); - } - - #[test] - fn invalid_unbalanced_parentheses_test() { - let regex1 = "(a|b"; - let regex2 = "a|b)"; - assert!( - !is_valid_regex(regex1), - "Expected invalid regex (unbalanced parentheses)." - ); - assert!( - !is_valid_regex(regex2), - "Expected invalid regex (unbalanced parentheses)." - ); - } - - #[test] - fn invalid_operator_placement_test() { - let regex1 = "*a"; - let regex2 = "|a|b"; - assert!( - !is_valid_regex(regex1), - "Expected invalid regex (invalid quantifier placement)." - ); - assert!( - !is_valid_regex(regex2), - "Expected invalid regex (invalid alternation placement)." - ); - } - - #[test] - fn valid_nested_parentheses_test() { - let regex = "((a|b)*c)"; - assert!( - is_valid_regex(regex), - "Expected valid regex with nested parentheses." - ); - } - - #[test] - fn valid_escape_sequence_test() { - let regex = "a\\*b"; - assert!( - is_valid_regex(regex), - "Expected valid regex with escape sequence." - ); - } - - #[test] - fn invalid_escape_sequence_test() { - let regex = "a\\"; - assert!( - !is_valid_regex(regex), - "Expected invalid regex with unpaired escape." - ); - } - - #[test] - fn normalise_regex_test() { - let cases = [ - (r"a+", r"aa*"), - (r"a\+", r"a\+"), - (r"a?", r"(a|())"), - (r"a\?", r"a\?"), - (r"(ab)?", r"((ab)|())"), - (r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"), - ]; - - for (input, expected) in cases { - let result = normalise_regex(input); - assert_eq!( - result, expected, - "Normalisation failed for input '{}'", - input - ); - } - } - #[test] fn create_dfa_test() { let generated_dfa = DFA::new("(a|b)*"); diff --git a/tests/test_one.rs b/tests/test_one.rs index 47ec5fe..a166e8a 100644 --- a/tests/test_one.rs +++ b/tests/test_one.rs @@ -1,4 +1,4 @@ -use regex_engine::regex_engine::Regex; +use regex_engine::regex_engine::{ConstructionType, Regex}; #[test] fn test_escape_sequence_plus() { @@ -6,7 +6,7 @@ fn test_escape_sequence_plus() { let text = "aaab+b"; // should fail on match let text_success = "aaab+"; - let engine = Regex::new(pattern); + let engine = Regex::new(pattern, ConstructionType::Thompson); let expected_match = text_success; @@ -22,7 +22,7 @@ fn test_escape_sequence_slash() { let text = "aaab\\b"; // should fail on match let text_success = "aaab\\"; - let engine = Regex::new(pattern); + let engine = Regex::new(pattern, ConstructionType::Thompson); let expected_match = text_success; @@ -38,7 +38,7 @@ fn test_dot_wildcard() { let text = "cabbc"; // should fail on match let text_success = "abbc"; - let engine = Regex::new(pattern); + let engine = Regex::new(pattern, ConstructionType::Thompson); let expected_match = text_success; From 469fdb9f2759dc90585d77553c374ac0458953d2 Mon Sep 17 00:00:00 2001 From: Testspieler09 Date: Sat, 10 May 2025 16:54:47 +0200 Subject: [PATCH 2/8] chore: index states and fixes --- src/glushkov.rs | 150 ++++++++++++++++++++++++++++++++++++++++++++---- src/thompson.rs | 42 +++++++------- 2 files changed, 160 insertions(+), 32 deletions(-) diff --git a/src/glushkov.rs b/src/glushkov.rs index d6d7342..72615a5 100644 --- a/src/glushkov.rs +++ b/src/glushkov.rs @@ -1,31 +1,100 @@ use crate::regex_engine::{is_valid_regex, normalise_regex}; use std::collections::{HashMap, HashSet}; +#[derive(Clone, Debug, PartialEq)] +enum SymbolType { + Normal, + KleeneStar, + Escaped, +} + struct NFA { - transitions: HashMap<(u32, Option), Vec>, - accepting_state: u32, + transitions: HashMap<(u32, char), Vec>, + accepting_states: HashSet, } pub struct DFA { - transitions: HashMap<(u32, Option), u32>, + transitions: HashMap<(u32, char), u32>, accepting_states: HashSet, } // GLUSHKOV CONSTRUCTION fn glushkov_construction(regex: &str) -> NFA { - // TODO: Step 1 (rename letters / index them) - // TODO: Step 2a () - // TODO: Step 2b () - // TODO: Step 3 () - // TODO: Step 4 () - todo!() + let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); + let mut accepting_states: HashSet = HashSet::new(); + + let states: HashMap<(u32, char), (SymbolType, u32)> = index_states(regex); + + // TODO: Construct transitions and accepting states using position_index + NFA { + transitions, + accepting_states, + } } -fn nfa_no_epsilon_to_dfa() { - todo!() +fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> { + let mut indexed_states: HashMap<(u32, char), (SymbolType, u32)> = HashMap::new(); + let mut symbol_type: SymbolType = SymbolType::Normal; + let mut union_count: Vec = vec![0]; + let mut idx: u32 = 0; + let mut group_index: u32 = 0; + + let mut chars = regex.chars().peekable(); + + while let Some(symbol) = chars.next() { + if symbol_type == SymbolType::Escaped { + indexed_states + .entry((idx as u32, symbol)) + .or_insert((symbol_type.clone(), group_index)); + + idx += 1; + symbol_type = SymbolType::Normal; + continue; + } + + match symbol { + '|' => { + if let Some(last_element) = union_count.last_mut() { + *last_element += 1; + } + group_index += 1; + } + '(' => { + union_count.push(0); + group_index += 1; + } + ')' => { + group_index -= union_count.pop().unwrap() + 1; + } + '*' => { + symbol_type = SymbolType::Normal; + continue; + } + '\\' => symbol_type = SymbolType::Escaped, + _ => { + if let Some(next_symbol) = chars.peek() { + if *next_symbol == '*' { + symbol_type = SymbolType::KleeneStar + } + } + + indexed_states + .entry((idx as u32, symbol)) + .or_insert((symbol_type.clone(), group_index)); + + idx += 1; + } + } + } + + indexed_states } // END GLUSHKOV CONSTRUCTION +fn nfa_no_epsilon_to_dfa(nfa: &NFA) -> DFA { + todo!() +} + impl DFA { pub fn new(regex: &str) -> Self { if !is_valid_regex(regex) { @@ -48,3 +117,62 @@ impl DFA { todo!() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_single_character() { + let expected = HashMap::from([((0, 'a'), (SymbolType::Normal, 0))]); + + let result = index_states("a"); + assert_eq!(result, expected, "Mismatch in single character test"); + } + + #[test] + fn test_kleene_star() { + let expected = HashMap::from([((0, 'a'), (SymbolType::KleeneStar, 0))]); + + let result = index_states("a*"); + assert_eq!(result, expected, "Mismatch in kleene star test"); + } + + #[test] + fn test_union_and_groups() { + let expected = HashMap::from([ + ((0, 'a'), (SymbolType::Normal, 1)), + ((1, 'b'), (SymbolType::Normal, 2)), + ]); + + let result = index_states("(a|b)"); + assert_eq!(result, expected, "Mismatch in union and groups test"); + } + + #[test] + fn test_escaped_character() { + let expected = HashMap::from([((0, 'a'), (SymbolType::Escaped, 0))]); + + let result = index_states("\\a"); + assert_eq!(result, expected, "Mismatch in escaped character test"); + } + + #[test] + fn test_mixed_regex() { + let expected = HashMap::from([ + ((0, 'a'), (SymbolType::Normal, 0)), + ((1, '*'), (SymbolType::Escaped, 0)), + ((2, 'b'), (SymbolType::Normal, 0)), + ((3, 'c'), (SymbolType::KleeneStar, 0)), + ((4, 'd'), (SymbolType::Normal, 0)), + ((5, 'e'), (SymbolType::Normal, 1)), + ((6, 'f'), (SymbolType::Normal, 2)), + ((7, 'g'), (SymbolType::Normal, 4)), + ((8, 'h'), (SymbolType::Normal, 5)), + ((9, 'i'), (SymbolType::Normal, 0)), + ]); + + let result = index_states("a\\*bc*d(e|f|(g|h))i"); + assert_eq!(result, expected, "Mismatch in mixed regex test"); + } +} diff --git a/src/thompson.rs b/src/thompson.rs index e96fcca..bb5c50d 100644 --- a/src/thompson.rs +++ b/src/thompson.rs @@ -7,7 +7,7 @@ struct NFA { } pub struct DFA { - transitions: HashMap<(u32, Option), u32>, + transitions: HashMap<(u32, char), u32>, accepting_states: HashSet, } @@ -289,7 +289,7 @@ fn nfa_to_dfa(nfa: &NFA) -> DFA { unmarked_states.push(move_closure); } - transitions.insert((current_dfa_state_id, Some(symbol)), state_map[&sorted_vec]); + transitions.insert((current_dfa_state_id, symbol), state_map[&sorted_vec]); } } @@ -340,7 +340,7 @@ fn optimise_dfa(dfa: &DFA) -> DFA { } while let Some(current_partition_index) = worklist.pop_front() { - let mut states_to_check: HashMap, HashSet> = HashMap::new(); + let mut states_to_check: HashMap> = HashMap::new(); for (&(source_state, symbol), &target_state) in &dfa.transitions { if partition[&target_state] == current_partition_index { states_to_check @@ -398,7 +398,7 @@ fn optimise_dfa(dfa: &DFA) -> DFA { } } - let mut minimal_transitions: HashMap<(u32, Option), u32> = HashMap::new(); + let mut minimal_transitions: HashMap<(u32, char), u32> = HashMap::new(); let mut minimal_accepting_states: HashSet = HashSet::new(); let mut new_state_map: HashMap = HashMap::new(); @@ -454,7 +454,7 @@ impl DFA { pub fn process(&self, input: &str) -> bool { let mut current_state = 0; for c in input.chars() { - if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) { + if let Some(&next_state) = self.transitions.get(&(current_state, c)) { current_state = next_state; } else { return false; @@ -472,7 +472,7 @@ impl DFA { let mut found_match = false; for (i, c) in text.chars().enumerate().skip(start_pos) { - if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) { + if let Some(&next_state) = self.transitions.get(&(current_state, c)) { current_state = next_state; match_start = match_start.or(Some(i)); @@ -510,7 +510,7 @@ impl DFA { let mut found_match = false; for (i, c) in input.chars().enumerate().skip(start_pos) { - if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) { + if let Some(&next_state) = self.transitions.get(&(current_state, c)) { current_state = next_state; match_start = match_start.or(Some(start_pos)); @@ -546,14 +546,14 @@ mod tests { #[test] fn create_dfa_test() { let generated_dfa = DFA::new("(a|b)*"); - let expected_transitions = HashMap::from([((0, Some('a')), 0), ((0, Some('b')), 0)]); + let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 0)]); let expected_accepting_states = HashSet::from([0]); assert_eq!(expected_transitions, generated_dfa.transitions); assert_eq!(expected_accepting_states, generated_dfa.accepting_states); let generated_dfa_2 = DFA::new("a|()"); - let expected_transitions_2 = HashMap::from([((0, Some('a')), 1)]); + let expected_transitions_2 = HashMap::from([((0, 'a'), 1)]); let expected_accepting_states_2 = HashSet::from([0, 1]); assert_eq!(expected_transitions_2, generated_dfa_2.transitions); @@ -671,20 +671,20 @@ mod tests { let expected_options = vec![ HashMap::from([ - ((0, Some('a')), 1), - ((0, Some('b')), 2), - ((1, Some('a')), 1), - ((1, Some('b')), 2), - ((2, Some('a')), 1), - ((2, Some('b')), 2), + ((0, 'a'), 1), + ((0, 'b'), 2), + ((1, 'a'), 1), + ((1, 'b'), 2), + ((2, 'a'), 1), + ((2, 'b'), 2), ]), HashMap::from([ - ((0, Some('a')), 2), - ((0, Some('b')), 1), - ((1, Some('a')), 2), - ((1, Some('b')), 1), - ((2, Some('a')), 2), - ((2, Some('b')), 1), + ((0, 'a'), 2), + ((0, 'b'), 1), + ((1, 'a'), 2), + ((1, 'b'), 1), + ((2, 'a'), 2), + ((2, 'b'), 1), ]), ]; let expected_accepting_states = HashSet::from([0, 1, 2]); From efdac98828bff4fc5ec6e1e7179ed82ae395f16a Mon Sep 17 00:00:00 2001 From: Testspieler09 Date: Mon, 21 Jul 2025 19:01:19 +0200 Subject: [PATCH 3/8] chore: refactor and restructure of codebase --- Cargo.lock | 599 ++++++++++++++++++++++++++++++++ Cargo.toml | 17 +- benches/glushkov_benchmark.rs | 0 benches/rust_regex_benchmark.rs | 0 benches/thompson_benchmark.rs | 0 src/glushkov.rs | 324 +++++++++++++---- src/lib.rs | 94 +++++ src/regex_engine.rs | 42 ++- src/thompson.rs | 194 ++++------- 9 files changed, 1053 insertions(+), 217 deletions(-) create mode 100644 benches/glushkov_benchmark.rs create mode 100644 benches/rust_regex_benchmark.rs create mode 100644 benches/thompson_benchmark.rs diff --git a/Cargo.lock b/Cargo.lock index ab61766..61f4459 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,605 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "half" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.174" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + [[package]] name = "regex_engine" version = "0.1.0" +dependencies = [ + "criterion", +] + +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.141" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b9eff21ebe718216c6ec64e1d9ac57087aad11efc64e32002bce4a0d4c03d3" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index e3ddfb0..6c7d598 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,19 @@ [package] name = "regex_engine" version = "0.1.0" -edition = "2021" +edition = "2024" -[dependencies] +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "thompson_benchmark" +harness = false + +[[bench]] +name = "glushkov_benchmark" +harness = false + +[[bench]] +name = "rust_regex_benchmark" +harness = false diff --git a/benches/glushkov_benchmark.rs b/benches/glushkov_benchmark.rs new file mode 100644 index 0000000..e69de29 diff --git a/benches/rust_regex_benchmark.rs b/benches/rust_regex_benchmark.rs new file mode 100644 index 0000000..e69de29 diff --git a/benches/thompson_benchmark.rs b/benches/thompson_benchmark.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/glushkov.rs b/src/glushkov.rs index 72615a5..ec35441 100644 --- a/src/glushkov.rs +++ b/src/glushkov.rs @@ -1,4 +1,7 @@ -use crate::regex_engine::{is_valid_regex, normalise_regex}; +use crate::{ + Dfa, + regex_engine::{is_valid_regex, normalise_regex}, +}; use std::collections::{HashMap, HashSet}; #[derive(Clone, Debug, PartialEq)] @@ -8,45 +11,73 @@ enum SymbolType { Escaped, } -struct NFA { +struct Nfa { transitions: HashMap<(u32, char), Vec>, accepting_states: HashSet, } -pub struct DFA { +pub struct GlushkovDfa { transitions: HashMap<(u32, char), u32>, accepting_states: HashSet, } +impl Dfa for GlushkovDfa { + fn new(regex: &str) -> Self { + if !is_valid_regex(regex) { + panic!("{regex} is not a valid regular expression!"); + } + + let normalised_regex = normalise_regex(®ex); + todo!() + } + + fn get_transitions(&self) -> &HashMap<(u32, char), u32> { + &self.transitions + } + + fn get_accepting_states(&self) -> &HashSet { + &self.accepting_states + } +} + // GLUSHKOV CONSTRUCTION -fn glushkov_construction(regex: &str) -> NFA { +fn glushkov_construction(regex: &str) -> Nfa { let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); let mut accepting_states: HashSet = HashSet::new(); - let states: HashMap<(u32, char), (SymbolType, u32)> = index_states(regex); + let states: HashMap = index_states(regex); + + let mut start_states: HashSet = HashSet::new(); + + fill_sets( + states, + &mut start_states, + &mut accepting_states, + &mut transitions, + ); // TODO: Construct transitions and accepting states using position_index - NFA { + Nfa { transitions, accepting_states, } } -fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> { - let mut indexed_states: HashMap<(u32, char), (SymbolType, u32)> = HashMap::new(); - let mut symbol_type: SymbolType = SymbolType::Normal; +fn index_states(regex: &str) -> HashMap { + let mut indexed_states: HashMap = HashMap::new(); + let mut symbol_type = SymbolType::Normal; let mut union_count: Vec = vec![0]; let mut idx: u32 = 0; let mut group_index: u32 = 0; + // New stack to track if a group is meaningful + let mut group_stack: Vec> = vec![]; // Some(index) if real, None if ignored + let mut chars = regex.chars().peekable(); while let Some(symbol) = chars.next() { if symbol_type == SymbolType::Escaped { - indexed_states - .entry((idx as u32, symbol)) - .or_insert((symbol_type.clone(), group_index)); - + indexed_states.insert(idx, (symbol, symbol_type.clone(), group_index)); idx += 1; symbol_type = SymbolType::Normal; continue; @@ -54,17 +85,33 @@ fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> { match symbol { '|' => { - if let Some(last_element) = union_count.last_mut() { - *last_element += 1; + if let Some(last_union) = union_count.last_mut() { + *last_union += 1; + } + if let Some(Some(_)) = group_stack.last_mut() { + // still a real group, do nothing here + } else if let Some(group) = group_stack.last_mut() { + // this group is now meaningful, assign it an index + *group = Some(group_index); + group_index += 1; } - group_index += 1; } '(' => { union_count.push(0); - group_index += 1; + group_stack.push(None); // not yet known if meaningful } ')' => { - group_index -= union_count.pop().unwrap() + 1; + union_count.pop(); + + match group_stack.pop() { + Some(Some(_)) => { + // it was meaningful, nothing to change + } + Some(None) => { + // the group was never promoted to real => do nothing + } + None => panic!("Mismatched parentheses"), + } } '*' => { symbol_type = SymbolType::Normal; @@ -72,51 +119,174 @@ fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> { } '\\' => symbol_type = SymbolType::Escaped, _ => { - if let Some(next_symbol) = chars.peek() { - if *next_symbol == '*' { - symbol_type = SymbolType::KleeneStar + if let Some(next) = chars.peek() { + if *next == '*' { + symbol_type = SymbolType::KleeneStar; + } + } + + // if we're inside a group that hasn't been assigned an index yet, assign now + if let Some(group) = group_stack.last_mut() { + if group.is_none() { + *group = Some(group_index); + group_index += 1; } } - indexed_states - .entry((idx as u32, symbol)) - .or_insert((symbol_type.clone(), group_index)); + // get current group idx for this symbol + let current_group = group_stack.last().and_then(|g| *g).unwrap_or(group_index); + indexed_states.insert(idx, (symbol, symbol_type.clone(), current_group)); idx += 1; } } } + // let mut indexed_states: HashMap = HashMap::new(); + // let mut symbol_type: SymbolType = SymbolType::Normal; + // let mut union_count: Vec = vec![0]; + // let mut idx: u32 = 0; + // let mut group_index: u32 = 0; + // + // let mut chars = regex.chars().peekable(); + // + // while let Some(symbol) = chars.next() { + // if symbol_type == SymbolType::Escaped { + // indexed_states + // .entry(idx as u32) + // .or_insert((symbol, symbol_type.clone(), group_index)); + // + // idx += 1; + // symbol_type = SymbolType::Normal; + // continue; + // } + // + // println!("{:?}, {:?}", union_count, symbol,); + // match symbol { + // '|' => { + // if let Some(last_element) = union_count.last_mut() { + // *last_element += 1; + // } + // group_index += 1; + // } + // // FIX: the paranthasis are not working correctly e.g. x|(x|y)|x <=> x|x|y|x + // '(' => { + // union_count.push(0); + // group_index += 1; + // } + // ')' => { + // let unions_last_grouping = union_count.pop().unwrap(); + // if unions_last_grouping == 0 { + // continue; + // } + // group_index -= unions_last_grouping + 1; + // } + // '*' => { + // symbol_type = SymbolType::Normal; + // continue; + // } + // '\\' => symbol_type = SymbolType::Escaped, + // _ => { + // if let Some(next_symbol) = chars.peek() { + // if *next_symbol == '*' { + // symbol_type = SymbolType::KleeneStar + // } + // } + // + // indexed_states.entry(idx as u32).or_insert(( + // symbol, + // symbol_type.clone(), + // group_index, + // )); + // + // idx += 1; + // } + // } + // } + indexed_states } -// END GLUSHKOV CONSTRUCTION -fn nfa_no_epsilon_to_dfa(nfa: &NFA) -> DFA { - todo!() -} +fn fill_sets( + states: HashMap, + start_states: &mut HashSet, + finite_states: &mut HashSet, + tranisitions: &mut HashMap<(u32, char), Vec>, +) { + let mut idx: u32 = 1; + let amount_states: u32 = states.len() as u32; -impl DFA { - pub fn new(regex: &str) -> Self { - if !is_valid_regex(regex) { - panic!("{} is not a valid regular expression!", regex); + if amount_states == 0 { + return; + } + + // tranisitions + // .entry((amount_states, states[&0].0)) + // .or_insert(vec![0]); + start_states.insert(0); + + let mut last_symbol_type: &SymbolType = &states[&0].1; + let mut last_group_idx: u32 = 0; + let mut check_next_group: bool = false; // NOTE: can also be thought of as group_is_exhausted + + loop { + let (_symbol, symbol_type, group_idx) = &states[&idx]; + // Skip forwards to next group + if check_next_group { + if *group_idx != last_group_idx { + start_states.insert(idx); + last_symbol_type = symbol_type; + last_group_idx = *group_idx; + check_next_group = false; + // continue; + } + + if idx < amount_states - 1 { + idx += 1; + continue; + } else { + break; + } } - let normalised_regex = normalise_regex(®ex); - todo!() - } + if *group_idx != last_group_idx { + start_states.insert(idx); + last_group_idx = *group_idx; + last_symbol_type = symbol_type; + check_next_group = true; - pub fn process(&self, input: &str) -> bool { - todo!() - } + if idx < amount_states - 1 { + idx += 1; + continue; + } else { + break; + } + } - pub fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> { - todo!() - } + match last_symbol_type { + SymbolType::Normal | SymbolType::Escaped => { + check_next_group = true; + } + SymbolType::KleeneStar => { + start_states.insert(idx); + check_next_group = false; + } + } - pub fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> { - todo!() + last_symbol_type = symbol_type; + + if idx < amount_states - 1 { + idx += 1; + } else { + break; + } } } +// END GLUSHKOV CONSTRUCTION + +fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa { + todo!() +} #[cfg(test)] mod tests { @@ -124,7 +294,7 @@ mod tests { #[test] fn test_single_character() { - let expected = HashMap::from([((0, 'a'), (SymbolType::Normal, 0))]); + let expected = HashMap::from([(0, ('a', SymbolType::Normal, 0))]); let result = index_states("a"); assert_eq!(result, expected, "Mismatch in single character test"); @@ -132,7 +302,7 @@ mod tests { #[test] fn test_kleene_star() { - let expected = HashMap::from([((0, 'a'), (SymbolType::KleeneStar, 0))]); + let expected = HashMap::from([(0, ('a', SymbolType::KleeneStar, 0))]); let result = index_states("a*"); assert_eq!(result, expected, "Mismatch in kleene star test"); @@ -141,8 +311,8 @@ mod tests { #[test] fn test_union_and_groups() { let expected = HashMap::from([ - ((0, 'a'), (SymbolType::Normal, 1)), - ((1, 'b'), (SymbolType::Normal, 2)), + (0, ('a', SymbolType::Normal, 1)), + (1, ('b', SymbolType::Normal, 2)), ]); let result = index_states("(a|b)"); @@ -151,7 +321,7 @@ mod tests { #[test] fn test_escaped_character() { - let expected = HashMap::from([((0, 'a'), (SymbolType::Escaped, 0))]); + let expected = HashMap::from([(0, ('a', SymbolType::Escaped, 0))]); let result = index_states("\\a"); assert_eq!(result, expected, "Mismatch in escaped character test"); @@ -160,19 +330,57 @@ mod tests { #[test] fn test_mixed_regex() { let expected = HashMap::from([ - ((0, 'a'), (SymbolType::Normal, 0)), - ((1, '*'), (SymbolType::Escaped, 0)), - ((2, 'b'), (SymbolType::Normal, 0)), - ((3, 'c'), (SymbolType::KleeneStar, 0)), - ((4, 'd'), (SymbolType::Normal, 0)), - ((5, 'e'), (SymbolType::Normal, 1)), - ((6, 'f'), (SymbolType::Normal, 2)), - ((7, 'g'), (SymbolType::Normal, 4)), - ((8, 'h'), (SymbolType::Normal, 5)), - ((9, 'i'), (SymbolType::Normal, 0)), + (0, ('a', SymbolType::Normal, 0)), + (1, ('*', SymbolType::Escaped, 0)), + (2, ('b', SymbolType::Normal, 0)), + (3, ('c', SymbolType::KleeneStar, 0)), + (4, ('d', SymbolType::Normal, 0)), + (5, ('e', SymbolType::Normal, 1)), + (6, ('f', SymbolType::Normal, 2)), + (7, ('g', SymbolType::Normal, 4)), + (8, ('h', SymbolType::Normal, 5)), + (9, ('i', SymbolType::Normal, 0)), ]); let result = index_states("a\\*bc*d(e|f|(g|h))i"); assert_eq!(result, expected, "Mismatch in mixed regex test"); } + + #[test] + fn test_to_many_brackets() { + let expected = HashMap::from([ + (0, ('a', SymbolType::KleeneStar, 0)), + (1, ('b', SymbolType::Normal, 0)), + (2, ('c', SymbolType::Normal, 3)), + (3, ('d', SymbolType::Normal, 4)), + (4, ('e', SymbolType::Normal, 5)), + (5, ('f', SymbolType::Normal, 5)), + ]); + + let result = index_states("a*b|((c|d))|ef"); + assert_eq!(result, expected, "Mismatch in mixed regex test"); + } + + #[test] + fn test_fill_sets() { + let states = index_states("a*b|(c|d)|ef"); + let mut start_states: HashSet = HashSet::new(); + let mut finite_states: HashSet = HashSet::new(); + let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); + + let expected_start_set: HashSet = HashSet::from([0, 1, 2, 3, 4]); + let expected_finite_set: HashSet = HashSet::new(); + let expected_transions: HashMap<(u32, char), Vec> = HashMap::new(); + + fill_sets( + states, + &mut start_states, + &mut finite_states, + &mut transitions, + ); + + assert_eq!(start_states, expected_start_set); + assert_eq!(finite_states, expected_finite_set); + assert_eq!(transitions, expected_transions); + } } diff --git a/src/lib.rs b/src/lib.rs index 1c11465..6f2fff7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,97 @@ +use std::collections::{HashMap, HashSet}; + mod glushkov; pub mod regex_engine; mod thompson; + +trait Dfa { + fn new(regex: &str) -> Self; + fn get_transitions(&self) -> &HashMap<(u32, char), u32>; + fn get_accepting_states(&self) -> &HashSet; + fn process(&self, input: &str) -> bool { + let mut current_state = 0; + for c in input.chars() { + if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { + current_state = next_state; + } else { + return false; + } + } + self.get_accepting_states().contains(¤t_state) + } + + fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> { + let mut start_pos = 0; + while start_pos < text.len() { + let mut current_state = 0; + let mut match_start = None; + let mut match_end = None; + let mut found_match = false; + + for (i, c) in text.chars().enumerate().skip(start_pos) { + if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { + current_state = next_state; + match_start = match_start.or(Some(i)); + + if self.get_accepting_states().contains(¤t_state) { + found_match = true; + match_end = Some(i) + } + + if i == text.len() - 1 && found_match { + break; + } + } else { + break; + } + } + + if let (Some(start), Some(end)) = (match_start, match_end) { + return Some(&text[start..=end]); + } else { + start_pos += 1; + } + } + + None + } + + fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> { + let mut matches: Vec<&str> = Vec::new(); + + let mut start_pos = 0; + while start_pos < input.len() { + let mut current_state = 0; + let mut match_start: Option = None; + let mut match_end: Option = None; + let mut found_match = false; + + for (i, c) in input.chars().enumerate().skip(start_pos) { + if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { + current_state = next_state; + match_start = match_start.or(Some(start_pos)); + + if self.get_accepting_states().contains(¤t_state) { + match_end = Some(i); + found_match = true; + } + + if i == input.len() - 1 && found_match { + break; + } + } else { + break; + } + } + + if let (Some(start), Some(end)) = (match_start, match_end) { + matches.push(&input[start..=end]); + start_pos = end; + } else { + start_pos += 1; + } + } + + matches + } +} diff --git a/src/regex_engine.rs b/src/regex_engine.rs index 77e15a5..f628c55 100644 --- a/src/regex_engine.rs +++ b/src/regex_engine.rs @@ -1,47 +1,46 @@ -use crate::glushkov::DFA as GlushkovDFA; -use crate::thompson::DFA as ThompsonDFA; +use crate::{Dfa, glushkov::GlushkovDfa, thompson::ThompsonDfa}; pub enum ConstructionType { Thompson, Glushkov, } -enum DFAType { - Thompson(ThompsonDFA), - Glushkov(GlushkovDFA), +enum DfaType { + Thompson(ThompsonDfa), + Glushkov(GlushkovDfa), } pub struct Regex { - dfa: DFAType, + dfa: DfaType, } impl Regex { pub fn new(pattern: &str, construction: ConstructionType) -> Self { let dfa_type = match construction { - ConstructionType::Thompson => DFAType::Thompson(ThompsonDFA::new(pattern)), - ConstructionType::Glushkov => DFAType::Glushkov(GlushkovDFA::new(pattern)), + ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)), + ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)), }; Regex { dfa: dfa_type } } pub fn is_match(&self, text: &str) -> bool { match &self.dfa { - DFAType::Thompson(dfa) => dfa.process(text), - DFAType::Glushkov(dfa) => dfa.process(text), + DfaType::Thompson(dfa) => dfa.process(text), + DfaType::Glushkov(dfa) => dfa.process(text), } } pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> { match &self.dfa { - DFAType::Thompson(dfa) => dfa.find_first_match(text), - DFAType::Glushkov(dfa) => dfa.find_first_match(text), + DfaType::Thompson(dfa) => dfa.find_first_match(text), + DfaType::Glushkov(dfa) => dfa.find_first_match(text), } } pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> { match &self.dfa { - DFAType::Thompson(dfa) => dfa.find_all_matches(text), - DFAType::Glushkov(dfa) => dfa.find_all_matches(text), + DfaType::Thompson(dfa) => dfa.find_all_matches(text), + DfaType::Glushkov(dfa) => dfa.find_all_matches(text), } } } @@ -248,16 +247,15 @@ mod tests { (r"a?", r"(a|())"), (r"a\?", r"a\?"), (r"(ab)?", r"((ab)|())"), - (r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"), + ( + r".", + "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)", + ), ]; for (input, expected) in cases { let result = normalise_regex(input); - assert_eq!( - result, expected, - "Normalisation failed for input '{}'", - input - ); + assert_eq!(result, expected, "Normalisation failed for input '{input}'"); } } @@ -290,7 +288,7 @@ mod tests { for (text, expected) in test_cases { let result = regex_object.find(text); - assert_eq!(result, expected, "Failed for input: {}", text); + assert_eq!(result, expected, "Failed for input: {text}"); } } @@ -305,7 +303,7 @@ mod tests { for (text, expected) in test_cases { let result = regex_object.findall(text); - assert_eq!(result, expected, "Failed for input: {}", text); + assert_eq!(result, expected, "Failed for input: {text}"); } } } diff --git a/src/thompson.rs b/src/thompson.rs index bb5c50d..1ad648c 100644 --- a/src/thompson.rs +++ b/src/thompson.rs @@ -1,19 +1,43 @@ -use crate::regex_engine::{is_valid_regex, normalise_regex}; +use crate::{ + Dfa, + regex_engine::{is_valid_regex, normalise_regex}, +}; use std::collections::{HashMap, HashSet, VecDeque}; -struct NFA { +struct Nfa { transitions: HashMap<(u32, Option), Vec>, accepting_state: u32, // the thompson construction always has one accepting_state } -pub struct DFA { +pub struct ThompsonDfa { transitions: HashMap<(u32, char), u32>, accepting_states: HashSet, } +impl Dfa for ThompsonDfa { + fn new(regex: &str) -> Self { + if !is_valid_regex(regex) { + panic!("{regex} is not a valid regular expression!"); + } + + let normalised_regex = normalise_regex(regex); + let regex_nfa: Nfa = thompson_construction(&normalised_regex); + let regex_dfa = nfa_to_dfa(®ex_nfa); + optimise_dfa(®ex_dfa) + } + + fn get_transitions(&self) -> &HashMap<(u32, char), u32> { + &self.transitions + } + + fn get_accepting_states(&self) -> &HashSet { + &self.accepting_states + } +} + // THOMPSON CONSTRUCTION --- -fn thompson_construction(normalised_regex: &str) -> NFA { - fn apply_operator(nfa_stack: &mut Vec, operator: char) { +fn thompson_construction(normalised_regex: &str) -> Nfa { + fn apply_operator(nfa_stack: &mut Vec, operator: char) { match operator { '|' => { let nfa_right = nfa_stack.pop().expect("Expected NFA for union"); @@ -25,12 +49,12 @@ fn thompson_construction(normalised_regex: &str) -> NFA { let nfa_left = nfa_stack.pop().expect("Expected NFA for concatenation"); nfa_stack.push(concatenate(&nfa_left, &nfa_right)); } - _ => panic!("Unknown operator {:?}", operator), + _ => panic!("Unknown operator {operator:?}"), } } let mut operators: Vec = Vec::new(); - let mut nfa_stack: Vec = Vec::new(); + let mut nfa_stack: Vec = Vec::new(); let mut concat_flag = false; let mut escape_sequence = false; @@ -99,7 +123,7 @@ fn thompson_construction(normalised_regex: &str) -> NFA { nfa_stack.pop().unwrap() } -fn apply_kleene_star(last_nfa: &NFA) -> NFA { +fn apply_kleene_star(last_nfa: &Nfa) -> Nfa { let mut transitions = HashMap::new(); let new_accepting = last_nfa.accepting_state + 2; @@ -130,13 +154,13 @@ fn apply_kleene_star(last_nfa: &NFA) -> NFA { .or_insert_with(Vec::new) .push(new_accepting); - NFA { + Nfa { transitions, accepting_state: new_accepting, } } -fn union(left: &NFA, right: &NFA) -> NFA { +fn union(left: &Nfa, right: &Nfa) -> Nfa { let mut transitions = HashMap::new(); let num_states_left_nfa = left.accepting_state; @@ -162,21 +186,21 @@ fn union(left: &NFA, right: &NFA) -> NFA { transitions.insert((0, None), vec![1, num_states_left_nfa + 2]); transitions - .entry((&left.accepting_state + 1, None)) + .entry((left.accepting_state + 1, None)) .or_insert_with(Vec::new) .push(new_accepting_state); transitions - .entry((&right.accepting_state + num_states_left_nfa + 2, None)) + .entry((right.accepting_state + num_states_left_nfa + 2, None)) .or_insert_with(Vec::new) .push(new_accepting_state); - NFA { + Nfa { transitions, accepting_state: new_accepting_state, } } -fn concatenate(left: &NFA, right: &NFA) -> NFA { +fn concatenate(left: &Nfa, right: &Nfa) -> Nfa { let mut transitions: HashMap<(u32, Option), Vec> = left.transitions.clone(); // HACK: The accepting states are (based on the implementation) the last ones of the NFA @@ -190,21 +214,21 @@ fn concatenate(left: &NFA, right: &NFA) -> NFA { ); } - NFA { + Nfa { transitions, accepting_state: right.accepting_state + num_states_left_nfa, } } -fn create_basic_nfa(letter: &char) -> NFA { - NFA { +fn create_basic_nfa(letter: &char) -> Nfa { + Nfa { transitions: HashMap::from([((0, Some(*letter)), vec![1])]), accepting_state: 1, } } -fn create_basic_epsilon_nfa() -> NFA { - NFA { +fn create_basic_epsilon_nfa() -> Nfa { + Nfa { transitions: HashMap::from([((0, None), vec![1])]), accepting_state: 1, } @@ -212,7 +236,7 @@ fn create_basic_epsilon_nfa() -> NFA { // END THOMPSON CONSTRUCTION --- // NFA to DFA functions --- -fn epsilon_closure(nfa: &NFA, states: &mut HashSet) { +fn epsilon_closure(nfa: &Nfa, states: &mut HashSet) { let mut stack = states.clone(); while let Some(&state_id) = stack.iter().next() { @@ -227,7 +251,7 @@ fn epsilon_closure(nfa: &NFA, states: &mut HashSet) { } } -fn move_nfa(nfa: &NFA, states: &HashSet, symbol: char) -> HashSet { +fn move_nfa(nfa: &Nfa, states: &HashSet, symbol: char) -> HashSet { let mut move_states = HashSet::new(); for &state in states { @@ -245,7 +269,7 @@ fn hash_set_to_sorted_vec(set: &HashSet) -> Vec { vec } -fn nfa_to_dfa(nfa: &NFA) -> DFA { +fn nfa_to_dfa(nfa: &Nfa) -> ThompsonDfa { // Start from the initial state of the NFA, assuming it's state 0 let mut start_closure = HashSet::from([0]); epsilon_closure(nfa, &mut start_closure); @@ -293,20 +317,20 @@ fn nfa_to_dfa(nfa: &NFA) -> DFA { } } - DFA { + ThompsonDfa { transitions, accepting_states: dfa_accepting_states, } } // END NFA to DFA functions --- -fn optimise_dfa(dfa: &DFA) -> DFA { +fn optimise_dfa(dfa: &ThompsonDfa) -> ThompsonDfa { let mut partition: HashMap = HashMap::new(); let mut accepting_states_set: HashSet = dfa.accepting_states.clone(); let mut non_accepting_states: HashSet = HashSet::new(); let mut all_states: HashSet = HashSet::new(); - for (&(state, _), _) in &dfa.transitions { + for &(state, _) in dfa.transitions.keys() { all_states.insert(state); if dfa.accepting_states.contains(&state) { accepting_states_set.insert(state); @@ -332,10 +356,10 @@ fn optimise_dfa(dfa: &DFA) -> DFA { partition_list.push(non_accepting_states); let mut worklist: VecDeque = VecDeque::new(); - if partition_list[0].len() > 0 { + if !partition_list[0].is_empty() { worklist.push_back(0); } - if partition_list.len() > 1 && partition_list[1].len() > 0 { + if partition_list.len() > 1 && !partition_list[1].is_empty() { worklist.push_back(1); } @@ -345,7 +369,7 @@ fn optimise_dfa(dfa: &DFA) -> DFA { if partition[&target_state] == current_partition_index { states_to_check .entry(symbol) - .or_insert_with(HashSet::new) + .or_default() .insert(source_state); } } @@ -410,8 +434,8 @@ fn optimise_dfa(dfa: &DFA) -> DFA { } for (_, &partition_index) in partition.iter() { - if !new_state_map.contains_key(&partition_index) { - new_state_map.insert(partition_index, next_state_id); + if let std::collections::hash_map::Entry::Vacant(e) = new_state_map.entry(partition_index) { + e.insert(next_state_id); next_state_id += 1; } } @@ -433,126 +457,26 @@ fn optimise_dfa(dfa: &DFA) -> DFA { minimal_transitions.insert((new_source_state, symbol), new_target_state); } - DFA { + ThompsonDfa { transitions: minimal_transitions, accepting_states: minimal_accepting_states, } } -impl DFA { - pub fn new(regex: &str) -> Self { - if !is_valid_regex(regex) { - panic!("{} is not a valid regular expression!", regex); - } - - let normalised_regex = normalise_regex(®ex); - let regex_nfa: NFA = thompson_construction(&normalised_regex); - let regex_dfa = nfa_to_dfa(®ex_nfa); - optimise_dfa(®ex_dfa) - } - - pub fn process(&self, input: &str) -> bool { - let mut current_state = 0; - for c in input.chars() { - if let Some(&next_state) = self.transitions.get(&(current_state, c)) { - current_state = next_state; - } else { - return false; - } - } - self.accepting_states.contains(¤t_state) - } - - pub fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> { - let mut start_pos = 0; - while start_pos < text.len() { - let mut current_state = 0; - let mut match_start = None; - let mut match_end = None; - let mut found_match = false; - - for (i, c) in text.chars().enumerate().skip(start_pos) { - if let Some(&next_state) = self.transitions.get(&(current_state, c)) { - current_state = next_state; - match_start = match_start.or(Some(i)); - - if self.accepting_states.contains(¤t_state) { - found_match = true; - match_end = Some(i) - } - - if i == text.len() - 1 && found_match { - break; - } - } else { - break; - } - } - - if let (Some(start), Some(end)) = (match_start, match_end) { - return Some(&text[start..=end]); - } else { - start_pos += 1; - } - } - - None - } - - pub fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> { - let mut matches: Vec<&str> = Vec::new(); - - let mut start_pos = 0; - while start_pos < input.len() { - let mut current_state = 0; - let mut match_start: Option = None; - let mut match_end: Option = None; - let mut found_match = false; - - for (i, c) in input.chars().enumerate().skip(start_pos) { - if let Some(&next_state) = self.transitions.get(&(current_state, c)) { - current_state = next_state; - match_start = match_start.or(Some(start_pos)); - - if self.accepting_states.contains(¤t_state) { - match_end = Some(i); - found_match = true; - } - - if i == input.len() - 1 && found_match { - break; - } - } else { - break; - } - } - - if let (Some(start), Some(end)) = (match_start, match_end) { - matches.push(&input[start..=end]); - start_pos = end; - } else { - start_pos += 1; - } - } - - matches - } -} - #[cfg(test)] mod tests { use super::*; #[test] fn create_dfa_test() { - let generated_dfa = DFA::new("(a|b)*"); + let generated_dfa = ThompsonDfa::new("(a|b)*"); let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 0)]); let expected_accepting_states = HashSet::from([0]); assert_eq!(expected_transitions, generated_dfa.transitions); assert_eq!(expected_accepting_states, generated_dfa.accepting_states); - let generated_dfa_2 = DFA::new("a|()"); + let generated_dfa_2 = ThompsonDfa::new("a|()"); let expected_transitions_2 = HashMap::from([((0, 'a'), 1)]); let expected_accepting_states_2 = HashSet::from([0, 1]); @@ -565,7 +489,7 @@ mod tests { #[test] fn prozess_regex_test() { - let generated_dfa = DFA::new("(a|b)*"); + let generated_dfa = ThompsonDfa::new("(a|b)*"); let test_strings = vec!["abbbababaaaa", ""]; for string in test_strings { assert!(generated_dfa.process(string)); @@ -654,7 +578,7 @@ mod tests { #[test] fn nfa_to_dfa_test() { - let input_nfa = NFA { + let input_nfa = Nfa { transitions: HashMap::from([ ((0, None), vec![1, 7]), ((1, None), vec![2, 4]), From 4aefba5d82e728c1ecc610274d4a4035492f8613 Mon Sep 17 00:00:00 2001 From: Testspieler09 Date: Mon, 21 Jul 2025 21:53:13 +0200 Subject: [PATCH 4/8] bench: add benchmarks --- Cargo.lock | 1 + Cargo.toml | 7 +- benches/bench_cases.rs | 76 ++++++++ benches/glushkov_benchmark.rs | 62 +++++++ benches/rust_regex_benchmark.rs | 56 ++++++ benches/thompson_benchmark.rs | 62 +++++++ src/glushkov.rs | 19 +- src/lib.rs | 310 +++++++++++++++++++++++++++++++- src/regex_engine.rs | 309 ------------------------------- src/thompson.rs | 7 +- 10 files changed, 579 insertions(+), 330 deletions(-) create mode 100644 benches/bench_cases.rs delete mode 100644 src/regex_engine.rs diff --git a/Cargo.lock b/Cargo.lock index 61f4459..4f0180a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -363,6 +363,7 @@ name = "regex_engine" version = "0.1.0" dependencies = [ "criterion", + "regex", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6c7d598..4a0c696 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,14 +5,15 @@ edition = "2024" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } +regex = "1.11.1" [[bench]] name = "thompson_benchmark" harness = false -[[bench]] -name = "glushkov_benchmark" -harness = false +# [[bench]] +# name = "glushkov_benchmark" +# harness = false [[bench]] name = "rust_regex_benchmark" diff --git a/benches/bench_cases.rs b/benches/bench_cases.rs new file mode 100644 index 0000000..7df2b4c --- /dev/null +++ b/benches/bench_cases.rs @@ -0,0 +1,76 @@ +#[allow(dead_code)] +struct BenchCase<'a> { + pub regex: &'a str, + pub input: String, +} + +// This function is used in the `benchmark` files +#[allow(dead_code)] +fn get_bench_cases() -> Vec> { + vec![ + BenchCase { + regex: r"a.b", + input: "abcd abef abgh ijk".to_string(), + }, + BenchCase { + regex: r"a*b", + input: "aaaaaaaaab".to_string(), + }, + BenchCase { + regex: r"a+b", + input: "aabab".to_string(), + }, + BenchCase { + regex: r"a?b", + input: "b aaab ab".to_string(), + }, + BenchCase { + regex: r"a|b", + input: "xxaxybxx".to_string(), + }, + // Group and escape sequences + BenchCase { + regex: r"(a|b)c", + input: "abc ac bc bbcc".to_string(), + }, + BenchCase { + regex: r"\.", + input: "Find . within this !?. sentence.".to_string(), + }, + // Larger and more complex patterns + BenchCase { + regex: r"(hel+o|wor?ld)", + input: "hello helolllo world worlld helloworld".to_string(), + }, + BenchCase { + regex: r"ab*c+", + input: "abbc abbbbbbbcc bccaaabbabc".to_string(), + }, + BenchCase { + regex: r"(a(bc|de)+)", + input: "abc abcbc abcdedef".to_string(), + }, + // Realistic text patterns and larger inputs + BenchCase { + regex: r"\b[0-9]{2}\b", + input: "There are 99 bottles of soda and 45 cans of juice".to_string(), + }, + BenchCase { + regex: r"\b\w{5,}\b", + input: "Rust is great for systems programming but can be challenging".to_string(), + }, + BenchCase { + regex: r"(https?|ftp)://[^\s/$.?#].[^\s]*", + input: "Check https://example.com out and ftp://fileserver.net as well".to_string(), + }, + // Pathological case to test limits + BenchCase { + regex: r"(a|b)*c", + input: format!("{}{}", "a".repeat(1000), "bc"), + }, + BenchCase { + regex: r"x{3}(y|z)", + input: "xxxxyxxxzxxxy".to_string(), + }, + ] +} diff --git a/benches/glushkov_benchmark.rs b/benches/glushkov_benchmark.rs index e69de29..1f6522a 100644 --- a/benches/glushkov_benchmark.rs +++ b/benches/glushkov_benchmark.rs @@ -0,0 +1,62 @@ +include!("bench_cases.rs"); +use criterion::{Criterion, criterion_group, criterion_main}; +use regex_engine::{ConstructionType, Regex}; + +fn benchmark_glushkov_regex_process(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Glushkov); + + c.bench_function( + &format!("Glushkov is_match - pattern: {}", case.regex), + |b| { + b.iter(|| { + regex.is_match(&case.input); + }) + }, + ); + } +} + +fn benchmark_glushkov_regex_find_first(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Glushkov); + + c.bench_function( + &format!("Glushkov find match - pattern: {}", case.regex), + |b| { + b.iter(|| { + regex.find(&case.input); + }) + }, + ); + } +} + +fn benchmark_glushkov_regex_find_all(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Glushkov); + + c.bench_function( + &format!("Glushkov findall matches - pattern: {}", case.regex), + |b| { + b.iter(|| { + regex.findall(&case.input); + }) + }, + ); + } +} + +criterion_group!( + benches, + benchmark_glushkov_regex_process, + benchmark_glushkov_regex_find_first, + benchmark_glushkov_regex_find_all +); +criterion_main!(benches); diff --git a/benches/rust_regex_benchmark.rs b/benches/rust_regex_benchmark.rs index e69de29..63d05a3 100644 --- a/benches/rust_regex_benchmark.rs +++ b/benches/rust_regex_benchmark.rs @@ -0,0 +1,56 @@ +include!("bench_cases.rs"); +use criterion::{Criterion, criterion_group, criterion_main}; +use regex::Regex; + +fn benchmark_rust_regex_process(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in cases { + let regex = Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + c.bench_function(&format!("Rust process match: {}", case.regex), |b| { + b.iter(|| { + regex.is_match(&case.input); + }) + }); + } +} + +fn benchmark_rust_regex_find_first(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in cases { + let regex = Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + c.bench_function(&format!("Rust find first match: {}", case.regex), |b| { + b.iter(|| { + regex.find(&case.input).map(|m| m.as_str()); + }) + }); + } +} + +fn benchmark_rust_regex_find_all(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in cases { + let regex = Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + c.bench_function(&format!("Rust find all matches: {}", case.regex), |b| { + b.iter(|| { + regex.find_iter(&case.input); + }) + }); + } +} + +criterion_group!( + benches, + benchmark_rust_regex_process, + benchmark_rust_regex_find_first, + benchmark_rust_regex_find_all +); +criterion_main!(benches); diff --git a/benches/thompson_benchmark.rs b/benches/thompson_benchmark.rs index e69de29..e38a502 100644 --- a/benches/thompson_benchmark.rs +++ b/benches/thompson_benchmark.rs @@ -0,0 +1,62 @@ +include!("bench_cases.rs"); +use criterion::{Criterion, criterion_group, criterion_main}; +use regex_engine::{ConstructionType, Regex}; + +fn benchmark_thompson_regex_process(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Thompson); + + c.bench_function( + &format!("Thompson is_match - pattern: {}", case.regex), + |b| { + b.iter(|| { + regex.is_match(&case.input); + }) + }, + ); + } +} + +fn benchmark_thompson_regex_find_first(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Thompson); + + c.bench_function( + &format!("Thompson find match - pattern: {}", case.regex), + |b| { + b.iter(|| { + regex.find(&case.input); + }) + }, + ); + } +} + +fn benchmark_thompson_regex_find_all(c: &mut Criterion) { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Thompson); + + c.bench_function( + &format!("Thompson findall matches - pattern: {}", case.regex), + |b| { + b.iter(|| { + regex.findall(&case.input); + }) + }, + ); + } +} + +criterion_group!( + benches, + benchmark_thompson_regex_process, + benchmark_thompson_regex_find_first, + benchmark_thompson_regex_find_all +); +criterion_main!(benches); diff --git a/src/glushkov.rs b/src/glushkov.rs index ec35441..17914ef 100644 --- a/src/glushkov.rs +++ b/src/glushkov.rs @@ -1,7 +1,4 @@ -use crate::{ - Dfa, - regex_engine::{is_valid_regex, normalise_regex}, -}; +use crate::{Dfa, is_valid_regex, normalise_regex}; use std::collections::{HashMap, HashSet}; #[derive(Clone, Debug, PartialEq)] @@ -153,7 +150,7 @@ fn index_states(regex: &str) -> HashMap { // while let Some(symbol) = chars.next() { // if symbol_type == SymbolType::Escaped { // indexed_states - // .entry(idx as u32) + // .entry(idx) // .or_insert((symbol, symbol_type.clone(), group_index)); // // idx += 1; @@ -161,7 +158,7 @@ fn index_states(regex: &str) -> HashMap { // continue; // } // - // println!("{:?}, {:?}", union_count, symbol,); + // println!("{union_count:?}, {symbol:?}"); // match symbol { // '|' => { // if let Some(last_element) = union_count.last_mut() { @@ -193,11 +190,9 @@ fn index_states(regex: &str) -> HashMap { // } // } // - // indexed_states.entry(idx as u32).or_insert(( - // symbol, - // symbol_type.clone(), - // group_index, - // )); + // indexed_states + // .entry(idx) + // .or_insert((symbol, symbol_type.clone(), group_index)); // // idx += 1; // } @@ -347,7 +342,7 @@ mod tests { } #[test] - fn test_to_many_brackets() { + fn test_too_many_brackets() { let expected = HashMap::from([ (0, ('a', SymbolType::KleeneStar, 0)), (1, ('b', SymbolType::Normal, 0)), diff --git a/src/lib.rs b/src/lib.rs index 6f2fff7..7d17879 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ +use crate::{glushkov::GlushkovDfa, thompson::ThompsonDfa}; use std::collections::{HashMap, HashSet}; mod glushkov; -pub mod regex_engine; mod thompson; trait Dfa { @@ -95,3 +95,311 @@ trait Dfa { matches } } + +pub enum ConstructionType { + Thompson, + Glushkov, +} + +enum DfaType { + Thompson(ThompsonDfa), + Glushkov(GlushkovDfa), +} + +pub struct Regex { + dfa: DfaType, +} + +impl Regex { + pub fn new(pattern: &str, construction: ConstructionType) -> Self { + let dfa_type = match construction { + ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)), + ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)), + }; + Regex { dfa: dfa_type } + } + + pub fn is_match(&self, text: &str) -> bool { + match &self.dfa { + DfaType::Thompson(dfa) => dfa.process(text), + DfaType::Glushkov(dfa) => dfa.process(text), + } + } + + pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> { + match &self.dfa { + DfaType::Thompson(dfa) => dfa.find_first_match(text), + DfaType::Glushkov(dfa) => dfa.find_first_match(text), + } + } + + pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> { + match &self.dfa { + DfaType::Thompson(dfa) => dfa.find_all_matches(text), + DfaType::Glushkov(dfa) => dfa.find_all_matches(text), + } + } +} + +pub fn is_valid_regex(regex: &str) -> bool { + if regex.is_empty() { + return false; + } + + let mut open_paren_count = 0; + let mut last_was_quantifier = false; + + let mut chars = regex.chars().peekable(); + while let Some(c) = chars.next() { + match c { + '(' => { + open_paren_count += 1; + last_was_quantifier = false; + } + + ')' => { + if open_paren_count == 0 { + return false; + } + open_paren_count -= 1; + last_was_quantifier = false; + } + + '*' | '+' => { + // Ensure quantifiers are not the first character and are not repeated + if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') { + return false; + } + last_was_quantifier = true; + } + + '|' => { + // Ensure alternation isn't the first or last character + if regex.starts_with('|') || chars.peek().is_none() { + return false; + } + last_was_quantifier = false; + } + + '\\' => { + // Handle escaped characters: ensure there's a character after the escape + if chars.peek().is_none() { + return false; + } + chars.next(); // Skip the escaped character + last_was_quantifier = false; + } + + _ => { + last_was_quantifier = false; + } + } + } + + open_paren_count == 0 +} + +pub fn normalise_regex(regex: &str) -> String { + let mut normalised = String::new(); + let mut escape_sequence = false; + let mut prev_char = '\0'; + + for curr_char in regex.chars() { + if escape_sequence { + // TODO: Implement further parsing features here (e.g. \w \d) + normalised.push(curr_char); + escape_sequence = false; + prev_char = curr_char; + continue; + } + + if curr_char == '\\' { + escape_sequence = true; + normalised.push(curr_char); + continue; + } + + if curr_char == '+' { + normalised.push(prev_char); + normalised.push('*'); + prev_char = curr_char; + continue; + } + if curr_char == '?' { + match prev_char { + ')' => { + let mut balance = 0; + + for j in (0..normalised.len()).rev() { + let ch = normalised.chars().nth(j).unwrap(); + if ch == ')' { + balance += 1; + } else if ch == '(' { + balance -= 1; + if balance == 0 { + normalised.insert(j, '('); + break; + } + } + } + } + _ => { + normalised.insert(normalised.len() - 1, '('); + } + } + normalised.push_str("|())"); + prev_char = curr_char; + continue; + } + if curr_char == '.' { + normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"); + prev_char = curr_char; + continue; + } + + normalised.push(curr_char); + prev_char = curr_char; + } + + normalised +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_regex_basic_test() { + let regex = "(a|b)*"; + assert!(is_valid_regex(regex), "Expected valid regex."); + } + + #[test] + fn invalid_empty_regex_test() { + let regex = ""; + assert!(!is_valid_regex(regex), "Expected invalid regex (empty)."); + } + + #[test] + fn invalid_unbalanced_parentheses_test() { + let regex1 = "(a|b"; + let regex2 = "a|b)"; + assert!( + !is_valid_regex(regex1), + "Expected invalid regex (unbalanced parentheses)." + ); + assert!( + !is_valid_regex(regex2), + "Expected invalid regex (unbalanced parentheses)." + ); + } + + #[test] + fn invalid_operator_placement_test() { + let regex1 = "*a"; + let regex2 = "|a|b"; + assert!( + !is_valid_regex(regex1), + "Expected invalid regex (invalid quantifier placement)." + ); + assert!( + !is_valid_regex(regex2), + "Expected invalid regex (invalid alternation placement)." + ); + } + + #[test] + fn valid_nested_parentheses_test() { + let regex = "((a|b)*c)"; + assert!( + is_valid_regex(regex), + "Expected valid regex with nested parentheses." + ); + } + + #[test] + fn valid_escape_sequence_test() { + let regex = "a\\*b"; + assert!( + is_valid_regex(regex), + "Expected valid regex with escape sequence." + ); + } + + #[test] + fn invalid_escape_sequence_test() { + let regex = "a\\"; + assert!( + !is_valid_regex(regex), + "Expected invalid regex with unpaired escape." + ); + } + + #[test] + fn normalise_regex_test() { + let cases = [ + (r"a+", r"aa*"), + (r"a\+", r"a\+"), + (r"a?", r"(a|())"), + (r"a\?", r"a\?"), + (r"(ab)?", r"((ab)|())"), + ( + r".", + "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)", + ), + ]; + + for (input, expected) in cases { + let result = normalise_regex(input); + assert_eq!(result, expected, "Normalisation failed for input '{input}'"); + } + } + + #[test] + fn is_match_test() { + let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson); + + let success_strings = vec!["abababaaaababa", ""]; + for string in success_strings { + assert!(regex_object.is_match(string)); + } + + let failing_strings = vec!["abc", "x"]; + for string in failing_strings { + assert!(!regex_object.is_match(string)); + } + } + + #[test] + fn find_test() { + let regex_object = Regex::new("abc", ConstructionType::Thompson); + let test_cases = vec![ + ("abcd", Some("abc")), + ("xyzabc", Some("abc")), + ("abc", Some("abc")), + ("ac", None), + ("def", None), + ("aabc", Some("abc")), + ]; + + for (text, expected) in test_cases { + let result = regex_object.find(text); + assert_eq!(result, expected, "Failed for input: {text}"); + } + } + + #[test] + fn find_all_test() { + let regex_object = Regex::new("abc*", ConstructionType::Thompson); + let test_cases = vec![ + ("abcd", vec!["abc"]), + ("ac", vec![]), + ("abcab", vec!["abc", "ab"]), + ]; + + for (text, expected) in test_cases { + let result = regex_object.findall(text); + assert_eq!(result, expected, "Failed for input: {text}"); + } + } +} diff --git a/src/regex_engine.rs b/src/regex_engine.rs deleted file mode 100644 index f628c55..0000000 --- a/src/regex_engine.rs +++ /dev/null @@ -1,309 +0,0 @@ -use crate::{Dfa, glushkov::GlushkovDfa, thompson::ThompsonDfa}; - -pub enum ConstructionType { - Thompson, - Glushkov, -} - -enum DfaType { - Thompson(ThompsonDfa), - Glushkov(GlushkovDfa), -} - -pub struct Regex { - dfa: DfaType, -} - -impl Regex { - pub fn new(pattern: &str, construction: ConstructionType) -> Self { - let dfa_type = match construction { - ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)), - ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)), - }; - Regex { dfa: dfa_type } - } - - pub fn is_match(&self, text: &str) -> bool { - match &self.dfa { - DfaType::Thompson(dfa) => dfa.process(text), - DfaType::Glushkov(dfa) => dfa.process(text), - } - } - - pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> { - match &self.dfa { - DfaType::Thompson(dfa) => dfa.find_first_match(text), - DfaType::Glushkov(dfa) => dfa.find_first_match(text), - } - } - - pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> { - match &self.dfa { - DfaType::Thompson(dfa) => dfa.find_all_matches(text), - DfaType::Glushkov(dfa) => dfa.find_all_matches(text), - } - } -} - -pub fn is_valid_regex(regex: &str) -> bool { - if regex.is_empty() { - return false; - } - - let mut open_paren_count = 0; - let mut last_was_quantifier = false; - - let mut chars = regex.chars().peekable(); - while let Some(c) = chars.next() { - match c { - '(' => { - open_paren_count += 1; - last_was_quantifier = false; - } - - ')' => { - if open_paren_count == 0 { - return false; - } - open_paren_count -= 1; - last_was_quantifier = false; - } - - '*' | '+' => { - // Ensure quantifiers are not the first character and are not repeated - if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') { - return false; - } - last_was_quantifier = true; - } - - '|' => { - // Ensure alternation isn't the first or last character - if regex.starts_with('|') || chars.peek().is_none() { - return false; - } - last_was_quantifier = false; - } - - '\\' => { - // Handle escaped characters: ensure there's a character after the escape - if chars.peek().is_none() { - return false; - } - chars.next(); // Skip the escaped character - last_was_quantifier = false; - } - - _ => { - last_was_quantifier = false; - } - } - } - - open_paren_count == 0 -} - -pub fn normalise_regex(regex: &str) -> String { - let mut normalised = String::new(); - let mut escape_sequence = false; - let mut prev_char = '\0'; - - for curr_char in regex.chars() { - if escape_sequence { - // TODO: Implement further parsing features here (e.g. \w \d) - normalised.push(curr_char); - escape_sequence = false; - prev_char = curr_char; - continue; - } - - if curr_char == '\\' { - escape_sequence = true; - normalised.push(curr_char); - continue; - } - - if curr_char == '+' { - normalised.push(prev_char); - normalised.push('*'); - prev_char = curr_char; - continue; - } - if curr_char == '?' { - match prev_char { - ')' => { - let mut balance = 0; - - for j in (0..normalised.len()).rev() { - let ch = normalised.chars().nth(j).unwrap(); - if ch == ')' { - balance += 1; - } else if ch == '(' { - balance -= 1; - if balance == 0 { - normalised.insert(j, '('); - break; - } - } - } - } - _ => { - normalised.insert(normalised.len() - 1, '('); - } - } - normalised.push_str("|())"); - prev_char = curr_char; - continue; - } - if curr_char == '.' { - normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"); - prev_char = curr_char; - continue; - } - - normalised.push(curr_char); - prev_char = curr_char; - } - - normalised -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn valid_regex_basic_test() { - let regex = "(a|b)*"; - assert!(is_valid_regex(regex), "Expected valid regex."); - } - - #[test] - fn invalid_empty_regex_test() { - let regex = ""; - assert!(!is_valid_regex(regex), "Expected invalid regex (empty)."); - } - - #[test] - fn invalid_unbalanced_parentheses_test() { - let regex1 = "(a|b"; - let regex2 = "a|b)"; - assert!( - !is_valid_regex(regex1), - "Expected invalid regex (unbalanced parentheses)." - ); - assert!( - !is_valid_regex(regex2), - "Expected invalid regex (unbalanced parentheses)." - ); - } - - #[test] - fn invalid_operator_placement_test() { - let regex1 = "*a"; - let regex2 = "|a|b"; - assert!( - !is_valid_regex(regex1), - "Expected invalid regex (invalid quantifier placement)." - ); - assert!( - !is_valid_regex(regex2), - "Expected invalid regex (invalid alternation placement)." - ); - } - - #[test] - fn valid_nested_parentheses_test() { - let regex = "((a|b)*c)"; - assert!( - is_valid_regex(regex), - "Expected valid regex with nested parentheses." - ); - } - - #[test] - fn valid_escape_sequence_test() { - let regex = "a\\*b"; - assert!( - is_valid_regex(regex), - "Expected valid regex with escape sequence." - ); - } - - #[test] - fn invalid_escape_sequence_test() { - let regex = "a\\"; - assert!( - !is_valid_regex(regex), - "Expected invalid regex with unpaired escape." - ); - } - - #[test] - fn normalise_regex_test() { - let cases = [ - (r"a+", r"aa*"), - (r"a\+", r"a\+"), - (r"a?", r"(a|())"), - (r"a\?", r"a\?"), - (r"(ab)?", r"((ab)|())"), - ( - r".", - "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)", - ), - ]; - - for (input, expected) in cases { - let result = normalise_regex(input); - assert_eq!(result, expected, "Normalisation failed for input '{input}'"); - } - } - - #[test] - fn is_match_test() { - let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson); - - let success_strings = vec!["abababaaaababa", ""]; - for string in success_strings { - assert!(regex_object.is_match(string)); - } - - let failing_strings = vec!["abc", "x"]; - for string in failing_strings { - assert!(!regex_object.is_match(string)); - } - } - - #[test] - fn find_test() { - let regex_object = Regex::new("abc", ConstructionType::Thompson); - let test_cases = vec![ - ("abcd", Some("abc")), - ("xyzabc", Some("abc")), - ("abc", Some("abc")), - ("ac", None), - ("def", None), - ("aabc", Some("abc")), - ]; - - for (text, expected) in test_cases { - let result = regex_object.find(text); - assert_eq!(result, expected, "Failed for input: {text}"); - } - } - - #[test] - fn find_all_test() { - let regex_object = Regex::new("abc*", ConstructionType::Thompson); - let test_cases = vec![ - ("abcd", vec!["abc"]), - ("ac", vec![]), - ("abcab", vec!["abc", "ab"]), - ]; - - for (text, expected) in test_cases { - let result = regex_object.findall(text); - assert_eq!(result, expected, "Failed for input: {text}"); - } - } -} diff --git a/src/thompson.rs b/src/thompson.rs index 1ad648c..5f0ede8 100644 --- a/src/thompson.rs +++ b/src/thompson.rs @@ -1,7 +1,4 @@ -use crate::{ - Dfa, - regex_engine::{is_valid_regex, normalise_regex}, -}; +use crate::{Dfa, is_valid_regex, normalise_regex}; use std::collections::{HashMap, HashSet, VecDeque}; struct Nfa { @@ -593,7 +590,7 @@ mod tests { let generated_dfa = nfa_to_dfa(&input_nfa); - let expected_options = vec![ + let expected_options = [ HashMap::from([ ((0, 'a'), 1), ((0, 'b'), 2), From 4b624b4bc5c01a2b50e8c65879ad55b5a5fe2b41 Mon Sep 17 00:00:00 2001 From: Testspieler09 Date: Sun, 27 Jul 2025 20:07:56 +0200 Subject: [PATCH 5/8] unstable(glushkov): pushing progress --- Cargo.toml | 10 +- benches/bench_cases.rs | 81 +++- benches/glushkov_benchmark.rs | 62 --- benches/regex_benchmark.rs | 181 ++++++++ benches/rust_regex_benchmark.rs | 56 --- benches/thompson_benchmark.rs | 62 --- src/glushkov.rs | 727 +++++++++++++++++++++++--------- src/lib.rs | 291 +++++++++++-- src/thompson.rs | 213 +++------- tests/glushkov_test.rs | 17 + tests/rust_regex_test.rs | 27 ++ tests/test_one.rs | 49 --- tests/thompson_test.rs | 18 + 13 files changed, 1149 insertions(+), 645 deletions(-) delete mode 100644 benches/glushkov_benchmark.rs create mode 100644 benches/regex_benchmark.rs delete mode 100644 benches/rust_regex_benchmark.rs delete mode 100644 benches/thompson_benchmark.rs create mode 100644 tests/glushkov_test.rs create mode 100644 tests/rust_regex_test.rs delete mode 100644 tests/test_one.rs create mode 100644 tests/thompson_test.rs diff --git a/Cargo.toml b/Cargo.toml index 4a0c696..4cdf3fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,13 +8,5 @@ criterion = { version = "0.5", features = ["html_reports"] } regex = "1.11.1" [[bench]] -name = "thompson_benchmark" -harness = false - -# [[bench]] -# name = "glushkov_benchmark" -# harness = false - -[[bench]] -name = "rust_regex_benchmark" +name = "regex_benchmark" harness = false diff --git a/benches/bench_cases.rs b/benches/bench_cases.rs index 7df2b4c..e76cd8a 100644 --- a/benches/bench_cases.rs +++ b/benches/bench_cases.rs @@ -1,76 +1,113 @@ +use std::hint::black_box; + #[allow(dead_code)] struct BenchCase<'a> { pub regex: &'a str, pub input: String, + pub expected_is_match: bool, + pub expected_first_match: Option, + pub expected_all_matches: Vec, } // This function is used in the `benchmark` files #[allow(dead_code)] fn get_bench_cases() -> Vec> { - vec![ + black_box(vec![ BenchCase { regex: r"a.b", input: "abcd abef abgh ijk".to_string(), + expected_is_match: false, + expected_first_match: None, + expected_all_matches: vec![], }, BenchCase { regex: r"a*b", input: "aaaaaaaaab".to_string(), + expected_is_match: true, + expected_first_match: Some("aaaaaaaaab".to_string()), + expected_all_matches: vec!["aaaaaaaaab".to_string()], }, BenchCase { regex: r"a+b", input: "aabab".to_string(), + expected_is_match: false, + expected_first_match: Some("aab".to_string()), + expected_all_matches: vec!["aab".to_string(), "ab".to_string()], }, BenchCase { regex: r"a?b", input: "b aaab ab".to_string(), + expected_is_match: false, + expected_first_match: Some("b".to_string()), + expected_all_matches: vec!["b".to_string(), "ab".to_string(), "ab".to_string()], }, BenchCase { regex: r"a|b", input: "xxaxybxx".to_string(), + expected_is_match: false, + expected_first_match: Some("a".to_string()), + expected_all_matches: vec!["a".to_string(), "b".to_string()], }, - // Group and escape sequences BenchCase { regex: r"(a|b)c", input: "abc ac bc bbcc".to_string(), + expected_is_match: false, + expected_first_match: Some("bc".to_string()), + expected_all_matches: vec![ + "bc".to_string(), + "ac".to_string(), + "bc".to_string(), + "bc".to_string(), + ], }, BenchCase { regex: r"\.", input: "Find . within this !?. sentence.".to_string(), + expected_is_match: false, + expected_first_match: Some(".".to_string()), + expected_all_matches: vec![".".to_string(), ".".to_string(), ".".to_string()], }, - // Larger and more complex patterns BenchCase { regex: r"(hel+o|wor?ld)", input: "hello helolllo world worlld helloworld".to_string(), + expected_is_match: false, + expected_first_match: Some("hello".to_string()), + expected_all_matches: vec![ + "hello".to_string(), + "helo".to_string(), + "world".to_string(), + "hello".to_string(), + "world".to_string(), + ], }, BenchCase { regex: r"ab*c+", input: "abbc abbbbbbbcc bccaaabbabc".to_string(), + expected_is_match: false, + expected_first_match: Some("abbc".to_string()), + expected_all_matches: vec![ + "abbc".to_string(), + "abbbbbbbcc".to_string(), + "abc".to_string(), + ], }, BenchCase { regex: r"(a(bc|de)+)", input: "abc abcbc abcdedef".to_string(), - }, - // Realistic text patterns and larger inputs - BenchCase { - regex: r"\b[0-9]{2}\b", - input: "There are 99 bottles of soda and 45 cans of juice".to_string(), - }, - BenchCase { - regex: r"\b\w{5,}\b", - input: "Rust is great for systems programming but can be challenging".to_string(), + expected_is_match: false, + expected_first_match: Some("abc".to_string()), + expected_all_matches: vec![ + "abc".to_string(), + "abcbc".to_string(), + "abcdede".to_string(), + ], }, - BenchCase { - regex: r"(https?|ftp)://[^\s/$.?#].[^\s]*", - input: "Check https://example.com out and ftp://fileserver.net as well".to_string(), - }, - // Pathological case to test limits BenchCase { regex: r"(a|b)*c", input: format!("{}{}", "a".repeat(1000), "bc"), + expected_is_match: true, + expected_first_match: Some(format!("{}{}", "a".repeat(1000), "bc")), + expected_all_matches: vec![format!("{}{}", "a".repeat(1000), "bc")], }, - BenchCase { - regex: r"x{3}(y|z)", - input: "xxxxyxxxzxxxy".to_string(), - }, - ] + ]) } diff --git a/benches/glushkov_benchmark.rs b/benches/glushkov_benchmark.rs deleted file mode 100644 index 1f6522a..0000000 --- a/benches/glushkov_benchmark.rs +++ /dev/null @@ -1,62 +0,0 @@ -include!("bench_cases.rs"); -use criterion::{Criterion, criterion_group, criterion_main}; -use regex_engine::{ConstructionType, Regex}; - -fn benchmark_glushkov_regex_process(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Glushkov); - - c.bench_function( - &format!("Glushkov is_match - pattern: {}", case.regex), - |b| { - b.iter(|| { - regex.is_match(&case.input); - }) - }, - ); - } -} - -fn benchmark_glushkov_regex_find_first(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Glushkov); - - c.bench_function( - &format!("Glushkov find match - pattern: {}", case.regex), - |b| { - b.iter(|| { - regex.find(&case.input); - }) - }, - ); - } -} - -fn benchmark_glushkov_regex_find_all(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Glushkov); - - c.bench_function( - &format!("Glushkov findall matches - pattern: {}", case.regex), - |b| { - b.iter(|| { - regex.findall(&case.input); - }) - }, - ); - } -} - -criterion_group!( - benches, - benchmark_glushkov_regex_process, - benchmark_glushkov_regex_find_first, - benchmark_glushkov_regex_find_all -); -criterion_main!(benches); diff --git a/benches/regex_benchmark.rs b/benches/regex_benchmark.rs new file mode 100644 index 0000000..8629579 --- /dev/null +++ b/benches/regex_benchmark.rs @@ -0,0 +1,181 @@ +include!("bench_cases.rs"); +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use regex as rust_regex; +use regex_engine::{ConstructionType, Regex}; + +fn benchmark_regex_compile_time(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Compile Time"); + + for case in cases { + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.regex, + |b, regex| { + b.iter(|| { + Regex::new(regex, ConstructionType::Thompson); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.regex, + |b, regex| { + b.iter(|| { + Regex::new(regex, ConstructionType::Glushkov); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.regex, + |b, regex| { + b.iter(|| { + rust_regex::Regex::new(regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {regex}")); + }) + }, + ); + } + group.finish(); +} + +fn benchmark_regex_is_match(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Is Match"); + + for case in &cases { + let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson); + let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov); + let rust_regex = rust_regex::Regex::new(&format!("^{}$", case.regex)) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.input, + |b, input| { + b.iter(|| { + thompson_regex.is_match(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.input, + |b, input| { + b.iter(|| { + glushkov_regex.is_match(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.input, + |b, input| { + b.iter(|| { + rust_regex.is_match(input); + }) + }, + ); + } + group.finish(); +} + +fn benchmark_regex_find_first(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Find First"); + + for case in &cases { + let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson); + let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov); + let rust_regex = rust_regex::Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.input, + |b, input| { + b.iter(|| { + thompson_regex.find(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.input, + |b, input| { + b.iter(|| { + glushkov_regex.find(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.input, + |b, input| { + b.iter(|| { + rust_regex.find(input).map(|m| m.as_str()); + }) + }, + ); + } + group.finish(); +} + +fn benchmark_regex_find_all(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Find All"); + + for case in &cases { + let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson); + let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov); + let rust_regex = rust_regex::Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.input, + |b, input| { + b.iter(|| { + thompson_regex.findall(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.input, + |b, input| { + b.iter(|| { + glushkov_regex.findall(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.input, + |b, input| { + b.iter(|| { + rust_regex.find_iter(input); + }) + }, + ); + } + group.finish(); +} + +criterion_group!( + benches, + benchmark_regex_compile_time, + benchmark_regex_is_match, + benchmark_regex_find_first, + benchmark_regex_find_all +); +criterion_main!(benches); diff --git a/benches/rust_regex_benchmark.rs b/benches/rust_regex_benchmark.rs deleted file mode 100644 index 63d05a3..0000000 --- a/benches/rust_regex_benchmark.rs +++ /dev/null @@ -1,56 +0,0 @@ -include!("bench_cases.rs"); -use criterion::{Criterion, criterion_group, criterion_main}; -use regex::Regex; - -fn benchmark_rust_regex_process(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in cases { - let regex = Regex::new(case.regex) - .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); - - c.bench_function(&format!("Rust process match: {}", case.regex), |b| { - b.iter(|| { - regex.is_match(&case.input); - }) - }); - } -} - -fn benchmark_rust_regex_find_first(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in cases { - let regex = Regex::new(case.regex) - .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); - - c.bench_function(&format!("Rust find first match: {}", case.regex), |b| { - b.iter(|| { - regex.find(&case.input).map(|m| m.as_str()); - }) - }); - } -} - -fn benchmark_rust_regex_find_all(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in cases { - let regex = Regex::new(case.regex) - .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); - - c.bench_function(&format!("Rust find all matches: {}", case.regex), |b| { - b.iter(|| { - regex.find_iter(&case.input); - }) - }); - } -} - -criterion_group!( - benches, - benchmark_rust_regex_process, - benchmark_rust_regex_find_first, - benchmark_rust_regex_find_all -); -criterion_main!(benches); diff --git a/benches/thompson_benchmark.rs b/benches/thompson_benchmark.rs deleted file mode 100644 index e38a502..0000000 --- a/benches/thompson_benchmark.rs +++ /dev/null @@ -1,62 +0,0 @@ -include!("bench_cases.rs"); -use criterion::{Criterion, criterion_group, criterion_main}; -use regex_engine::{ConstructionType, Regex}; - -fn benchmark_thompson_regex_process(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Thompson); - - c.bench_function( - &format!("Thompson is_match - pattern: {}", case.regex), - |b| { - b.iter(|| { - regex.is_match(&case.input); - }) - }, - ); - } -} - -fn benchmark_thompson_regex_find_first(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Thompson); - - c.bench_function( - &format!("Thompson find match - pattern: {}", case.regex), - |b| { - b.iter(|| { - regex.find(&case.input); - }) - }, - ); - } -} - -fn benchmark_thompson_regex_find_all(c: &mut Criterion) { - let cases = get_bench_cases(); - - for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Thompson); - - c.bench_function( - &format!("Thompson findall matches - pattern: {}", case.regex), - |b| { - b.iter(|| { - regex.findall(&case.input); - }) - }, - ); - } -} - -criterion_group!( - benches, - benchmark_thompson_regex_process, - benchmark_thompson_regex_find_first, - benchmark_thompson_regex_find_all -); -criterion_main!(benches); diff --git a/src/glushkov.rs b/src/glushkov.rs index 17914ef..1d75081 100644 --- a/src/glushkov.rs +++ b/src/glushkov.rs @@ -1,5 +1,5 @@ use crate::{Dfa, is_valid_regex, normalise_regex}; -use std::collections::{HashMap, HashSet}; +use std::collections::{HashMap, HashSet, VecDeque}; #[derive(Clone, Debug, PartialEq)] enum SymbolType { @@ -8,11 +8,13 @@ enum SymbolType { Escaped, } +#[derive(Debug)] struct Nfa { transitions: HashMap<(u32, char), Vec>, accepting_states: HashSet, } +#[derive(Debug)] pub struct GlushkovDfa { transitions: HashMap<(u32, char), u32>, accepting_states: HashSet, @@ -24,8 +26,12 @@ impl Dfa for GlushkovDfa { panic!("{regex} is not a valid regular expression!"); } - let normalised_regex = normalise_regex(®ex); - todo!() + let normalised_regex = normalise_regex(regex); + let regex_nfa = glushkov_construction(&normalised_regex); + dbg!(®ex_nfa); + let mut regex_dfa = nfa_no_epsilon_to_dfa(®ex_nfa); + ::optimise_dfa(&mut regex_dfa); + regex_dfa } fn get_transitions(&self) -> &HashMap<(u32, char), u32> { @@ -35,25 +41,27 @@ impl Dfa for GlushkovDfa { fn get_accepting_states(&self) -> &HashSet { &self.accepting_states } + + fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32> { + &mut self.transitions + } + + fn get_accepting_states_mut(&mut self) -> &mut HashSet { + &mut self.accepting_states + } } // GLUSHKOV CONSTRUCTION fn glushkov_construction(regex: &str) -> Nfa { + dbg!(®ex); let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); let mut accepting_states: HashSet = HashSet::new(); let states: HashMap = index_states(regex); + dbg!(&states); - let mut start_states: HashSet = HashSet::new(); + fill_sets(states, &mut accepting_states, &mut transitions); - fill_sets( - states, - &mut start_states, - &mut accepting_states, - &mut transitions, - ); - - // TODO: Construct transitions and accepting states using position_index Nfa { transitions, accepting_states, @@ -62,19 +70,19 @@ fn glushkov_construction(regex: &str) -> Nfa { fn index_states(regex: &str) -> HashMap { let mut indexed_states: HashMap = HashMap::new(); - let mut symbol_type = SymbolType::Normal; - let mut union_count: Vec = vec![0]; + let mut symbol_type: SymbolType = SymbolType::Normal; + let mut group_stack: Vec = vec![0]; let mut idx: u32 = 0; - let mut group_index: u32 = 0; - - // New stack to track if a group is meaningful - let mut group_stack: Vec> = vec![]; // Some(index) if real, None if ignored - + let mut next_group_id: u32 = 1; let mut chars = regex.chars().peekable(); while let Some(symbol) = chars.next() { if symbol_type == SymbolType::Escaped { - indexed_states.insert(idx, (symbol, symbol_type.clone(), group_index)); + indexed_states.entry(idx).or_insert(( + symbol, + symbol_type.clone(), + *group_stack.last().unwrap(), + )); idx += 1; symbol_type = SymbolType::Normal; continue; @@ -82,33 +90,23 @@ fn index_states(regex: &str) -> HashMap { match symbol { '|' => { - if let Some(last_union) = union_count.last_mut() { - *last_union += 1; - } - if let Some(Some(_)) = group_stack.last_mut() { - // still a real group, do nothing here - } else if let Some(group) = group_stack.last_mut() { - // this group is now meaningful, assign it an index - *group = Some(group_index); - group_index += 1; + // Start a new group for the next alternative + let new_group_id = next_group_id; + next_group_id += 1; + // Replace the current group on the stack with the new one + if let Some(last) = group_stack.last_mut() { + *last = new_group_id; } } '(' => { - union_count.push(0); - group_stack.push(None); // not yet known if meaningful + // Push the next group ID onto the stack for this grouping level + let new_group_id = next_group_id; + next_group_id += 1; + group_stack.push(new_group_id); } ')' => { - union_count.pop(); - - match group_stack.pop() { - Some(Some(_)) => { - // it was meaningful, nothing to change - } - Some(None) => { - // the group was never promoted to real => do nothing - } - None => panic!("Mismatched parentheses"), - } + // Pop the current group and return to parent group + group_stack.pop(); } '*' => { symbol_type = SymbolType::Normal; @@ -116,179 +114,332 @@ fn index_states(regex: &str) -> HashMap { } '\\' => symbol_type = SymbolType::Escaped, _ => { - if let Some(next) = chars.peek() { - if *next == '*' { - symbol_type = SymbolType::KleeneStar; - } - } - - // if we're inside a group that hasn't been assigned an index yet, assign now - if let Some(group) = group_stack.last_mut() { - if group.is_none() { - *group = Some(group_index); - group_index += 1; - } + if let Some(next_symbol) = chars.peek() + && matches!(*next_symbol, '*') + { + symbol_type = SymbolType::KleeneStar } - - // get current group idx for this symbol - let current_group = group_stack.last().and_then(|g| *g).unwrap_or(group_index); - - indexed_states.insert(idx, (symbol, symbol_type.clone(), current_group)); + indexed_states.entry(idx).or_insert(( + symbol, + symbol_type.clone(), + *group_stack.last().unwrap(), + )); idx += 1; } } } - - // let mut indexed_states: HashMap = HashMap::new(); - // let mut symbol_type: SymbolType = SymbolType::Normal; - // let mut union_count: Vec = vec![0]; - // let mut idx: u32 = 0; - // let mut group_index: u32 = 0; - // - // let mut chars = regex.chars().peekable(); - // - // while let Some(symbol) = chars.next() { - // if symbol_type == SymbolType::Escaped { - // indexed_states - // .entry(idx) - // .or_insert((symbol, symbol_type.clone(), group_index)); - // - // idx += 1; - // symbol_type = SymbolType::Normal; - // continue; - // } - // - // println!("{union_count:?}, {symbol:?}"); - // match symbol { - // '|' => { - // if let Some(last_element) = union_count.last_mut() { - // *last_element += 1; - // } - // group_index += 1; - // } - // // FIX: the paranthasis are not working correctly e.g. x|(x|y)|x <=> x|x|y|x - // '(' => { - // union_count.push(0); - // group_index += 1; - // } - // ')' => { - // let unions_last_grouping = union_count.pop().unwrap(); - // if unions_last_grouping == 0 { - // continue; - // } - // group_index -= unions_last_grouping + 1; - // } - // '*' => { - // symbol_type = SymbolType::Normal; - // continue; - // } - // '\\' => symbol_type = SymbolType::Escaped, - // _ => { - // if let Some(next_symbol) = chars.peek() { - // if *next_symbol == '*' { - // symbol_type = SymbolType::KleeneStar - // } - // } - // - // indexed_states - // .entry(idx) - // .or_insert((symbol, symbol_type.clone(), group_index)); - // - // idx += 1; - // } - // } - // } - indexed_states } fn fill_sets( states: HashMap, - start_states: &mut HashSet, - finite_states: &mut HashSet, - tranisitions: &mut HashMap<(u32, char), Vec>, + accepting_states: &mut HashSet, + transitions: &mut HashMap<(u32, char), Vec>, ) { - let mut idx: u32 = 1; - let amount_states: u32 = states.len() as u32; + let mut start_states = HashSet::new(); + let amount_states = states.len() as u32; if amount_states == 0 { return; } - // tranisitions - // .entry((amount_states, states[&0].0)) - // .or_insert(vec![0]); - start_states.insert(0); - - let mut last_symbol_type: &SymbolType = &states[&0].1; - let mut last_group_idx: u32 = 0; - let mut check_next_group: bool = false; // NOTE: can also be thought of as group_is_exhausted - - loop { - let (_symbol, symbol_type, group_idx) = &states[&idx]; - // Skip forwards to next group - if check_next_group { - if *group_idx != last_group_idx { - start_states.insert(idx); - last_symbol_type = symbol_type; - last_group_idx = *group_idx; - check_next_group = false; - // continue; - } + // Group states by their group index + let mut groups: HashMap> = HashMap::new(); + for (state_id, (_, _, group_idx)) in &states { + groups.entry(*group_idx).or_default().push(*state_id); + } - if idx < amount_states - 1 { - idx += 1; - continue; - } else { - break; - } + // Sort states within each group + for group in groups.values_mut() { + group.sort(); + } + + // Determine start states (first state of each group) + for group in groups.values() { + if group.is_empty() { + continue; } - if *group_idx != last_group_idx { - start_states.insert(idx); - last_group_idx = *group_idx; - last_symbol_type = symbol_type; - check_next_group = true; + start_states.insert(group[0]); - if idx < amount_states - 1 { - idx += 1; - continue; - } else { - break; + for i in 0..group.len() { + let state = group[i]; + if let Some((_, symbol_type, _)) = states.get(&state) { + if symbol_type == &SymbolType::KleeneStar && i + 1 < group.len() { + start_states.insert(group[i + 1]); + } } } + } - match last_symbol_type { + // Build transitions and determine accepting states + for (state_id, (symbol, symbol_type, group_idx)) in &states { + let current_group = &groups[group_idx]; + let pos_in_group = current_group.iter().position(|&x| x == *state_id).unwrap(); + + match symbol_type { SymbolType::Normal | SymbolType::Escaped => { - check_next_group = true; + if pos_in_group + 1 < current_group.len() { + let next_state = current_group[pos_in_group + 1]; + transitions + .entry((*state_id, *symbol)) + .or_default() + .push(next_state); + } else { + accepting_states.insert(*state_id); + } } SymbolType::KleeneStar => { - start_states.insert(idx); - check_next_group = false; + transitions + .entry((*state_id, *symbol)) + .or_default() + .push(*state_id); + + if pos_in_group + 1 < current_group.len() { + for next_state in current_group.iter().skip(pos_in_group + 1) { + transitions + .entry((*state_id, *symbol)) + .or_default() + .push(*next_state); + } + } else { + accepting_states.insert(*state_id); + } } } + } - last_symbol_type = symbol_type; + // Setup virtual (start-)state + let virtual_start = states.keys().max().copied().unwrap_or(0) + 1; - if idx < amount_states - 1 { - idx += 1; - } else { - break; - } + let symbol_to_first_state: Vec<(u32, char)> = start_states + .iter() + .map(|&s| (s, states.get(&s).expect("Expected an entry").0)) + .collect(); + + for (first_state, symbol) in symbol_to_first_state { + transitions + .entry((virtual_start, symbol)) + .or_default() + .push(first_state); } } // END GLUSHKOV CONSTRUCTION fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa { - todo!() + let mut dfa_transitions = HashMap::new(); + let mut dfa_accepting_states = HashSet::new(); + + // Map from sorted vector of NFA states to DFA state ID (for hashable key) + let mut nfa_states_to_dfa_state: HashMap, u32> = HashMap::new(); + let mut next_dfa_state_id = 0u32; + let mut work_queue = VecDeque::new(); + + // Helper function to convert HashSet to sorted Vec for use as HashMap key + let set_to_sorted_vec = |set: &HashSet| -> Vec { + let mut vec: Vec = set.iter().cloned().collect(); + vec.sort_unstable(); + vec + }; + + // Get all possible input symbols from NFA transitions + let alphabet: HashSet = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect(); + + // Find all states that exist in the NFA + let mut all_nfa_states = HashSet::new(); + for &(state, _) in nfa.transitions.keys() { + all_nfa_states.insert(state); + } + for target_states in nfa.transitions.values() { + for &state in target_states { + all_nfa_states.insert(state); + } + } + for &state in &nfa.accepting_states { + all_nfa_states.insert(state); + } + + // In a Glushkov NFA, state 0 is always the start state + let start_state = 0; + + // Verify that state 0 exists in the NFA + if !all_nfa_states.contains(&start_state) { + panic!("Expected start state 0 not found in NFA states: {all_nfa_states:?}"); + } + + let start_state_set = { + let mut set = HashSet::new(); + set.insert(start_state); + set + }; + + // Create initial DFA state + let start_dfa_state = next_dfa_state_id; + next_dfa_state_id += 1; + + let start_state_key = set_to_sorted_vec(&start_state_set); + nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state); + work_queue.push_back(start_state_set); + + // Process each DFA state + while let Some(current_nfa_states) = work_queue.pop_front() { + let current_state_key = set_to_sorted_vec(¤t_nfa_states); + let current_dfa_state = nfa_states_to_dfa_state[¤t_state_key]; + + // Check if this DFA state should be accepting + if current_nfa_states + .iter() + .any(|&state| nfa.accepting_states.contains(&state)) + { + dfa_accepting_states.insert(current_dfa_state); + } + + // For each symbol in the alphabet + for &symbol in &alphabet { + let mut next_nfa_states = HashSet::new(); + + // Collect all states reachable from current_nfa_states via symbol + for &nfa_state in ¤t_nfa_states { + if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) { + for &target_state in target_states { + next_nfa_states.insert(target_state); + } + } + } + + // Skip if no transitions exist for this symbol + if next_nfa_states.is_empty() { + continue; + } + + // Get or create DFA state for this set of NFA states + let next_state_key = set_to_sorted_vec(&next_nfa_states); + let next_dfa_state = + if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) { + existing_state + } else { + let new_state = next_dfa_state_id; + next_dfa_state_id += 1; + + nfa_states_to_dfa_state.insert(next_state_key.clone(), new_state); + work_queue.push_back(next_nfa_states); + + new_state + }; + + // Add transition to DFA + dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state); + } + } + + GlushkovDfa { + transitions: dfa_transitions, + accepting_states: dfa_accepting_states, + } } +// fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa { +// let mut dfa_transitions = HashMap::new(); +// let mut dfa_accepting_states = HashSet::new(); +// +// // Map from DFA state ID to the set of NFA states it represents +// let mut dfa_state_to_nfa_states: HashMap> = HashMap::new(); +// // Map from sorted vector of NFA states to DFA state ID (for hashable key) +// let mut nfa_states_to_dfa_state: HashMap, u32> = HashMap::new(); +// +// let mut next_dfa_state_id = 0u32; +// let mut work_queue = VecDeque::new(); +// +// // Helper function to convert HashSet to sorted Vec for use as HashMap key +// let set_to_sorted_vec = |set: &HashSet| -> Vec { +// let mut vec: Vec = set.iter().cloned().collect(); +// vec.sort_unstable(); +// vec +// }; +// +// // Get all possible input symbols from NFA transitions +// let alphabet: HashSet = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect(); +// +// // Find the start state (assuming state 0 is the start state) +// let start_state_set = { +// let mut set = HashSet::new(); +// set.insert(0u32); +// set +// }; +// +// // Create initial DFA state +// let start_dfa_state = next_dfa_state_id; +// next_dfa_state_id += 1; +// +// let start_state_key = set_to_sorted_vec(&start_state_set); +// dfa_state_to_nfa_states.insert(start_dfa_state, start_state_set.clone()); +// nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state); +// work_queue.push_back(start_state_set); +// +// // Process each DFA state +// while let Some(current_nfa_states) = work_queue.pop_front() { +// let current_state_key = set_to_sorted_vec(¤t_nfa_states); +// let current_dfa_state = nfa_states_to_dfa_state[¤t_state_key]; +// +// // Check if this DFA state should be accepting +// if current_nfa_states +// .iter() +// .any(|&state| nfa.accepting_states.contains(&state)) +// { +// dfa_accepting_states.insert(current_dfa_state); +// } +// +// // For each symbol in the alphabet +// for &symbol in &alphabet { +// let mut next_nfa_states = HashSet::new(); +// +// // Collect all states reachable from current_nfa_states via symbol +// for &nfa_state in ¤t_nfa_states { +// if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) { +// for &target_state in target_states { +// next_nfa_states.insert(target_state); +// } +// } +// } +// +// // Skip if no transitions exist for this symbol +// if next_nfa_states.is_empty() { +// continue; +// } +// +// // Get or create DFA state for this set of NFA states +// let next_state_key = set_to_sorted_vec(&next_nfa_states); +// let next_dfa_state = +// if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) { +// existing_state +// } else { +// let new_state = next_dfa_state_id; +// next_dfa_state_id += 1; +// +// dfa_state_to_nfa_states.insert(new_state, next_nfa_states.clone()); +// nfa_states_to_dfa_state.insert(next_state_key, new_state); +// work_queue.push_back(next_nfa_states); +// +// new_state +// }; +// +// // Add transition to DFA +// dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state); +// } +// } +// +// GlushkovDfa { +// transitions: dfa_transitions, +// accepting_states: dfa_accepting_states, +// } +// } + #[cfg(test)] mod tests { use super::*; #[test] - fn test_single_character() { + fn test_index_single_character() { let expected = HashMap::from([(0, ('a', SymbolType::Normal, 0))]); let result = index_states("a"); @@ -296,7 +447,41 @@ mod tests { } #[test] - fn test_kleene_star() { + fn test_nfa_single_character() { + let expected_finite = HashSet::from([0]); + let expected_transitions: HashMap<(u32, char), Vec> = + HashMap::from([((1, 'a'), vec![0])]); + + let result = glushkov_construction("a"); + assert_eq!( + result.transitions, expected_transitions, + "Mismatch in single character test" + ); + assert_eq!( + result.accepting_states, expected_finite, + "Mismatch in single character test" + ); + } + + #[test] + fn test_nfa_single_character_kleene_star() { + let expected_finite = HashSet::from([0]); + let expected_transitions: HashMap<(u32, char), Vec> = + HashMap::from([((0, 'a'), vec![0]), ((1, 'a'), vec![0])]); + + let result = glushkov_construction("a*"); + assert_eq!( + result.transitions, expected_transitions, + "Mismatch in single character test" + ); + assert_eq!( + result.accepting_states, expected_finite, + "Mismatch in single character test" + ); + } + + #[test] + fn test_index_kleene_star() { let expected = HashMap::from([(0, ('a', SymbolType::KleeneStar, 0))]); let result = index_states("a*"); @@ -304,7 +489,7 @@ mod tests { } #[test] - fn test_union_and_groups() { + fn test_index_union_and_groups() { let expected = HashMap::from([ (0, ('a', SymbolType::Normal, 1)), (1, ('b', SymbolType::Normal, 2)), @@ -315,7 +500,7 @@ mod tests { } #[test] - fn test_escaped_character() { + fn test_index_escaped_character() { let expected = HashMap::from([(0, ('a', SymbolType::Escaped, 0))]); let result = index_states("\\a"); @@ -323,7 +508,7 @@ mod tests { } #[test] - fn test_mixed_regex() { + fn test_index_mixed_regex() { let expected = HashMap::from([ (0, ('a', SymbolType::Normal, 0)), (1, ('*', SymbolType::Escaped, 0)), @@ -342,7 +527,7 @@ mod tests { } #[test] - fn test_too_many_brackets() { + fn test_index_too_many_brackets() { let expected = HashMap::from([ (0, ('a', SymbolType::KleeneStar, 0)), (1, ('b', SymbolType::Normal, 0)), @@ -357,25 +542,185 @@ mod tests { } #[test] - fn test_fill_sets() { + fn test_fill_sets_too_many_brackets() { let states = index_states("a*b|(c|d)|ef"); - let mut start_states: HashSet = HashSet::new(); let mut finite_states: HashSet = HashSet::new(); let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); - let expected_start_set: HashSet = HashSet::from([0, 1, 2, 3, 4]); - let expected_finite_set: HashSet = HashSet::new(); - let expected_transions: HashMap<(u32, char), Vec> = HashMap::new(); + let expected_finite_set: HashSet = HashSet::from([1, 2, 3, 5]); + let expected_transitions: HashMap<(u32, char), Vec> = HashMap::from([ + ((6, 'a'), vec![0]), + ((6, 'b'), vec![1]), + ((6, 'c'), vec![2]), + ((6, 'd'), vec![3]), + ((6, 'e'), vec![4]), + ((0, 'a'), vec![0, 1]), + ((4, 'e'), vec![5]), + ]); - fill_sets( - states, - &mut start_states, - &mut finite_states, - &mut transitions, - ); + fill_sets(states, &mut finite_states, &mut transitions); - assert_eq!(start_states, expected_start_set); assert_eq!(finite_states, expected_finite_set); - assert_eq!(transitions, expected_transions); + assert_eq!(transitions, expected_transitions); + } + + #[test] + fn test_fill_sets_complex() { + let states = index_states("a*b*c|d*e"); + let mut finite_states: HashSet = HashSet::new(); + let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); + + let expected_finite_set: HashSet = HashSet::from([2, 4]); + let expected_transitions = HashMap::from([ + ((5, 'a'), vec![0]), + ((5, 'b'), vec![1]), + ((5, 'c'), vec![2]), + ((5, 'd'), vec![3]), + ((5, 'e'), vec![4]), + ((0, 'a'), vec![0, 1, 2]), + ((1, 'b'), vec![1, 2]), + ((3, 'd'), vec![3, 4]), + ]); + + fill_sets(states, &mut finite_states, &mut transitions); + + assert_eq!(finite_states, expected_finite_set); + assert_eq!(transitions, expected_transitions); + } + + #[test] + fn nfa_to_dfa_simple_test() { + // NFA that accepts exactly "a" + // State 0 --a--> State 1 (accepting) + let input_nfa = Nfa { + transitions: HashMap::from([((0, 'a'), vec![1])]), + accepting_states: HashSet::from([1]), + }; + + let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); + + let expected_transitions = HashMap::from([((0, 'a'), 1)]); + let expected_accepting_states = HashSet::from([1]); + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + } + + #[test] + fn nfa_to_dfa_sequence_test() { + // NFA that accepts exactly "ab" + // State 0 --a--> State 1 --b--> State 2 (accepting) + let input_nfa = Nfa { + transitions: HashMap::from([((0, 'a'), vec![1]), ((1, 'b'), vec![2])]), + accepting_states: HashSet::from([2]), + }; + + let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); + + let expected_transitions = HashMap::from([((0, 'a'), 1), ((1, 'b'), 2)]); + let expected_accepting_states = HashSet::from([2]); + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + } + + #[test] + fn nfa_to_dfa_alternation_test() { + // NFA that accepts "a" or "b" + // State 0 --a--> State 1 (accepting) + // State 0 --b--> State 2 (accepting) + let input_nfa = Nfa { + transitions: HashMap::from([((0, 'a'), vec![1]), ((0, 'b'), vec![2])]), + accepting_states: HashSet::from([1, 2]), + }; + + let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); + + let expected_transitions = [ + HashMap::from([((0, 'a'), 1), ((0, 'b'), 2)]), + HashMap::from([((0, 'a'), 2), ((0, 'b'), 1)]), + ]; + let expected_accepting_states = HashSet::from([1, 2]); + + assert!( + generated_dfa.transitions == expected_transitions[0] + || generated_dfa.transitions == expected_transitions[1], + "generated_dfa.transitions did not match either expected set" + ); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + } + + #[test] + fn nfa_to_dfa_nondeterministic_test() { + // NFA with nondeterministic transition + // State 0 --a--> State 1, State 2 + // State 1 --b--> State 3 (accepting) + // State 2 --c--> State 3 (accepting) + let input_nfa = Nfa { + transitions: HashMap::from([ + ((0, 'a'), vec![1, 2]), + ((1, 'b'), vec![3]), + ((2, 'c'), vec![3]), + ]), + accepting_states: HashSet::from([3]), + }; + + let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); + + // After 'a' from state 0, we should be in a state representing {1, 2} + // Let's call this combined state "1" in the DFA + let expected_transitions = HashMap::from([ + ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1) + ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2) + ((1, 'c'), 2), // {1,2} --c--> {3} (DFA state 2) + ]); + let expected_accepting_states = HashSet::from([2]); // DFA state 2 represents {3} + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + } + + #[test] + fn nfa_to_dfa_multiple_accepting_test() { + // NFA where multiple paths lead to accepting states + // State 0 --a--> State 1 (accepting) + // State 0 --a--> State 2 --b--> State 3 (accepting) + let input_nfa = Nfa { + transitions: HashMap::from([((0, 'a'), vec![1, 2]), ((2, 'b'), vec![3])]), + accepting_states: HashSet::from([1, 3]), + }; + + let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); + + // After 'a' from state 0, we're in state representing {1, 2} + // This should be accepting because it contains state 1 + let expected_transitions = HashMap::from([ + ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1) + ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2) + ]); + let expected_accepting_states = HashSet::from([1, 2]); // Both DFA states are accepting + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + } + + #[test] + fn nfa_to_dfa_self_loop_test() { + // NFA with self-loop: accepts a* + // State 0 (accepting) --a--> State 0 + let input_nfa = Nfa { + transitions: HashMap::from([((0, 'a'), vec![0])]), + accepting_states: HashSet::from([0]), + }; + + let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); + + let expected_transitions = HashMap::from([ + ((0, 'a'), 0), // Self-loop + ]); + let expected_accepting_states = HashSet::from([0]); + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); } } diff --git a/src/lib.rs b/src/lib.rs index 7d17879..a57b628 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ use crate::{glushkov::GlushkovDfa, thompson::ThompsonDfa}; -use std::collections::{HashMap, HashSet}; +use std::collections::{HashMap, HashSet, VecDeque}; mod glushkov; mod thompson; @@ -8,6 +8,166 @@ trait Dfa { fn new(regex: &str) -> Self; fn get_transitions(&self) -> &HashMap<(u32, char), u32>; fn get_accepting_states(&self) -> &HashSet; + fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32>; + fn get_accepting_states_mut(&mut self) -> &mut HashSet; + fn optimise_dfa(&mut self) { + let mut partition: HashMap = HashMap::new(); + let mut accepting_states_set: HashSet = self.get_accepting_states().clone(); + let mut non_accepting_states: HashSet = HashSet::new(); + let mut all_states: HashSet = HashSet::new(); + + for &(state, _) in self.get_transitions().keys() { + all_states.insert(state); + if self.get_accepting_states().contains(&state) { + accepting_states_set.insert(state); + } else { + non_accepting_states.insert(state); + } + } + + for state in self.get_accepting_states().iter() { + all_states.insert(*state); + } + + for state in all_states.iter() { + if self.get_accepting_states().contains(state) { + partition.insert(*state, 0); + } else { + partition.insert(*state, 1); + } + } + + let mut partition_list: Vec> = Vec::new(); + partition_list.push(accepting_states_set); + partition_list.push(non_accepting_states); + + let mut worklist: VecDeque = VecDeque::new(); + if !partition_list[0].is_empty() { + worklist.push_back(0); + } + if partition_list.len() > 1 && !partition_list[1].is_empty() { + worklist.push_back(1); + } + + while let Some(current_partition_index) = worklist.pop_front() { + let mut states_to_check: HashMap> = HashMap::new(); + for (&(source_state, symbol), &target_state) in self.get_transitions() { + if partition[&target_state] == current_partition_index { + states_to_check + .entry(symbol) + .or_default() + .insert(source_state); + } + } + + for (_, states_to_split) in states_to_check.iter() { + let mut partitions_to_split: HashSet = HashSet::new(); + + for &state in states_to_split.iter() { + let partition_index = partition[&state]; + if partition_list[partition_index].len() > 1 { + partitions_to_split.insert(partition_index); + } + } + + for &partition_index_to_split in partitions_to_split.iter() { + let mut intersection: HashSet = HashSet::new(); + let mut difference: HashSet = HashSet::new(); + + for &state in partition_list[partition_index_to_split].iter() { + if states_to_split.contains(&state) { + intersection.insert(state); + } else { + difference.insert(state); + } + } + + if !intersection.is_empty() && !difference.is_empty() { + let new_partition_index = partition_list.len(); + + for &state in intersection.iter() { + partition.insert(state, new_partition_index); + } + + partition_list.push(intersection); + + for &state in &difference { + partition.insert(state, partition_index_to_split); + } + partition_list[partition_index_to_split] = difference; + + if partition_list[new_partition_index].len() + < partition_list[partition_index_to_split].len() + { + worklist.push_back(new_partition_index); + } else { + worklist.push_back(partition_index_to_split); + } + } + } + } + } + + // Build new transitions and accepting states + let mut minimal_transitions: HashMap<(u32, char), u32> = HashMap::new(); + let mut minimal_accepting_states: HashSet = HashSet::new(); + let mut new_state_map: HashMap = HashMap::new(); + + let mut next_state_id: u32 = 0; + + if let Some(partition_index) = partition.get(&0) { + new_state_map.insert(*partition_index, next_state_id); + next_state_id += 1; + } + + for (_, &partition_index) in partition.iter() { + if let std::collections::hash_map::Entry::Vacant(e) = + new_state_map.entry(partition_index) + { + e.insert(next_state_id); + next_state_id += 1; + } + } + + for (original_state, &partition_index) in partition.iter() { + let new_state_id = new_state_map[&partition_index]; + if self.get_accepting_states().contains(original_state) { + minimal_accepting_states.insert(new_state_id); + } + } + + for (&(source_state, symbol), &target_state) in self.get_transitions() { + let source_partition = partition[&source_state]; + let target_partition = partition[&target_state]; + + let new_source_state = new_state_map[&source_partition]; + let new_target_state = new_state_map[&target_partition]; + + minimal_transitions.insert((new_source_state, symbol), new_target_state); + } + + // Modify the existing DFA in-place + *self.get_transitions_mut() = minimal_transitions; + *self.get_accepting_states_mut() = minimal_accepting_states; + } + + /// Determines if the given input string exactly matches the regex pattern. + /// + /// This function processes the input as though it is surrounded by start (`^`) and + /// end (`$`) position anchors, ensuring that the entire input must conform to the pattern. + /// + /// # Parameters + /// + /// - `input`: A string slice representing the text to be checked against the regex. + /// + /// # Returns + /// + /// Returns `true` if the entire input string matches the regex pattern exactly, + /// considering implicit start and end anchors. + /// + /// e.g., for the regex pattern "(a|b)*", the function checks if the input matches + /// the pattern from start to finish, equivalent to "^(a|b)*$". + /// fn process(&self, input: &str) -> bool { let mut current_state = 0; for c in input.chars() { @@ -26,7 +186,6 @@ trait Dfa { let mut current_state = 0; let mut match_start = None; let mut match_end = None; - let mut found_match = false; for (i, c) in text.chars().enumerate().skip(start_pos) { if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { @@ -34,13 +193,8 @@ trait Dfa { match_start = match_start.or(Some(i)); if self.get_accepting_states().contains(¤t_state) { - found_match = true; match_end = Some(i) } - - if i == text.len() - 1 && found_match { - break; - } } else { break; } @@ -64,7 +218,6 @@ trait Dfa { let mut current_state = 0; let mut match_start: Option = None; let mut match_end: Option = None; - let mut found_match = false; for (i, c) in input.chars().enumerate().skip(start_pos) { if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { @@ -73,11 +226,6 @@ trait Dfa { if self.get_accepting_states().contains(¤t_state) { match_end = Some(i); - found_match = true; - } - - if i == input.len() - 1 && found_match { - break; } } else { break; @@ -86,7 +234,7 @@ trait Dfa { if let (Some(start), Some(end)) = (match_start, match_end) { matches.push(&input[start..=end]); - start_pos = end; + start_pos = end + 1; } else { start_pos += 1; } @@ -119,6 +267,29 @@ impl Regex { Regex { dfa: dfa_type } } + /// Determines if the provided `text` is an exact match for the regex pattern. + /// + /// This method interprets the regex pattern as though it is bracketed by start (`^`) + /// and end (`$`) anchors, requiring the entire `text` to conform to the pattern. + /// + /// # Parameters + /// + /// - `text`: A string slice that represents the text to be verified against the regex. + /// + /// # Returns + /// + /// Returns `true` if the `text` completely matches the regex pattern encompassed by implicit + /// anchors, otherwise returns `false`. + /// + /// # Example + /// + /// ```rust + /// use regex_engine::{Regex, ConstructionType}; + /// + /// let regex = Regex::new("(a|b)*", ConstructionType::Thompson); + /// assert!(regex.is_match("abba")); + /// assert!(!regex.is_match("abc")); + /// ``` pub fn is_match(&self, text: &str) -> bool { match &self.dfa { DfaType::Thompson(dfa) => dfa.process(text), @@ -126,6 +297,31 @@ impl Regex { } } + /// Searches for the first occurrence of a sequence in `text` that matches the regex pattern. + /// + /// This method locates and returns the first substring of `text` that matches the regex, + /// if such a substring exists. + /// + /// # Parameters + /// + /// - `text`: A string slice in which to search for the regex pattern. + /// + /// # Returns + /// + /// Returns an `Option<&str>` which contains the first matching substring if a match is found, + /// or `None` if no match occurs. + /// + /// # Example + /// + /// ```rust + /// use regex_engine::{Regex, ConstructionType}; + /// + /// let regex = Regex::new("ab+", ConstructionType::Thompson); + /// if let Some(matched) = regex.find("aabbcc") { + /// println!("Found: {}", matched); + /// } + /// // Output: Found: abb + /// ``` pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> { match &self.dfa { DfaType::Thompson(dfa) => dfa.find_first_match(text), @@ -147,16 +343,15 @@ pub fn is_valid_regex(regex: &str) -> bool { } let mut open_paren_count = 0; - let mut last_was_quantifier = false; + let mut last_was_quantifier = true; let mut chars = regex.chars().peekable(); while let Some(c) = chars.next() { match c { '(' => { open_paren_count += 1; - last_was_quantifier = false; + last_was_quantifier = true; } - ')' => { if open_paren_count == 0 { return false; @@ -164,23 +359,13 @@ pub fn is_valid_regex(regex: &str) -> bool { open_paren_count -= 1; last_was_quantifier = false; } - '*' | '+' => { // Ensure quantifiers are not the first character and are not repeated - if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') { + if last_was_quantifier { return false; } last_was_quantifier = true; } - - '|' => { - // Ensure alternation isn't the first or last character - if regex.starts_with('|') || chars.peek().is_none() { - return false; - } - last_was_quantifier = false; - } - '\\' => { // Handle escaped characters: ensure there's a character after the escape if chars.peek().is_none() { @@ -203,7 +388,6 @@ pub fn normalise_regex(regex: &str) -> String { let mut normalised = String::new(); let mut escape_sequence = false; let mut prev_char = '\0'; - for curr_char in regex.chars() { if escape_sequence { // TODO: Implement further parsing features here (e.g. \w \d) @@ -212,24 +396,45 @@ pub fn normalise_regex(regex: &str) -> String { prev_char = curr_char; continue; } - if curr_char == '\\' { escape_sequence = true; normalised.push(curr_char); continue; } - if curr_char == '+' { - normalised.push(prev_char); + match prev_char { + ')' => { + let mut balance = 0; + let mut group_start = 0; + + for j in (0..normalised.len()).rev() { + let ch = normalised.chars().nth(j).unwrap(); + if ch == ')' { + balance += 1; + } else if ch == '(' { + balance -= 1; + if balance == 0 { + group_start = j; + break; + } + } + } + + let group = String::from(&normalised[group_start..normalised.len()]); + normalised.push_str(&group); + } + _ => { + normalised.push(prev_char); + } + } normalised.push('*'); - prev_char = curr_char; + prev_char = '*'; continue; } if curr_char == '?' { match prev_char { ')' => { let mut balance = 0; - for j in (0..normalised.len()).rev() { let ch = normalised.chars().nth(j).unwrap(); if ch == ')' { @@ -247,20 +452,18 @@ pub fn normalise_regex(regex: &str) -> String { normalised.insert(normalised.len() - 1, '('); } } - normalised.push_str("|())"); - prev_char = curr_char; + normalised.push_str("|)"); + prev_char = ')'; continue; } if curr_char == '.' { normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"); - prev_char = curr_char; + prev_char = ')'; continue; } - normalised.push(curr_char); prev_char = curr_char; } - normalised } @@ -297,7 +500,7 @@ mod tests { #[test] fn invalid_operator_placement_test() { let regex1 = "*a"; - let regex2 = "|a|b"; + let regex2 = "(+abc|x)"; assert!( !is_valid_regex(regex1), "Expected invalid regex (invalid quantifier placement)." @@ -340,9 +543,9 @@ mod tests { let cases = [ (r"a+", r"aa*"), (r"a\+", r"a\+"), - (r"a?", r"(a|())"), + (r"a?", r"(a|)"), (r"a\?", r"a\?"), - (r"(ab)?", r"((ab)|())"), + (r"(ab)?", r"((ab)|)"), ( r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)", @@ -357,9 +560,9 @@ mod tests { #[test] fn is_match_test() { - let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson); + let regex_object = Regex::new("a(a|b)*", ConstructionType::Thompson); - let success_strings = vec!["abababaaaababa", ""]; + let success_strings = vec!["abababaaaababa", "a"]; for string in success_strings { assert!(regex_object.is_match(string)); } diff --git a/src/thompson.rs b/src/thompson.rs index 5f0ede8..dcf54d7 100644 --- a/src/thompson.rs +++ b/src/thompson.rs @@ -1,5 +1,5 @@ use crate::{Dfa, is_valid_regex, normalise_regex}; -use std::collections::{HashMap, HashSet, VecDeque}; +use std::collections::{HashMap, HashSet}; struct Nfa { transitions: HashMap<(u32, Option), Vec>, @@ -19,8 +19,9 @@ impl Dfa for ThompsonDfa { let normalised_regex = normalise_regex(regex); let regex_nfa: Nfa = thompson_construction(&normalised_regex); - let regex_dfa = nfa_to_dfa(®ex_nfa); - optimise_dfa(®ex_dfa) + let mut regex_dfa = nfa_to_dfa(®ex_nfa); + ::optimise_dfa(&mut regex_dfa); + regex_dfa } fn get_transitions(&self) -> &HashMap<(u32, char), u32> { @@ -30,6 +31,14 @@ impl Dfa for ThompsonDfa { fn get_accepting_states(&self) -> &HashSet { &self.accepting_states } + + fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32> { + &mut self.transitions + } + + fn get_accepting_states_mut(&mut self) -> &mut HashSet { + &mut self.accepting_states + } } // THOMPSON CONSTRUCTION --- @@ -46,7 +55,7 @@ fn thompson_construction(normalised_regex: &str) -> Nfa { let nfa_left = nfa_stack.pop().expect("Expected NFA for concatenation"); nfa_stack.push(concatenate(&nfa_left, &nfa_right)); } - _ => panic!("Unknown operator {operator:?}"), + _ => unreachable!("Unknown operator {}", operator), } } @@ -65,6 +74,7 @@ fn thompson_construction(normalised_regex: &str) -> Nfa { escape_sequence = false; continue; } + match symbol { '(' => { if concat_flag { @@ -74,17 +84,24 @@ fn thompson_construction(normalised_regex: &str) -> Nfa { concat_flag = false; } ')' => { - let mut is_epsilon = true; + // If concat_flag is false, we have an empty right operand for union + if !concat_flag { + nfa_stack.push(create_basic_epsilon_nfa()); + } + + // Process all operators until we hit the matching '(' while let Some(op) = operators.pop() { - if op == '(' && is_epsilon { - nfa_stack.push(create_basic_epsilon_nfa()); - break; - } else if op == '(' { + if op == '(' { break; } - is_epsilon = false; apply_operator(&mut nfa_stack, op); } + + // If stack is empty after processing, we had completely empty parentheses + if nfa_stack.is_empty() { + nfa_stack.push(create_basic_epsilon_nfa()); + } + concat_flag = true; } '*' => { @@ -93,6 +110,20 @@ fn thompson_construction(normalised_regex: &str) -> Nfa { concat_flag = true; } '|' => { + // Process all concatenation operators (higher precedence than union) + while let Some(&op) = operators.last() { + if op == '(' || op == '|' { + break; + } + operators.pop(); + apply_operator(&mut nfa_stack, op); + } + + // If we have no operand for the left side of union, create epsilon + if !concat_flag { + nfa_stack.push(create_basic_epsilon_nfa()); + } + operators.push('|'); concat_flag = false; } @@ -109,12 +140,26 @@ fn thompson_construction(normalised_regex: &str) -> Nfa { } } + // Handle case where regex ends with '|' (empty right operand) + if let Some(&'|') = operators.last() { + if nfa_stack.len() < 2 { + nfa_stack.push(create_basic_epsilon_nfa()); + } + } + + // Process remaining operators while let Some(op) = operators.pop() { + if op == '(' { + panic!("Unmatched opening parenthesis"); + } apply_operator(&mut nfa_stack, op); } if nfa_stack.len() != 1 { - panic!("Invalid Regex, unexpected final NFA stack size"); + panic!( + "Invalid Regex, unexpected final NFA stack size: {}", + nfa_stack.len() + ); } nfa_stack.pop().unwrap() @@ -321,145 +366,6 @@ fn nfa_to_dfa(nfa: &Nfa) -> ThompsonDfa { } // END NFA to DFA functions --- -fn optimise_dfa(dfa: &ThompsonDfa) -> ThompsonDfa { - let mut partition: HashMap = HashMap::new(); - let mut accepting_states_set: HashSet = dfa.accepting_states.clone(); - let mut non_accepting_states: HashSet = HashSet::new(); - let mut all_states: HashSet = HashSet::new(); - - for &(state, _) in dfa.transitions.keys() { - all_states.insert(state); - if dfa.accepting_states.contains(&state) { - accepting_states_set.insert(state); - } else { - non_accepting_states.insert(state); - } - } - - for state in dfa.accepting_states.iter() { - all_states.insert(*state); - } - - for state in all_states.iter() { - if dfa.accepting_states.contains(state) { - partition.insert(*state, 0); - } else { - partition.insert(*state, 1); - } - } - - let mut partition_list: Vec> = Vec::new(); - partition_list.push(accepting_states_set); - partition_list.push(non_accepting_states); - - let mut worklist: VecDeque = VecDeque::new(); - if !partition_list[0].is_empty() { - worklist.push_back(0); - } - if partition_list.len() > 1 && !partition_list[1].is_empty() { - worklist.push_back(1); - } - - while let Some(current_partition_index) = worklist.pop_front() { - let mut states_to_check: HashMap> = HashMap::new(); - for (&(source_state, symbol), &target_state) in &dfa.transitions { - if partition[&target_state] == current_partition_index { - states_to_check - .entry(symbol) - .or_default() - .insert(source_state); - } - } - - for (_, states_to_split) in states_to_check.iter() { - let mut partitions_to_split: HashSet = HashSet::new(); - - for &state in states_to_split.iter() { - let partition_index = partition[&state]; - if partition_list[partition_index].len() > 1 { - partitions_to_split.insert(partition_index); - } - } - - for &partition_index_to_split in partitions_to_split.iter() { - let mut intersection: HashSet = HashSet::new(); - let mut difference: HashSet = HashSet::new(); - - for &state in partition_list[partition_index_to_split].iter() { - if states_to_split.contains(&state) { - intersection.insert(state); - } else { - difference.insert(state); - } - } - - if !intersection.is_empty() && !difference.is_empty() { - let new_partition_index = partition_list.len(); - - for &state in intersection.iter() { - partition.insert(state, new_partition_index); - } - - partition_list.push(intersection); - - for &state in &difference { - partition.insert(state, partition_index_to_split); - } - partition_list[partition_index_to_split] = difference; - - if partition_list[new_partition_index].len() - < partition_list[partition_index_to_split].len() - { - worklist.push_back(new_partition_index); - } else { - worklist.push_back(partition_index_to_split); - } - } - } - } - } - - let mut minimal_transitions: HashMap<(u32, char), u32> = HashMap::new(); - let mut minimal_accepting_states: HashSet = HashSet::new(); - let mut new_state_map: HashMap = HashMap::new(); - - let mut next_state_id: u32 = 0; - - if let Some(partition_index) = partition.get(&0) { - new_state_map.insert(*partition_index, next_state_id); - next_state_id += 1; - } - - for (_, &partition_index) in partition.iter() { - if let std::collections::hash_map::Entry::Vacant(e) = new_state_map.entry(partition_index) { - e.insert(next_state_id); - next_state_id += 1; - } - } - - for (original_state, &partition_index) in partition.iter() { - let new_state_id = new_state_map[&partition_index]; - if dfa.accepting_states.contains(original_state) { - minimal_accepting_states.insert(new_state_id); - } - } - - for (&(source_state, symbol), &target_state) in &dfa.transitions { - let source_partition = partition[&source_state]; - let target_partition = partition[&target_state]; - - let new_source_state = new_state_map[&source_partition]; - let new_target_state = new_state_map[&target_partition]; - - minimal_transitions.insert((new_source_state, symbol), new_target_state); - } - - ThompsonDfa { - transitions: minimal_transitions, - accepting_states: minimal_accepting_states, - } -} - #[cfg(test)] mod tests { use super::*; @@ -482,6 +388,13 @@ mod tests { expected_accepting_states_2, generated_dfa_2.accepting_states ); + + let generated_dfa = ThompsonDfa::new("a*b"); + let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 1)]); + let expected_accepting_states = HashSet::from([1]); + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); } #[test] diff --git a/tests/glushkov_test.rs b/tests/glushkov_test.rs new file mode 100644 index 0000000..20c5306 --- /dev/null +++ b/tests/glushkov_test.rs @@ -0,0 +1,17 @@ +include!("../benches/bench_cases.rs"); +use regex_engine::{ConstructionType, Regex}; + +#[test] +fn test_all_bench_cases() { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Glushkov); + assert_eq!(regex.is_match(&case.input), case.expected_is_match); + assert_eq!( + regex.find(&case.input), + case.expected_first_match.as_deref() + ); + assert_eq!(regex.findall(&case.input), case.expected_all_matches); + } +} diff --git a/tests/rust_regex_test.rs b/tests/rust_regex_test.rs new file mode 100644 index 0000000..9142f6a --- /dev/null +++ b/tests/rust_regex_test.rs @@ -0,0 +1,27 @@ +include!("../benches/bench_cases.rs"); +use regex::Regex; + +#[test] +fn test_all_bench_cases() { + let cases = get_bench_cases(); + + for case in &cases { + let match_regex = Regex::new(format!("^{}$", case.regex).as_str()) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + let regex = Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + assert_eq!(match_regex.is_match(&case.input), case.expected_is_match); + assert_eq!( + regex.find(&case.input).map(|s| s.as_str()), + case.expected_first_match.as_deref() + ); + assert_eq!( + regex + .find_iter(&case.input) + .map(|s| s.as_str()) + .collect::>(), + case.expected_all_matches + ); + } +} diff --git a/tests/test_one.rs b/tests/test_one.rs deleted file mode 100644 index a166e8a..0000000 --- a/tests/test_one.rs +++ /dev/null @@ -1,49 +0,0 @@ -use regex_engine::regex_engine::{ConstructionType, Regex}; - -#[test] -fn test_escape_sequence_plus() { - let pattern = r"a*b\+"; - let text = "aaab+b"; // should fail on match - let text_success = "aaab+"; - - let engine = Regex::new(pattern, ConstructionType::Thompson); - - let expected_match = text_success; - - assert!(!engine.is_match(text)); - assert!(engine.is_match(text_success)); - assert_eq!(engine.find(text), Some(expected_match)); - assert_eq!(engine.findall(text), vec![expected_match]); -} - -#[test] -fn test_escape_sequence_slash() { - let pattern = r"a*b\\"; - let text = "aaab\\b"; // should fail on match - let text_success = "aaab\\"; - - let engine = Regex::new(pattern, ConstructionType::Thompson); - - let expected_match = text_success; - - assert!(!engine.is_match(text)); - assert!(engine.is_match(text_success)); - assert_eq!(engine.find(text), Some(expected_match)); - assert_eq!(engine.findall(text), vec![expected_match]); -} - -#[test] -fn test_dot_wildcard() { - let pattern = r"a.*"; - let text = "cabbc"; // should fail on match - let text_success = "abbc"; - - let engine = Regex::new(pattern, ConstructionType::Thompson); - - let expected_match = text_success; - - assert!(!engine.is_match(text)); - assert!(engine.is_match(text_success)); - assert_eq!(engine.find(text), Some(expected_match)); - assert_eq!(engine.findall(text), vec![expected_match]); -} diff --git a/tests/thompson_test.rs b/tests/thompson_test.rs new file mode 100644 index 0000000..d2cce3a --- /dev/null +++ b/tests/thompson_test.rs @@ -0,0 +1,18 @@ +include!("../benches/bench_cases.rs"); +use regex_engine::{ConstructionType, Regex}; + +#[test] +fn test_all_bench_cases() { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Thompson); + + assert_eq!(regex.is_match(&case.input), case.expected_is_match); + assert_eq!( + regex.find(&case.input), + case.expected_first_match.as_deref() + ); + assert_eq!(regex.findall(&case.input), case.expected_all_matches); + } +} From 58fd71847e1928593fe2ba5445152b836548976c Mon Sep 17 00:00:00 2001 From: Pepe Hanisch <142326461+Testspieler09@users.noreply.github.com> Date: Mon, 18 Aug 2025 20:34:57 +0200 Subject: [PATCH 6/8] progress --- benches/regex_benchmark.rs | 22 ++++--- src/glushkov.rs | 117 +++++++++++++++++++++++++++++-------- src/lib.rs | 18 +++--- src/thompson.rs | 22 +++---- tests/glushkov_test.rs | 2 +- tests/thompson_test.rs | 2 +- 6 files changed, 130 insertions(+), 53 deletions(-) diff --git a/benches/regex_benchmark.rs b/benches/regex_benchmark.rs index 8629579..7105fa5 100644 --- a/benches/regex_benchmark.rs +++ b/benches/regex_benchmark.rs @@ -13,7 +13,7 @@ fn benchmark_regex_compile_time(c: &mut Criterion) { &case.regex, |b, regex| { b.iter(|| { - Regex::new(regex, ConstructionType::Thompson); + let _ = Regex::new(regex, ConstructionType::Thompson); }) }, ); @@ -23,7 +23,7 @@ fn benchmark_regex_compile_time(c: &mut Criterion) { &case.regex, |b, regex| { b.iter(|| { - Regex::new(regex, ConstructionType::Glushkov); + let _ = Regex::new(regex, ConstructionType::Glushkov); }) }, ); @@ -47,8 +47,10 @@ fn benchmark_regex_is_match(c: &mut Criterion) { let mut group = c.benchmark_group("Regex Is Match"); for case in &cases { - let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson); - let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov); + let thompson_regex = + Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); + let glushkov_regex = + Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); let rust_regex = rust_regex::Regex::new(&format!("^{}$", case.regex)) .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); @@ -90,8 +92,10 @@ fn benchmark_regex_find_first(c: &mut Criterion) { let mut group = c.benchmark_group("Regex Find First"); for case in &cases { - let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson); - let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov); + let thompson_regex = + Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); + let glushkov_regex = + Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); let rust_regex = rust_regex::Regex::new(case.regex) .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); @@ -133,8 +137,10 @@ fn benchmark_regex_find_all(c: &mut Criterion) { let mut group = c.benchmark_group("Regex Find All"); for case in &cases { - let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson); - let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov); + let thompson_regex = + Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); + let glushkov_regex = + Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); let rust_regex = rust_regex::Regex::new(case.regex) .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); diff --git a/src/glushkov.rs b/src/glushkov.rs index 1d75081..b0360dc 100644 --- a/src/glushkov.rs +++ b/src/glushkov.rs @@ -21,17 +21,19 @@ pub struct GlushkovDfa { } impl Dfa for GlushkovDfa { - fn new(regex: &str) -> Self { + fn new(regex: &str) -> Result { if !is_valid_regex(regex) { - panic!("{regex} is not a valid regular expression!"); + return Err("{regex} is not a valid regular expression!".to_string()); } let normalised_regex = normalise_regex(regex); let regex_nfa = glushkov_construction(&normalised_regex); dbg!(®ex_nfa); let mut regex_dfa = nfa_no_epsilon_to_dfa(®ex_nfa); + // dbg!(®ex_dfa); ::optimise_dfa(&mut regex_dfa); - regex_dfa + // dbg!(®ex_dfa); + Ok(regex_dfa) } fn get_transitions(&self) -> &HashMap<(u32, char), u32> { @@ -53,14 +55,12 @@ impl Dfa for GlushkovDfa { // GLUSHKOV CONSTRUCTION fn glushkov_construction(regex: &str) -> Nfa { - dbg!(®ex); let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); - let mut accepting_states: HashSet = HashSet::new(); + let accepting_states: HashSet = compute_accepting_states(regex); let states: HashMap = index_states(regex); - dbg!(&states); - fill_sets(states, &mut accepting_states, &mut transitions); + fill_sets(states, &mut transitions); Nfa { transitions, @@ -68,6 +68,59 @@ fn glushkov_construction(regex: &str) -> Nfa { } } +fn compute_accepting_states(regex: &str) -> HashSet { + // also need handling of escape sequence?! + let mut accepting_states = HashSet::new(); + let mut number_of_accepting_states_in_group = 0; + let mut group_is_exhausted = false; + let mut last_element_was_seperator = false; + let num_unions = regex.chars().filter(|&c| matches!(c, '|')); + let mut position: u32 = regex + .chars() + .filter(|&c| !matches!(c, '(' | ')' | '|' | '*')) + .count() as u32; + + dbg!(regex); + + // check if after none ) a | is present + for ch in regex.chars().rev() { + match ch { + ')' => { + if !last_element_was_seperator { + group_is_exhausted = number_of_accepting_states_in_group != 0; + } + last_element_was_seperator = true; + } + '|' => { + group_is_exhausted = false; + last_element_was_seperator = true; + } + '(' => group_is_exhausted = true, + '*' => { + // Should account for ba* -> b and a are accepting + last_element_was_seperator = false; + } + _ => { + if position != 0 { + position -= 1; + } else { + break; + } + + if group_is_exhausted { + continue; + } + dbg!(&ch, &position); + accepting_states.insert(position); + number_of_accepting_states_in_group += 1; + group_is_exhausted = true; + } + } + } + + accepting_states +} + fn index_states(regex: &str) -> HashMap { let mut indexed_states: HashMap = HashMap::new(); let mut symbol_type: SymbolType = SymbolType::Normal; @@ -131,11 +184,12 @@ fn index_states(regex: &str) -> HashMap { indexed_states } +// TODO: remove the unused param later fn fill_sets( states: HashMap, - accepting_states: &mut HashSet, transitions: &mut HashMap<(u32, char), Vec>, ) { + dbg!(&states); let mut start_states = HashSet::new(); let amount_states = states.len() as u32; @@ -164,10 +218,11 @@ fn fill_sets( for i in 0..group.len() { let state = group[i]; - if let Some((_, symbol_type, _)) = states.get(&state) { - if symbol_type == &SymbolType::KleeneStar && i + 1 < group.len() { - start_states.insert(group[i + 1]); - } + if let Some((_, symbol_type, _)) = states.get(&state) + && symbol_type == &SymbolType::KleeneStar + && i + 1 < group.len() + { + start_states.insert(group[i + 1]); } } } @@ -185,8 +240,6 @@ fn fill_sets( .entry((*state_id, *symbol)) .or_default() .push(next_state); - } else { - accepting_states.insert(*state_id); } } SymbolType::KleeneStar => { @@ -202,8 +255,6 @@ fn fill_sets( .or_default() .push(*next_state); } - } else { - accepting_states.insert(*state_id); } } } @@ -541,13 +592,35 @@ mod tests { assert_eq!(result, expected, "Mismatch in mixed regex test"); } + #[test] + fn test_compute_accepting_states_too_many_brackets() { + let regex = "a*b|(c|d)|ef"; + let accepting_states = compute_accepting_states(regex); + + assert_eq!(accepting_states, HashSet::from([1, 2, 3, 5])) + } + + #[test] + fn test_compute_accepting_states_escape_sequence() { + let regex = r"a\*b|cd\*|sdfe\|f"; + let accepting_states = compute_accepting_states(regex); + + assert_eq!(accepting_states, HashSet::from([3, 6, 12])) + } + + #[test] + fn test_compute_accepting_states_complex() { + let regex = "a*b*c|d*e"; + let accepting_states = compute_accepting_states(regex); + + assert_eq!(accepting_states, HashSet::from([2, 4])) + } + #[test] fn test_fill_sets_too_many_brackets() { let states = index_states("a*b|(c|d)|ef"); - let mut finite_states: HashSet = HashSet::new(); let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); - let expected_finite_set: HashSet = HashSet::from([1, 2, 3, 5]); let expected_transitions: HashMap<(u32, char), Vec> = HashMap::from([ ((6, 'a'), vec![0]), ((6, 'b'), vec![1]), @@ -558,19 +631,16 @@ mod tests { ((4, 'e'), vec![5]), ]); - fill_sets(states, &mut finite_states, &mut transitions); + fill_sets(states, &mut transitions); - assert_eq!(finite_states, expected_finite_set); assert_eq!(transitions, expected_transitions); } #[test] fn test_fill_sets_complex() { let states = index_states("a*b*c|d*e"); - let mut finite_states: HashSet = HashSet::new(); let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); - let expected_finite_set: HashSet = HashSet::from([2, 4]); let expected_transitions = HashMap::from([ ((5, 'a'), vec![0]), ((5, 'b'), vec![1]), @@ -582,9 +652,8 @@ mod tests { ((3, 'd'), vec![3, 4]), ]); - fill_sets(states, &mut finite_states, &mut transitions); + fill_sets(states, &mut transitions); - assert_eq!(finite_states, expected_finite_set); assert_eq!(transitions, expected_transitions); } diff --git a/src/lib.rs b/src/lib.rs index a57b628..a6adb31 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,9 @@ mod glushkov; mod thompson; trait Dfa { - fn new(regex: &str) -> Self; + fn new(regex: &str) -> Result + where + Self: std::marker::Sized; fn get_transitions(&self) -> &HashMap<(u32, char), u32>; fn get_accepting_states(&self) -> &HashSet; fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32>; @@ -259,12 +261,12 @@ pub struct Regex { } impl Regex { - pub fn new(pattern: &str, construction: ConstructionType) -> Self { + pub fn new(pattern: &str, construction: ConstructionType) -> Result { let dfa_type = match construction { - ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)), - ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)), + ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)?), + ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)?), }; - Regex { dfa: dfa_type } + Ok(Regex { dfa: dfa_type }) } /// Determines if the provided `text` is an exact match for the regex pattern. @@ -560,7 +562,7 @@ mod tests { #[test] fn is_match_test() { - let regex_object = Regex::new("a(a|b)*", ConstructionType::Thompson); + let regex_object = Regex::new("a(a|b)*", ConstructionType::Thompson).expect("Valid regex"); let success_strings = vec!["abababaaaababa", "a"]; for string in success_strings { @@ -575,7 +577,7 @@ mod tests { #[test] fn find_test() { - let regex_object = Regex::new("abc", ConstructionType::Thompson); + let regex_object = Regex::new("abc", ConstructionType::Thompson).expect("Valid regex"); let test_cases = vec![ ("abcd", Some("abc")), ("xyzabc", Some("abc")), @@ -593,7 +595,7 @@ mod tests { #[test] fn find_all_test() { - let regex_object = Regex::new("abc*", ConstructionType::Thompson); + let regex_object = Regex::new("abc*", ConstructionType::Thompson).expect("Valid regex"); let test_cases = vec![ ("abcd", vec!["abc"]), ("ac", vec![]), diff --git a/src/thompson.rs b/src/thompson.rs index dcf54d7..a74b138 100644 --- a/src/thompson.rs +++ b/src/thompson.rs @@ -12,16 +12,16 @@ pub struct ThompsonDfa { } impl Dfa for ThompsonDfa { - fn new(regex: &str) -> Self { + fn new(regex: &str) -> Result { if !is_valid_regex(regex) { - panic!("{regex} is not a valid regular expression!"); + return Err("{regex} is not a valid regular expression!".to_string()); } let normalised_regex = normalise_regex(regex); let regex_nfa: Nfa = thompson_construction(&normalised_regex); let mut regex_dfa = nfa_to_dfa(®ex_nfa); ::optimise_dfa(&mut regex_dfa); - regex_dfa + Ok(regex_dfa) } fn get_transitions(&self) -> &HashMap<(u32, char), u32> { @@ -141,10 +141,10 @@ fn thompson_construction(normalised_regex: &str) -> Nfa { } // Handle case where regex ends with '|' (empty right operand) - if let Some(&'|') = operators.last() { - if nfa_stack.len() < 2 { - nfa_stack.push(create_basic_epsilon_nfa()); - } + if let Some(&'|') = operators.last() + && nfa_stack.len() < 2 + { + nfa_stack.push(create_basic_epsilon_nfa()); } // Process remaining operators @@ -372,14 +372,14 @@ mod tests { #[test] fn create_dfa_test() { - let generated_dfa = ThompsonDfa::new("(a|b)*"); + let generated_dfa = ThompsonDfa::new("(a|b)*").expect("Valid dfa"); let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 0)]); let expected_accepting_states = HashSet::from([0]); assert_eq!(expected_transitions, generated_dfa.transitions); assert_eq!(expected_accepting_states, generated_dfa.accepting_states); - let generated_dfa_2 = ThompsonDfa::new("a|()"); + let generated_dfa_2 = ThompsonDfa::new("a|()").expect("Valid dfa"); let expected_transitions_2 = HashMap::from([((0, 'a'), 1)]); let expected_accepting_states_2 = HashSet::from([0, 1]); @@ -389,7 +389,7 @@ mod tests { generated_dfa_2.accepting_states ); - let generated_dfa = ThompsonDfa::new("a*b"); + let generated_dfa = ThompsonDfa::new("a*b").expect("Valid dfa"); let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 1)]); let expected_accepting_states = HashSet::from([1]); @@ -399,7 +399,7 @@ mod tests { #[test] fn prozess_regex_test() { - let generated_dfa = ThompsonDfa::new("(a|b)*"); + let generated_dfa = ThompsonDfa::new("(a|b)*").expect("Valid dfa"); let test_strings = vec!["abbbababaaaa", ""]; for string in test_strings { assert!(generated_dfa.process(string)); diff --git a/tests/glushkov_test.rs b/tests/glushkov_test.rs index 20c5306..545218e 100644 --- a/tests/glushkov_test.rs +++ b/tests/glushkov_test.rs @@ -6,7 +6,7 @@ fn test_all_bench_cases() { let cases = get_bench_cases(); for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Glushkov); + let regex = Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); assert_eq!(regex.is_match(&case.input), case.expected_is_match); assert_eq!( regex.find(&case.input), diff --git a/tests/thompson_test.rs b/tests/thompson_test.rs index d2cce3a..5532768 100644 --- a/tests/thompson_test.rs +++ b/tests/thompson_test.rs @@ -6,7 +6,7 @@ fn test_all_bench_cases() { let cases = get_bench_cases(); for case in &cases { - let regex = Regex::new(case.regex, ConstructionType::Thompson); + let regex = Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); assert_eq!(regex.is_match(&case.input), case.expected_is_match); assert_eq!( From 7a8c7eb84daf924231c3410e23b0f8a95a3745c3 Mon Sep 17 00:00:00 2001 From: Pepe Hanisch <142326461+Testspieler09@users.noreply.github.com> Date: Sun, 24 Aug 2025 20:44:58 +0200 Subject: [PATCH 7/8] feat: glushkov construction --- src/glushkov.rs | 1067 ++++++++++++++++++----------------------------- src/lib.rs | 4 +- 2 files changed, 415 insertions(+), 656 deletions(-) diff --git a/src/glushkov.rs b/src/glushkov.rs index b0360dc..85b3093 100644 --- a/src/glushkov.rs +++ b/src/glushkov.rs @@ -1,11 +1,12 @@ use crate::{Dfa, is_valid_regex, normalise_regex}; -use std::collections::{HashMap, HashSet, VecDeque}; - -#[derive(Clone, Debug, PartialEq)] -enum SymbolType { - Normal, - KleeneStar, - Escaped, +use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; + +#[derive(Debug, Clone)] +enum RegexAst { + Char(char), + Concat(Vec), + Alternation(Vec), + KleeneStar(Box), } #[derive(Debug)] @@ -23,16 +24,15 @@ pub struct GlushkovDfa { impl Dfa for GlushkovDfa { fn new(regex: &str) -> Result { if !is_valid_regex(regex) { - return Err("{regex} is not a valid regular expression!".to_string()); + return Err(format!("{regex} is not a valid regular expression!")); } let normalised_regex = normalise_regex(regex); - let regex_nfa = glushkov_construction(&normalised_regex); - dbg!(®ex_nfa); - let mut regex_dfa = nfa_no_epsilon_to_dfa(®ex_nfa); - // dbg!(®ex_dfa); + let ast = parse_regex(&normalised_regex)?; + let nfa = glushkov_construction(ast)?; + let mut regex_dfa = nfa_to_dfa(nfa); + ::optimise_dfa(&mut regex_dfa); - // dbg!(®ex_dfa); Ok(regex_dfa) } @@ -53,743 +53,502 @@ impl Dfa for GlushkovDfa { } } -// GLUSHKOV CONSTRUCTION -fn glushkov_construction(regex: &str) -> Nfa { - let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); - let accepting_states: HashSet = compute_accepting_states(regex); +// Parser for regex string to AST +fn parse_regex(regex: &str) -> Result { + let chars: Vec = regex.chars().collect(); + let (ast, pos) = parse_alternation(&chars, 0)?; - let states: HashMap = index_states(regex); + if pos != chars.len() { + return Err("Unexpected characters at end of regex".to_string()); + } - fill_sets(states, &mut transitions); + Ok(ast) +} - Nfa { - transitions, - accepting_states, +fn parse_alternation(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> { + let mut alternatives = Vec::new(); + + let (first_alt, new_pos) = parse_concatenation(chars, pos)?; + alternatives.push(first_alt); + pos = new_pos; + + while pos < chars.len() && chars[pos] == '|' { + pos += 1; // skip '|' + let (alt, new_pos) = parse_concatenation(chars, pos)?; + alternatives.push(alt); + pos = new_pos; + } + + if alternatives.len() == 1 { + Ok((alternatives.into_iter().next().unwrap(), pos)) + } else { + Ok((RegexAst::Alternation(alternatives), pos)) } } -fn compute_accepting_states(regex: &str) -> HashSet { - // also need handling of escape sequence?! - let mut accepting_states = HashSet::new(); - let mut number_of_accepting_states_in_group = 0; - let mut group_is_exhausted = false; - let mut last_element_was_seperator = false; - let num_unions = regex.chars().filter(|&c| matches!(c, '|')); - let mut position: u32 = regex - .chars() - .filter(|&c| !matches!(c, '(' | ')' | '|' | '*')) - .count() as u32; - - dbg!(regex); - - // check if after none ) a | is present - for ch in regex.chars().rev() { - match ch { - ')' => { - if !last_element_was_seperator { - group_is_exhausted = number_of_accepting_states_in_group != 0; - } - last_element_was_seperator = true; - } - '|' => { - group_is_exhausted = false; - last_element_was_seperator = true; +fn parse_concatenation(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> { + let mut elements = Vec::new(); + + while pos < chars.len() && chars[pos] != '|' && chars[pos] != ')' { + let (element, new_pos) = parse_factor(chars, pos)?; + elements.push(element); + pos = new_pos; + } + + // Handle empty concatenation (empty alternative) + if elements.is_empty() { + // Return an epsilon (empty string) represented as an empty concatenation + return Ok((RegexAst::Concat(vec![]), pos)); + } + + if elements.len() == 1 { + Ok((elements.into_iter().next().unwrap(), pos)) + } else { + Ok((RegexAst::Concat(elements), pos)) + } +} + +fn parse_factor(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> { + if pos >= chars.len() { + return Err("Unexpected end of regex".to_string()); + } + + let (base, new_pos) = match chars[pos] { + '(' => { + pos += 1; // skip '(' + let (inner, inner_pos) = parse_alternation(chars, pos)?; + if inner_pos >= chars.len() || chars[inner_pos] != ')' { + return Err("Unmatched opening parenthesis".to_string()); } - '(' => group_is_exhausted = true, - '*' => { - // Should account for ba* -> b and a are accepting - last_element_was_seperator = false; + (inner, inner_pos + 1) // skip ')' + } + '\\' => { + if pos + 1 >= chars.len() { + return Err("Invalid escape sequence".to_string()); } - _ => { - if position != 0 { - position -= 1; - } else { - break; - } + pos += 1; // skip '\' + (RegexAst::Char(chars[pos]), pos + 1) + } + c if c.is_ascii() && !"()|*+\\".contains(c) => (RegexAst::Char(c), pos + 1), + _ => { + return Err(format!("Unexpected character: {}", chars[pos])); + } + }; - if group_is_exhausted { - continue; - } - dbg!(&ch, &position); - accepting_states.insert(position); - number_of_accepting_states_in_group += 1; - group_is_exhausted = true; + pos = new_pos; + + // Check for Kleene star + if pos < chars.len() && chars[pos] == '*' { + pos += 1; + Ok((RegexAst::KleeneStar(Box::new(base)), pos)) + } else { + Ok((base, pos)) + } +} + +fn glushkov_construction(ast: RegexAst) -> Result { + let mut state_counter = 0u32; + let mut state_to_char: HashMap = HashMap::new(); + + // Assign unique state numbers to each character occurrence + assign_positions(&ast, &mut state_counter, &mut state_to_char); + + let start_state = state_counter; + + // Compute First, Last, Follow sets - each with fresh position counter + let first_set = first(&ast); + let last_set = last(&ast); + let follow_map = follow(&ast); + + // Build NFA + let mut transitions = HashMap::new(); + let mut accepting_states = HashSet::new(); + + // Transitions from start state + for &state in &first_set { + if let Some(&ch) = state_to_char.get(&state) { + transitions + .entry((start_state, ch)) + .or_insert_with(Vec::new) + .push(state); + } + } + + // Internal transitions based on follow sets + for (state, follow_states) in follow_map { + for &follow_state in &follow_states { + if let Some(&ch) = state_to_char.get(&follow_state) { + transitions + .entry((state, ch)) + .or_insert_with(Vec::new) + .push(follow_state); } } } - accepting_states + // Accepting states + if nullable(&ast) { + accepting_states.insert(start_state); + } + for &state in &last_set { + accepting_states.insert(state); + } + + Ok(Nfa { + transitions, + accepting_states, + }) +} + +fn first(ast: &RegexAst) -> HashSet { + let mut positions = HashMap::new(); + let mut counter = 0; + map_ast_to_positions(ast, &mut counter, &mut positions); + first_positions(ast, &positions) } -fn index_states(regex: &str) -> HashMap { - let mut indexed_states: HashMap = HashMap::new(); - let mut symbol_type: SymbolType = SymbolType::Normal; - let mut group_stack: Vec = vec![0]; - let mut idx: u32 = 0; - let mut next_group_id: u32 = 1; - let mut chars = regex.chars().peekable(); - - while let Some(symbol) = chars.next() { - if symbol_type == SymbolType::Escaped { - indexed_states.entry(idx).or_insert(( - symbol, - symbol_type.clone(), - *group_stack.last().unwrap(), - )); - idx += 1; - symbol_type = SymbolType::Normal; - continue; - } +fn last(ast: &RegexAst) -> HashSet { + let mut positions = HashMap::new(); + let mut counter = 0; + map_ast_to_positions(ast, &mut counter, &mut positions); + last_positions(ast, &positions) +} - match symbol { - '|' => { - // Start a new group for the next alternative - let new_group_id = next_group_id; - next_group_id += 1; - // Replace the current group on the stack with the new one - if let Some(last) = group_stack.last_mut() { - *last = new_group_id; - } +fn follow(ast: &RegexAst) -> HashMap> { + let mut positions = HashMap::new(); + let mut counter = 0; + map_ast_to_positions(ast, &mut counter, &mut positions); + + let mut result = HashMap::new(); + follow_positions(ast, &positions, &mut result); + result +} + +// Helper function to create a mapping from AST nodes to their position ranges +fn map_ast_to_positions( + ast: &RegexAst, + counter: &mut u32, + positions: &mut HashMap<*const RegexAst, (u32, u32)>, +) { + let start_pos = *counter; + + match ast { + RegexAst::Char(_) => { + *counter += 1; + } + RegexAst::Concat(elements) => { + for element in elements { + map_ast_to_positions(element, counter, positions); } - '(' => { - // Push the next group ID onto the stack for this grouping level - let new_group_id = next_group_id; - next_group_id += 1; - group_stack.push(new_group_id); + } + RegexAst::Alternation(alternatives) => { + for alt in alternatives { + map_ast_to_positions(alt, counter, positions); } - ')' => { - // Pop the current group and return to parent group - group_stack.pop(); + } + RegexAst::KleeneStar(inner) => { + map_ast_to_positions(inner, counter, positions); + } + } + + positions.insert(ast as *const RegexAst, (start_pos, *counter)); +} + +fn first_positions( + ast: &RegexAst, + positions: &HashMap<*const RegexAst, (u32, u32)>, +) -> HashSet { + match ast { + RegexAst::Char(_) => { + let (start_pos, _) = positions[&(ast as *const RegexAst)]; + let mut result = HashSet::new(); + result.insert(start_pos); + result + } + RegexAst::Concat(elements) => { + let mut result = HashSet::new(); + for element in elements { + result.extend(first_positions(element, positions)); + if !nullable(element) { + break; + } } - '*' => { - symbol_type = SymbolType::Normal; - continue; + result + } + RegexAst::Alternation(alternatives) => { + let mut result = HashSet::new(); + for alt in alternatives { + result.extend(first_positions(alt, positions)); } - '\\' => symbol_type = SymbolType::Escaped, - _ => { - if let Some(next_symbol) = chars.peek() - && matches!(*next_symbol, '*') - { - symbol_type = SymbolType::KleeneStar + result + } + RegexAst::KleeneStar(inner) => first_positions(inner, positions), + } +} + +fn last_positions( + ast: &RegexAst, + positions: &HashMap<*const RegexAst, (u32, u32)>, +) -> HashSet { + match ast { + RegexAst::Char(_) => { + let (start_pos, _) = positions[&(ast as *const RegexAst)]; + let mut result = HashSet::new(); + result.insert(start_pos); + result + } + RegexAst::Concat(elements) => { + let mut result = HashSet::new(); + for element in elements.iter().rev() { + result.extend(last_positions(element, positions)); + if !nullable(element) { + break; } - indexed_states.entry(idx).or_insert(( - symbol, - symbol_type.clone(), - *group_stack.last().unwrap(), - )); - idx += 1; } + result + } + RegexAst::Alternation(alternatives) => { + let mut result = HashSet::new(); + for alt in alternatives { + result.extend(last_positions(alt, positions)); + } + result } + RegexAst::KleeneStar(inner) => last_positions(inner, positions), } - indexed_states } -// TODO: remove the unused param later -fn fill_sets( - states: HashMap, - transitions: &mut HashMap<(u32, char), Vec>, +fn follow_positions( + ast: &RegexAst, + positions: &HashMap<*const RegexAst, (u32, u32)>, + result: &mut HashMap>, ) { - dbg!(&states); - let mut start_states = HashSet::new(); + match ast { + RegexAst::Char(_) => { + // Base case - no follow computation needed + } + RegexAst::Concat(elements) => { + // Process each element recursively + for element in elements { + follow_positions(element, positions, result); + } - let amount_states = states.len() as u32; - if amount_states == 0 { - return; - } + // Add follow relationships between consecutive elements + for i in 0..elements.len() { + let last_i = last_positions(&elements[i], positions); - // Group states by their group index - let mut groups: HashMap> = HashMap::new(); - for (state_id, (_, _, group_idx)) in &states { - groups.entry(*group_idx).or_default().push(*state_id); - } + // For each subsequent element j > i + for j in (i + 1)..elements.len() { + // Check if all elements between i and j are nullable + let all_between_nullable = elements[(i + 1)..j].iter().all(nullable); - // Sort states within each group - for group in groups.values_mut() { - group.sort(); - } + if j == i + 1 || all_between_nullable { + let first_j = first_positions(&elements[j], positions); + + // Add follow relationships from last(i) to first(j) + for &last_state in &last_i { + result.entry(last_state).or_default().extend(&first_j); + } + } - // Determine start states (first state of each group) - for group in groups.values() { - if group.is_empty() { - continue; + // If element j is not nullable, we can't skip further + if !nullable(&elements[j]) { + break; + } + } + } + } + RegexAst::Alternation(alternatives) => { + for alt in alternatives { + follow_positions(alt, positions, result); + } } + RegexAst::KleeneStar(inner) => { + follow_positions(inner, positions, result); - start_states.insert(group[0]); + // Kleene star: last positions can loop back to first positions + let inner_last = last_positions(inner, positions); + let inner_first = first_positions(inner, positions); - for i in 0..group.len() { - let state = group[i]; - if let Some((_, symbol_type, _)) = states.get(&state) - && symbol_type == &SymbolType::KleeneStar - && i + 1 < group.len() - { - start_states.insert(group[i + 1]); + for &last_state in &inner_last { + result.entry(last_state).or_default().extend(&inner_first); } } } +} - // Build transitions and determine accepting states - for (state_id, (symbol, symbol_type, group_idx)) in &states { - let current_group = &groups[group_idx]; - let pos_in_group = current_group.iter().position(|&x| x == *state_id).unwrap(); +fn nullable(ast: &RegexAst) -> bool { + match ast { + RegexAst::Char(_) => false, + RegexAst::Concat(elements) => { + // Empty concat is nullable (represents epsilon) + elements.is_empty() || elements.iter().all(nullable) + } + RegexAst::Alternation(alternatives) => alternatives.iter().any(nullable), + RegexAst::KleeneStar(_) => true, + } +} - match symbol_type { - SymbolType::Normal | SymbolType::Escaped => { - if pos_in_group + 1 < current_group.len() { - let next_state = current_group[pos_in_group + 1]; - transitions - .entry((*state_id, *symbol)) - .or_default() - .push(next_state); - } +fn assign_positions(ast: &RegexAst, counter: &mut u32, state_to_char: &mut HashMap) { + match ast { + RegexAst::Char(ch) => { + let state = *counter; + *counter += 1; + state_to_char.insert(state, *ch); + } + RegexAst::Concat(elements) => { + for element in elements { + assign_positions(element, counter, state_to_char); } - SymbolType::KleeneStar => { - transitions - .entry((*state_id, *symbol)) - .or_default() - .push(*state_id); - - if pos_in_group + 1 < current_group.len() { - for next_state in current_group.iter().skip(pos_in_group + 1) { - transitions - .entry((*state_id, *symbol)) - .or_default() - .push(*next_state); - } - } + } + RegexAst::Alternation(alternatives) => { + for alt in alternatives { + assign_positions(alt, counter, state_to_char); } } - } - - // Setup virtual (start-)state - let virtual_start = states.keys().max().copied().unwrap_or(0) + 1; - - let symbol_to_first_state: Vec<(u32, char)> = start_states - .iter() - .map(|&s| (s, states.get(&s).expect("Expected an entry").0)) - .collect(); - - for (first_state, symbol) in symbol_to_first_state { - transitions - .entry((virtual_start, symbol)) - .or_default() - .push(first_state); + RegexAst::KleeneStar(inner) => { + assign_positions(inner, counter, state_to_char); + } } } -// END GLUSHKOV CONSTRUCTION -fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa { +fn nfa_to_dfa(nfa: Nfa) -> GlushkovDfa { let mut dfa_transitions = HashMap::new(); let mut dfa_accepting_states = HashSet::new(); + let mut state_sets_to_dfa_state: HashMap, u32> = HashMap::new(); + let mut queue = VecDeque::new(); + let mut next_dfa_state = 0u32; - // Map from sorted vector of NFA states to DFA state ID (for hashable key) - let mut nfa_states_to_dfa_state: HashMap, u32> = HashMap::new(); - let mut next_dfa_state_id = 0u32; - let mut work_queue = VecDeque::new(); - - // Helper function to convert HashSet to sorted Vec for use as HashMap key - let set_to_sorted_vec = |set: &HashSet| -> Vec { - let mut vec: Vec = set.iter().cloned().collect(); - vec.sort_unstable(); - vec - }; - - // Get all possible input symbols from NFA transitions - let alphabet: HashSet = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect(); + // Get alphabet from NFA + let alphabet: HashSet = nfa.transitions.keys().map(|(_, ch)| *ch).collect(); - // Find all states that exist in the NFA + // Find start state (highest numbered state in NFA) let mut all_nfa_states = HashSet::new(); - for &(state, _) in nfa.transitions.keys() { - all_nfa_states.insert(state); + + for &(from_state, _) in nfa.transitions.keys() { + all_nfa_states.insert(from_state); } for target_states in nfa.transitions.values() { - for &state in target_states { - all_nfa_states.insert(state); + for &to_state in target_states { + all_nfa_states.insert(to_state); } } - for &state in &nfa.accepting_states { - all_nfa_states.insert(state); + for &accepting_state in &nfa.accepting_states { + all_nfa_states.insert(accepting_state); } - // In a Glushkov NFA, state 0 is always the start state - let start_state = 0; + let start_state = all_nfa_states.iter().max().copied().unwrap_or(0); - // Verify that state 0 exists in the NFA - if !all_nfa_states.contains(&start_state) { - panic!("Expected start state 0 not found in NFA states: {all_nfa_states:?}"); - } - - let start_state_set = { - let mut set = HashSet::new(); + let start_set: BTreeSet = { + let mut set = BTreeSet::new(); set.insert(start_state); set }; - // Create initial DFA state - let start_dfa_state = next_dfa_state_id; - next_dfa_state_id += 1; - - let start_state_key = set_to_sorted_vec(&start_state_set); - nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state); - work_queue.push_back(start_state_set); + state_sets_to_dfa_state.insert(start_set.clone(), next_dfa_state); + queue.push_back(start_set); + next_dfa_state += 1; - // Process each DFA state - while let Some(current_nfa_states) = work_queue.pop_front() { - let current_state_key = set_to_sorted_vec(¤t_nfa_states); - let current_dfa_state = nfa_states_to_dfa_state[¤t_state_key]; + while let Some(current_set) = queue.pop_front() { + let current_dfa_state = state_sets_to_dfa_state[¤t_set]; // Check if this DFA state should be accepting - if current_nfa_states + if current_set .iter() - .any(|&state| nfa.accepting_states.contains(&state)) + .any(|&s| nfa.accepting_states.contains(&s)) { dfa_accepting_states.insert(current_dfa_state); } - // For each symbol in the alphabet + // For each symbol in alphabet for &symbol in &alphabet { - let mut next_nfa_states = HashSet::new(); + let mut next_set = BTreeSet::new(); - // Collect all states reachable from current_nfa_states via symbol - for &nfa_state in ¤t_nfa_states { - if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) { - for &target_state in target_states { - next_nfa_states.insert(target_state); - } + // Collect all states reachable via this symbol + for &state in ¤t_set { + if let Some(targets) = nfa.transitions.get(&(state, symbol)) { + next_set.extend(targets); } } - // Skip if no transitions exist for this symbol - if next_nfa_states.is_empty() { - continue; - } - - // Get or create DFA state for this set of NFA states - let next_state_key = set_to_sorted_vec(&next_nfa_states); - let next_dfa_state = - if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) { - existing_state + if !next_set.is_empty() { + let next_dfa_state = if let Some(&existing) = state_sets_to_dfa_state.get(&next_set) + { + existing } else { - let new_state = next_dfa_state_id; - next_dfa_state_id += 1; - - nfa_states_to_dfa_state.insert(next_state_key.clone(), new_state); - work_queue.push_back(next_nfa_states); - + let new_state = next_dfa_state; + next_dfa_state += 1; + state_sets_to_dfa_state.insert(next_set.clone(), new_state); + queue.push_back(next_set.clone()); new_state }; - // Add transition to DFA - dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state); + dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state); + } } } - GlushkovDfa { - transitions: dfa_transitions, - accepting_states: dfa_accepting_states, - } + // Normalize to start from state 0 + normalize_dfa_states(dfa_transitions, dfa_accepting_states) } -// fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa { -// let mut dfa_transitions = HashMap::new(); -// let mut dfa_accepting_states = HashSet::new(); -// -// // Map from DFA state ID to the set of NFA states it represents -// let mut dfa_state_to_nfa_states: HashMap> = HashMap::new(); -// // Map from sorted vector of NFA states to DFA state ID (for hashable key) -// let mut nfa_states_to_dfa_state: HashMap, u32> = HashMap::new(); -// -// let mut next_dfa_state_id = 0u32; -// let mut work_queue = VecDeque::new(); -// -// // Helper function to convert HashSet to sorted Vec for use as HashMap key -// let set_to_sorted_vec = |set: &HashSet| -> Vec { -// let mut vec: Vec = set.iter().cloned().collect(); -// vec.sort_unstable(); -// vec -// }; -// -// // Get all possible input symbols from NFA transitions -// let alphabet: HashSet = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect(); -// -// // Find the start state (assuming state 0 is the start state) -// let start_state_set = { -// let mut set = HashSet::new(); -// set.insert(0u32); -// set -// }; -// -// // Create initial DFA state -// let start_dfa_state = next_dfa_state_id; -// next_dfa_state_id += 1; -// -// let start_state_key = set_to_sorted_vec(&start_state_set); -// dfa_state_to_nfa_states.insert(start_dfa_state, start_state_set.clone()); -// nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state); -// work_queue.push_back(start_state_set); -// -// // Process each DFA state -// while let Some(current_nfa_states) = work_queue.pop_front() { -// let current_state_key = set_to_sorted_vec(¤t_nfa_states); -// let current_dfa_state = nfa_states_to_dfa_state[¤t_state_key]; -// -// // Check if this DFA state should be accepting -// if current_nfa_states -// .iter() -// .any(|&state| nfa.accepting_states.contains(&state)) -// { -// dfa_accepting_states.insert(current_dfa_state); -// } -// -// // For each symbol in the alphabet -// for &symbol in &alphabet { -// let mut next_nfa_states = HashSet::new(); -// -// // Collect all states reachable from current_nfa_states via symbol -// for &nfa_state in ¤t_nfa_states { -// if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) { -// for &target_state in target_states { -// next_nfa_states.insert(target_state); -// } -// } -// } -// -// // Skip if no transitions exist for this symbol -// if next_nfa_states.is_empty() { -// continue; -// } -// -// // Get or create DFA state for this set of NFA states -// let next_state_key = set_to_sorted_vec(&next_nfa_states); -// let next_dfa_state = -// if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) { -// existing_state -// } else { -// let new_state = next_dfa_state_id; -// next_dfa_state_id += 1; -// -// dfa_state_to_nfa_states.insert(new_state, next_nfa_states.clone()); -// nfa_states_to_dfa_state.insert(next_state_key, new_state); -// work_queue.push_back(next_nfa_states); -// -// new_state -// }; -// -// // Add transition to DFA -// dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state); -// } -// } -// -// GlushkovDfa { -// transitions: dfa_transitions, -// accepting_states: dfa_accepting_states, -// } -// } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_index_single_character() { - let expected = HashMap::from([(0, ('a', SymbolType::Normal, 0))]); - - let result = index_states("a"); - assert_eq!(result, expected, "Mismatch in single character test"); - } - - #[test] - fn test_nfa_single_character() { - let expected_finite = HashSet::from([0]); - let expected_transitions: HashMap<(u32, char), Vec> = - HashMap::from([((1, 'a'), vec![0])]); - - let result = glushkov_construction("a"); - assert_eq!( - result.transitions, expected_transitions, - "Mismatch in single character test" - ); - assert_eq!( - result.accepting_states, expected_finite, - "Mismatch in single character test" - ); - } - - #[test] - fn test_nfa_single_character_kleene_star() { - let expected_finite = HashSet::from([0]); - let expected_transitions: HashMap<(u32, char), Vec> = - HashMap::from([((0, 'a'), vec![0]), ((1, 'a'), vec![0])]); - - let result = glushkov_construction("a*"); - assert_eq!( - result.transitions, expected_transitions, - "Mismatch in single character test" - ); - assert_eq!( - result.accepting_states, expected_finite, - "Mismatch in single character test" - ); - } - - #[test] - fn test_index_kleene_star() { - let expected = HashMap::from([(0, ('a', SymbolType::KleeneStar, 0))]); - - let result = index_states("a*"); - assert_eq!(result, expected, "Mismatch in kleene star test"); - } - - #[test] - fn test_index_union_and_groups() { - let expected = HashMap::from([ - (0, ('a', SymbolType::Normal, 1)), - (1, ('b', SymbolType::Normal, 2)), - ]); - - let result = index_states("(a|b)"); - assert_eq!(result, expected, "Mismatch in union and groups test"); - } - - #[test] - fn test_index_escaped_character() { - let expected = HashMap::from([(0, ('a', SymbolType::Escaped, 0))]); - - let result = index_states("\\a"); - assert_eq!(result, expected, "Mismatch in escaped character test"); - } - - #[test] - fn test_index_mixed_regex() { - let expected = HashMap::from([ - (0, ('a', SymbolType::Normal, 0)), - (1, ('*', SymbolType::Escaped, 0)), - (2, ('b', SymbolType::Normal, 0)), - (3, ('c', SymbolType::KleeneStar, 0)), - (4, ('d', SymbolType::Normal, 0)), - (5, ('e', SymbolType::Normal, 1)), - (6, ('f', SymbolType::Normal, 2)), - (7, ('g', SymbolType::Normal, 4)), - (8, ('h', SymbolType::Normal, 5)), - (9, ('i', SymbolType::Normal, 0)), - ]); - - let result = index_states("a\\*bc*d(e|f|(g|h))i"); - assert_eq!(result, expected, "Mismatch in mixed regex test"); - } - - #[test] - fn test_index_too_many_brackets() { - let expected = HashMap::from([ - (0, ('a', SymbolType::KleeneStar, 0)), - (1, ('b', SymbolType::Normal, 0)), - (2, ('c', SymbolType::Normal, 3)), - (3, ('d', SymbolType::Normal, 4)), - (4, ('e', SymbolType::Normal, 5)), - (5, ('f', SymbolType::Normal, 5)), - ]); - - let result = index_states("a*b|((c|d))|ef"); - assert_eq!(result, expected, "Mismatch in mixed regex test"); - } - - #[test] - fn test_compute_accepting_states_too_many_brackets() { - let regex = "a*b|(c|d)|ef"; - let accepting_states = compute_accepting_states(regex); - - assert_eq!(accepting_states, HashSet::from([1, 2, 3, 5])) - } - - #[test] - fn test_compute_accepting_states_escape_sequence() { - let regex = r"a\*b|cd\*|sdfe\|f"; - let accepting_states = compute_accepting_states(regex); - - assert_eq!(accepting_states, HashSet::from([3, 6, 12])) - } - - #[test] - fn test_compute_accepting_states_complex() { - let regex = "a*b*c|d*e"; - let accepting_states = compute_accepting_states(regex); - - assert_eq!(accepting_states, HashSet::from([2, 4])) +fn normalize_dfa_states( + transitions: HashMap<(u32, char), u32>, + accepting_states: HashSet, +) -> GlushkovDfa { + if transitions.is_empty() && accepting_states.is_empty() { + return GlushkovDfa { + transitions, + accepting_states, + }; } - #[test] - fn test_fill_sets_too_many_brackets() { - let states = index_states("a*b|(c|d)|ef"); - let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); - - let expected_transitions: HashMap<(u32, char), Vec> = HashMap::from([ - ((6, 'a'), vec![0]), - ((6, 'b'), vec![1]), - ((6, 'c'), vec![2]), - ((6, 'd'), vec![3]), - ((6, 'e'), vec![4]), - ((0, 'a'), vec![0, 1]), - ((4, 'e'), vec![5]), - ]); - - fill_sets(states, &mut transitions); - - assert_eq!(transitions, expected_transitions); + // Find all states + let mut all_states = HashSet::new(); + for &(from, _) in transitions.keys() { + all_states.insert(from); } - - #[test] - fn test_fill_sets_complex() { - let states = index_states("a*b*c|d*e"); - let mut transitions: HashMap<(u32, char), Vec> = HashMap::new(); - - let expected_transitions = HashMap::from([ - ((5, 'a'), vec![0]), - ((5, 'b'), vec![1]), - ((5, 'c'), vec![2]), - ((5, 'd'), vec![3]), - ((5, 'e'), vec![4]), - ((0, 'a'), vec![0, 1, 2]), - ((1, 'b'), vec![1, 2]), - ((3, 'd'), vec![3, 4]), - ]); - - fill_sets(states, &mut transitions); - - assert_eq!(transitions, expected_transitions); + for &to in transitions.values() { + all_states.insert(to); } + all_states.extend(&accepting_states); - #[test] - fn nfa_to_dfa_simple_test() { - // NFA that accepts exactly "a" - // State 0 --a--> State 1 (accepting) - let input_nfa = Nfa { - transitions: HashMap::from([((0, 'a'), vec![1])]), - accepting_states: HashSet::from([1]), + if all_states.is_empty() { + return GlushkovDfa { + transitions, + accepting_states, }; - - let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); - - let expected_transitions = HashMap::from([((0, 'a'), 1)]); - let expected_accepting_states = HashSet::from([1]); - - assert_eq!(expected_transitions, generated_dfa.transitions); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); } - #[test] - fn nfa_to_dfa_sequence_test() { - // NFA that accepts exactly "ab" - // State 0 --a--> State 1 --b--> State 2 (accepting) - let input_nfa = Nfa { - transitions: HashMap::from([((0, 'a'), vec![1]), ((1, 'b'), vec![2])]), - accepting_states: HashSet::from([2]), - }; - - let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); + // Create mapping with 0 as start state + let start_state = *all_states.iter().min().unwrap(); + let mut state_mapping = HashMap::new(); + state_mapping.insert(start_state, 0); - let expected_transitions = HashMap::from([((0, 'a'), 1), ((1, 'b'), 2)]); - let expected_accepting_states = HashSet::from([2]); - - assert_eq!(expected_transitions, generated_dfa.transitions); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + let mut next_state = 1; + for &state in &all_states { + if state != start_state { + state_mapping.insert(state, next_state); + next_state += 1; + } } - #[test] - fn nfa_to_dfa_alternation_test() { - // NFA that accepts "a" or "b" - // State 0 --a--> State 1 (accepting) - // State 0 --b--> State 2 (accepting) - let input_nfa = Nfa { - transitions: HashMap::from([((0, 'a'), vec![1]), ((0, 'b'), vec![2])]), - accepting_states: HashSet::from([1, 2]), - }; - - let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); - - let expected_transitions = [ - HashMap::from([((0, 'a'), 1), ((0, 'b'), 2)]), - HashMap::from([((0, 'a'), 2), ((0, 'b'), 1)]), - ]; - let expected_accepting_states = HashSet::from([1, 2]); - - assert!( - generated_dfa.transitions == expected_transitions[0] - || generated_dfa.transitions == expected_transitions[1], - "generated_dfa.transitions did not match either expected set" - ); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); - } - - #[test] - fn nfa_to_dfa_nondeterministic_test() { - // NFA with nondeterministic transition - // State 0 --a--> State 1, State 2 - // State 1 --b--> State 3 (accepting) - // State 2 --c--> State 3 (accepting) - let input_nfa = Nfa { - transitions: HashMap::from([ - ((0, 'a'), vec![1, 2]), - ((1, 'b'), vec![3]), - ((2, 'c'), vec![3]), - ]), - accepting_states: HashSet::from([3]), - }; - - let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); - - // After 'a' from state 0, we should be in a state representing {1, 2} - // Let's call this combined state "1" in the DFA - let expected_transitions = HashMap::from([ - ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1) - ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2) - ((1, 'c'), 2), // {1,2} --c--> {3} (DFA state 2) - ]); - let expected_accepting_states = HashSet::from([2]); // DFA state 2 represents {3} - - assert_eq!(expected_transitions, generated_dfa.transitions); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); - } - - #[test] - fn nfa_to_dfa_multiple_accepting_test() { - // NFA where multiple paths lead to accepting states - // State 0 --a--> State 1 (accepting) - // State 0 --a--> State 2 --b--> State 3 (accepting) - let input_nfa = Nfa { - transitions: HashMap::from([((0, 'a'), vec![1, 2]), ((2, 'b'), vec![3])]), - accepting_states: HashSet::from([1, 3]), - }; - - let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); - - // After 'a' from state 0, we're in state representing {1, 2} - // This should be accepting because it contains state 1 - let expected_transitions = HashMap::from([ - ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1) - ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2) - ]); - let expected_accepting_states = HashSet::from([1, 2]); // Both DFA states are accepting - - assert_eq!(expected_transitions, generated_dfa.transitions); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + // Remap transitions + let mut new_transitions = HashMap::new(); + for ((from, symbol), to) in transitions { + let new_from = state_mapping[&from]; + let new_to = state_mapping[&to]; + new_transitions.insert((new_from, symbol), new_to); } - #[test] - fn nfa_to_dfa_self_loop_test() { - // NFA with self-loop: accepts a* - // State 0 (accepting) --a--> State 0 - let input_nfa = Nfa { - transitions: HashMap::from([((0, 'a'), vec![0])]), - accepting_states: HashSet::from([0]), - }; - - let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa); - - let expected_transitions = HashMap::from([ - ((0, 'a'), 0), // Self-loop - ]); - let expected_accepting_states = HashSet::from([0]); + // Remap accepting states + let mut new_accepting_states = HashSet::new(); + for state in accepting_states { + new_accepting_states.insert(state_mapping[&state]); + } - assert_eq!(expected_transitions, generated_dfa.transitions); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + GlushkovDfa { + transitions: new_transitions, + accepting_states: new_accepting_states, } } diff --git a/src/lib.rs b/src/lib.rs index a6adb31..482ee9a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -288,7 +288,7 @@ impl Regex { /// ```rust /// use regex_engine::{Regex, ConstructionType}; /// - /// let regex = Regex::new("(a|b)*", ConstructionType::Thompson); + /// let regex = Regex::new("(a|b)*", ConstructionType::Thompson).expect("Valied regex"); /// assert!(regex.is_match("abba")); /// assert!(!regex.is_match("abc")); /// ``` @@ -318,7 +318,7 @@ impl Regex { /// ```rust /// use regex_engine::{Regex, ConstructionType}; /// - /// let regex = Regex::new("ab+", ConstructionType::Thompson); + /// let regex = Regex::new("ab+", ConstructionType::Thompson).expect("Valied regex"); /// if let Some(matched) = regex.find("aabbcc") { /// println!("Found: {}", matched); /// } From 13e48c31228b5273ec04ad49d020caaf28e1ca7c Mon Sep 17 00:00:00 2001 From: Pepe Hanisch <142326461+Testspieler09@users.noreply.github.com> Date: Sun, 24 Aug 2025 20:59:22 +0200 Subject: [PATCH 8/8] feat: github pages for html report --- .github/workflows/benchmark.yml | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/benchmark.yml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..7a9335e --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,37 @@ +name: Benchmark + +on: + push: + branches: [main] + +# Add permissions for Pages +permissions: + contents: read + pages: write + id-token: write + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + + - name: Run benchmarks + run: cargo bench + + - name: Setup Pages + uses: actions/configure-pages@v3 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v2 + with: + path: './target/criterion' + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2