From b5952cfbb06ed7f3e838fca220234dfa0010f00b Mon Sep 17 00:00:00 2001
From: Testspieler09 <pepehanisch06@gmail.com>
Date: Sun, 4 May 2025 15:15:51 +0200
Subject: [PATCH 1/8] feat(glushkov): adjust base project to the new
 construction

---
 src/glushkov.rs             |  50 +++++++
 src/lib.rs                  |   3 +-
 src/regex_engine.rs         | 257 ++++++++++++++++++++++++++++++++++--
 src/{dfa.rs => thompson.rs} | 228 +-------------------------------
 tests/test_one.rs           |   8 +-
 5 files changed, 302 insertions(+), 244 deletions(-)
 create mode 100644 src/glushkov.rs
 rename src/{dfa.rs => thompson.rs} (78%)
diff --git a/src/glushkov.rs b/src/glushkov.rs
new file mode 100644
index 0000000..d6d7342
--- /dev/null
+++ b/src/glushkov.rs
@@ -0,0 +1,50 @@
+use crate::regex_engine::{is_valid_regex, normalise_regex};
+use std::collections::{HashMap, HashSet};
+
+struct NFA {
+    transitions: HashMap<(u32, Option<char>), Vec<u32>>,
+    accepting_state: u32,
+}
+
+pub struct DFA {
+    transitions: HashMap<(u32, Option<char>), u32>,
+    accepting_states: HashSet<u32>,
+}
+
+// GLUSHKOV CONSTRUCTION
+fn glushkov_construction(regex: &str) -> NFA {
+    // TODO: Step 1 (rename letters / index them)
+    // TODO: Step 2a ()
+    // TODO: Step 2b ()
+    // TODO: Step 3 ()
+    // TODO: Step 4 ()
+    todo!()
+}
+
+fn nfa_no_epsilon_to_dfa() {
+    todo!()
+}
+// END GLUSHKOV CONSTRUCTION
+
+impl DFA {
+    pub fn new(regex: &str) -> Self {
+        if !is_valid_regex(regex) {
+            panic!("{} is not a valid regular expression!", regex);
+        }
+
+        let normalised_regex = normalise_regex(&regex);
+        todo!()
+    }
+
+    pub fn process(&self, input: &str) -> bool {
+        todo!()
+    }
+
+    pub fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> {
+        todo!()
+    }
+
+    pub fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> {
+        todo!()
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 244eee8..1c11465 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,2 +1,3 @@
-mod dfa;
+mod glushkov;
 pub mod regex_engine;
+mod thompson;
diff --git a/src/regex_engine.rs b/src/regex_engine.rs
index 2d528cf..77e15a5 100644
--- a/src/regex_engine.rs
+++ b/src/regex_engine.rs
@@ -1,36 +1,269 @@
-use crate::dfa::DFA;
+use crate::glushkov::DFA as GlushkovDFA;
+use crate::thompson::DFA as ThompsonDFA;
+
+pub enum ConstructionType {
+    Thompson,
+    Glushkov,
+}
+
+enum DFAType {
+    Thompson(ThompsonDFA),
+    Glushkov(GlushkovDFA),
+}
 
 pub struct Regex {
-    dfa: DFA,
+    dfa: DFAType,
 }
 
 impl Regex {
-    pub fn new(pattern: &str) -> Self {
-        Regex {
-            dfa: DFA::new(pattern),
-        }
+    pub fn new(pattern: &str, construction: ConstructionType) -> Self {
+        let dfa_type = match construction {
+            ConstructionType::Thompson => DFAType::Thompson(ThompsonDFA::new(pattern)),
+            ConstructionType::Glushkov => DFAType::Glushkov(GlushkovDFA::new(pattern)),
+        };
+        Regex { dfa: dfa_type }
     }
 
     pub fn is_match(&self, text: &str) -> bool {
-        self.dfa.process(text)
+        match &self.dfa {
+            DFAType::Thompson(dfa) => dfa.process(text),
+            DFAType::Glushkov(dfa) => dfa.process(text),
+        }
     }
 
     pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> {
-        self.dfa.find_first_match(text)
+        match &self.dfa {
+            DFAType::Thompson(dfa) => dfa.find_first_match(text),
+            DFAType::Glushkov(dfa) => dfa.find_first_match(text),
+        }
     }
 
     pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> {
-        self.dfa.find_all_matches(text)
+        match &self.dfa {
+            DFAType::Thompson(dfa) => dfa.find_all_matches(text),
+            DFAType::Glushkov(dfa) => dfa.find_all_matches(text),
+        }
+    }
+}
+
+pub fn is_valid_regex(regex: &str) -> bool {
+    if regex.is_empty() {
+        return false;
+    }
+
+    let mut open_paren_count = 0;
+    let mut last_was_quantifier = false;
+
+    let mut chars = regex.chars().peekable();
+    while let Some(c) = chars.next() {
+        match c {
+            '(' => {
+                open_paren_count += 1;
+                last_was_quantifier = false;
+            }
+
+            ')' => {
+                if open_paren_count == 0 {
+                    return false;
+                }
+                open_paren_count -= 1;
+                last_was_quantifier = false;
+            }
+
+            '*' | '+' => {
+                // Ensure quantifiers are not the first character and are not repeated
+                if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') {
+                    return false;
+                }
+                last_was_quantifier = true;
+            }
+
+            '|' => {
+                // Ensure alternation isn't the first or last character
+                if regex.starts_with('|') || chars.peek().is_none() {
+                    return false;
+                }
+                last_was_quantifier = false;
+            }
+
+            '\\' => {
+                // Handle escaped characters: ensure there's a character after the escape
+                if chars.peek().is_none() {
+                    return false;
+                }
+                chars.next(); // Skip the escaped character
+                last_was_quantifier = false;
+            }
+
+            _ => {
+                last_was_quantifier = false;
+            }
+        }
+    }
+
+    open_paren_count == 0
+}
+
+pub fn normalise_regex(regex: &str) -> String {
+    let mut normalised = String::new();
+    let mut escape_sequence = false;
+    let mut prev_char = '\0';
+
+    for curr_char in regex.chars() {
+        if escape_sequence {
+            // TODO: Implement further parsing features here (e.g. \w \d)
+            normalised.push(curr_char);
+            escape_sequence = false;
+            prev_char = curr_char;
+            continue;
+        }
+
+        if curr_char == '\\' {
+            escape_sequence = true;
+            normalised.push(curr_char);
+            continue;
+        }
+
+        if curr_char == '+' {
+            normalised.push(prev_char);
+            normalised.push('*');
+            prev_char = curr_char;
+            continue;
+        }
+        if curr_char == '?' {
+            match prev_char {
+                ')' => {
+                    let mut balance = 0;
+
+                    for j in (0..normalised.len()).rev() {
+                        let ch = normalised.chars().nth(j).unwrap();
+                        if ch == ')' {
+                            balance += 1;
+                        } else if ch == '(' {
+                            balance -= 1;
+                            if balance == 0 {
+                                normalised.insert(j, '(');
+                                break;
+                            }
+                        }
+                    }
+                }
+                _ => {
+                    normalised.insert(normalised.len() - 1, '(');
+                }
+            }
+            normalised.push_str("|())");
+            prev_char = curr_char;
+            continue;
+        }
+        if curr_char == '.' {
+            normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)");
+            prev_char = curr_char;
+            continue;
+        }
+
+        normalised.push(curr_char);
+        prev_char = curr_char;
     }
+
+    normalised
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
 
+    #[test]
+    fn valid_regex_basic_test() {
+        let regex = "(a|b)*";
+        assert!(is_valid_regex(regex), "Expected valid regex.");
+    }
+
+    #[test]
+    fn invalid_empty_regex_test() {
+        let regex = "";
+        assert!(!is_valid_regex(regex), "Expected invalid regex (empty).");
+    }
+
+    #[test]
+    fn invalid_unbalanced_parentheses_test() {
+        let regex1 = "(a|b";
+        let regex2 = "a|b)";
+        assert!(
+            !is_valid_regex(regex1),
+            "Expected invalid regex (unbalanced parentheses)."
+        );
+        assert!(
+            !is_valid_regex(regex2),
+            "Expected invalid regex (unbalanced parentheses)."
+        );
+    }
+
+    #[test]
+    fn invalid_operator_placement_test() {
+        let regex1 = "*a";
+        let regex2 = "|a|b";
+        assert!(
+            !is_valid_regex(regex1),
+            "Expected invalid regex (invalid quantifier placement)."
+        );
+        assert!(
+            !is_valid_regex(regex2),
+            "Expected invalid regex (invalid alternation placement)."
+        );
+    }
+
+    #[test]
+    fn valid_nested_parentheses_test() {
+        let regex = "((a|b)*c)";
+        assert!(
+            is_valid_regex(regex),
+            "Expected valid regex with nested parentheses."
+        );
+    }
+
+    #[test]
+    fn valid_escape_sequence_test() {
+        let regex = "a\\*b";
+        assert!(
+            is_valid_regex(regex),
+            "Expected valid regex with escape sequence."
+        );
+    }
+
+    #[test]
+    fn invalid_escape_sequence_test() {
+        let regex = "a\\";
+        assert!(
+            !is_valid_regex(regex),
+            "Expected invalid regex with unpaired escape."
+        );
+    }
+
+    #[test]
+    fn normalise_regex_test() {
+        let cases = [
+            (r"a+", r"aa*"),
+            (r"a\+", r"a\+"),
+            (r"a?", r"(a|())"),
+            (r"a\?", r"a\?"),
+            (r"(ab)?", r"((ab)|())"),
+            (r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"),
+        ];
+
+        for (input, expected) in cases {
+            let result = normalise_regex(input);
+            assert_eq!(
+                result, expected,
+                "Normalisation failed for input '{}'",
+                input
+            );
+        }
+    }
+
     #[test]
     fn is_match_test() {
-        let regex_object = Regex::new("(a|b)*");
+        let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson);
 
         let success_strings = vec!["abababaaaababa", ""];
         for string in success_strings {
@@ -45,7 +278,7 @@ mod tests {
 
     #[test]
     fn find_test() {
-        let regex_object = Regex::new("abc");
+        let regex_object = Regex::new("abc", ConstructionType::Thompson);
         let test_cases = vec![
             ("abcd", Some("abc")),
             ("xyzabc", Some("abc")),
@@ -63,7 +296,7 @@ mod tests {
 
     #[test]
     fn find_all_test() {
-        let regex_object = Regex::new("abc*");
+        let regex_object = Regex::new("abc*", ConstructionType::Thompson);
         let test_cases = vec![
             ("abcd", vec!["abc"]),
             ("ac", vec![]),
diff --git a/src/dfa.rs b/src/thompson.rs
similarity index 78%
rename from src/dfa.rs
rename to src/thompson.rs
index 17019c6..e96fcca 100644
--- a/src/dfa.rs
+++ b/src/thompson.rs
@@ -1,4 +1,4 @@
-use core::panic;
+use crate::regex_engine::{is_valid_regex, normalise_regex};
 use std::collections::{HashMap, HashSet, VecDeque};
 
 struct NFA {
@@ -11,144 +11,6 @@ pub struct DFA {
     accepting_states: HashSet<u32>,
 }
 
-fn is_valid_regex(regex: &str) -> bool {
-    if regex.is_empty() {
-        return false;
-    }
-
-    let mut open_paren_count = 0;
-    let mut last_was_quantifier = false;
-
-    let mut chars = regex.chars().peekable();
-    while let Some(c) = chars.next() {
-        match c {
-            '(' => {
-                open_paren_count += 1;
-                last_was_quantifier = false;
-            }
-
-            ')' => {
-                if open_paren_count == 0 {
-                    return false;
-                }
-                open_paren_count -= 1;
-                last_was_quantifier = false;
-            }
-
-            '*' | '+' => {
-                // Ensure quantifiers are not the first character and are not repeated
-                if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') {
-                    return false;
-                }
-                last_was_quantifier = true;
-            }
-
-            '|' => {
-                // Ensure alternation isn't the first or last character
-                if regex.starts_with('|') || chars.peek().is_none() {
-                    return false;
-                }
-                last_was_quantifier = false;
-            }
-
-            '\\' => {
-                // Handle escaped characters: ensure there's a character after the escape
-                if chars.peek().is_none() {
-                    return false;
-                }
-                chars.next(); // Skip the escaped character
-                last_was_quantifier = false;
-            }
-
-            _ => {
-                last_was_quantifier = false;
-            }
-        }
-    }
-
-    open_paren_count == 0
-}
-
-fn normalise_regex(regex: &str) -> String {
-    let mut normalised = String::new();
-    let mut escape_sequence = false;
-    let mut prev_char = '\0';
-
-    for curr_char in regex.chars() {
-        if escape_sequence {
-            // TODO: Implement further parsing features here (e.g. \w \d)
-            normalised.push(curr_char);
-            escape_sequence = false;
-            prev_char = curr_char;
-            continue;
-        }
-
-        if curr_char == '\\' {
-            escape_sequence = true;
-            normalised.push(curr_char);
-            continue;
-        }
-
-        if curr_char == '+' {
-            normalised.push(prev_char);
-            normalised.push('*');
-            prev_char = curr_char;
-            continue;
-        }
-        if curr_char == '?' {
-            match prev_char {
-                ')' => {
-                    let mut balance = 0;
-
-                    for j in (0..normalised.len()).rev() {
-                        let ch = normalised.chars().nth(j).unwrap();
-                        if ch == ')' {
-                            balance += 1;
-                        } else if ch == '(' {
-                            balance -= 1;
-                            if balance == 0 {
-                                normalised.insert(j, '(');
-                                break;
-                            }
-                        }
-                    }
-                }
-                _ => {
-                    normalised.insert(normalised.len() - 1, '(');
-                }
-            }
-            normalised.push_str("|())");
-            prev_char = curr_char;
-            continue;
-        }
-        if curr_char == '.' {
-            normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)");
-            prev_char = curr_char;
-            continue;
-        }
-
-        normalised.push(curr_char);
-        prev_char = curr_char;
-    }
-
-    normalised
-}
-
-// GLUSHKOV CONSTRUCTION
-fn glushkov_construction(regex: &str) -> NFA {
-    // TODO: Step 1 (rename letters / index them)
-    // TODO: Step 2a ()
-    // TODO: Step 2b ()
-    // TODO: Step 3 ()
-    // TODO: Step 4 ()
-    todo!()
-}
-
-fn nfa_no_epsilon_to_dfa() {
-    todo!()
-}
-// END GLUSHKOV CONSTRUCTION
-
 // THOMPSON CONSTRUCTION ---
 fn thompson_construction(normalised_regex: &str) -> NFA {
     fn apply_operator(nfa_stack: &mut Vec<NFA>, operator: char) {
@@ -681,94 +543,6 @@ impl DFA {
 mod tests {
     use super::*;
 
-    #[test]
-    fn valid_regex_basic_test() {
-        let regex = "(a|b)*";
-        assert!(is_valid_regex(regex), "Expected valid regex.");
-    }
-
-    #[test]
-    fn invalid_empty_regex_test() {
-        let regex = "";
-        assert!(!is_valid_regex(regex), "Expected invalid regex (empty).");
-    }
-
-    #[test]
-    fn invalid_unbalanced_parentheses_test() {
-        let regex1 = "(a|b";
-        let regex2 = "a|b)";
-        assert!(
-            !is_valid_regex(regex1),
-            "Expected invalid regex (unbalanced parentheses)."
-        );
-        assert!(
-            !is_valid_regex(regex2),
-            "Expected invalid regex (unbalanced parentheses)."
-        );
-    }
-
-    #[test]
-    fn invalid_operator_placement_test() {
-        let regex1 = "*a";
-        let regex2 = "|a|b";
-        assert!(
-            !is_valid_regex(regex1),
-            "Expected invalid regex (invalid quantifier placement)."
-        );
-        assert!(
-            !is_valid_regex(regex2),
-            "Expected invalid regex (invalid alternation placement)."
-        );
-    }
-
-    #[test]
-    fn valid_nested_parentheses_test() {
-        let regex = "((a|b)*c)";
-        assert!(
-            is_valid_regex(regex),
-            "Expected valid regex with nested parentheses."
-        );
-    }
-
-    #[test]
-    fn valid_escape_sequence_test() {
-        let regex = "a\\*b";
-        assert!(
-            is_valid_regex(regex),
-            "Expected valid regex with escape sequence."
-        );
-    }
-
-    #[test]
-    fn invalid_escape_sequence_test() {
-        let regex = "a\\";
-        assert!(
-            !is_valid_regex(regex),
-            "Expected invalid regex with unpaired escape."
-        );
-    }
-
-    #[test]
-    fn normalise_regex_test() {
-        let cases = [
-            (r"a+", r"aa*"),
-            (r"a\+", r"a\+"),
-            (r"a?", r"(a|())"),
-            (r"a\?", r"a\?"),
-            (r"(ab)?", r"((ab)|())"),
-            (r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"),
-        ];
-
-        for (input, expected) in cases {
-            let result = normalise_regex(input);
-            assert_eq!(
-                result, expected,
-                "Normalisation failed for input '{}'",
-                input
-            );
-        }
-    }
-
     #[test]
     fn create_dfa_test() {
         let generated_dfa = DFA::new("(a|b)*");
diff --git a/tests/test_one.rs b/tests/test_one.rs
index 47ec5fe..a166e8a 100644
--- a/tests/test_one.rs
+++ b/tests/test_one.rs
@@ -1,4 +1,4 @@
-use regex_engine::regex_engine::Regex;
+use regex_engine::regex_engine::{ConstructionType, Regex};
 
 #[test]
 fn test_escape_sequence_plus() {
@@ -6,7 +6,7 @@ fn test_escape_sequence_plus() {
     let text = "aaab+b"; // should fail on match
     let text_success = "aaab+";
 
-    let engine = Regex::new(pattern);
+    let engine = Regex::new(pattern, ConstructionType::Thompson);
 
     let expected_match = text_success;
 
@@ -22,7 +22,7 @@ fn test_escape_sequence_slash() {
     let text = "aaab\\b"; // should fail on match
     let text_success = "aaab\\";
 
-    let engine = Regex::new(pattern);
+    let engine = Regex::new(pattern, ConstructionType::Thompson);
 
     let expected_match = text_success;
 
@@ -38,7 +38,7 @@ fn test_dot_wildcard() {
     let text = "cabbc"; // should fail on match
     let text_success = "abbc";
 
-    let engine = Regex::new(pattern);
+    let engine = Regex::new(pattern, ConstructionType::Thompson);
 
     let expected_match = text_success;
 

From 469fdb9f2759dc90585d77553c374ac0458953d2 Mon Sep 17 00:00:00 2001
From: Testspieler09 <pepehanisch06@gmail.com>
Date: Sat, 10 May 2025 16:54:47 +0200
Subject: [PATCH 2/8] chore: index states and fixes

---
 src/glushkov.rs | 150 ++++++++++++++++++++++++++++++++++++++++++++----
 src/thompson.rs |  42 +++++++-------
 2 files changed, 160 insertions(+), 32 deletions(-)

diff --git a/src/glushkov.rs b/src/glushkov.rs
index d6d7342..72615a5 100644
--- a/src/glushkov.rs
+++ b/src/glushkov.rs
@@ -1,31 +1,100 @@
 use crate::regex_engine::{is_valid_regex, normalise_regex};
 use std::collections::{HashMap, HashSet};
 
+#[derive(Clone, Debug, PartialEq)]
+enum SymbolType {
+    Normal,
+    KleeneStar,
+    Escaped,
+}
+
 struct NFA {
-    transitions: HashMap<(u32, Option<char>), Vec<u32>>,
-    accepting_state: u32,
+    transitions: HashMap<(u32, char), Vec<u32>>,
+    accepting_states: HashSet<u32>,
 }
 
 pub struct DFA {
-    transitions: HashMap<(u32, Option<char>), u32>,
+    transitions: HashMap<(u32, char), u32>,
     accepting_states: HashSet<u32>,
 }
 
 // GLUSHKOV CONSTRUCTION
 fn glushkov_construction(regex: &str) -> NFA {
-    // TODO: Step 1 (rename letters / index them)
-    // TODO: Step 2a ()
-    // TODO: Step 2b ()
-    // TODO: Step 3 ()
-    // TODO: Step 4 ()
-    todo!()
+    let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
+    let mut accepting_states: HashSet<u32> = HashSet::new();
+
+    let states: HashMap<(u32, char), (SymbolType, u32)> = index_states(regex);
+
+    // TODO: Construct transitions and accepting states using position_index
+    NFA {
+        transitions,
+        accepting_states,
+    }
 }
 
-fn nfa_no_epsilon_to_dfa() {
-    todo!()
+fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> {
+    let mut indexed_states: HashMap<(u32, char), (SymbolType, u32)> = HashMap::new();
+    let mut symbol_type: SymbolType = SymbolType::Normal;
+    let mut union_count: Vec<u32> = vec![0];
+    let mut idx: u32 = 0;
+    let mut group_index: u32 = 0;
+
+    let mut chars = regex.chars().peekable();
+
+    while let Some(symbol) = chars.next() {
+        if symbol_type == SymbolType::Escaped {
+            indexed_states
+                .entry((idx as u32, symbol))
+                .or_insert((symbol_type.clone(), group_index));
+
+            idx += 1;
+            symbol_type = SymbolType::Normal;
+            continue;
+        }
+
+        match symbol {
+            '|' => {
+                if let Some(last_element) = union_count.last_mut() {
+                    *last_element += 1;
+                }
+                group_index += 1;
+            }
+            '(' => {
+                union_count.push(0);
+                group_index += 1;
+            }
+            ')' => {
+                group_index -= union_count.pop().unwrap() + 1;
+            }
+            '*' => {
+                symbol_type = SymbolType::Normal;
+                continue;
+            }
+            '\\' => symbol_type = SymbolType::Escaped,
+            _ => {
+                if let Some(next_symbol) = chars.peek() {
+                    if *next_symbol == '*' {
+                        symbol_type = SymbolType::KleeneStar
+                    }
+                }
+
+                indexed_states
+                    .entry((idx as u32, symbol))
+                    .or_insert((symbol_type.clone(), group_index));
+
+                idx += 1;
+            }
+        }
+    }
+
+    indexed_states
 }
 // END GLUSHKOV CONSTRUCTION
 
+fn nfa_no_epsilon_to_dfa(nfa: &NFA) -> DFA {
+    todo!()
+}
+
 impl DFA {
     pub fn new(regex: &str) -> Self {
         if !is_valid_regex(regex) {
@@ -48,3 +117,62 @@ impl DFA {
         todo!()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_single_character() {
+        let expected = HashMap::from([((0, 'a'), (SymbolType::Normal, 0))]);
+
+        let result = index_states("a");
+        assert_eq!(result, expected, "Mismatch in single character test");
+    }
+
+    #[test]
+    fn test_kleene_star() {
+        let expected = HashMap::from([((0, 'a'), (SymbolType::KleeneStar, 0))]);
+
+        let result = index_states("a*");
+        assert_eq!(result, expected, "Mismatch in kleene star test");
+    }
+
+    #[test]
+    fn test_union_and_groups() {
+        let expected = HashMap::from([
+            ((0, 'a'), (SymbolType::Normal, 1)),
+            ((1, 'b'), (SymbolType::Normal, 2)),
+        ]);
+
+        let result = index_states("(a|b)");
+        assert_eq!(result, expected, "Mismatch in union and groups test");
+    }
+
+    #[test]
+    fn test_escaped_character() {
+        let expected = HashMap::from([((0, 'a'), (SymbolType::Escaped, 0))]);
+
+        let result = index_states("\\a");
+        assert_eq!(result, expected, "Mismatch in escaped character test");
+    }
+
+    #[test]
+    fn test_mixed_regex() {
+        let expected = HashMap::from([
+            ((0, 'a'), (SymbolType::Normal, 0)),
+            ((1, '*'), (SymbolType::Escaped, 0)),
+            ((2, 'b'), (SymbolType::Normal, 0)),
+            ((3, 'c'), (SymbolType::KleeneStar, 0)),
+            ((4, 'd'), (SymbolType::Normal, 0)),
+            ((5, 'e'), (SymbolType::Normal, 1)),
+            ((6, 'f'), (SymbolType::Normal, 2)),
+            ((7, 'g'), (SymbolType::Normal, 4)),
+            ((8, 'h'), (SymbolType::Normal, 5)),
+            ((9, 'i'), (SymbolType::Normal, 0)),
+        ]);
+
+        let result = index_states("a\\*bc*d(e|f|(g|h))i");
+        assert_eq!(result, expected, "Mismatch in mixed regex test");
+    }
+}
diff --git a/src/thompson.rs b/src/thompson.rs
index e96fcca..bb5c50d 100644
--- a/src/thompson.rs
+++ b/src/thompson.rs
@@ -7,7 +7,7 @@ struct NFA {
 }
 
 pub struct DFA {
-    transitions: HashMap<(u32, Option<char>), u32>,
+    transitions: HashMap<(u32, char), u32>,
     accepting_states: HashSet<u32>,
 }
 
@@ -289,7 +289,7 @@ fn nfa_to_dfa(nfa: &NFA) -> DFA {
                 unmarked_states.push(move_closure);
             }
 
-            transitions.insert((current_dfa_state_id, Some(symbol)), state_map[&sorted_vec]);
+            transitions.insert((current_dfa_state_id, symbol), state_map[&sorted_vec]);
         }
     }
 
@@ -340,7 +340,7 @@ fn optimise_dfa(dfa: &DFA) -> DFA {
     }
 
     while let Some(current_partition_index) = worklist.pop_front() {
-        let mut states_to_check: HashMap<Option<char>, HashSet<u32>> = HashMap::new();
+        let mut states_to_check: HashMap<char, HashSet<u32>> = HashMap::new();
         for (&(source_state, symbol), &target_state) in &dfa.transitions {
             if partition[&target_state] == current_partition_index {
                 states_to_check
@@ -398,7 +398,7 @@ fn optimise_dfa(dfa: &DFA) -> DFA {
         }
     }
 
-    let mut minimal_transitions: HashMap<(u32, Option<char>), u32> = HashMap::new();
+    let mut minimal_transitions: HashMap<(u32, char), u32> = HashMap::new();
     let mut minimal_accepting_states: HashSet<u32> = HashSet::new();
     let mut new_state_map: HashMap<usize, u32> = HashMap::new();
 
@@ -454,7 +454,7 @@ impl DFA {
     pub fn process(&self, input: &str) -> bool {
         let mut current_state = 0;
         for c in input.chars() {
-            if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) {
+            if let Some(&next_state) = self.transitions.get(&(current_state, c)) {
                 current_state = next_state;
             } else {
                 return false;
@@ -472,7 +472,7 @@ impl DFA {
             let mut found_match = false;
 
             for (i, c) in text.chars().enumerate().skip(start_pos) {
-                if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) {
+                if let Some(&next_state) = self.transitions.get(&(current_state, c)) {
                     current_state = next_state;
                     match_start = match_start.or(Some(i));
 
@@ -510,7 +510,7 @@ impl DFA {
             let mut found_match = false;
 
             for (i, c) in input.chars().enumerate().skip(start_pos) {
-                if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) {
+                if let Some(&next_state) = self.transitions.get(&(current_state, c)) {
                     current_state = next_state;
                     match_start = match_start.or(Some(start_pos));
 
@@ -546,14 +546,14 @@ mod tests {
     #[test]
     fn create_dfa_test() {
         let generated_dfa = DFA::new("(a|b)*");
-        let expected_transitions = HashMap::from([((0, Some('a')), 0), ((0, Some('b')), 0)]);
+        let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 0)]);
         let expected_accepting_states = HashSet::from([0]);
 
         assert_eq!(expected_transitions, generated_dfa.transitions);
         assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
 
         let generated_dfa_2 = DFA::new("a|()");
-        let expected_transitions_2 = HashMap::from([((0, Some('a')), 1)]);
+        let expected_transitions_2 = HashMap::from([((0, 'a'), 1)]);
         let expected_accepting_states_2 = HashSet::from([0, 1]);
 
         assert_eq!(expected_transitions_2, generated_dfa_2.transitions);
@@ -671,20 +671,20 @@ mod tests {
 
         let expected_options = vec![
             HashMap::from([
-                ((0, Some('a')), 1),
-                ((0, Some('b')), 2),
-                ((1, Some('a')), 1),
-                ((1, Some('b')), 2),
-                ((2, Some('a')), 1),
-                ((2, Some('b')), 2),
+                ((0, 'a'), 1),
+                ((0, 'b'), 2),
+                ((1, 'a'), 1),
+                ((1, 'b'), 2),
+                ((2, 'a'), 1),
+                ((2, 'b'), 2),
             ]),
             HashMap::from([
-                ((0, Some('a')), 2),
-                ((0, Some('b')), 1),
-                ((1, Some('a')), 2),
-                ((1, Some('b')), 1),
-                ((2, Some('a')), 2),
-                ((2, Some('b')), 1),
+                ((0, 'a'), 2),
+                ((0, 'b'), 1),
+                ((1, 'a'), 2),
+                ((1, 'b'), 1),
+                ((2, 'a'), 2),
+                ((2, 'b'), 1),
             ]),
         ];
         let expected_accepting_states = HashSet::from([0, 1, 2]);

From efdac98828bff4fc5ec6e1e7179ed82ae395f16a Mon Sep 17 00:00:00 2001
From: Testspieler09 <pepehanisch06@gmail.com>
Date: Mon, 21 Jul 2025 19:01:19 +0200
Subject: [PATCH 3/8] chore: refactor and restructure of codebase

---
 Cargo.lock                      | 599 ++++++++++++++++++++++++++++++++
 Cargo.toml                      |  17 +-
 benches/glushkov_benchmark.rs   |   0
 benches/rust_regex_benchmark.rs |   0
 benches/thompson_benchmark.rs   |   0
 src/glushkov.rs                 | 324 +++++++++++++----
 src/lib.rs                      |  94 +++++
 src/regex_engine.rs             |  42 ++-
 src/thompson.rs                 | 194 ++++-------
 9 files changed, 1053 insertions(+), 217 deletions(-)
 create mode 100644 benches/glushkov_benchmark.rs
 create mode 100644 benches/rust_regex_benchmark.rs
 create mode 100644 benches/thompson_benchmark.rs

diff --git a/Cargo.lock b/Cargo.lock
index ab61766..61f4459 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,605 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstyle"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "bumpalo"
+version = "3.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
+
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268"
+
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9"
+dependencies = [
+ "clap_builder",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d"
+dependencies = [
+ "anstyle",
+ "clap_lex",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "half"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
+[[package]]
+name = "is-terminal"
+version = "0.4.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys",
+]
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "js-sys"
+version = "0.3.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.174"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
+
+[[package]]
+name = "log"
+version = "0.4.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+
+[[package]]
+name = "memchr"
+version = "2.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rayon"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "regex"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+
 [[package]]
 name = "regex_engine"
 version = "0.1.0"
+dependencies = [
+ "criterion",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
+
+[[package]]
+name = "ryu"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.141"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30b9eff21ebe718216c6ec64e1d9ac57087aad11efc64e32002bce4a0d4c03d3"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.104"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
+dependencies = [
+ "bumpalo",
+ "log",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
+dependencies = [
+ "windows-sys",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
diff --git a/Cargo.toml b/Cargo.toml
index e3ddfb0..6c7d598 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,19 @@
 [package]
 name = "regex_engine"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 
-[dependencies]
+[dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
+
+[[bench]]
+name = "thompson_benchmark"
+harness = false
+
+[[bench]]
+name = "glushkov_benchmark"
+harness = false
+
+[[bench]]
+name = "rust_regex_benchmark"
+harness = false
diff --git a/benches/glushkov_benchmark.rs b/benches/glushkov_benchmark.rs
new file mode 100644
index 0000000..e69de29
diff --git a/benches/rust_regex_benchmark.rs b/benches/rust_regex_benchmark.rs
new file mode 100644
index 0000000..e69de29
diff --git a/benches/thompson_benchmark.rs b/benches/thompson_benchmark.rs
new file mode 100644
index 0000000..e69de29
diff --git a/src/glushkov.rs b/src/glushkov.rs
index 72615a5..ec35441 100644
--- a/src/glushkov.rs
+++ b/src/glushkov.rs
@@ -1,4 +1,7 @@
-use crate::regex_engine::{is_valid_regex, normalise_regex};
+use crate::{
+    Dfa,
+    regex_engine::{is_valid_regex, normalise_regex},
+};
 use std::collections::{HashMap, HashSet};
 
 #[derive(Clone, Debug, PartialEq)]
@@ -8,45 +11,73 @@ enum SymbolType {
     Escaped,
 }
 
-struct NFA {
+struct Nfa {
     transitions: HashMap<(u32, char), Vec<u32>>,
     accepting_states: HashSet<u32>,
 }
 
-pub struct DFA {
+pub struct GlushkovDfa {
     transitions: HashMap<(u32, char), u32>,
     accepting_states: HashSet<u32>,
 }
 
+impl Dfa for GlushkovDfa {
+    fn new(regex: &str) -> Self {
+        if !is_valid_regex(regex) {
+            panic!("{regex} is not a valid regular expression!");
+        }
+
+        let normalised_regex = normalise_regex(&regex);
+        todo!()
+    }
+
+    fn get_transitions(&self) -> &HashMap<(u32, char), u32> {
+        &self.transitions
+    }
+
+    fn get_accepting_states(&self) -> &HashSet<u32> {
+        &self.accepting_states
+    }
+}
+
 // GLUSHKOV CONSTRUCTION
-fn glushkov_construction(regex: &str) -> NFA {
+fn glushkov_construction(regex: &str) -> Nfa {
     let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
     let mut accepting_states: HashSet<u32> = HashSet::new();
 
-    let states: HashMap<(u32, char), (SymbolType, u32)> = index_states(regex);
+    let states: HashMap<u32, (char, SymbolType, u32)> = index_states(regex);
+
+    let mut start_states: HashSet<u32> = HashSet::new();
+
+    fill_sets(
+        states,
+        &mut start_states,
+        &mut accepting_states,
+        &mut transitions,
+    );
 
     // TODO: Construct transitions and accepting states using position_index
-    NFA {
+    Nfa {
         transitions,
         accepting_states,
     }
 }
 
-fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> {
-    let mut indexed_states: HashMap<(u32, char), (SymbolType, u32)> = HashMap::new();
-    let mut symbol_type: SymbolType = SymbolType::Normal;
+fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
+    let mut indexed_states: HashMap<u32, (char, SymbolType, u32)> = HashMap::new();
+    let mut symbol_type = SymbolType::Normal;
     let mut union_count: Vec<u32> = vec![0];
     let mut idx: u32 = 0;
     let mut group_index: u32 = 0;
 
+    // New stack to track if a group is meaningful
+    let mut group_stack: Vec<Option<u32>> = vec![]; // Some(index) if real, None if ignored
+
     let mut chars = regex.chars().peekable();
 
     while let Some(symbol) = chars.next() {
         if symbol_type == SymbolType::Escaped {
-            indexed_states
-                .entry((idx as u32, symbol))
-                .or_insert((symbol_type.clone(), group_index));
-
+            indexed_states.insert(idx, (symbol, symbol_type.clone(), group_index));
             idx += 1;
             symbol_type = SymbolType::Normal;
             continue;
@@ -54,17 +85,33 @@ fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> {
 
         match symbol {
             '|' => {
-                if let Some(last_element) = union_count.last_mut() {
-                    *last_element += 1;
+                if let Some(last_union) = union_count.last_mut() {
+                    *last_union += 1;
+                }
+                if let Some(Some(_)) = group_stack.last_mut() {
+                    // still a real group, do nothing here
+                } else if let Some(group) = group_stack.last_mut() {
+                    // this group is now meaningful, assign it an index
+                    *group = Some(group_index);
+                    group_index += 1;
                 }
-                group_index += 1;
             }
             '(' => {
                 union_count.push(0);
-                group_index += 1;
+                group_stack.push(None); // not yet known if meaningful
             }
             ')' => {
-                group_index -= union_count.pop().unwrap() + 1;
+                union_count.pop();
+
+                match group_stack.pop() {
+                    Some(Some(_)) => {
+                        // it was meaningful, nothing to change
+                    }
+                    Some(None) => {
+                        // the group was never promoted to real => do nothing
+                    }
+                    None => panic!("Mismatched parentheses"),
+                }
             }
             '*' => {
                 symbol_type = SymbolType::Normal;
@@ -72,51 +119,174 @@ fn index_states(regex: &str) -> HashMap<(u32, char), (SymbolType, u32)> {
             }
             '\\' => symbol_type = SymbolType::Escaped,
             _ => {
-                if let Some(next_symbol) = chars.peek() {
-                    if *next_symbol == '*' {
-                        symbol_type = SymbolType::KleeneStar
+                if let Some(next) = chars.peek() {
+                    if *next == '*' {
+                        symbol_type = SymbolType::KleeneStar;
+                    }
+                }
+
+                // if we're inside a group that hasn't been assigned an index yet, assign now
+                if let Some(group) = group_stack.last_mut() {
+                    if group.is_none() {
+                        *group = Some(group_index);
+                        group_index += 1;
                     }
                 }
 
-                indexed_states
-                    .entry((idx as u32, symbol))
-                    .or_insert((symbol_type.clone(), group_index));
+                // get current group idx for this symbol
+                let current_group = group_stack.last().and_then(|g| *g).unwrap_or(group_index);
 
+                indexed_states.insert(idx, (symbol, symbol_type.clone(), current_group));
                 idx += 1;
             }
         }
     }
 
+    // let mut indexed_states: HashMap<u32, (char, SymbolType, u32)> = HashMap::new();
+    // let mut symbol_type: SymbolType = SymbolType::Normal;
+    // let mut union_count: Vec<u32> = vec![0];
+    // let mut idx: u32 = 0;
+    // let mut group_index: u32 = 0;
+    //
+    // let mut chars = regex.chars().peekable();
+    //
+    // while let Some(symbol) = chars.next() {
+    //     if symbol_type == SymbolType::Escaped {
+    //         indexed_states
+    //             .entry(idx as u32)
+    //             .or_insert((symbol, symbol_type.clone(), group_index));
+    //
+    //         idx += 1;
+    //         symbol_type = SymbolType::Normal;
+    //         continue;
+    //     }
+    //
+    //     println!("{:?}, {:?}", union_count, symbol,);
+    //     match symbol {
+    //         '|' => {
+    //             if let Some(last_element) = union_count.last_mut() {
+    //                 *last_element += 1;
+    //             }
+    //             group_index += 1;
+    //         }
+    //         // FIX: the paranthasis are not working correctly e.g. x|(x|y)|x <=> x|x|y|x
+    //         '(' => {
+    //             union_count.push(0);
+    //             group_index += 1;
+    //         }
+    //         ')' => {
+    //             let unions_last_grouping = union_count.pop().unwrap();
+    //             if unions_last_grouping == 0 {
+    //                 continue;
+    //             }
+    //             group_index -= unions_last_grouping + 1;
+    //         }
+    //         '*' => {
+    //             symbol_type = SymbolType::Normal;
+    //             continue;
+    //         }
+    //         '\\' => symbol_type = SymbolType::Escaped,
+    //         _ => {
+    //             if let Some(next_symbol) = chars.peek() {
+    //                 if *next_symbol == '*' {
+    //                     symbol_type = SymbolType::KleeneStar
+    //                 }
+    //             }
+    //
+    //             indexed_states.entry(idx as u32).or_insert((
+    //                 symbol,
+    //                 symbol_type.clone(),
+    //                 group_index,
+    //             ));
+    //
+    //             idx += 1;
+    //         }
+    //     }
+    // }
+
     indexed_states
 }
-// END GLUSHKOV CONSTRUCTION
 
-fn nfa_no_epsilon_to_dfa(nfa: &NFA) -> DFA {
-    todo!()
-}
+fn fill_sets(
+    states: HashMap<u32, (char, SymbolType, u32)>,
+    start_states: &mut HashSet<u32>,
+    finite_states: &mut HashSet<u32>,
+    tranisitions: &mut HashMap<(u32, char), Vec<u32>>,
+) {
+    let mut idx: u32 = 1;
+    let amount_states: u32 = states.len() as u32;
 
-impl DFA {
-    pub fn new(regex: &str) -> Self {
-        if !is_valid_regex(regex) {
-            panic!("{} is not a valid regular expression!", regex);
+    if amount_states == 0 {
+        return;
+    }
+
+    // tranisitions
+    //     .entry((amount_states, states[&0].0))
+    //     .or_insert(vec![0]);
+    start_states.insert(0);
+
+    let mut last_symbol_type: &SymbolType = &states[&0].1;
+    let mut last_group_idx: u32 = 0;
+    let mut check_next_group: bool = false; // NOTE: can also be thought of as group_is_exhausted
+
+    loop {
+        let (_symbol, symbol_type, group_idx) = &states[&idx];
+        // Skip forwards to next group
+        if check_next_group {
+            if *group_idx != last_group_idx {
+                start_states.insert(idx);
+                last_symbol_type = symbol_type;
+                last_group_idx = *group_idx;
+                check_next_group = false;
+                // continue;
+            }
+
+            if idx < amount_states - 1 {
+                idx += 1;
+                continue;
+            } else {
+                break;
+            }
         }
 
-        let normalised_regex = normalise_regex(&regex);
-        todo!()
-    }
+        if *group_idx != last_group_idx {
+            start_states.insert(idx);
+            last_group_idx = *group_idx;
+            last_symbol_type = symbol_type;
+            check_next_group = true;
 
-    pub fn process(&self, input: &str) -> bool {
-        todo!()
-    }
+            if idx < amount_states - 1 {
+                idx += 1;
+                continue;
+            } else {
+                break;
+            }
+        }
 
-    pub fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> {
-        todo!()
-    }
+        match last_symbol_type {
+            SymbolType::Normal | SymbolType::Escaped => {
+                check_next_group = true;
+            }
+            SymbolType::KleeneStar => {
+                start_states.insert(idx);
+                check_next_group = false;
+            }
+        }
 
-    pub fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> {
-        todo!()
+        last_symbol_type = symbol_type;
+
+        if idx < amount_states - 1 {
+            idx += 1;
+        } else {
+            break;
+        }
     }
 }
+// END GLUSHKOV CONSTRUCTION
+
+fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa {
+    todo!()
+}
 
 #[cfg(test)]
 mod tests {
@@ -124,7 +294,7 @@ mod tests {
 
     #[test]
     fn test_single_character() {
-        let expected = HashMap::from([((0, 'a'), (SymbolType::Normal, 0))]);
+        let expected = HashMap::from([(0, ('a', SymbolType::Normal, 0))]);
 
         let result = index_states("a");
         assert_eq!(result, expected, "Mismatch in single character test");
@@ -132,7 +302,7 @@ mod tests {
 
     #[test]
     fn test_kleene_star() {
-        let expected = HashMap::from([((0, 'a'), (SymbolType::KleeneStar, 0))]);
+        let expected = HashMap::from([(0, ('a', SymbolType::KleeneStar, 0))]);
 
         let result = index_states("a*");
         assert_eq!(result, expected, "Mismatch in kleene star test");
@@ -141,8 +311,8 @@ mod tests {
     #[test]
     fn test_union_and_groups() {
         let expected = HashMap::from([
-            ((0, 'a'), (SymbolType::Normal, 1)),
-            ((1, 'b'), (SymbolType::Normal, 2)),
+            (0, ('a', SymbolType::Normal, 1)),
+            (1, ('b', SymbolType::Normal, 2)),
         ]);
 
         let result = index_states("(a|b)");
@@ -151,7 +321,7 @@ mod tests {
 
     #[test]
     fn test_escaped_character() {
-        let expected = HashMap::from([((0, 'a'), (SymbolType::Escaped, 0))]);
+        let expected = HashMap::from([(0, ('a', SymbolType::Escaped, 0))]);
 
         let result = index_states("\\a");
         assert_eq!(result, expected, "Mismatch in escaped character test");
@@ -160,19 +330,57 @@ mod tests {
     #[test]
     fn test_mixed_regex() {
         let expected = HashMap::from([
-            ((0, 'a'), (SymbolType::Normal, 0)),
-            ((1, '*'), (SymbolType::Escaped, 0)),
-            ((2, 'b'), (SymbolType::Normal, 0)),
-            ((3, 'c'), (SymbolType::KleeneStar, 0)),
-            ((4, 'd'), (SymbolType::Normal, 0)),
-            ((5, 'e'), (SymbolType::Normal, 1)),
-            ((6, 'f'), (SymbolType::Normal, 2)),
-            ((7, 'g'), (SymbolType::Normal, 4)),
-            ((8, 'h'), (SymbolType::Normal, 5)),
-            ((9, 'i'), (SymbolType::Normal, 0)),
+            (0, ('a', SymbolType::Normal, 0)),
+            (1, ('*', SymbolType::Escaped, 0)),
+            (2, ('b', SymbolType::Normal, 0)),
+            (3, ('c', SymbolType::KleeneStar, 0)),
+            (4, ('d', SymbolType::Normal, 0)),
+            (5, ('e', SymbolType::Normal, 1)),
+            (6, ('f', SymbolType::Normal, 2)),
+            (7, ('g', SymbolType::Normal, 4)),
+            (8, ('h', SymbolType::Normal, 5)),
+            (9, ('i', SymbolType::Normal, 0)),
         ]);
 
         let result = index_states("a\\*bc*d(e|f|(g|h))i");
         assert_eq!(result, expected, "Mismatch in mixed regex test");
     }
+
+    #[test]
+    fn test_to_many_brackets() {
+        let expected = HashMap::from([
+            (0, ('a', SymbolType::KleeneStar, 0)),
+            (1, ('b', SymbolType::Normal, 0)),
+            (2, ('c', SymbolType::Normal, 3)),
+            (3, ('d', SymbolType::Normal, 4)),
+            (4, ('e', SymbolType::Normal, 5)),
+            (5, ('f', SymbolType::Normal, 5)),
+        ]);
+
+        let result = index_states("a*b|((c|d))|ef");
+        assert_eq!(result, expected, "Mismatch in mixed regex test");
+    }
+
+    #[test]
+    fn test_fill_sets() {
+        let states = index_states("a*b|(c|d)|ef");
+        let mut start_states: HashSet<u32> = HashSet::new();
+        let mut finite_states: HashSet<u32> = HashSet::new();
+        let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
+
+        let expected_start_set: HashSet<u32> = HashSet::from([0, 1, 2, 3, 4]);
+        let expected_finite_set: HashSet<u32> = HashSet::new();
+        let expected_transions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
+
+        fill_sets(
+            states,
+            &mut start_states,
+            &mut finite_states,
+            &mut transitions,
+        );
+
+        assert_eq!(start_states, expected_start_set);
+        assert_eq!(finite_states, expected_finite_set);
+        assert_eq!(transitions, expected_transions);
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 1c11465..6f2fff7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,97 @@
+use std::collections::{HashMap, HashSet};
+
 mod glushkov;
 pub mod regex_engine;
 mod thompson;
+
+trait Dfa {
+    fn new(regex: &str) -> Self;
+    fn get_transitions(&self) -> &HashMap<(u32, char), u32>;
+    fn get_accepting_states(&self) -> &HashSet<u32>;
+    fn process(&self, input: &str) -> bool {
+        let mut current_state = 0;
+        for c in input.chars() {
+            if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) {
+                current_state = next_state;
+            } else {
+                return false;
+            }
+        }
+        self.get_accepting_states().contains(&current_state)
+    }
+
+    fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> {
+        let mut start_pos = 0;
+        while start_pos < text.len() {
+            let mut current_state = 0;
+            let mut match_start = None;
+            let mut match_end = None;
+            let mut found_match = false;
+
+            for (i, c) in text.chars().enumerate().skip(start_pos) {
+                if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) {
+                    current_state = next_state;
+                    match_start = match_start.or(Some(i));
+
+                    if self.get_accepting_states().contains(&current_state) {
+                        found_match = true;
+                        match_end = Some(i)
+                    }
+
+                    if i == text.len() - 1 && found_match {
+                        break;
+                    }
+                } else {
+                    break;
+                }
+            }
+
+            if let (Some(start), Some(end)) = (match_start, match_end) {
+                return Some(&text[start..=end]);
+            } else {
+                start_pos += 1;
+            }
+        }
+
+        None
+    }
+
+    fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> {
+        let mut matches: Vec<&str> = Vec::new();
+
+        let mut start_pos = 0;
+        while start_pos < input.len() {
+            let mut current_state = 0;
+            let mut match_start: Option<usize> = None;
+            let mut match_end: Option<usize> = None;
+            let mut found_match = false;
+
+            for (i, c) in input.chars().enumerate().skip(start_pos) {
+                if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) {
+                    current_state = next_state;
+                    match_start = match_start.or(Some(start_pos));
+
+                    if self.get_accepting_states().contains(&current_state) {
+                        match_end = Some(i);
+                        found_match = true;
+                    }
+
+                    if i == input.len() - 1 && found_match {
+                        break;
+                    }
+                } else {
+                    break;
+                }
+            }
+
+            if let (Some(start), Some(end)) = (match_start, match_end) {
+                matches.push(&input[start..=end]);
+                start_pos = end;
+            } else {
+                start_pos += 1;
+            }
+        }
+
+        matches
+    }
+}
diff --git a/src/regex_engine.rs b/src/regex_engine.rs
index 77e15a5..f628c55 100644
--- a/src/regex_engine.rs
+++ b/src/regex_engine.rs
@@ -1,47 +1,46 @@
-use crate::glushkov::DFA as GlushkovDFA;
-use crate::thompson::DFA as ThompsonDFA;
+use crate::{Dfa, glushkov::GlushkovDfa, thompson::ThompsonDfa};
 
 pub enum ConstructionType {
     Thompson,
     Glushkov,
 }
 
-enum DFAType {
-    Thompson(ThompsonDFA),
-    Glushkov(GlushkovDFA),
+enum DfaType {
+    Thompson(ThompsonDfa),
+    Glushkov(GlushkovDfa),
 }
 
 pub struct Regex {
-    dfa: DFAType,
+    dfa: DfaType,
 }
 
 impl Regex {
     pub fn new(pattern: &str, construction: ConstructionType) -> Self {
         let dfa_type = match construction {
-            ConstructionType::Thompson => DFAType::Thompson(ThompsonDFA::new(pattern)),
-            ConstructionType::Glushkov => DFAType::Glushkov(GlushkovDFA::new(pattern)),
+            ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)),
+            ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)),
         };
         Regex { dfa: dfa_type }
     }
 
     pub fn is_match(&self, text: &str) -> bool {
         match &self.dfa {
-            DFAType::Thompson(dfa) => dfa.process(text),
-            DFAType::Glushkov(dfa) => dfa.process(text),
+            DfaType::Thompson(dfa) => dfa.process(text),
+            DfaType::Glushkov(dfa) => dfa.process(text),
         }
     }
 
     pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> {
         match &self.dfa {
-            DFAType::Thompson(dfa) => dfa.find_first_match(text),
-            DFAType::Glushkov(dfa) => dfa.find_first_match(text),
+            DfaType::Thompson(dfa) => dfa.find_first_match(text),
+            DfaType::Glushkov(dfa) => dfa.find_first_match(text),
         }
     }
 
     pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> {
         match &self.dfa {
-            DFAType::Thompson(dfa) => dfa.find_all_matches(text),
-            DFAType::Glushkov(dfa) => dfa.find_all_matches(text),
+            DfaType::Thompson(dfa) => dfa.find_all_matches(text),
+            DfaType::Glushkov(dfa) => dfa.find_all_matches(text),
         }
     }
 }
@@ -248,16 +247,15 @@ mod tests {
             (r"a?", r"(a|())"),
             (r"a\?", r"a\?"),
             (r"(ab)?", r"((ab)|())"),
-            (r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"),
+            (
+                r".",
+                "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)",
+            ),
         ];
 
         for (input, expected) in cases {
             let result = normalise_regex(input);
-            assert_eq!(
-                result, expected,
-                "Normalisation failed for input '{}'",
-                input
-            );
+            assert_eq!(result, expected, "Normalisation failed for input '{input}'");
         }
     }
 
@@ -290,7 +288,7 @@ mod tests {
 
         for (text, expected) in test_cases {
             let result = regex_object.find(text);
-            assert_eq!(result, expected, "Failed for input: {}", text);
+            assert_eq!(result, expected, "Failed for input: {text}");
         }
     }
 
@@ -305,7 +303,7 @@ mod tests {
 
         for (text, expected) in test_cases {
             let result = regex_object.findall(text);
-            assert_eq!(result, expected, "Failed for input: {}", text);
+            assert_eq!(result, expected, "Failed for input: {text}");
         }
     }
 }
diff --git a/src/thompson.rs b/src/thompson.rs
index bb5c50d..1ad648c 100644
--- a/src/thompson.rs
+++ b/src/thompson.rs
@@ -1,19 +1,43 @@
-use crate::regex_engine::{is_valid_regex, normalise_regex};
+use crate::{
+    Dfa,
+    regex_engine::{is_valid_regex, normalise_regex},
+};
 use std::collections::{HashMap, HashSet, VecDeque};
 
-struct NFA {
+struct Nfa {
     transitions: HashMap<(u32, Option<char>), Vec<u32>>,
     accepting_state: u32, // the thompson construction always has one accepting_state
 }
 
-pub struct DFA {
+pub struct ThompsonDfa {
     transitions: HashMap<(u32, char), u32>,
     accepting_states: HashSet<u32>,
 }
 
+impl Dfa for ThompsonDfa {
+    fn new(regex: &str) -> Self {
+        if !is_valid_regex(regex) {
+            panic!("{regex} is not a valid regular expression!");
+        }
+
+        let normalised_regex = normalise_regex(regex);
+        let regex_nfa: Nfa = thompson_construction(&normalised_regex);
+        let regex_dfa = nfa_to_dfa(&regex_nfa);
+        optimise_dfa(&regex_dfa)
+    }
+
+    fn get_transitions(&self) -> &HashMap<(u32, char), u32> {
+        &self.transitions
+    }
+
+    fn get_accepting_states(&self) -> &HashSet<u32> {
+        &self.accepting_states
+    }
+}
+
 // THOMPSON CONSTRUCTION ---
-fn thompson_construction(normalised_regex: &str) -> NFA {
-    fn apply_operator(nfa_stack: &mut Vec<NFA>, operator: char) {
+fn thompson_construction(normalised_regex: &str) -> Nfa {
+    fn apply_operator(nfa_stack: &mut Vec<Nfa>, operator: char) {
         match operator {
             '|' => {
                 let nfa_right = nfa_stack.pop().expect("Expected NFA for union");
@@ -25,12 +49,12 @@ fn thompson_construction(normalised_regex: &str) -> NFA {
                 let nfa_left = nfa_stack.pop().expect("Expected NFA for concatenation");
                 nfa_stack.push(concatenate(&nfa_left, &nfa_right));
             }
-            _ => panic!("Unknown operator {:?}", operator),
+            _ => panic!("Unknown operator {operator:?}"),
         }
     }
 
     let mut operators: Vec<char> = Vec::new();
-    let mut nfa_stack: Vec<NFA> = Vec::new();
+    let mut nfa_stack: Vec<Nfa> = Vec::new();
     let mut concat_flag = false;
     let mut escape_sequence = false;
 
@@ -99,7 +123,7 @@ fn thompson_construction(normalised_regex: &str) -> NFA {
     nfa_stack.pop().unwrap()
 }
 
-fn apply_kleene_star(last_nfa: &NFA) -> NFA {
+fn apply_kleene_star(last_nfa: &Nfa) -> Nfa {
     let mut transitions = HashMap::new();
 
     let new_accepting = last_nfa.accepting_state + 2;
@@ -130,13 +154,13 @@ fn apply_kleene_star(last_nfa: &NFA) -> NFA {
         .or_insert_with(Vec::new)
         .push(new_accepting);
 
-    NFA {
+    Nfa {
         transitions,
         accepting_state: new_accepting,
     }
 }
 
-fn union(left: &NFA, right: &NFA) -> NFA {
+fn union(left: &Nfa, right: &Nfa) -> Nfa {
     let mut transitions = HashMap::new();
 
     let num_states_left_nfa = left.accepting_state;
@@ -162,21 +186,21 @@ fn union(left: &NFA, right: &NFA) -> NFA {
 
     transitions.insert((0, None), vec![1, num_states_left_nfa + 2]);
     transitions
-        .entry((&left.accepting_state + 1, None))
+        .entry((left.accepting_state + 1, None))
         .or_insert_with(Vec::new)
         .push(new_accepting_state);
     transitions
-        .entry((&right.accepting_state + num_states_left_nfa + 2, None))
+        .entry((right.accepting_state + num_states_left_nfa + 2, None))
         .or_insert_with(Vec::new)
         .push(new_accepting_state);
 
-    NFA {
+    Nfa {
         transitions,
         accepting_state: new_accepting_state,
     }
 }
 
-fn concatenate(left: &NFA, right: &NFA) -> NFA {
+fn concatenate(left: &Nfa, right: &Nfa) -> Nfa {
     let mut transitions: HashMap<(u32, Option<char>), Vec<u32>> = left.transitions.clone();
 
     // HACK: The accepting states are (based on the implementation) the last ones of the NFA
@@ -190,21 +214,21 @@ fn concatenate(left: &NFA, right: &NFA) -> NFA {
         );
     }
 
-    NFA {
+    Nfa {
         transitions,
         accepting_state: right.accepting_state + num_states_left_nfa,
     }
 }
 
-fn create_basic_nfa(letter: &char) -> NFA {
-    NFA {
+fn create_basic_nfa(letter: &char) -> Nfa {
+    Nfa {
         transitions: HashMap::from([((0, Some(*letter)), vec![1])]),
         accepting_state: 1,
     }
 }
 
-fn create_basic_epsilon_nfa() -> NFA {
-    NFA {
+fn create_basic_epsilon_nfa() -> Nfa {
+    Nfa {
         transitions: HashMap::from([((0, None), vec![1])]),
         accepting_state: 1,
     }
@@ -212,7 +236,7 @@ fn create_basic_epsilon_nfa() -> NFA {
 // END THOMPSON CONSTRUCTION ---
 
 // NFA to DFA functions ---
-fn epsilon_closure(nfa: &NFA, states: &mut HashSet<u32>) {
+fn epsilon_closure(nfa: &Nfa, states: &mut HashSet<u32>) {
     let mut stack = states.clone();
 
     while let Some(&state_id) = stack.iter().next() {
@@ -227,7 +251,7 @@ fn epsilon_closure(nfa: &NFA, states: &mut HashSet<u32>) {
     }
 }
 
-fn move_nfa(nfa: &NFA, states: &HashSet<u32>, symbol: char) -> HashSet<u32> {
+fn move_nfa(nfa: &Nfa, states: &HashSet<u32>, symbol: char) -> HashSet<u32> {
     let mut move_states = HashSet::new();
 
     for &state in states {
@@ -245,7 +269,7 @@ fn hash_set_to_sorted_vec(set: &HashSet<u32>) -> Vec<u32> {
     vec
 }
 
-fn nfa_to_dfa(nfa: &NFA) -> DFA {
+fn nfa_to_dfa(nfa: &Nfa) -> ThompsonDfa {
     // Start from the initial state of the NFA, assuming it's state 0
     let mut start_closure = HashSet::from([0]);
     epsilon_closure(nfa, &mut start_closure);
@@ -293,20 +317,20 @@ fn nfa_to_dfa(nfa: &NFA) -> DFA {
         }
     }
 
-    DFA {
+    ThompsonDfa {
         transitions,
         accepting_states: dfa_accepting_states,
     }
 }
 // END NFA to DFA functions ---
 
-fn optimise_dfa(dfa: &DFA) -> DFA {
+fn optimise_dfa(dfa: &ThompsonDfa) -> ThompsonDfa {
     let mut partition: HashMap<u32, usize> = HashMap::new();
     let mut accepting_states_set: HashSet<u32> = dfa.accepting_states.clone();
     let mut non_accepting_states: HashSet<u32> = HashSet::new();
     let mut all_states: HashSet<u32> = HashSet::new();
 
-    for (&(state, _), _) in &dfa.transitions {
+    for &(state, _) in dfa.transitions.keys() {
         all_states.insert(state);
         if dfa.accepting_states.contains(&state) {
             accepting_states_set.insert(state);
@@ -332,10 +356,10 @@ fn optimise_dfa(dfa: &DFA) -> DFA {
     partition_list.push(non_accepting_states);
 
     let mut worklist: VecDeque<usize> = VecDeque::new();
-    if partition_list[0].len() > 0 {
+    if !partition_list[0].is_empty() {
         worklist.push_back(0);
     }
-    if partition_list.len() > 1 && partition_list[1].len() > 0 {
+    if partition_list.len() > 1 && !partition_list[1].is_empty() {
         worklist.push_back(1);
     }
 
@@ -345,7 +369,7 @@ fn optimise_dfa(dfa: &DFA) -> DFA {
             if partition[&target_state] == current_partition_index {
                 states_to_check
                     .entry(symbol)
-                    .or_insert_with(HashSet::new)
+                    .or_default()
                     .insert(source_state);
             }
         }
@@ -410,8 +434,8 @@ fn optimise_dfa(dfa: &DFA) -> DFA {
     }
 
     for (_, &partition_index) in partition.iter() {
-        if !new_state_map.contains_key(&partition_index) {
-            new_state_map.insert(partition_index, next_state_id);
+        if let std::collections::hash_map::Entry::Vacant(e) = new_state_map.entry(partition_index) {
+            e.insert(next_state_id);
             next_state_id += 1;
         }
     }
@@ -433,126 +457,26 @@ fn optimise_dfa(dfa: &DFA) -> DFA {
         minimal_transitions.insert((new_source_state, symbol), new_target_state);
     }
 
-    DFA {
+    ThompsonDfa {
         transitions: minimal_transitions,
         accepting_states: minimal_accepting_states,
     }
 }
 
-impl DFA {
-    pub fn new(regex: &str) -> Self {
-        if !is_valid_regex(regex) {
-            panic!("{} is not a valid regular expression!", regex);
-        }
-
-        let normalised_regex = normalise_regex(&regex);
-        let regex_nfa: NFA = thompson_construction(&normalised_regex);
-        let regex_dfa = nfa_to_dfa(&regex_nfa);
-        optimise_dfa(&regex_dfa)
-    }
-
-    pub fn process(&self, input: &str) -> bool {
-        let mut current_state = 0;
-        for c in input.chars() {
-            if let Some(&next_state) = self.transitions.get(&(current_state, c)) {
-                current_state = next_state;
-            } else {
-                return false;
-            }
-        }
-        self.accepting_states.contains(&current_state)
-    }
-
-    pub fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> {
-        let mut start_pos = 0;
-        while start_pos < text.len() {
-            let mut current_state = 0;
-            let mut match_start = None;
-            let mut match_end = None;
-            let mut found_match = false;
-
-            for (i, c) in text.chars().enumerate().skip(start_pos) {
-                if let Some(&next_state) = self.transitions.get(&(current_state, c)) {
-                    current_state = next_state;
-                    match_start = match_start.or(Some(i));
-
-                    if self.accepting_states.contains(&current_state) {
-                        found_match = true;
-                        match_end = Some(i)
-                    }
-
-                    if i == text.len() - 1 && found_match {
-                        break;
-                    }
-                } else {
-                    break;
-                }
-            }
-
-            if let (Some(start), Some(end)) = (match_start, match_end) {
-                return Some(&text[start..=end]);
-            } else {
-                start_pos += 1;
-            }
-        }
-
-        None
-    }
-
-    pub fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> {
-        let mut matches: Vec<&str> = Vec::new();
-
-        let mut start_pos = 0;
-        while start_pos < input.len() {
-            let mut current_state = 0;
-            let mut match_start: Option<usize> = None;
-            let mut match_end: Option<usize> = None;
-            let mut found_match = false;
-
-            for (i, c) in input.chars().enumerate().skip(start_pos) {
-                if let Some(&next_state) = self.transitions.get(&(current_state, c)) {
-                    current_state = next_state;
-                    match_start = match_start.or(Some(start_pos));
-
-                    if self.accepting_states.contains(&current_state) {
-                        match_end = Some(i);
-                        found_match = true;
-                    }
-
-                    if i == input.len() - 1 && found_match {
-                        break;
-                    }
-                } else {
-                    break;
-                }
-            }
-
-            if let (Some(start), Some(end)) = (match_start, match_end) {
-                matches.push(&input[start..=end]);
-                start_pos = end;
-            } else {
-                start_pos += 1;
-            }
-        }
-
-        matches
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
     fn create_dfa_test() {
-        let generated_dfa = DFA::new("(a|b)*");
+        let generated_dfa = ThompsonDfa::new("(a|b)*");
         let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 0)]);
         let expected_accepting_states = HashSet::from([0]);
 
         assert_eq!(expected_transitions, generated_dfa.transitions);
         assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
 
-        let generated_dfa_2 = DFA::new("a|()");
+        let generated_dfa_2 = ThompsonDfa::new("a|()");
         let expected_transitions_2 = HashMap::from([((0, 'a'), 1)]);
         let expected_accepting_states_2 = HashSet::from([0, 1]);
 
@@ -565,7 +489,7 @@ mod tests {
 
     #[test]
     fn prozess_regex_test() {
-        let generated_dfa = DFA::new("(a|b)*");
+        let generated_dfa = ThompsonDfa::new("(a|b)*");
         let test_strings = vec!["abbbababaaaa", ""];
         for string in test_strings {
             assert!(generated_dfa.process(string));
@@ -654,7 +578,7 @@ mod tests {
 
     #[test]
     fn nfa_to_dfa_test() {
-        let input_nfa = NFA {
+        let input_nfa = Nfa {
             transitions: HashMap::from([
                 ((0, None), vec![1, 7]),
                 ((1, None), vec![2, 4]),

From 4aefba5d82e728c1ecc610274d4a4035492f8613 Mon Sep 17 00:00:00 2001
From: Testspieler09 <pepehanisch06@gmail.com>
Date: Mon, 21 Jul 2025 21:53:13 +0200
Subject: [PATCH 4/8] bench: add benchmarks

---
 Cargo.lock                      |   1 +
 Cargo.toml                      |   7 +-
 benches/bench_cases.rs          |  76 ++++++++
 benches/glushkov_benchmark.rs   |  62 +++++++
 benches/rust_regex_benchmark.rs |  56 ++++++
 benches/thompson_benchmark.rs   |  62 +++++++
 src/glushkov.rs                 |  19 +-
 src/lib.rs                      | 310 +++++++++++++++++++++++++++++++-
 src/regex_engine.rs             | 309 -------------------------------
 src/thompson.rs                 |   7 +-
 10 files changed, 579 insertions(+), 330 deletions(-)
 create mode 100644 benches/bench_cases.rs
 delete mode 100644 src/regex_engine.rs

diff --git a/Cargo.lock b/Cargo.lock
index 61f4459..4f0180a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -363,6 +363,7 @@ name = "regex_engine"
 version = "0.1.0"
 dependencies = [
  "criterion",
+ "regex",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 6c7d598..4a0c696 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,14 +5,15 @@ edition = "2024"
 
 [dev-dependencies]
 criterion = { version = "0.5", features = ["html_reports"] }
+regex = "1.11.1"
 
 [[bench]]
 name = "thompson_benchmark"
 harness = false
 
-[[bench]]
-name = "glushkov_benchmark"
-harness = false
+# [[bench]]
+# name = "glushkov_benchmark"
+# harness = false
 
 [[bench]]
 name = "rust_regex_benchmark"
diff --git a/benches/bench_cases.rs b/benches/bench_cases.rs
new file mode 100644
index 0000000..7df2b4c
--- /dev/null
+++ b/benches/bench_cases.rs
@@ -0,0 +1,76 @@
+#[allow(dead_code)]
+struct BenchCase<'a> {
+    pub regex: &'a str,
+    pub input: String,
+}
+
+// This function is used in the `benchmark` files
+#[allow(dead_code)]
+fn get_bench_cases() -> Vec<BenchCase<'static>> {
+    vec![
+        BenchCase {
+            regex: r"a.b",
+            input: "abcd abef abgh ijk".to_string(),
+        },
+        BenchCase {
+            regex: r"a*b",
+            input: "aaaaaaaaab".to_string(),
+        },
+        BenchCase {
+            regex: r"a+b",
+            input: "aabab".to_string(),
+        },
+        BenchCase {
+            regex: r"a?b",
+            input: "b aaab ab".to_string(),
+        },
+        BenchCase {
+            regex: r"a|b",
+            input: "xxaxybxx".to_string(),
+        },
+        // Group and escape sequences
+        BenchCase {
+            regex: r"(a|b)c",
+            input: "abc ac bc bbcc".to_string(),
+        },
+        BenchCase {
+            regex: r"\.",
+            input: "Find . within this !?. sentence.".to_string(),
+        },
+        // Larger and more complex patterns
+        BenchCase {
+            regex: r"(hel+o|wor?ld)",
+            input: "hello helolllo world worlld helloworld".to_string(),
+        },
+        BenchCase {
+            regex: r"ab*c+",
+            input: "abbc abbbbbbbcc bccaaabbabc".to_string(),
+        },
+        BenchCase {
+            regex: r"(a(bc|de)+)",
+            input: "abc abcbc abcdedef".to_string(),
+        },
+        // Realistic text patterns and larger inputs
+        BenchCase {
+            regex: r"\b[0-9]{2}\b",
+            input: "There are 99 bottles of soda and 45 cans of juice".to_string(),
+        },
+        BenchCase {
+            regex: r"\b\w{5,}\b",
+            input: "Rust is great for systems programming but can be challenging".to_string(),
+        },
+        BenchCase {
+            regex: r"(https?|ftp)://[^\s/$.?#].[^\s]*",
+            input: "Check https://example.com out and ftp://fileserver.net as well".to_string(),
+        },
+        // Pathological case to test limits
+        BenchCase {
+            regex: r"(a|b)*c",
+            input: format!("{}{}", "a".repeat(1000), "bc"),
+        },
+        BenchCase {
+            regex: r"x{3}(y|z)",
+            input: "xxxxyxxxzxxxy".to_string(),
+        },
+    ]
+}
diff --git a/benches/glushkov_benchmark.rs b/benches/glushkov_benchmark.rs
index e69de29..1f6522a 100644
--- a/benches/glushkov_benchmark.rs
+++ b/benches/glushkov_benchmark.rs
@@ -0,0 +1,62 @@
+include!("bench_cases.rs");
+use criterion::{Criterion, criterion_group, criterion_main};
+use regex_engine::{ConstructionType, Regex};
+
+fn benchmark_glushkov_regex_process(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
+
+        c.bench_function(
+            &format!("Glushkov is_match - pattern: {}", case.regex),
+            |b| {
+                b.iter(|| {
+                    regex.is_match(&case.input);
+                })
+            },
+        );
+    }
+}
+
+fn benchmark_glushkov_regex_find_first(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
+
+        c.bench_function(
+            &format!("Glushkov find match - pattern: {}", case.regex),
+            |b| {
+                b.iter(|| {
+                    regex.find(&case.input);
+                })
+            },
+        );
+    }
+}
+
+fn benchmark_glushkov_regex_find_all(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
+
+        c.bench_function(
+            &format!("Glushkov findall matches - pattern: {}", case.regex),
+            |b| {
+                b.iter(|| {
+                    regex.findall(&case.input);
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(
+    benches,
+    benchmark_glushkov_regex_process,
+    benchmark_glushkov_regex_find_first,
+    benchmark_glushkov_regex_find_all
+);
+criterion_main!(benches);
diff --git a/benches/rust_regex_benchmark.rs b/benches/rust_regex_benchmark.rs
index e69de29..63d05a3 100644
--- a/benches/rust_regex_benchmark.rs
+++ b/benches/rust_regex_benchmark.rs
@@ -0,0 +1,56 @@
+include!("bench_cases.rs");
+use criterion::{Criterion, criterion_group, criterion_main};
+use regex::Regex;
+
+fn benchmark_rust_regex_process(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in cases {
+        let regex = Regex::new(case.regex)
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+
+        c.bench_function(&format!("Rust process match: {}", case.regex), |b| {
+            b.iter(|| {
+                regex.is_match(&case.input);
+            })
+        });
+    }
+}
+
+fn benchmark_rust_regex_find_first(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in cases {
+        let regex = Regex::new(case.regex)
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+
+        c.bench_function(&format!("Rust find first match: {}", case.regex), |b| {
+            b.iter(|| {
+                regex.find(&case.input).map(|m| m.as_str());
+            })
+        });
+    }
+}
+
+fn benchmark_rust_regex_find_all(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in cases {
+        let regex = Regex::new(case.regex)
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+
+        c.bench_function(&format!("Rust find all matches: {}", case.regex), |b| {
+            b.iter(|| {
+                regex.find_iter(&case.input);
+            })
+        });
+    }
+}
+
+criterion_group!(
+    benches,
+    benchmark_rust_regex_process,
+    benchmark_rust_regex_find_first,
+    benchmark_rust_regex_find_all
+);
+criterion_main!(benches);
diff --git a/benches/thompson_benchmark.rs b/benches/thompson_benchmark.rs
index e69de29..e38a502 100644
--- a/benches/thompson_benchmark.rs
+++ b/benches/thompson_benchmark.rs
@@ -0,0 +1,62 @@
+include!("bench_cases.rs");
+use criterion::{Criterion, criterion_group, criterion_main};
+use regex_engine::{ConstructionType, Regex};
+
+fn benchmark_thompson_regex_process(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Thompson);
+
+        c.bench_function(
+            &format!("Thompson is_match - pattern: {}", case.regex),
+            |b| {
+                b.iter(|| {
+                    regex.is_match(&case.input);
+                })
+            },
+        );
+    }
+}
+
+fn benchmark_thompson_regex_find_first(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Thompson);
+
+        c.bench_function(
+            &format!("Thompson find match - pattern: {}", case.regex),
+            |b| {
+                b.iter(|| {
+                    regex.find(&case.input);
+                })
+            },
+        );
+    }
+}
+
+fn benchmark_thompson_regex_find_all(c: &mut Criterion) {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Thompson);
+
+        c.bench_function(
+            &format!("Thompson findall matches - pattern: {}", case.regex),
+            |b| {
+                b.iter(|| {
+                    regex.findall(&case.input);
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(
+    benches,
+    benchmark_thompson_regex_process,
+    benchmark_thompson_regex_find_first,
+    benchmark_thompson_regex_find_all
+);
+criterion_main!(benches);
diff --git a/src/glushkov.rs b/src/glushkov.rs
index ec35441..17914ef 100644
--- a/src/glushkov.rs
+++ b/src/glushkov.rs
@@ -1,7 +1,4 @@
-use crate::{
-    Dfa,
-    regex_engine::{is_valid_regex, normalise_regex},
-};
+use crate::{Dfa, is_valid_regex, normalise_regex};
 use std::collections::{HashMap, HashSet};
 
 #[derive(Clone, Debug, PartialEq)]
@@ -153,7 +150,7 @@ fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
     // while let Some(symbol) = chars.next() {
     //     if symbol_type == SymbolType::Escaped {
     //         indexed_states
-    //             .entry(idx as u32)
+    //             .entry(idx)
     //             .or_insert((symbol, symbol_type.clone(), group_index));
     //
     //         idx += 1;
@@ -161,7 +158,7 @@ fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
     //         continue;
     //     }
     //
-    //     println!("{:?}, {:?}", union_count, symbol,);
+    //     println!("{union_count:?}, {symbol:?}");
     //     match symbol {
     //         '|' => {
     //             if let Some(last_element) = union_count.last_mut() {
@@ -193,11 +190,9 @@ fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
     //                 }
     //             }
     //
-    //             indexed_states.entry(idx as u32).or_insert((
-    //                 symbol,
-    //                 symbol_type.clone(),
-    //                 group_index,
-    //             ));
+    //             indexed_states
+    //                 .entry(idx)
+    //                 .or_insert((symbol, symbol_type.clone(), group_index));
     //
     //             idx += 1;
     //         }
@@ -347,7 +342,7 @@ mod tests {
     }
 
     #[test]
-    fn test_to_many_brackets() {
+    fn test_too_many_brackets() {
         let expected = HashMap::from([
             (0, ('a', SymbolType::KleeneStar, 0)),
             (1, ('b', SymbolType::Normal, 0)),
diff --git a/src/lib.rs b/src/lib.rs
index 6f2fff7..7d17879 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,7 +1,7 @@
+use crate::{glushkov::GlushkovDfa, thompson::ThompsonDfa};
 use std::collections::{HashMap, HashSet};
 
 mod glushkov;
-pub mod regex_engine;
 mod thompson;
 
 trait Dfa {
@@ -95,3 +95,311 @@ trait Dfa {
         matches
     }
 }
+
+pub enum ConstructionType {
+    Thompson,
+    Glushkov,
+}
+
+enum DfaType {
+    Thompson(ThompsonDfa),
+    Glushkov(GlushkovDfa),
+}
+
+pub struct Regex {
+    dfa: DfaType,
+}
+
+impl Regex {
+    pub fn new(pattern: &str, construction: ConstructionType) -> Self {
+        let dfa_type = match construction {
+            ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)),
+            ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)),
+        };
+        Regex { dfa: dfa_type }
+    }
+
+    pub fn is_match(&self, text: &str) -> bool {
+        match &self.dfa {
+            DfaType::Thompson(dfa) => dfa.process(text),
+            DfaType::Glushkov(dfa) => dfa.process(text),
+        }
+    }
+
+    pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> {
+        match &self.dfa {
+            DfaType::Thompson(dfa) => dfa.find_first_match(text),
+            DfaType::Glushkov(dfa) => dfa.find_first_match(text),
+        }
+    }
+
+    pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> {
+        match &self.dfa {
+            DfaType::Thompson(dfa) => dfa.find_all_matches(text),
+            DfaType::Glushkov(dfa) => dfa.find_all_matches(text),
+        }
+    }
+}
+
+pub fn is_valid_regex(regex: &str) -> bool {
+    if regex.is_empty() {
+        return false;
+    }
+
+    let mut open_paren_count = 0;
+    let mut last_was_quantifier = false;
+
+    let mut chars = regex.chars().peekable();
+    while let Some(c) = chars.next() {
+        match c {
+            '(' => {
+                open_paren_count += 1;
+                last_was_quantifier = false;
+            }
+
+            ')' => {
+                if open_paren_count == 0 {
+                    return false;
+                }
+                open_paren_count -= 1;
+                last_was_quantifier = false;
+            }
+
+            '*' | '+' => {
+                // Ensure quantifiers are not the first character and are not repeated
+                if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') {
+                    return false;
+                }
+                last_was_quantifier = true;
+            }
+
+            '|' => {
+                // Ensure alternation isn't the first or last character
+                if regex.starts_with('|') || chars.peek().is_none() {
+                    return false;
+                }
+                last_was_quantifier = false;
+            }
+
+            '\\' => {
+                // Handle escaped characters: ensure there's a character after the escape
+                if chars.peek().is_none() {
+                    return false;
+                }
+                chars.next(); // Skip the escaped character
+                last_was_quantifier = false;
+            }
+
+            _ => {
+                last_was_quantifier = false;
+            }
+        }
+    }
+
+    open_paren_count == 0
+}
+
+pub fn normalise_regex(regex: &str) -> String {
+    let mut normalised = String::new();
+    let mut escape_sequence = false;
+    let mut prev_char = '\0';
+
+    for curr_char in regex.chars() {
+        if escape_sequence {
+            // TODO: Implement further parsing features here (e.g. \w \d)
+            normalised.push(curr_char);
+            escape_sequence = false;
+            prev_char = curr_char;
+            continue;
+        }
+
+        if curr_char == '\\' {
+            escape_sequence = true;
+            normalised.push(curr_char);
+            continue;
+        }
+
+        if curr_char == '+' {
+            normalised.push(prev_char);
+            normalised.push('*');
+            prev_char = curr_char;
+            continue;
+        }
+        if curr_char == '?' {
+            match prev_char {
+                ')' => {
+                    let mut balance = 0;
+
+                    for j in (0..normalised.len()).rev() {
+                        let ch = normalised.chars().nth(j).unwrap();
+                        if ch == ')' {
+                            balance += 1;
+                        } else if ch == '(' {
+                            balance -= 1;
+                            if balance == 0 {
+                                normalised.insert(j, '(');
+                                break;
+                            }
+                        }
+                    }
+                }
+                _ => {
+                    normalised.insert(normalised.len() - 1, '(');
+                }
+            }
+            normalised.push_str("|())");
+            prev_char = curr_char;
+            continue;
+        }
+        if curr_char == '.' {
+            normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)");
+            prev_char = curr_char;
+            continue;
+        }
+
+        normalised.push(curr_char);
+        prev_char = curr_char;
+    }
+
+    normalised
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn valid_regex_basic_test() {
+        let regex = "(a|b)*";
+        assert!(is_valid_regex(regex), "Expected valid regex.");
+    }
+
+    #[test]
+    fn invalid_empty_regex_test() {
+        let regex = "";
+        assert!(!is_valid_regex(regex), "Expected invalid regex (empty).");
+    }
+
+    #[test]
+    fn invalid_unbalanced_parentheses_test() {
+        let regex1 = "(a|b";
+        let regex2 = "a|b)";
+        assert!(
+            !is_valid_regex(regex1),
+            "Expected invalid regex (unbalanced parentheses)."
+        );
+        assert!(
+            !is_valid_regex(regex2),
+            "Expected invalid regex (unbalanced parentheses)."
+        );
+    }
+
+    #[test]
+    fn invalid_operator_placement_test() {
+        let regex1 = "*a";
+        let regex2 = "|a|b";
+        assert!(
+            !is_valid_regex(regex1),
+            "Expected invalid regex (invalid quantifier placement)."
+        );
+        assert!(
+            !is_valid_regex(regex2),
+            "Expected invalid regex (invalid alternation placement)."
+        );
+    }
+
+    #[test]
+    fn valid_nested_parentheses_test() {
+        let regex = "((a|b)*c)";
+        assert!(
+            is_valid_regex(regex),
+            "Expected valid regex with nested parentheses."
+        );
+    }
+
+    #[test]
+    fn valid_escape_sequence_test() {
+        let regex = "a\\*b";
+        assert!(
+            is_valid_regex(regex),
+            "Expected valid regex with escape sequence."
+        );
+    }
+
+    #[test]
+    fn invalid_escape_sequence_test() {
+        let regex = "a\\";
+        assert!(
+            !is_valid_regex(regex),
+            "Expected invalid regex with unpaired escape."
+        );
+    }
+
+    #[test]
+    fn normalise_regex_test() {
+        let cases = [
+            (r"a+", r"aa*"),
+            (r"a\+", r"a\+"),
+            (r"a?", r"(a|())"),
+            (r"a\?", r"a\?"),
+            (r"(ab)?", r"((ab)|())"),
+            (
+                r".",
+                "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)",
+            ),
+        ];
+
+        for (input, expected) in cases {
+            let result = normalise_regex(input);
+            assert_eq!(result, expected, "Normalisation failed for input '{input}'");
+        }
+    }
+
+    #[test]
+    fn is_match_test() {
+        let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson);
+
+        let success_strings = vec!["abababaaaababa", ""];
+        for string in success_strings {
+            assert!(regex_object.is_match(string));
+        }
+
+        let failing_strings = vec!["abc", "x"];
+        for string in failing_strings {
+            assert!(!regex_object.is_match(string));
+        }
+    }
+
+    #[test]
+    fn find_test() {
+        let regex_object = Regex::new("abc", ConstructionType::Thompson);
+        let test_cases = vec![
+            ("abcd", Some("abc")),
+            ("xyzabc", Some("abc")),
+            ("abc", Some("abc")),
+            ("ac", None),
+            ("def", None),
+            ("aabc", Some("abc")),
+        ];
+
+        for (text, expected) in test_cases {
+            let result = regex_object.find(text);
+            assert_eq!(result, expected, "Failed for input: {text}");
+        }
+    }
+
+    #[test]
+    fn find_all_test() {
+        let regex_object = Regex::new("abc*", ConstructionType::Thompson);
+        let test_cases = vec![
+            ("abcd", vec!["abc"]),
+            ("ac", vec![]),
+            ("abcab", vec!["abc", "ab"]),
+        ];
+
+        for (text, expected) in test_cases {
+            let result = regex_object.findall(text);
+            assert_eq!(result, expected, "Failed for input: {text}");
+        }
+    }
+}
diff --git a/src/regex_engine.rs b/src/regex_engine.rs
deleted file mode 100644
index f628c55..0000000
--- a/src/regex_engine.rs
+++ /dev/null
@@ -1,309 +0,0 @@
-use crate::{Dfa, glushkov::GlushkovDfa, thompson::ThompsonDfa};
-
-pub enum ConstructionType {
-    Thompson,
-    Glushkov,
-}
-
-enum DfaType {
-    Thompson(ThompsonDfa),
-    Glushkov(GlushkovDfa),
-}
-
-pub struct Regex {
-    dfa: DfaType,
-}
-
-impl Regex {
-    pub fn new(pattern: &str, construction: ConstructionType) -> Self {
-        let dfa_type = match construction {
-            ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)),
-            ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)),
-        };
-        Regex { dfa: dfa_type }
-    }
-
-    pub fn is_match(&self, text: &str) -> bool {
-        match &self.dfa {
-            DfaType::Thompson(dfa) => dfa.process(text),
-            DfaType::Glushkov(dfa) => dfa.process(text),
-        }
-    }
-
-    pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> {
-        match &self.dfa {
-            DfaType::Thompson(dfa) => dfa.find_first_match(text),
-            DfaType::Glushkov(dfa) => dfa.find_first_match(text),
-        }
-    }
-
-    pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> {
-        match &self.dfa {
-            DfaType::Thompson(dfa) => dfa.find_all_matches(text),
-            DfaType::Glushkov(dfa) => dfa.find_all_matches(text),
-        }
-    }
-}
-
-pub fn is_valid_regex(regex: &str) -> bool {
-    if regex.is_empty() {
-        return false;
-    }
-
-    let mut open_paren_count = 0;
-    let mut last_was_quantifier = false;
-
-    let mut chars = regex.chars().peekable();
-    while let Some(c) = chars.next() {
-        match c {
-            '(' => {
-                open_paren_count += 1;
-                last_was_quantifier = false;
-            }
-
-            ')' => {
-                if open_paren_count == 0 {
-                    return false;
-                }
-                open_paren_count -= 1;
-                last_was_quantifier = false;
-            }
-
-            '*' | '+' => {
-                // Ensure quantifiers are not the first character and are not repeated
-                if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') {
-                    return false;
-                }
-                last_was_quantifier = true;
-            }
-
-            '|' => {
-                // Ensure alternation isn't the first or last character
-                if regex.starts_with('|') || chars.peek().is_none() {
-                    return false;
-                }
-                last_was_quantifier = false;
-            }
-
-            '\\' => {
-                // Handle escaped characters: ensure there's a character after the escape
-                if chars.peek().is_none() {
-                    return false;
-                }
-                chars.next(); // Skip the escaped character
-                last_was_quantifier = false;
-            }
-
-            _ => {
-                last_was_quantifier = false;
-            }
-        }
-    }
-
-    open_paren_count == 0
-}
-
-pub fn normalise_regex(regex: &str) -> String {
-    let mut normalised = String::new();
-    let mut escape_sequence = false;
-    let mut prev_char = '\0';
-
-    for curr_char in regex.chars() {
-        if escape_sequence {
-            // TODO: Implement further parsing features here (e.g. \w \d)
-            normalised.push(curr_char);
-            escape_sequence = false;
-            prev_char = curr_char;
-            continue;
-        }
-
-        if curr_char == '\\' {
-            escape_sequence = true;
-            normalised.push(curr_char);
-            continue;
-        }
-
-        if curr_char == '+' {
-            normalised.push(prev_char);
-            normalised.push('*');
-            prev_char = curr_char;
-            continue;
-        }
-        if curr_char == '?' {
-            match prev_char {
-                ')' => {
-                    let mut balance = 0;
-
-                    for j in (0..normalised.len()).rev() {
-                        let ch = normalised.chars().nth(j).unwrap();
-                        if ch == ')' {
-                            balance += 1;
-                        } else if ch == '(' {
-                            balance -= 1;
-                            if balance == 0 {
-                                normalised.insert(j, '(');
-                                break;
-                            }
-                        }
-                    }
-                }
-                _ => {
-                    normalised.insert(normalised.len() - 1, '(');
-                }
-            }
-            normalised.push_str("|())");
-            prev_char = curr_char;
-            continue;
-        }
-        if curr_char == '.' {
-            normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)");
-            prev_char = curr_char;
-            continue;
-        }
-
-        normalised.push(curr_char);
-        prev_char = curr_char;
-    }
-
-    normalised
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn valid_regex_basic_test() {
-        let regex = "(a|b)*";
-        assert!(is_valid_regex(regex), "Expected valid regex.");
-    }
-
-    #[test]
-    fn invalid_empty_regex_test() {
-        let regex = "";
-        assert!(!is_valid_regex(regex), "Expected invalid regex (empty).");
-    }
-
-    #[test]
-    fn invalid_unbalanced_parentheses_test() {
-        let regex1 = "(a|b";
-        let regex2 = "a|b)";
-        assert!(
-            !is_valid_regex(regex1),
-            "Expected invalid regex (unbalanced parentheses)."
-        );
-        assert!(
-            !is_valid_regex(regex2),
-            "Expected invalid regex (unbalanced parentheses)."
-        );
-    }
-
-    #[test]
-    fn invalid_operator_placement_test() {
-        let regex1 = "*a";
-        let regex2 = "|a|b";
-        assert!(
-            !is_valid_regex(regex1),
-            "Expected invalid regex (invalid quantifier placement)."
-        );
-        assert!(
-            !is_valid_regex(regex2),
-            "Expected invalid regex (invalid alternation placement)."
-        );
-    }
-
-    #[test]
-    fn valid_nested_parentheses_test() {
-        let regex = "((a|b)*c)";
-        assert!(
-            is_valid_regex(regex),
-            "Expected valid regex with nested parentheses."
-        );
-    }
-
-    #[test]
-    fn valid_escape_sequence_test() {
-        let regex = "a\\*b";
-        assert!(
-            is_valid_regex(regex),
-            "Expected valid regex with escape sequence."
-        );
-    }
-
-    #[test]
-    fn invalid_escape_sequence_test() {
-        let regex = "a\\";
-        assert!(
-            !is_valid_regex(regex),
-            "Expected invalid regex with unpaired escape."
-        );
-    }
-
-    #[test]
-    fn normalise_regex_test() {
-        let cases = [
-            (r"a+", r"aa*"),
-            (r"a\+", r"a\+"),
-            (r"a?", r"(a|())"),
-            (r"a\?", r"a\?"),
-            (r"(ab)?", r"((ab)|())"),
-            (
-                r".",
-                "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)",
-            ),
-        ];
-
-        for (input, expected) in cases {
-            let result = normalise_regex(input);
-            assert_eq!(result, expected, "Normalisation failed for input '{input}'");
-        }
-    }
-
-    #[test]
-    fn is_match_test() {
-        let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson);
-
-        let success_strings = vec!["abababaaaababa", ""];
-        for string in success_strings {
-            assert!(regex_object.is_match(string));
-        }
-
-        let failing_strings = vec!["abc", "x"];
-        for string in failing_strings {
-            assert!(!regex_object.is_match(string));
-        }
-    }
-
-    #[test]
-    fn find_test() {
-        let regex_object = Regex::new("abc", ConstructionType::Thompson);
-        let test_cases = vec![
-            ("abcd", Some("abc")),
-            ("xyzabc", Some("abc")),
-            ("abc", Some("abc")),
-            ("ac", None),
-            ("def", None),
-            ("aabc", Some("abc")),
-        ];
-
-        for (text, expected) in test_cases {
-            let result = regex_object.find(text);
-            assert_eq!(result, expected, "Failed for input: {text}");
-        }
-    }
-
-    #[test]
-    fn find_all_test() {
-        let regex_object = Regex::new("abc*", ConstructionType::Thompson);
-        let test_cases = vec![
-            ("abcd", vec!["abc"]),
-            ("ac", vec![]),
-            ("abcab", vec!["abc", "ab"]),
-        ];
-
-        for (text, expected) in test_cases {
-            let result = regex_object.findall(text);
-            assert_eq!(result, expected, "Failed for input: {text}");
-        }
-    }
-}
diff --git a/src/thompson.rs b/src/thompson.rs
index 1ad648c..5f0ede8 100644
--- a/src/thompson.rs
+++ b/src/thompson.rs
@@ -1,7 +1,4 @@
-use crate::{
-    Dfa,
-    regex_engine::{is_valid_regex, normalise_regex},
-};
+use crate::{Dfa, is_valid_regex, normalise_regex};
 use std::collections::{HashMap, HashSet, VecDeque};
 
 struct Nfa {
@@ -593,7 +590,7 @@ mod tests {
 
         let generated_dfa = nfa_to_dfa(&input_nfa);
 
-        let expected_options = vec![
+        let expected_options = [
             HashMap::from([
                 ((0, 'a'), 1),
                 ((0, 'b'), 2),

From 4b624b4bc5c01a2b50e8c65879ad55b5a5fe2b41 Mon Sep 17 00:00:00 2001
From: Testspieler09 <pepehanisch06@gmail.com>
Date: Sun, 27 Jul 2025 20:07:56 +0200
Subject: [PATCH 5/8] unstable(glushkov): pushing progress

---
 Cargo.toml                      |  10 +-
 benches/bench_cases.rs          |  81 +++-
 benches/glushkov_benchmark.rs   |  62 ---
 benches/regex_benchmark.rs      | 181 ++++++++
 benches/rust_regex_benchmark.rs |  56 ---
 benches/thompson_benchmark.rs   |  62 ---
 src/glushkov.rs                 | 727 +++++++++++++++++++++++---------
 src/lib.rs                      | 291 +++++++++++--
 src/thompson.rs                 | 213 +++-------
 tests/glushkov_test.rs          |  17 +
 tests/rust_regex_test.rs        |  27 ++
 tests/test_one.rs               |  49 ---
 tests/thompson_test.rs          |  18 +
 13 files changed, 1149 insertions(+), 645 deletions(-)
 delete mode 100644 benches/glushkov_benchmark.rs
 create mode 100644 benches/regex_benchmark.rs
 delete mode 100644 benches/rust_regex_benchmark.rs
 delete mode 100644 benches/thompson_benchmark.rs
 create mode 100644 tests/glushkov_test.rs
 create mode 100644 tests/rust_regex_test.rs
 delete mode 100644 tests/test_one.rs
 create mode 100644 tests/thompson_test.rs

diff --git a/Cargo.toml b/Cargo.toml
index 4a0c696..4cdf3fd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,13 +8,5 @@ criterion = { version = "0.5", features = ["html_reports"] }
 regex = "1.11.1"
 
 [[bench]]
-name = "thompson_benchmark"
-harness = false
-
-# [[bench]]
-# name = "glushkov_benchmark"
-# harness = false
-
-[[bench]]
-name = "rust_regex_benchmark"
+name = "regex_benchmark"
 harness = false
diff --git a/benches/bench_cases.rs b/benches/bench_cases.rs
index 7df2b4c..e76cd8a 100644
--- a/benches/bench_cases.rs
+++ b/benches/bench_cases.rs
@@ -1,76 +1,113 @@
+use std::hint::black_box;
+
 #[allow(dead_code)]
 struct BenchCase<'a> {
     pub regex: &'a str,
     pub input: String,
+    pub expected_is_match: bool,
+    pub expected_first_match: Option<String>,
+    pub expected_all_matches: Vec<String>,
 }
 
 // This function is used in the `benchmark` files
 #[allow(dead_code)]
 fn get_bench_cases() -> Vec<BenchCase<'static>> {
-    vec![
+    black_box(vec![
         BenchCase {
             regex: r"a.b",
             input: "abcd abef abgh ijk".to_string(),
+            expected_is_match: false,
+            expected_first_match: None,
+            expected_all_matches: vec![],
         },
         BenchCase {
             regex: r"a*b",
             input: "aaaaaaaaab".to_string(),
+            expected_is_match: true,
+            expected_first_match: Some("aaaaaaaaab".to_string()),
+            expected_all_matches: vec!["aaaaaaaaab".to_string()],
         },
         BenchCase {
             regex: r"a+b",
             input: "aabab".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some("aab".to_string()),
+            expected_all_matches: vec!["aab".to_string(), "ab".to_string()],
         },
         BenchCase {
             regex: r"a?b",
             input: "b aaab ab".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some("b".to_string()),
+            expected_all_matches: vec!["b".to_string(), "ab".to_string(), "ab".to_string()],
         },
         BenchCase {
             regex: r"a|b",
             input: "xxaxybxx".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some("a".to_string()),
+            expected_all_matches: vec!["a".to_string(), "b".to_string()],
         },
-        // Group and escape sequences
         BenchCase {
             regex: r"(a|b)c",
             input: "abc ac bc bbcc".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some("bc".to_string()),
+            expected_all_matches: vec![
+                "bc".to_string(),
+                "ac".to_string(),
+                "bc".to_string(),
+                "bc".to_string(),
+            ],
         },
         BenchCase {
             regex: r"\.",
             input: "Find . within this !?. sentence.".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some(".".to_string()),
+            expected_all_matches: vec![".".to_string(), ".".to_string(), ".".to_string()],
         },
-        // Larger and more complex patterns
         BenchCase {
             regex: r"(hel+o|wor?ld)",
             input: "hello helolllo world worlld helloworld".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some("hello".to_string()),
+            expected_all_matches: vec![
+                "hello".to_string(),
+                "helo".to_string(),
+                "world".to_string(),
+                "hello".to_string(),
+                "world".to_string(),
+            ],
         },
         BenchCase {
             regex: r"ab*c+",
             input: "abbc abbbbbbbcc bccaaabbabc".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some("abbc".to_string()),
+            expected_all_matches: vec![
+                "abbc".to_string(),
+                "abbbbbbbcc".to_string(),
+                "abc".to_string(),
+            ],
         },
         BenchCase {
             regex: r"(a(bc|de)+)",
             input: "abc abcbc abcdedef".to_string(),
-        },
-        // Realistic text patterns and larger inputs
-        BenchCase {
-            regex: r"\b[0-9]{2}\b",
-            input: "There are 99 bottles of soda and 45 cans of juice".to_string(),
-        },
-        BenchCase {
-            regex: r"\b\w{5,}\b",
-            input: "Rust is great for systems programming but can be challenging".to_string(),
+            expected_is_match: false,
+            expected_first_match: Some("abc".to_string()),
+            expected_all_matches: vec![
+                "abc".to_string(),
+                "abcbc".to_string(),
+                "abcdede".to_string(),
+            ],
         },
-        BenchCase {
-            regex: r"(https?|ftp)://[^\s/$.?#].[^\s]*",
-            input: "Check https://example.com out and ftp://fileserver.net as well".to_string(),
-        },
-        // Pathological case to test limits
         BenchCase {
             regex: r"(a|b)*c",
             input: format!("{}{}", "a".repeat(1000), "bc"),
+            expected_is_match: true,
+            expected_first_match: Some(format!("{}{}", "a".repeat(1000), "bc")),
+            expected_all_matches: vec![format!("{}{}", "a".repeat(1000), "bc")],
         },
-        BenchCase {
-            regex: r"x{3}(y|z)",
-            input: "xxxxyxxxzxxxy".to_string(),
-        },
-    ]
+    ])
 }
diff --git a/benches/glushkov_benchmark.rs b/benches/glushkov_benchmark.rs
deleted file mode 100644
index 1f6522a..0000000
--- a/benches/glushkov_benchmark.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-include!("bench_cases.rs");
-use criterion::{Criterion, criterion_group, criterion_main};
-use regex_engine::{ConstructionType, Regex};
-
-fn benchmark_glushkov_regex_process(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
-
-        c.bench_function(
-            &format!("Glushkov is_match - pattern: {}", case.regex),
-            |b| {
-                b.iter(|| {
-                    regex.is_match(&case.input);
-                })
-            },
-        );
-    }
-}
-
-fn benchmark_glushkov_regex_find_first(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
-
-        c.bench_function(
-            &format!("Glushkov find match - pattern: {}", case.regex),
-            |b| {
-                b.iter(|| {
-                    regex.find(&case.input);
-                })
-            },
-        );
-    }
-}
-
-fn benchmark_glushkov_regex_find_all(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
-
-        c.bench_function(
-            &format!("Glushkov findall matches - pattern: {}", case.regex),
-            |b| {
-                b.iter(|| {
-                    regex.findall(&case.input);
-                })
-            },
-        );
-    }
-}
-
-criterion_group!(
-    benches,
-    benchmark_glushkov_regex_process,
-    benchmark_glushkov_regex_find_first,
-    benchmark_glushkov_regex_find_all
-);
-criterion_main!(benches);
diff --git a/benches/regex_benchmark.rs b/benches/regex_benchmark.rs
new file mode 100644
index 0000000..8629579
--- /dev/null
+++ b/benches/regex_benchmark.rs
@@ -0,0 +1,181 @@
+include!("bench_cases.rs");
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use regex as rust_regex;
+use regex_engine::{ConstructionType, Regex};
+
+fn benchmark_regex_compile_time(c: &mut Criterion) {
+    let cases = get_bench_cases();
+    let mut group = c.benchmark_group("Regex Compile Time");
+
+    for case in cases {
+        group.bench_with_input(
+            BenchmarkId::new("Thompson", case.regex),
+            &case.regex,
+            |b, regex| {
+                b.iter(|| {
+                    Regex::new(regex, ConstructionType::Thompson);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Glushkov", case.regex),
+            &case.regex,
+            |b, regex| {
+                b.iter(|| {
+                    Regex::new(regex, ConstructionType::Glushkov);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Rust", case.regex),
+            &case.regex,
+            |b, regex| {
+                b.iter(|| {
+                    rust_regex::Regex::new(regex)
+                        .unwrap_or_else(|_| panic!("Failed to create pattern: {regex}"));
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+fn benchmark_regex_is_match(c: &mut Criterion) {
+    let cases = get_bench_cases();
+    let mut group = c.benchmark_group("Regex Is Match");
+
+    for case in &cases {
+        let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson);
+        let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        let rust_regex = rust_regex::Regex::new(&format!("^{}$", case.regex))
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+
+        group.bench_with_input(
+            BenchmarkId::new("Thompson", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    thompson_regex.is_match(input);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Glushkov", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    glushkov_regex.is_match(input);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Rust", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    rust_regex.is_match(input);
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+fn benchmark_regex_find_first(c: &mut Criterion) {
+    let cases = get_bench_cases();
+    let mut group = c.benchmark_group("Regex Find First");
+
+    for case in &cases {
+        let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson);
+        let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        let rust_regex = rust_regex::Regex::new(case.regex)
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+
+        group.bench_with_input(
+            BenchmarkId::new("Thompson", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    thompson_regex.find(input);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Glushkov", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    glushkov_regex.find(input);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Rust", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    rust_regex.find(input).map(|m| m.as_str());
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+fn benchmark_regex_find_all(c: &mut Criterion) {
+    let cases = get_bench_cases();
+    let mut group = c.benchmark_group("Regex Find All");
+
+    for case in &cases {
+        let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson);
+        let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        let rust_regex = rust_regex::Regex::new(case.regex)
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+
+        group.bench_with_input(
+            BenchmarkId::new("Thompson", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    thompson_regex.findall(input);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Glushkov", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    glushkov_regex.findall(input);
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("Rust", case.regex),
+            &case.input,
+            |b, input| {
+                b.iter(|| {
+                    rust_regex.find_iter(input);
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    benchmark_regex_compile_time,
+    benchmark_regex_is_match,
+    benchmark_regex_find_first,
+    benchmark_regex_find_all
+);
+criterion_main!(benches);
diff --git a/benches/rust_regex_benchmark.rs b/benches/rust_regex_benchmark.rs
deleted file mode 100644
index 63d05a3..0000000
--- a/benches/rust_regex_benchmark.rs
+++ /dev/null
@@ -1,56 +0,0 @@
-include!("bench_cases.rs");
-use criterion::{Criterion, criterion_group, criterion_main};
-use regex::Regex;
-
-fn benchmark_rust_regex_process(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in cases {
-        let regex = Regex::new(case.regex)
-            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
-
-        c.bench_function(&format!("Rust process match: {}", case.regex), |b| {
-            b.iter(|| {
-                regex.is_match(&case.input);
-            })
-        });
-    }
-}
-
-fn benchmark_rust_regex_find_first(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in cases {
-        let regex = Regex::new(case.regex)
-            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
-
-        c.bench_function(&format!("Rust find first match: {}", case.regex), |b| {
-            b.iter(|| {
-                regex.find(&case.input).map(|m| m.as_str());
-            })
-        });
-    }
-}
-
-fn benchmark_rust_regex_find_all(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in cases {
-        let regex = Regex::new(case.regex)
-            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
-
-        c.bench_function(&format!("Rust find all matches: {}", case.regex), |b| {
-            b.iter(|| {
-                regex.find_iter(&case.input);
-            })
-        });
-    }
-}
-
-criterion_group!(
-    benches,
-    benchmark_rust_regex_process,
-    benchmark_rust_regex_find_first,
-    benchmark_rust_regex_find_all
-);
-criterion_main!(benches);
diff --git a/benches/thompson_benchmark.rs b/benches/thompson_benchmark.rs
deleted file mode 100644
index e38a502..0000000
--- a/benches/thompson_benchmark.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-include!("bench_cases.rs");
-use criterion::{Criterion, criterion_group, criterion_main};
-use regex_engine::{ConstructionType, Regex};
-
-fn benchmark_thompson_regex_process(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Thompson);
-
-        c.bench_function(
-            &format!("Thompson is_match - pattern: {}", case.regex),
-            |b| {
-                b.iter(|| {
-                    regex.is_match(&case.input);
-                })
-            },
-        );
-    }
-}
-
-fn benchmark_thompson_regex_find_first(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Thompson);
-
-        c.bench_function(
-            &format!("Thompson find match - pattern: {}", case.regex),
-            |b| {
-                b.iter(|| {
-                    regex.find(&case.input);
-                })
-            },
-        );
-    }
-}
-
-fn benchmark_thompson_regex_find_all(c: &mut Criterion) {
-    let cases = get_bench_cases();
-
-    for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Thompson);
-
-        c.bench_function(
-            &format!("Thompson findall matches - pattern: {}", case.regex),
-            |b| {
-                b.iter(|| {
-                    regex.findall(&case.input);
-                })
-            },
-        );
-    }
-}
-
-criterion_group!(
-    benches,
-    benchmark_thompson_regex_process,
-    benchmark_thompson_regex_find_first,
-    benchmark_thompson_regex_find_all
-);
-criterion_main!(benches);
diff --git a/src/glushkov.rs b/src/glushkov.rs
index 17914ef..1d75081 100644
--- a/src/glushkov.rs
+++ b/src/glushkov.rs
@@ -1,5 +1,5 @@
 use crate::{Dfa, is_valid_regex, normalise_regex};
-use std::collections::{HashMap, HashSet};
+use std::collections::{HashMap, HashSet, VecDeque};
 
 #[derive(Clone, Debug, PartialEq)]
 enum SymbolType {
@@ -8,11 +8,13 @@ enum SymbolType {
     Escaped,
 }
 
+#[derive(Debug)]
 struct Nfa {
     transitions: HashMap<(u32, char), Vec<u32>>,
     accepting_states: HashSet<u32>,
 }
 
+#[derive(Debug)]
 pub struct GlushkovDfa {
     transitions: HashMap<(u32, char), u32>,
     accepting_states: HashSet<u32>,
@@ -24,8 +26,12 @@ impl Dfa for GlushkovDfa {
             panic!("{regex} is not a valid regular expression!");
         }
 
-        let normalised_regex = normalise_regex(&regex);
-        todo!()
+        let normalised_regex = normalise_regex(regex);
+        let regex_nfa = glushkov_construction(&normalised_regex);
+        dbg!(&regex_nfa);
+        let mut regex_dfa = nfa_no_epsilon_to_dfa(&regex_nfa);
+        <Self as Dfa>::optimise_dfa(&mut regex_dfa);
+        regex_dfa
     }
 
     fn get_transitions(&self) -> &HashMap<(u32, char), u32> {
@@ -35,25 +41,27 @@ impl Dfa for GlushkovDfa {
     fn get_accepting_states(&self) -> &HashSet<u32> {
         &self.accepting_states
     }
+
+    fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32> {
+        &mut self.transitions
+    }
+
+    fn get_accepting_states_mut(&mut self) -> &mut HashSet<u32> {
+        &mut self.accepting_states
+    }
 }
 
 // GLUSHKOV CONSTRUCTION
 fn glushkov_construction(regex: &str) -> Nfa {
+    dbg!(&regex);
     let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
     let mut accepting_states: HashSet<u32> = HashSet::new();
 
     let states: HashMap<u32, (char, SymbolType, u32)> = index_states(regex);
+    dbg!(&states);
 
-    let mut start_states: HashSet<u32> = HashSet::new();
+    fill_sets(states, &mut accepting_states, &mut transitions);
 
-    fill_sets(
-        states,
-        &mut start_states,
-        &mut accepting_states,
-        &mut transitions,
-    );
-
-    // TODO: Construct transitions and accepting states using position_index
     Nfa {
         transitions,
         accepting_states,
@@ -62,19 +70,19 @@ fn glushkov_construction(regex: &str) -> Nfa {
 
 fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
     let mut indexed_states: HashMap<u32, (char, SymbolType, u32)> = HashMap::new();
-    let mut symbol_type = SymbolType::Normal;
-    let mut union_count: Vec<u32> = vec![0];
+    let mut symbol_type: SymbolType = SymbolType::Normal;
+    let mut group_stack: Vec<u32> = vec![0];
     let mut idx: u32 = 0;
-    let mut group_index: u32 = 0;
-
-    // New stack to track if a group is meaningful
-    let mut group_stack: Vec<Option<u32>> = vec![]; // Some(index) if real, None if ignored
-
+    let mut next_group_id: u32 = 1;
     let mut chars = regex.chars().peekable();
 
     while let Some(symbol) = chars.next() {
         if symbol_type == SymbolType::Escaped {
-            indexed_states.insert(idx, (symbol, symbol_type.clone(), group_index));
+            indexed_states.entry(idx).or_insert((
+                symbol,
+                symbol_type.clone(),
+                *group_stack.last().unwrap(),
+            ));
             idx += 1;
             symbol_type = SymbolType::Normal;
             continue;
@@ -82,33 +90,23 @@ fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
 
         match symbol {
             '|' => {
-                if let Some(last_union) = union_count.last_mut() {
-                    *last_union += 1;
-                }
-                if let Some(Some(_)) = group_stack.last_mut() {
-                    // still a real group, do nothing here
-                } else if let Some(group) = group_stack.last_mut() {
-                    // this group is now meaningful, assign it an index
-                    *group = Some(group_index);
-                    group_index += 1;
+                // Start a new group for the next alternative
+                let new_group_id = next_group_id;
+                next_group_id += 1;
+                // Replace the current group on the stack with the new one
+                if let Some(last) = group_stack.last_mut() {
+                    *last = new_group_id;
                 }
             }
             '(' => {
-                union_count.push(0);
-                group_stack.push(None); // not yet known if meaningful
+                // Push the next group ID onto the stack for this grouping level
+                let new_group_id = next_group_id;
+                next_group_id += 1;
+                group_stack.push(new_group_id);
             }
             ')' => {
-                union_count.pop();
-
-                match group_stack.pop() {
-                    Some(Some(_)) => {
-                        // it was meaningful, nothing to change
-                    }
-                    Some(None) => {
-                        // the group was never promoted to real => do nothing
-                    }
-                    None => panic!("Mismatched parentheses"),
-                }
+                // Pop the current group and return to parent group
+                group_stack.pop();
             }
             '*' => {
                 symbol_type = SymbolType::Normal;
@@ -116,179 +114,332 @@ fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
             }
             '\\' => symbol_type = SymbolType::Escaped,
             _ => {
-                if let Some(next) = chars.peek() {
-                    if *next == '*' {
-                        symbol_type = SymbolType::KleeneStar;
-                    }
-                }
-
-                // if we're inside a group that hasn't been assigned an index yet, assign now
-                if let Some(group) = group_stack.last_mut() {
-                    if group.is_none() {
-                        *group = Some(group_index);
-                        group_index += 1;
-                    }
+                if let Some(next_symbol) = chars.peek()
+                    && matches!(*next_symbol, '*')
+                {
+                    symbol_type = SymbolType::KleeneStar
                 }
-
-                // get current group idx for this symbol
-                let current_group = group_stack.last().and_then(|g| *g).unwrap_or(group_index);
-
-                indexed_states.insert(idx, (symbol, symbol_type.clone(), current_group));
+                indexed_states.entry(idx).or_insert((
+                    symbol,
+                    symbol_type.clone(),
+                    *group_stack.last().unwrap(),
+                ));
                 idx += 1;
             }
         }
     }
-
-    // let mut indexed_states: HashMap<u32, (char, SymbolType, u32)> = HashMap::new();
-    // let mut symbol_type: SymbolType = SymbolType::Normal;
-    // let mut union_count: Vec<u32> = vec![0];
-    // let mut idx: u32 = 0;
-    // let mut group_index: u32 = 0;
-    //
-    // let mut chars = regex.chars().peekable();
-    //
-    // while let Some(symbol) = chars.next() {
-    //     if symbol_type == SymbolType::Escaped {
-    //         indexed_states
-    //             .entry(idx)
-    //             .or_insert((symbol, symbol_type.clone(), group_index));
-    //
-    //         idx += 1;
-    //         symbol_type = SymbolType::Normal;
-    //         continue;
-    //     }
-    //
-    //     println!("{union_count:?}, {symbol:?}");
-    //     match symbol {
-    //         '|' => {
-    //             if let Some(last_element) = union_count.last_mut() {
-    //                 *last_element += 1;
-    //             }
-    //             group_index += 1;
-    //         }
-    //         // FIX: the paranthasis are not working correctly e.g. x|(x|y)|x <=> x|x|y|x
-    //         '(' => {
-    //             union_count.push(0);
-    //             group_index += 1;
-    //         }
-    //         ')' => {
-    //             let unions_last_grouping = union_count.pop().unwrap();
-    //             if unions_last_grouping == 0 {
-    //                 continue;
-    //             }
-    //             group_index -= unions_last_grouping + 1;
-    //         }
-    //         '*' => {
-    //             symbol_type = SymbolType::Normal;
-    //             continue;
-    //         }
-    //         '\\' => symbol_type = SymbolType::Escaped,
-    //         _ => {
-    //             if let Some(next_symbol) = chars.peek() {
-    //                 if *next_symbol == '*' {
-    //                     symbol_type = SymbolType::KleeneStar
-    //                 }
-    //             }
-    //
-    //             indexed_states
-    //                 .entry(idx)
-    //                 .or_insert((symbol, symbol_type.clone(), group_index));
-    //
-    //             idx += 1;
-    //         }
-    //     }
-    // }
-
     indexed_states
 }
 
 fn fill_sets(
     states: HashMap<u32, (char, SymbolType, u32)>,
-    start_states: &mut HashSet<u32>,
-    finite_states: &mut HashSet<u32>,
-    tranisitions: &mut HashMap<(u32, char), Vec<u32>>,
+    accepting_states: &mut HashSet<u32>,
+    transitions: &mut HashMap<(u32, char), Vec<u32>>,
 ) {
-    let mut idx: u32 = 1;
-    let amount_states: u32 = states.len() as u32;
+    let mut start_states = HashSet::new();
 
+    let amount_states = states.len() as u32;
     if amount_states == 0 {
         return;
     }
 
-    // tranisitions
-    //     .entry((amount_states, states[&0].0))
-    //     .or_insert(vec![0]);
-    start_states.insert(0);
-
-    let mut last_symbol_type: &SymbolType = &states[&0].1;
-    let mut last_group_idx: u32 = 0;
-    let mut check_next_group: bool = false; // NOTE: can also be thought of as group_is_exhausted
-
-    loop {
-        let (_symbol, symbol_type, group_idx) = &states[&idx];
-        // Skip forwards to next group
-        if check_next_group {
-            if *group_idx != last_group_idx {
-                start_states.insert(idx);
-                last_symbol_type = symbol_type;
-                last_group_idx = *group_idx;
-                check_next_group = false;
-                // continue;
-            }
+    // Group states by their group index
+    let mut groups: HashMap<u32, Vec<u32>> = HashMap::new();
+    for (state_id, (_, _, group_idx)) in &states {
+        groups.entry(*group_idx).or_default().push(*state_id);
+    }
 
-            if idx < amount_states - 1 {
-                idx += 1;
-                continue;
-            } else {
-                break;
-            }
+    // Sort states within each group
+    for group in groups.values_mut() {
+        group.sort();
+    }
+
+    // Determine start states (first state of each group)
+    for group in groups.values() {
+        if group.is_empty() {
+            continue;
         }
 
-        if *group_idx != last_group_idx {
-            start_states.insert(idx);
-            last_group_idx = *group_idx;
-            last_symbol_type = symbol_type;
-            check_next_group = true;
+        start_states.insert(group[0]);
 
-            if idx < amount_states - 1 {
-                idx += 1;
-                continue;
-            } else {
-                break;
+        for i in 0..group.len() {
+            let state = group[i];
+            if let Some((_, symbol_type, _)) = states.get(&state) {
+                if symbol_type == &SymbolType::KleeneStar && i + 1 < group.len() {
+                    start_states.insert(group[i + 1]);
+                }
             }
         }
+    }
 
-        match last_symbol_type {
+    // Build transitions and determine accepting states
+    for (state_id, (symbol, symbol_type, group_idx)) in &states {
+        let current_group = &groups[group_idx];
+        let pos_in_group = current_group.iter().position(|&x| x == *state_id).unwrap();
+
+        match symbol_type {
             SymbolType::Normal | SymbolType::Escaped => {
-                check_next_group = true;
+                if pos_in_group + 1 < current_group.len() {
+                    let next_state = current_group[pos_in_group + 1];
+                    transitions
+                        .entry((*state_id, *symbol))
+                        .or_default()
+                        .push(next_state);
+                } else {
+                    accepting_states.insert(*state_id);
+                }
             }
             SymbolType::KleeneStar => {
-                start_states.insert(idx);
-                check_next_group = false;
+                transitions
+                    .entry((*state_id, *symbol))
+                    .or_default()
+                    .push(*state_id);
+
+                if pos_in_group + 1 < current_group.len() {
+                    for next_state in current_group.iter().skip(pos_in_group + 1) {
+                        transitions
+                            .entry((*state_id, *symbol))
+                            .or_default()
+                            .push(*next_state);
+                    }
+                } else {
+                    accepting_states.insert(*state_id);
+                }
             }
         }
+    }
 
-        last_symbol_type = symbol_type;
+    // Setup virtual (start-)state
+    let virtual_start = states.keys().max().copied().unwrap_or(0) + 1;
 
-        if idx < amount_states - 1 {
-            idx += 1;
-        } else {
-            break;
-        }
+    let symbol_to_first_state: Vec<(u32, char)> = start_states
+        .iter()
+        .map(|&s| (s, states.get(&s).expect("Expected an entry").0))
+        .collect();
+
+    for (first_state, symbol) in symbol_to_first_state {
+        transitions
+            .entry((virtual_start, symbol))
+            .or_default()
+            .push(first_state);
     }
 }
 // END GLUSHKOV CONSTRUCTION
 
 fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa {
-    todo!()
+    let mut dfa_transitions = HashMap::new();
+    let mut dfa_accepting_states = HashSet::new();
+
+    // Map from sorted vector of NFA states to DFA state ID (for hashable key)
+    let mut nfa_states_to_dfa_state: HashMap<Vec<u32>, u32> = HashMap::new();
+    let mut next_dfa_state_id = 0u32;
+    let mut work_queue = VecDeque::new();
+
+    // Helper function to convert HashSet to sorted Vec for use as HashMap key
+    let set_to_sorted_vec = |set: &HashSet<u32>| -> Vec<u32> {
+        let mut vec: Vec<u32> = set.iter().cloned().collect();
+        vec.sort_unstable();
+        vec
+    };
+
+    // Get all possible input symbols from NFA transitions
+    let alphabet: HashSet<char> = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect();
+
+    // Find all states that exist in the NFA
+    let mut all_nfa_states = HashSet::new();
+    for &(state, _) in nfa.transitions.keys() {
+        all_nfa_states.insert(state);
+    }
+    for target_states in nfa.transitions.values() {
+        for &state in target_states {
+            all_nfa_states.insert(state);
+        }
+    }
+    for &state in &nfa.accepting_states {
+        all_nfa_states.insert(state);
+    }
+
+    // In a Glushkov NFA, state 0 is always the start state
+    let start_state = 0;
+
+    // Verify that state 0 exists in the NFA
+    if !all_nfa_states.contains(&start_state) {
+        panic!("Expected start state 0 not found in NFA states: {all_nfa_states:?}");
+    }
+
+    let start_state_set = {
+        let mut set = HashSet::new();
+        set.insert(start_state);
+        set
+    };
+
+    // Create initial DFA state
+    let start_dfa_state = next_dfa_state_id;
+    next_dfa_state_id += 1;
+
+    let start_state_key = set_to_sorted_vec(&start_state_set);
+    nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state);
+    work_queue.push_back(start_state_set);
+
+    // Process each DFA state
+    while let Some(current_nfa_states) = work_queue.pop_front() {
+        let current_state_key = set_to_sorted_vec(&current_nfa_states);
+        let current_dfa_state = nfa_states_to_dfa_state[&current_state_key];
+
+        // Check if this DFA state should be accepting
+        if current_nfa_states
+            .iter()
+            .any(|&state| nfa.accepting_states.contains(&state))
+        {
+            dfa_accepting_states.insert(current_dfa_state);
+        }
+
+        // For each symbol in the alphabet
+        for &symbol in &alphabet {
+            let mut next_nfa_states = HashSet::new();
+
+            // Collect all states reachable from current_nfa_states via symbol
+            for &nfa_state in &current_nfa_states {
+                if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) {
+                    for &target_state in target_states {
+                        next_nfa_states.insert(target_state);
+                    }
+                }
+            }
+
+            // Skip if no transitions exist for this symbol
+            if next_nfa_states.is_empty() {
+                continue;
+            }
+
+            // Get or create DFA state for this set of NFA states
+            let next_state_key = set_to_sorted_vec(&next_nfa_states);
+            let next_dfa_state =
+                if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) {
+                    existing_state
+                } else {
+                    let new_state = next_dfa_state_id;
+                    next_dfa_state_id += 1;
+
+                    nfa_states_to_dfa_state.insert(next_state_key.clone(), new_state);
+                    work_queue.push_back(next_nfa_states);
+
+                    new_state
+                };
+
+            // Add transition to DFA
+            dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state);
+        }
+    }
+
+    GlushkovDfa {
+        transitions: dfa_transitions,
+        accepting_states: dfa_accepting_states,
+    }
 }
 
+// fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa {
+//     let mut dfa_transitions = HashMap::new();
+//     let mut dfa_accepting_states = HashSet::new();
+//
+//     // Map from DFA state ID to the set of NFA states it represents
+//     let mut dfa_state_to_nfa_states: HashMap<u32, HashSet<u32>> = HashMap::new();
+//     // Map from sorted vector of NFA states to DFA state ID (for hashable key)
+//     let mut nfa_states_to_dfa_state: HashMap<Vec<u32>, u32> = HashMap::new();
+//
+//     let mut next_dfa_state_id = 0u32;
+//     let mut work_queue = VecDeque::new();
+//
+//     // Helper function to convert HashSet to sorted Vec for use as HashMap key
+//     let set_to_sorted_vec = |set: &HashSet<u32>| -> Vec<u32> {
+//         let mut vec: Vec<u32> = set.iter().cloned().collect();
+//         vec.sort_unstable();
+//         vec
+//     };
+//
+//     // Get all possible input symbols from NFA transitions
+//     let alphabet: HashSet<char> = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect();
+//
+//     // Find the start state (assuming state 0 is the start state)
+//     let start_state_set = {
+//         let mut set = HashSet::new();
+//         set.insert(0u32);
+//         set
+//     };
+//
+//     // Create initial DFA state
+//     let start_dfa_state = next_dfa_state_id;
+//     next_dfa_state_id += 1;
+//
+//     let start_state_key = set_to_sorted_vec(&start_state_set);
+//     dfa_state_to_nfa_states.insert(start_dfa_state, start_state_set.clone());
+//     nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state);
+//     work_queue.push_back(start_state_set);
+//
+//     // Process each DFA state
+//     while let Some(current_nfa_states) = work_queue.pop_front() {
+//         let current_state_key = set_to_sorted_vec(&current_nfa_states);
+//         let current_dfa_state = nfa_states_to_dfa_state[&current_state_key];
+//
+//         // Check if this DFA state should be accepting
+//         if current_nfa_states
+//             .iter()
+//             .any(|&state| nfa.accepting_states.contains(&state))
+//         {
+//             dfa_accepting_states.insert(current_dfa_state);
+//         }
+//
+//         // For each symbol in the alphabet
+//         for &symbol in &alphabet {
+//             let mut next_nfa_states = HashSet::new();
+//
+//             // Collect all states reachable from current_nfa_states via symbol
+//             for &nfa_state in &current_nfa_states {
+//                 if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) {
+//                     for &target_state in target_states {
+//                         next_nfa_states.insert(target_state);
+//                     }
+//                 }
+//             }
+//
+//             // Skip if no transitions exist for this symbol
+//             if next_nfa_states.is_empty() {
+//                 continue;
+//             }
+//
+//             // Get or create DFA state for this set of NFA states
+//             let next_state_key = set_to_sorted_vec(&next_nfa_states);
+//             let next_dfa_state =
+//                 if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) {
+//                     existing_state
+//                 } else {
+//                     let new_state = next_dfa_state_id;
+//                     next_dfa_state_id += 1;
+//
+//                     dfa_state_to_nfa_states.insert(new_state, next_nfa_states.clone());
+//                     nfa_states_to_dfa_state.insert(next_state_key, new_state);
+//                     work_queue.push_back(next_nfa_states);
+//
+//                     new_state
+//                 };
+//
+//             // Add transition to DFA
+//             dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state);
+//         }
+//     }
+//
+//     GlushkovDfa {
+//         transitions: dfa_transitions,
+//         accepting_states: dfa_accepting_states,
+//     }
+// }
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn test_single_character() {
+    fn test_index_single_character() {
         let expected = HashMap::from([(0, ('a', SymbolType::Normal, 0))]);
 
         let result = index_states("a");
@@ -296,7 +447,41 @@ mod tests {
     }
 
     #[test]
-    fn test_kleene_star() {
+    fn test_nfa_single_character() {
+        let expected_finite = HashSet::from([0]);
+        let expected_transitions: HashMap<(u32, char), Vec<u32>> =
+            HashMap::from([((1, 'a'), vec![0])]);
+
+        let result = glushkov_construction("a");
+        assert_eq!(
+            result.transitions, expected_transitions,
+            "Mismatch in single character test"
+        );
+        assert_eq!(
+            result.accepting_states, expected_finite,
+            "Mismatch in single character test"
+        );
+    }
+
+    #[test]
+    fn test_nfa_single_character_kleene_star() {
+        let expected_finite = HashSet::from([0]);
+        let expected_transitions: HashMap<(u32, char), Vec<u32>> =
+            HashMap::from([((0, 'a'), vec![0]), ((1, 'a'), vec![0])]);
+
+        let result = glushkov_construction("a*");
+        assert_eq!(
+            result.transitions, expected_transitions,
+            "Mismatch in single character test"
+        );
+        assert_eq!(
+            result.accepting_states, expected_finite,
+            "Mismatch in single character test"
+        );
+    }
+
+    #[test]
+    fn test_index_kleene_star() {
         let expected = HashMap::from([(0, ('a', SymbolType::KleeneStar, 0))]);
 
         let result = index_states("a*");
@@ -304,7 +489,7 @@ mod tests {
     }
 
     #[test]
-    fn test_union_and_groups() {
+    fn test_index_union_and_groups() {
         let expected = HashMap::from([
             (0, ('a', SymbolType::Normal, 1)),
             (1, ('b', SymbolType::Normal, 2)),
@@ -315,7 +500,7 @@ mod tests {
     }
 
     #[test]
-    fn test_escaped_character() {
+    fn test_index_escaped_character() {
         let expected = HashMap::from([(0, ('a', SymbolType::Escaped, 0))]);
 
         let result = index_states("\\a");
@@ -323,7 +508,7 @@ mod tests {
     }
 
     #[test]
-    fn test_mixed_regex() {
+    fn test_index_mixed_regex() {
         let expected = HashMap::from([
             (0, ('a', SymbolType::Normal, 0)),
             (1, ('*', SymbolType::Escaped, 0)),
@@ -342,7 +527,7 @@ mod tests {
     }
 
     #[test]
-    fn test_too_many_brackets() {
+    fn test_index_too_many_brackets() {
         let expected = HashMap::from([
             (0, ('a', SymbolType::KleeneStar, 0)),
             (1, ('b', SymbolType::Normal, 0)),
@@ -357,25 +542,185 @@ mod tests {
     }
 
     #[test]
-    fn test_fill_sets() {
+    fn test_fill_sets_too_many_brackets() {
         let states = index_states("a*b|(c|d)|ef");
-        let mut start_states: HashSet<u32> = HashSet::new();
         let mut finite_states: HashSet<u32> = HashSet::new();
         let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
 
-        let expected_start_set: HashSet<u32> = HashSet::from([0, 1, 2, 3, 4]);
-        let expected_finite_set: HashSet<u32> = HashSet::new();
-        let expected_transions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
+        let expected_finite_set: HashSet<u32> = HashSet::from([1, 2, 3, 5]);
+        let expected_transitions: HashMap<(u32, char), Vec<u32>> = HashMap::from([
+            ((6, 'a'), vec![0]),
+            ((6, 'b'), vec![1]),
+            ((6, 'c'), vec![2]),
+            ((6, 'd'), vec![3]),
+            ((6, 'e'), vec![4]),
+            ((0, 'a'), vec![0, 1]),
+            ((4, 'e'), vec![5]),
+        ]);
 
-        fill_sets(
-            states,
-            &mut start_states,
-            &mut finite_states,
-            &mut transitions,
-        );
+        fill_sets(states, &mut finite_states, &mut transitions);
 
-        assert_eq!(start_states, expected_start_set);
         assert_eq!(finite_states, expected_finite_set);
-        assert_eq!(transitions, expected_transions);
+        assert_eq!(transitions, expected_transitions);
+    }
+
+    #[test]
+    fn test_fill_sets_complex() {
+        let states = index_states("a*b*c|d*e");
+        let mut finite_states: HashSet<u32> = HashSet::new();
+        let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
+
+        let expected_finite_set: HashSet<u32> = HashSet::from([2, 4]);
+        let expected_transitions = HashMap::from([
+            ((5, 'a'), vec![0]),
+            ((5, 'b'), vec![1]),
+            ((5, 'c'), vec![2]),
+            ((5, 'd'), vec![3]),
+            ((5, 'e'), vec![4]),
+            ((0, 'a'), vec![0, 1, 2]),
+            ((1, 'b'), vec![1, 2]),
+            ((3, 'd'), vec![3, 4]),
+        ]);
+
+        fill_sets(states, &mut finite_states, &mut transitions);
+
+        assert_eq!(finite_states, expected_finite_set);
+        assert_eq!(transitions, expected_transitions);
+    }
+
+    #[test]
+    fn nfa_to_dfa_simple_test() {
+        // NFA that accepts exactly "a"
+        // State 0 --a--> State 1 (accepting)
+        let input_nfa = Nfa {
+            transitions: HashMap::from([((0, 'a'), vec![1])]),
+            accepting_states: HashSet::from([1]),
+        };
+
+        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
+
+        let expected_transitions = HashMap::from([((0, 'a'), 1)]);
+        let expected_accepting_states = HashSet::from([1]);
+
+        assert_eq!(expected_transitions, generated_dfa.transitions);
+        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    }
+
+    #[test]
+    fn nfa_to_dfa_sequence_test() {
+        // NFA that accepts exactly "ab"
+        // State 0 --a--> State 1 --b--> State 2 (accepting)
+        let input_nfa = Nfa {
+            transitions: HashMap::from([((0, 'a'), vec![1]), ((1, 'b'), vec![2])]),
+            accepting_states: HashSet::from([2]),
+        };
+
+        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
+
+        let expected_transitions = HashMap::from([((0, 'a'), 1), ((1, 'b'), 2)]);
+        let expected_accepting_states = HashSet::from([2]);
+
+        assert_eq!(expected_transitions, generated_dfa.transitions);
+        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    }
+
+    #[test]
+    fn nfa_to_dfa_alternation_test() {
+        // NFA that accepts "a" or "b"
+        // State 0 --a--> State 1 (accepting)
+        // State 0 --b--> State 2 (accepting)
+        let input_nfa = Nfa {
+            transitions: HashMap::from([((0, 'a'), vec![1]), ((0, 'b'), vec![2])]),
+            accepting_states: HashSet::from([1, 2]),
+        };
+
+        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
+
+        let expected_transitions = [
+            HashMap::from([((0, 'a'), 1), ((0, 'b'), 2)]),
+            HashMap::from([((0, 'a'), 2), ((0, 'b'), 1)]),
+        ];
+        let expected_accepting_states = HashSet::from([1, 2]);
+
+        assert!(
+            generated_dfa.transitions == expected_transitions[0]
+                || generated_dfa.transitions == expected_transitions[1],
+            "generated_dfa.transitions did not match either expected set"
+        );
+        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    }
+
+    #[test]
+    fn nfa_to_dfa_nondeterministic_test() {
+        // NFA with nondeterministic transition
+        // State 0 --a--> State 1, State 2
+        // State 1 --b--> State 3 (accepting)
+        // State 2 --c--> State 3 (accepting)
+        let input_nfa = Nfa {
+            transitions: HashMap::from([
+                ((0, 'a'), vec![1, 2]),
+                ((1, 'b'), vec![3]),
+                ((2, 'c'), vec![3]),
+            ]),
+            accepting_states: HashSet::from([3]),
+        };
+
+        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
+
+        // After 'a' from state 0, we should be in a state representing {1, 2}
+        // Let's call this combined state "1" in the DFA
+        let expected_transitions = HashMap::from([
+            ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1)
+            ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2)
+            ((1, 'c'), 2), // {1,2} --c--> {3} (DFA state 2)
+        ]);
+        let expected_accepting_states = HashSet::from([2]); // DFA state 2 represents {3}
+
+        assert_eq!(expected_transitions, generated_dfa.transitions);
+        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    }
+
+    #[test]
+    fn nfa_to_dfa_multiple_accepting_test() {
+        // NFA where multiple paths lead to accepting states
+        // State 0 --a--> State 1 (accepting)
+        // State 0 --a--> State 2 --b--> State 3 (accepting)
+        let input_nfa = Nfa {
+            transitions: HashMap::from([((0, 'a'), vec![1, 2]), ((2, 'b'), vec![3])]),
+            accepting_states: HashSet::from([1, 3]),
+        };
+
+        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
+
+        // After 'a' from state 0, we're in state representing {1, 2}
+        // This should be accepting because it contains state 1
+        let expected_transitions = HashMap::from([
+            ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1)
+            ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2)
+        ]);
+        let expected_accepting_states = HashSet::from([1, 2]); // Both DFA states are accepting
+
+        assert_eq!(expected_transitions, generated_dfa.transitions);
+        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    }
+
+    #[test]
+    fn nfa_to_dfa_self_loop_test() {
+        // NFA with self-loop: accepts a*
+        // State 0 (accepting) --a--> State 0
+        let input_nfa = Nfa {
+            transitions: HashMap::from([((0, 'a'), vec![0])]),
+            accepting_states: HashSet::from([0]),
+        };
+
+        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
+
+        let expected_transitions = HashMap::from([
+            ((0, 'a'), 0), // Self-loop
+        ]);
+        let expected_accepting_states = HashSet::from([0]);
+
+        assert_eq!(expected_transitions, generated_dfa.transitions);
+        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 7d17879..a57b628 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
 use crate::{glushkov::GlushkovDfa, thompson::ThompsonDfa};
-use std::collections::{HashMap, HashSet};
+use std::collections::{HashMap, HashSet, VecDeque};
 
 mod glushkov;
 mod thompson;
@@ -8,6 +8,166 @@ trait Dfa {
     fn new(regex: &str) -> Self;
     fn get_transitions(&self) -> &HashMap<(u32, char), u32>;
     fn get_accepting_states(&self) -> &HashSet<u32>;
+    fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32>;
+    fn get_accepting_states_mut(&mut self) -> &mut HashSet<u32>;
+    fn optimise_dfa(&mut self) {
+        let mut partition: HashMap<u32, usize> = HashMap::new();
+        let mut accepting_states_set: HashSet<u32> = self.get_accepting_states().clone();
+        let mut non_accepting_states: HashSet<u32> = HashSet::new();
+        let mut all_states: HashSet<u32> = HashSet::new();
+
+        for &(state, _) in self.get_transitions().keys() {
+            all_states.insert(state);
+            if self.get_accepting_states().contains(&state) {
+                accepting_states_set.insert(state);
+            } else {
+                non_accepting_states.insert(state);
+            }
+        }
+
+        for state in self.get_accepting_states().iter() {
+            all_states.insert(*state);
+        }
+
+        for state in all_states.iter() {
+            if self.get_accepting_states().contains(state) {
+                partition.insert(*state, 0);
+            } else {
+                partition.insert(*state, 1);
+            }
+        }
+
+        let mut partition_list: Vec<HashSet<u32>> = Vec::new();
+        partition_list.push(accepting_states_set);
+        partition_list.push(non_accepting_states);
+
+        let mut worklist: VecDeque<usize> = VecDeque::new();
+        if !partition_list[0].is_empty() {
+            worklist.push_back(0);
+        }
+        if partition_list.len() > 1 && !partition_list[1].is_empty() {
+            worklist.push_back(1);
+        }
+
+        while let Some(current_partition_index) = worklist.pop_front() {
+            let mut states_to_check: HashMap<char, HashSet<u32>> = HashMap::new();
+            for (&(source_state, symbol), &target_state) in self.get_transitions() {
+                if partition[&target_state] == current_partition_index {
+                    states_to_check
+                        .entry(symbol)
+                        .or_default()
+                        .insert(source_state);
+                }
+            }
+
+            for (_, states_to_split) in states_to_check.iter() {
+                let mut partitions_to_split: HashSet<usize> = HashSet::new();
+
+                for &state in states_to_split.iter() {
+                    let partition_index = partition[&state];
+                    if partition_list[partition_index].len() > 1 {
+                        partitions_to_split.insert(partition_index);
+                    }
+                }
+
+                for &partition_index_to_split in partitions_to_split.iter() {
+                    let mut intersection: HashSet<u32> = HashSet::new();
+                    let mut difference: HashSet<u32> = HashSet::new();
+
+                    for &state in partition_list[partition_index_to_split].iter() {
+                        if states_to_split.contains(&state) {
+                            intersection.insert(state);
+                        } else {
+                            difference.insert(state);
+                        }
+                    }
+
+                    if !intersection.is_empty() && !difference.is_empty() {
+                        let new_partition_index = partition_list.len();
+
+                        for &state in intersection.iter() {
+                            partition.insert(state, new_partition_index);
+                        }
+
+                        partition_list.push(intersection);
+
+                        for &state in &difference {
+                            partition.insert(state, partition_index_to_split);
+                        }
+                        partition_list[partition_index_to_split] = difference;
+
+                        if partition_list[new_partition_index].len()
+                            < partition_list[partition_index_to_split].len()
+                        {
+                            worklist.push_back(new_partition_index);
+                        } else {
+                            worklist.push_back(partition_index_to_split);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Build new transitions and accepting states
+        let mut minimal_transitions: HashMap<(u32, char), u32> = HashMap::new();
+        let mut minimal_accepting_states: HashSet<u32> = HashSet::new();
+        let mut new_state_map: HashMap<usize, u32> = HashMap::new();
+
+        let mut next_state_id: u32 = 0;
+
+        if let Some(partition_index) = partition.get(&0) {
+            new_state_map.insert(*partition_index, next_state_id);
+            next_state_id += 1;
+        }
+
+        for (_, &partition_index) in partition.iter() {
+            if let std::collections::hash_map::Entry::Vacant(e) =
+                new_state_map.entry(partition_index)
+            {
+                e.insert(next_state_id);
+                next_state_id += 1;
+            }
+        }
+
+        for (original_state, &partition_index) in partition.iter() {
+            let new_state_id = new_state_map[&partition_index];
+            if self.get_accepting_states().contains(original_state) {
+                minimal_accepting_states.insert(new_state_id);
+            }
+        }
+
+        for (&(source_state, symbol), &target_state) in self.get_transitions() {
+            let source_partition = partition[&source_state];
+            let target_partition = partition[&target_state];
+
+            let new_source_state = new_state_map[&source_partition];
+            let new_target_state = new_state_map[&target_partition];
+
+            minimal_transitions.insert((new_source_state, symbol), new_target_state);
+        }
+
+        // Modify the existing DFA in-place
+        *self.get_transitions_mut() = minimal_transitions;
+        *self.get_accepting_states_mut() = minimal_accepting_states;
+    }
+
+    /// Determines if the given input string exactly matches the regex pattern.
+    ///
+    /// This function processes the input as though it is surrounded by start (`^`) and
+    /// end (`$`) position anchors, ensuring that the entire input must conform to the pattern.
+    ///
+    /// # Parameters
+    ///
+    /// - `input`: A string slice representing the text to be checked against the regex.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the entire input string matches the regex pattern exactly,
+    /// considering implicit start and end anchors.
+    ///
+    /// e.g., for the regex pattern "(a|b)*", the function checks if the input matches
+    /// the pattern from start to finish, equivalent to "^(a|b)*$".
+    ///
     fn process(&self, input: &str) -> bool {
         let mut current_state = 0;
         for c in input.chars() {
@@ -26,7 +186,6 @@ trait Dfa {
             let mut current_state = 0;
             let mut match_start = None;
             let mut match_end = None;
-            let mut found_match = false;
 
             for (i, c) in text.chars().enumerate().skip(start_pos) {
                 if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) {
@@ -34,13 +193,8 @@ trait Dfa {
                     match_start = match_start.or(Some(i));
 
                     if self.get_accepting_states().contains(&current_state) {
-                        found_match = true;
                         match_end = Some(i)
                     }
-
-                    if i == text.len() - 1 && found_match {
-                        break;
-                    }
                 } else {
                     break;
                 }
@@ -64,7 +218,6 @@ trait Dfa {
             let mut current_state = 0;
             let mut match_start: Option<usize> = None;
             let mut match_end: Option<usize> = None;
-            let mut found_match = false;
 
             for (i, c) in input.chars().enumerate().skip(start_pos) {
                 if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) {
@@ -73,11 +226,6 @@ trait Dfa {
 
                     if self.get_accepting_states().contains(&current_state) {
                         match_end = Some(i);
-                        found_match = true;
-                    }
-
-                    if i == input.len() - 1 && found_match {
-                        break;
                     }
                 } else {
                     break;
@@ -86,7 +234,7 @@ trait Dfa {
 
             if let (Some(start), Some(end)) = (match_start, match_end) {
                 matches.push(&input[start..=end]);
-                start_pos = end;
+                start_pos = end + 1;
             } else {
                 start_pos += 1;
             }
@@ -119,6 +267,29 @@ impl Regex {
         Regex { dfa: dfa_type }
     }
 
+    /// Determines if the provided `text` is an exact match for the regex pattern.
+    ///
+    /// This method interprets the regex pattern as though it is bracketed by start (`^`)
+    /// and end (`$`) anchors, requiring the entire `text` to conform to the pattern.
+    ///
+    /// # Parameters
+    ///
+    /// - `text`: A string slice that represents the text to be verified against the regex.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the `text` completely matches the regex pattern encompassed by implicit
+    /// anchors, otherwise returns `false`.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use regex_engine::{Regex, ConstructionType};
+    ///
+    /// let regex = Regex::new("(a|b)*", ConstructionType::Thompson);
+    /// assert!(regex.is_match("abba"));
+    /// assert!(!regex.is_match("abc"));
+    /// ```
     pub fn is_match(&self, text: &str) -> bool {
         match &self.dfa {
             DfaType::Thompson(dfa) => dfa.process(text),
@@ -126,6 +297,31 @@ impl Regex {
         }
     }
 
+    /// Searches for the first occurrence of a sequence in `text` that matches the regex pattern.
+    ///
+    /// This method locates and returns the first substring of `text` that matches the regex,
+    /// if such a substring exists.
+    ///
+    /// # Parameters
+    ///
+    /// - `text`: A string slice in which to search for the regex pattern.
+    ///
+    /// # Returns
+    ///
+    /// Returns an `Option<&str>` which contains the first matching substring if a match is found,
+    /// or `None` if no match occurs.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use regex_engine::{Regex, ConstructionType};
+    ///
+    /// let regex = Regex::new("ab+", ConstructionType::Thompson);
+    /// if let Some(matched) = regex.find("aabbcc") {
+    ///     println!("Found: {}", matched);
+    /// }
+    /// // Output: Found: abb
+    /// ```
     pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> {
         match &self.dfa {
             DfaType::Thompson(dfa) => dfa.find_first_match(text),
@@ -147,16 +343,15 @@ pub fn is_valid_regex(regex: &str) -> bool {
     }
 
     let mut open_paren_count = 0;
-    let mut last_was_quantifier = false;
+    let mut last_was_quantifier = true;
 
     let mut chars = regex.chars().peekable();
     while let Some(c) = chars.next() {
         match c {
             '(' => {
                 open_paren_count += 1;
-                last_was_quantifier = false;
+                last_was_quantifier = true;
             }
-
             ')' => {
                 if open_paren_count == 0 {
                     return false;
@@ -164,23 +359,13 @@ pub fn is_valid_regex(regex: &str) -> bool {
                 open_paren_count -= 1;
                 last_was_quantifier = false;
             }
-
             '*' | '+' => {
                 // Ensure quantifiers are not the first character and are not repeated
-                if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') {
+                if last_was_quantifier {
                     return false;
                 }
                 last_was_quantifier = true;
             }
-
-            '|' => {
-                // Ensure alternation isn't the first or last character
-                if regex.starts_with('|') || chars.peek().is_none() {
-                    return false;
-                }
-                last_was_quantifier = false;
-            }
-
             '\\' => {
                 // Handle escaped characters: ensure there's a character after the escape
                 if chars.peek().is_none() {
@@ -203,7 +388,6 @@ pub fn normalise_regex(regex: &str) -> String {
     let mut normalised = String::new();
     let mut escape_sequence = false;
     let mut prev_char = '\0';
-
     for curr_char in regex.chars() {
         if escape_sequence {
             // TODO: Implement further parsing features here (e.g. \w \d)
@@ -212,24 +396,45 @@ pub fn normalise_regex(regex: &str) -> String {
             prev_char = curr_char;
             continue;
         }
-
         if curr_char == '\\' {
             escape_sequence = true;
             normalised.push(curr_char);
             continue;
         }
-
         if curr_char == '+' {
-            normalised.push(prev_char);
+            match prev_char {
+                ')' => {
+                    let mut balance = 0;
+                    let mut group_start = 0;
+
+                    for j in (0..normalised.len()).rev() {
+                        let ch = normalised.chars().nth(j).unwrap();
+                        if ch == ')' {
+                            balance += 1;
+                        } else if ch == '(' {
+                            balance -= 1;
+                            if balance == 0 {
+                                group_start = j;
+                                break;
+                            }
+                        }
+                    }
+
+                    let group = String::from(&normalised[group_start..normalised.len()]);
+                    normalised.push_str(&group);
+                }
+                _ => {
+                    normalised.push(prev_char);
+                }
+            }
             normalised.push('*');
-            prev_char = curr_char;
+            prev_char = '*';
             continue;
         }
         if curr_char == '?' {
             match prev_char {
                 ')' => {
                     let mut balance = 0;
-
                     for j in (0..normalised.len()).rev() {
                         let ch = normalised.chars().nth(j).unwrap();
                         if ch == ')' {
@@ -247,20 +452,18 @@ pub fn normalise_regex(regex: &str) -> String {
                     normalised.insert(normalised.len() - 1, '(');
                 }
             }
-            normalised.push_str("|())");
-            prev_char = curr_char;
+            normalised.push_str("|)");
+            prev_char = ')';
             continue;
         }
         if curr_char == '.' {
             normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)");
-            prev_char = curr_char;
+            prev_char = ')';
             continue;
         }
-
         normalised.push(curr_char);
         prev_char = curr_char;
     }
-
     normalised
 }
 
@@ -297,7 +500,7 @@ mod tests {
     #[test]
     fn invalid_operator_placement_test() {
         let regex1 = "*a";
-        let regex2 = "|a|b";
+        let regex2 = "(+abc|x)";
         assert!(
             !is_valid_regex(regex1),
             "Expected invalid regex (invalid quantifier placement)."
@@ -340,9 +543,9 @@ mod tests {
         let cases = [
             (r"a+", r"aa*"),
             (r"a\+", r"a\+"),
-            (r"a?", r"(a|())"),
+            (r"a?", r"(a|)"),
             (r"a\?", r"a\?"),
-            (r"(ab)?", r"((ab)|())"),
+            (r"(ab)?", r"((ab)|)"),
             (
                 r".",
                 "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)",
@@ -357,9 +560,9 @@ mod tests {
 
     #[test]
     fn is_match_test() {
-        let regex_object = Regex::new("(a|b)*", ConstructionType::Thompson);
+        let regex_object = Regex::new("a(a|b)*", ConstructionType::Thompson);
 
-        let success_strings = vec!["abababaaaababa", ""];
+        let success_strings = vec!["abababaaaababa", "a"];
         for string in success_strings {
             assert!(regex_object.is_match(string));
         }
diff --git a/src/thompson.rs b/src/thompson.rs
index 5f0ede8..dcf54d7 100644
--- a/src/thompson.rs
+++ b/src/thompson.rs
@@ -1,5 +1,5 @@
 use crate::{Dfa, is_valid_regex, normalise_regex};
-use std::collections::{HashMap, HashSet, VecDeque};
+use std::collections::{HashMap, HashSet};
 
 struct Nfa {
     transitions: HashMap<(u32, Option<char>), Vec<u32>>,
@@ -19,8 +19,9 @@ impl Dfa for ThompsonDfa {
 
         let normalised_regex = normalise_regex(regex);
         let regex_nfa: Nfa = thompson_construction(&normalised_regex);
-        let regex_dfa = nfa_to_dfa(&regex_nfa);
-        optimise_dfa(&regex_dfa)
+        let mut regex_dfa = nfa_to_dfa(&regex_nfa);
+        <Self as Dfa>::optimise_dfa(&mut regex_dfa);
+        regex_dfa
     }
 
     fn get_transitions(&self) -> &HashMap<(u32, char), u32> {
@@ -30,6 +31,14 @@ impl Dfa for ThompsonDfa {
     fn get_accepting_states(&self) -> &HashSet<u32> {
         &self.accepting_states
     }
+
+    fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32> {
+        &mut self.transitions
+    }
+
+    fn get_accepting_states_mut(&mut self) -> &mut HashSet<u32> {
+        &mut self.accepting_states
+    }
 }
 
 // THOMPSON CONSTRUCTION ---
@@ -46,7 +55,7 @@ fn thompson_construction(normalised_regex: &str) -> Nfa {
                 let nfa_left = nfa_stack.pop().expect("Expected NFA for concatenation");
                 nfa_stack.push(concatenate(&nfa_left, &nfa_right));
             }
-            _ => panic!("Unknown operator {operator:?}"),
+            _ => unreachable!("Unknown operator {}", operator),
         }
     }
 
@@ -65,6 +74,7 @@ fn thompson_construction(normalised_regex: &str) -> Nfa {
             escape_sequence = false;
             continue;
         }
+
         match symbol {
             '(' => {
                 if concat_flag {
@@ -74,17 +84,24 @@ fn thompson_construction(normalised_regex: &str) -> Nfa {
                 concat_flag = false;
             }
             ')' => {
-                let mut is_epsilon = true;
+                // If concat_flag is false, we have an empty right operand for union
+                if !concat_flag {
+                    nfa_stack.push(create_basic_epsilon_nfa());
+                }
+
+                // Process all operators until we hit the matching '('
                 while let Some(op) = operators.pop() {
-                    if op == '(' && is_epsilon {
-                        nfa_stack.push(create_basic_epsilon_nfa());
-                        break;
-                    } else if op == '(' {
+                    if op == '(' {
                         break;
                     }
-                    is_epsilon = false;
                     apply_operator(&mut nfa_stack, op);
                 }
+
+                // If stack is empty after processing, we had completely empty parentheses
+                if nfa_stack.is_empty() {
+                    nfa_stack.push(create_basic_epsilon_nfa());
+                }
+
                 concat_flag = true;
             }
             '*' => {
@@ -93,6 +110,20 @@ fn thompson_construction(normalised_regex: &str) -> Nfa {
                 concat_flag = true;
             }
             '|' => {
+                // Process all concatenation operators (higher precedence than union)
+                while let Some(&op) = operators.last() {
+                    if op == '(' || op == '|' {
+                        break;
+                    }
+                    operators.pop();
+                    apply_operator(&mut nfa_stack, op);
+                }
+
+                // If we have no operand for the left side of union, create epsilon
+                if !concat_flag {
+                    nfa_stack.push(create_basic_epsilon_nfa());
+                }
+
                 operators.push('|');
                 concat_flag = false;
             }
@@ -109,12 +140,26 @@ fn thompson_construction(normalised_regex: &str) -> Nfa {
         }
     }
 
+    // Handle case where regex ends with '|' (empty right operand)
+    if let Some(&'|') = operators.last() {
+        if nfa_stack.len() < 2 {
+            nfa_stack.push(create_basic_epsilon_nfa());
+        }
+    }
+
+    // Process remaining operators
     while let Some(op) = operators.pop() {
+        if op == '(' {
+            panic!("Unmatched opening parenthesis");
+        }
         apply_operator(&mut nfa_stack, op);
     }
 
     if nfa_stack.len() != 1 {
-        panic!("Invalid Regex, unexpected final NFA stack size");
+        panic!(
+            "Invalid Regex, unexpected final NFA stack size: {}",
+            nfa_stack.len()
+        );
     }
 
     nfa_stack.pop().unwrap()
@@ -321,145 +366,6 @@ fn nfa_to_dfa(nfa: &Nfa) -> ThompsonDfa {
 }
 // END NFA to DFA functions ---
 
-fn optimise_dfa(dfa: &ThompsonDfa) -> ThompsonDfa {
-    let mut partition: HashMap<u32, usize> = HashMap::new();
-    let mut accepting_states_set: HashSet<u32> = dfa.accepting_states.clone();
-    let mut non_accepting_states: HashSet<u32> = HashSet::new();
-    let mut all_states: HashSet<u32> = HashSet::new();
-
-    for &(state, _) in dfa.transitions.keys() {
-        all_states.insert(state);
-        if dfa.accepting_states.contains(&state) {
-            accepting_states_set.insert(state);
-        } else {
-            non_accepting_states.insert(state);
-        }
-    }
-
-    for state in dfa.accepting_states.iter() {
-        all_states.insert(*state);
-    }
-
-    for state in all_states.iter() {
-        if dfa.accepting_states.contains(state) {
-            partition.insert(*state, 0);
-        } else {
-            partition.insert(*state, 1);
-        }
-    }
-
-    let mut partition_list: Vec<HashSet<u32>> = Vec::new();
-    partition_list.push(accepting_states_set);
-    partition_list.push(non_accepting_states);
-
-    let mut worklist: VecDeque<usize> = VecDeque::new();
-    if !partition_list[0].is_empty() {
-        worklist.push_back(0);
-    }
-    if partition_list.len() > 1 && !partition_list[1].is_empty() {
-        worklist.push_back(1);
-    }
-
-    while let Some(current_partition_index) = worklist.pop_front() {
-        let mut states_to_check: HashMap<char, HashSet<u32>> = HashMap::new();
-        for (&(source_state, symbol), &target_state) in &dfa.transitions {
-            if partition[&target_state] == current_partition_index {
-                states_to_check
-                    .entry(symbol)
-                    .or_default()
-                    .insert(source_state);
-            }
-        }
-
-        for (_, states_to_split) in states_to_check.iter() {
-            let mut partitions_to_split: HashSet<usize> = HashSet::new();
-
-            for &state in states_to_split.iter() {
-                let partition_index = partition[&state];
-                if partition_list[partition_index].len() > 1 {
-                    partitions_to_split.insert(partition_index);
-                }
-            }
-
-            for &partition_index_to_split in partitions_to_split.iter() {
-                let mut intersection: HashSet<u32> = HashSet::new();
-                let mut difference: HashSet<u32> = HashSet::new();
-
-                for &state in partition_list[partition_index_to_split].iter() {
-                    if states_to_split.contains(&state) {
-                        intersection.insert(state);
-                    } else {
-                        difference.insert(state);
-                    }
-                }
-
-                if !intersection.is_empty() && !difference.is_empty() {
-                    let new_partition_index = partition_list.len();
-
-                    for &state in intersection.iter() {
-                        partition.insert(state, new_partition_index);
-                    }
-
-                    partition_list.push(intersection);
-
-                    for &state in &difference {
-                        partition.insert(state, partition_index_to_split);
-                    }
-                    partition_list[partition_index_to_split] = difference;
-
-                    if partition_list[new_partition_index].len()
-                        < partition_list[partition_index_to_split].len()
-                    {
-                        worklist.push_back(new_partition_index);
-                    } else {
-                        worklist.push_back(partition_index_to_split);
-                    }
-                }
-            }
-        }
-    }
-
-    let mut minimal_transitions: HashMap<(u32, char), u32> = HashMap::new();
-    let mut minimal_accepting_states: HashSet<u32> = HashSet::new();
-    let mut new_state_map: HashMap<usize, u32> = HashMap::new();
-
-    let mut next_state_id: u32 = 0;
-
-    if let Some(partition_index) = partition.get(&0) {
-        new_state_map.insert(*partition_index, next_state_id);
-        next_state_id += 1;
-    }
-
-    for (_, &partition_index) in partition.iter() {
-        if let std::collections::hash_map::Entry::Vacant(e) = new_state_map.entry(partition_index) {
-            e.insert(next_state_id);
-            next_state_id += 1;
-        }
-    }
-
-    for (original_state, &partition_index) in partition.iter() {
-        let new_state_id = new_state_map[&partition_index];
-        if dfa.accepting_states.contains(original_state) {
-            minimal_accepting_states.insert(new_state_id);
-        }
-    }
-
-    for (&(source_state, symbol), &target_state) in &dfa.transitions {
-        let source_partition = partition[&source_state];
-        let target_partition = partition[&target_state];
-
-        let new_source_state = new_state_map[&source_partition];
-        let new_target_state = new_state_map[&target_partition];
-
-        minimal_transitions.insert((new_source_state, symbol), new_target_state);
-    }
-
-    ThompsonDfa {
-        transitions: minimal_transitions,
-        accepting_states: minimal_accepting_states,
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -482,6 +388,13 @@ mod tests {
             expected_accepting_states_2,
             generated_dfa_2.accepting_states
         );
+
+        let generated_dfa = ThompsonDfa::new("a*b");
+        let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 1)]);
+        let expected_accepting_states = HashSet::from([1]);
+
+        assert_eq!(expected_transitions, generated_dfa.transitions);
+        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
     }
 
     #[test]
diff --git a/tests/glushkov_test.rs b/tests/glushkov_test.rs
new file mode 100644
index 0000000..20c5306
--- /dev/null
+++ b/tests/glushkov_test.rs
@@ -0,0 +1,17 @@
+include!("../benches/bench_cases.rs");
+use regex_engine::{ConstructionType, Regex};
+
+#[test]
+fn test_all_bench_cases() {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        assert_eq!(regex.is_match(&case.input), case.expected_is_match);
+        assert_eq!(
+            regex.find(&case.input),
+            case.expected_first_match.as_deref()
+        );
+        assert_eq!(regex.findall(&case.input), case.expected_all_matches);
+    }
+}
diff --git a/tests/rust_regex_test.rs b/tests/rust_regex_test.rs
new file mode 100644
index 0000000..9142f6a
--- /dev/null
+++ b/tests/rust_regex_test.rs
@@ -0,0 +1,27 @@
+include!("../benches/bench_cases.rs");
+use regex::Regex;
+
+#[test]
+fn test_all_bench_cases() {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let match_regex = Regex::new(format!("^{}$", case.regex).as_str())
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+        let regex = Regex::new(case.regex)
+            .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
+
+        assert_eq!(match_regex.is_match(&case.input), case.expected_is_match);
+        assert_eq!(
+            regex.find(&case.input).map(|s| s.as_str()),
+            case.expected_first_match.as_deref()
+        );
+        assert_eq!(
+            regex
+                .find_iter(&case.input)
+                .map(|s| s.as_str())
+                .collect::<Vec<_>>(),
+            case.expected_all_matches
+        );
+    }
+}
diff --git a/tests/test_one.rs b/tests/test_one.rs
deleted file mode 100644
index a166e8a..0000000
--- a/tests/test_one.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-use regex_engine::regex_engine::{ConstructionType, Regex};
-
-#[test]
-fn test_escape_sequence_plus() {
-    let pattern = r"a*b\+";
-    let text = "aaab+b"; // should fail on match
-    let text_success = "aaab+";
-
-    let engine = Regex::new(pattern, ConstructionType::Thompson);
-
-    let expected_match = text_success;
-
-    assert!(!engine.is_match(text));
-    assert!(engine.is_match(text_success));
-    assert_eq!(engine.find(text), Some(expected_match));
-    assert_eq!(engine.findall(text), vec![expected_match]);
-}
-
-#[test]
-fn test_escape_sequence_slash() {
-    let pattern = r"a*b\\";
-    let text = "aaab\\b"; // should fail on match
-    let text_success = "aaab\\";
-
-    let engine = Regex::new(pattern, ConstructionType::Thompson);
-
-    let expected_match = text_success;
-
-    assert!(!engine.is_match(text));
-    assert!(engine.is_match(text_success));
-    assert_eq!(engine.find(text), Some(expected_match));
-    assert_eq!(engine.findall(text), vec![expected_match]);
-}
-
-#[test]
-fn test_dot_wildcard() {
-    let pattern = r"a.*";
-    let text = "cabbc"; // should fail on match
-    let text_success = "abbc";
-
-    let engine = Regex::new(pattern, ConstructionType::Thompson);
-
-    let expected_match = text_success;
-
-    assert!(!engine.is_match(text));
-    assert!(engine.is_match(text_success));
-    assert_eq!(engine.find(text), Some(expected_match));
-    assert_eq!(engine.findall(text), vec![expected_match]);
-}
diff --git a/tests/thompson_test.rs b/tests/thompson_test.rs
new file mode 100644
index 0000000..d2cce3a
--- /dev/null
+++ b/tests/thompson_test.rs
@@ -0,0 +1,18 @@
+include!("../benches/bench_cases.rs");
+use regex_engine::{ConstructionType, Regex};
+
+#[test]
+fn test_all_bench_cases() {
+    let cases = get_bench_cases();
+
+    for case in &cases {
+        let regex = Regex::new(case.regex, ConstructionType::Thompson);
+
+        assert_eq!(regex.is_match(&case.input), case.expected_is_match);
+        assert_eq!(
+            regex.find(&case.input),
+            case.expected_first_match.as_deref()
+        );
+        assert_eq!(regex.findall(&case.input), case.expected_all_matches);
+    }
+}

From 58fd71847e1928593fe2ba5445152b836548976c Mon Sep 17 00:00:00 2001
From: Pepe Hanisch <142326461+Testspieler09@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:34:57 +0200
Subject: [PATCH 6/8] progress

---
 benches/regex_benchmark.rs |  22 ++++---
 src/glushkov.rs            | 117 +++++++++++++++++++++++++++++--------
 src/lib.rs                 |  18 +++---
 src/thompson.rs            |  22 +++----
 tests/glushkov_test.rs     |   2 +-
 tests/thompson_test.rs     |   2 +-
 6 files changed, 130 insertions(+), 53 deletions(-)

diff --git a/benches/regex_benchmark.rs b/benches/regex_benchmark.rs
index 8629579..7105fa5 100644
--- a/benches/regex_benchmark.rs
+++ b/benches/regex_benchmark.rs
@@ -13,7 +13,7 @@ fn benchmark_regex_compile_time(c: &mut Criterion) {
             &case.regex,
             |b, regex| {
                 b.iter(|| {
-                    Regex::new(regex, ConstructionType::Thompson);
+                    let _ = Regex::new(regex, ConstructionType::Thompson);
                 })
             },
         );
@@ -23,7 +23,7 @@ fn benchmark_regex_compile_time(c: &mut Criterion) {
             &case.regex,
             |b, regex| {
                 b.iter(|| {
-                    Regex::new(regex, ConstructionType::Glushkov);
+                    let _ = Regex::new(regex, ConstructionType::Glushkov);
                 })
             },
         );
@@ -47,8 +47,10 @@ fn benchmark_regex_is_match(c: &mut Criterion) {
     let mut group = c.benchmark_group("Regex Is Match");
 
     for case in &cases {
-        let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson);
-        let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        let thompson_regex =
+            Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex");
+        let glushkov_regex =
+            Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex");
         let rust_regex = rust_regex::Regex::new(&format!("^{}$", case.regex))
             .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
 
@@ -90,8 +92,10 @@ fn benchmark_regex_find_first(c: &mut Criterion) {
     let mut group = c.benchmark_group("Regex Find First");
 
     for case in &cases {
-        let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson);
-        let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        let thompson_regex =
+            Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex");
+        let glushkov_regex =
+            Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex");
         let rust_regex = rust_regex::Regex::new(case.regex)
             .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
 
@@ -133,8 +137,10 @@ fn benchmark_regex_find_all(c: &mut Criterion) {
     let mut group = c.benchmark_group("Regex Find All");
 
     for case in &cases {
-        let thompson_regex = Regex::new(case.regex, ConstructionType::Thompson);
-        let glushkov_regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        let thompson_regex =
+            Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex");
+        let glushkov_regex =
+            Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex");
         let rust_regex = rust_regex::Regex::new(case.regex)
             .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex));
 
diff --git a/src/glushkov.rs b/src/glushkov.rs
index 1d75081..b0360dc 100644
--- a/src/glushkov.rs
+++ b/src/glushkov.rs
@@ -21,17 +21,19 @@ pub struct GlushkovDfa {
 }
 
 impl Dfa for GlushkovDfa {
-    fn new(regex: &str) -> Self {
+    fn new(regex: &str) -> Result<Self, String> {
         if !is_valid_regex(regex) {
-            panic!("{regex} is not a valid regular expression!");
+            return Err("{regex} is not a valid regular expression!".to_string());
         }
 
         let normalised_regex = normalise_regex(regex);
         let regex_nfa = glushkov_construction(&normalised_regex);
         dbg!(&regex_nfa);
         let mut regex_dfa = nfa_no_epsilon_to_dfa(&regex_nfa);
+        // dbg!(&regex_dfa);
         <Self as Dfa>::optimise_dfa(&mut regex_dfa);
-        regex_dfa
+        // dbg!(&regex_dfa);
+        Ok(regex_dfa)
     }
 
     fn get_transitions(&self) -> &HashMap<(u32, char), u32> {
@@ -53,14 +55,12 @@ impl Dfa for GlushkovDfa {
 
 // GLUSHKOV CONSTRUCTION
 fn glushkov_construction(regex: &str) -> Nfa {
-    dbg!(&regex);
     let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
-    let mut accepting_states: HashSet<u32> = HashSet::new();
+    let accepting_states: HashSet<u32> = compute_accepting_states(regex);
 
     let states: HashMap<u32, (char, SymbolType, u32)> = index_states(regex);
-    dbg!(&states);
 
-    fill_sets(states, &mut accepting_states, &mut transitions);
+    fill_sets(states, &mut transitions);
 
     Nfa {
         transitions,
@@ -68,6 +68,59 @@ fn glushkov_construction(regex: &str) -> Nfa {
     }
 }
 
+fn compute_accepting_states(regex: &str) -> HashSet<u32> {
+    // also need handling of escape sequence?!
+    let mut accepting_states = HashSet::new();
+    let mut number_of_accepting_states_in_group = 0;
+    let mut group_is_exhausted = false;
+    let mut last_element_was_seperator = false;
+    let num_unions = regex.chars().filter(|&c| matches!(c, '|'));
+    let mut position: u32 = regex
+        .chars()
+        .filter(|&c| !matches!(c, '(' | ')' | '|' | '*'))
+        .count() as u32;
+
+    dbg!(regex);
+
+    // check if after none ) a | is present
+    for ch in regex.chars().rev() {
+        match ch {
+            ')' => {
+                if !last_element_was_seperator {
+                    group_is_exhausted = number_of_accepting_states_in_group != 0;
+                }
+                last_element_was_seperator = true;
+            }
+            '|' => {
+                group_is_exhausted = false;
+                last_element_was_seperator = true;
+            }
+            '(' => group_is_exhausted = true,
+            '*' => {
+                // Should account for ba* -> b and a are accepting
+                last_element_was_seperator = false;
+            }
+            _ => {
+                if position != 0 {
+                    position -= 1;
+                } else {
+                    break;
+                }
+
+                if group_is_exhausted {
+                    continue;
+                }
+                dbg!(&ch, &position);
+                accepting_states.insert(position);
+                number_of_accepting_states_in_group += 1;
+                group_is_exhausted = true;
+            }
+        }
+    }
+
+    accepting_states
+}
+
 fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
     let mut indexed_states: HashMap<u32, (char, SymbolType, u32)> = HashMap::new();
     let mut symbol_type: SymbolType = SymbolType::Normal;
@@ -131,11 +184,12 @@ fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
     indexed_states
 }
 
+// TODO: remove the unused param later
 fn fill_sets(
     states: HashMap<u32, (char, SymbolType, u32)>,
-    accepting_states: &mut HashSet<u32>,
     transitions: &mut HashMap<(u32, char), Vec<u32>>,
 ) {
+    dbg!(&states);
     let mut start_states = HashSet::new();
 
     let amount_states = states.len() as u32;
@@ -164,10 +218,11 @@ fn fill_sets(
 
         for i in 0..group.len() {
             let state = group[i];
-            if let Some((_, symbol_type, _)) = states.get(&state) {
-                if symbol_type == &SymbolType::KleeneStar && i + 1 < group.len() {
-                    start_states.insert(group[i + 1]);
-                }
+            if let Some((_, symbol_type, _)) = states.get(&state)
+                && symbol_type == &SymbolType::KleeneStar
+                && i + 1 < group.len()
+            {
+                start_states.insert(group[i + 1]);
             }
         }
     }
@@ -185,8 +240,6 @@ fn fill_sets(
                         .entry((*state_id, *symbol))
                         .or_default()
                         .push(next_state);
-                } else {
-                    accepting_states.insert(*state_id);
                 }
             }
             SymbolType::KleeneStar => {
@@ -202,8 +255,6 @@ fn fill_sets(
                             .or_default()
                             .push(*next_state);
                     }
-                } else {
-                    accepting_states.insert(*state_id);
                 }
             }
         }
@@ -541,13 +592,35 @@ mod tests {
         assert_eq!(result, expected, "Mismatch in mixed regex test");
     }
 
+    #[test]
+    fn test_compute_accepting_states_too_many_brackets() {
+        let regex = "a*b|(c|d)|ef";
+        let accepting_states = compute_accepting_states(regex);
+
+        assert_eq!(accepting_states, HashSet::from([1, 2, 3, 5]))
+    }
+
+    #[test]
+    fn test_compute_accepting_states_escape_sequence() {
+        let regex = r"a\*b|cd\*|sdfe\|f";
+        let accepting_states = compute_accepting_states(regex);
+
+        assert_eq!(accepting_states, HashSet::from([3, 6, 12]))
+    }
+
+    #[test]
+    fn test_compute_accepting_states_complex() {
+        let regex = "a*b*c|d*e";
+        let accepting_states = compute_accepting_states(regex);
+
+        assert_eq!(accepting_states, HashSet::from([2, 4]))
+    }
+
     #[test]
     fn test_fill_sets_too_many_brackets() {
         let states = index_states("a*b|(c|d)|ef");
-        let mut finite_states: HashSet<u32> = HashSet::new();
         let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
 
-        let expected_finite_set: HashSet<u32> = HashSet::from([1, 2, 3, 5]);
         let expected_transitions: HashMap<(u32, char), Vec<u32>> = HashMap::from([
             ((6, 'a'), vec![0]),
             ((6, 'b'), vec![1]),
@@ -558,19 +631,16 @@ mod tests {
             ((4, 'e'), vec![5]),
         ]);
 
-        fill_sets(states, &mut finite_states, &mut transitions);
+        fill_sets(states, &mut transitions);
 
-        assert_eq!(finite_states, expected_finite_set);
         assert_eq!(transitions, expected_transitions);
     }
 
     #[test]
     fn test_fill_sets_complex() {
         let states = index_states("a*b*c|d*e");
-        let mut finite_states: HashSet<u32> = HashSet::new();
         let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
 
-        let expected_finite_set: HashSet<u32> = HashSet::from([2, 4]);
         let expected_transitions = HashMap::from([
             ((5, 'a'), vec![0]),
             ((5, 'b'), vec![1]),
@@ -582,9 +652,8 @@ mod tests {
             ((3, 'd'), vec![3, 4]),
         ]);
 
-        fill_sets(states, &mut finite_states, &mut transitions);
+        fill_sets(states, &mut transitions);
 
-        assert_eq!(finite_states, expected_finite_set);
         assert_eq!(transitions, expected_transitions);
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index a57b628..a6adb31 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,7 +5,9 @@ mod glushkov;
 mod thompson;
 
 trait Dfa {
-    fn new(regex: &str) -> Self;
+    fn new(regex: &str) -> Result<Self, String>
+    where
+        Self: std::marker::Sized;
     fn get_transitions(&self) -> &HashMap<(u32, char), u32>;
     fn get_accepting_states(&self) -> &HashSet<u32>;
     fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32>;
@@ -259,12 +261,12 @@ pub struct Regex {
 }
 
 impl Regex {
-    pub fn new(pattern: &str, construction: ConstructionType) -> Self {
+    pub fn new(pattern: &str, construction: ConstructionType) -> Result<Self, String> {
         let dfa_type = match construction {
-            ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)),
-            ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)),
+            ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)?),
+            ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)?),
         };
-        Regex { dfa: dfa_type }
+        Ok(Regex { dfa: dfa_type })
     }
 
     /// Determines if the provided `text` is an exact match for the regex pattern.
@@ -560,7 +562,7 @@ mod tests {
 
     #[test]
     fn is_match_test() {
-        let regex_object = Regex::new("a(a|b)*", ConstructionType::Thompson);
+        let regex_object = Regex::new("a(a|b)*", ConstructionType::Thompson).expect("Valid regex");
 
         let success_strings = vec!["abababaaaababa", "a"];
         for string in success_strings {
@@ -575,7 +577,7 @@ mod tests {
 
     #[test]
     fn find_test() {
-        let regex_object = Regex::new("abc", ConstructionType::Thompson);
+        let regex_object = Regex::new("abc", ConstructionType::Thompson).expect("Valid regex");
         let test_cases = vec![
             ("abcd", Some("abc")),
             ("xyzabc", Some("abc")),
@@ -593,7 +595,7 @@ mod tests {
 
     #[test]
     fn find_all_test() {
-        let regex_object = Regex::new("abc*", ConstructionType::Thompson);
+        let regex_object = Regex::new("abc*", ConstructionType::Thompson).expect("Valid regex");
         let test_cases = vec![
             ("abcd", vec!["abc"]),
             ("ac", vec![]),
diff --git a/src/thompson.rs b/src/thompson.rs
index dcf54d7..a74b138 100644
--- a/src/thompson.rs
+++ b/src/thompson.rs
@@ -12,16 +12,16 @@ pub struct ThompsonDfa {
 }
 
 impl Dfa for ThompsonDfa {
-    fn new(regex: &str) -> Self {
+    fn new(regex: &str) -> Result<Self, String> {
         if !is_valid_regex(regex) {
-            panic!("{regex} is not a valid regular expression!");
+            return Err("{regex} is not a valid regular expression!".to_string());
         }
 
         let normalised_regex = normalise_regex(regex);
         let regex_nfa: Nfa = thompson_construction(&normalised_regex);
         let mut regex_dfa = nfa_to_dfa(&regex_nfa);
         <Self as Dfa>::optimise_dfa(&mut regex_dfa);
-        regex_dfa
+        Ok(regex_dfa)
     }
 
     fn get_transitions(&self) -> &HashMap<(u32, char), u32> {
@@ -141,10 +141,10 @@ fn thompson_construction(normalised_regex: &str) -> Nfa {
     }
 
     // Handle case where regex ends with '|' (empty right operand)
-    if let Some(&'|') = operators.last() {
-        if nfa_stack.len() < 2 {
-            nfa_stack.push(create_basic_epsilon_nfa());
-        }
+    if let Some(&'|') = operators.last()
+        && nfa_stack.len() < 2
+    {
+        nfa_stack.push(create_basic_epsilon_nfa());
     }
 
     // Process remaining operators
@@ -372,14 +372,14 @@ mod tests {
 
     #[test]
     fn create_dfa_test() {
-        let generated_dfa = ThompsonDfa::new("(a|b)*");
+        let generated_dfa = ThompsonDfa::new("(a|b)*").expect("Valid dfa");
         let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 0)]);
         let expected_accepting_states = HashSet::from([0]);
 
         assert_eq!(expected_transitions, generated_dfa.transitions);
         assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
 
-        let generated_dfa_2 = ThompsonDfa::new("a|()");
+        let generated_dfa_2 = ThompsonDfa::new("a|()").expect("Valid dfa");
         let expected_transitions_2 = HashMap::from([((0, 'a'), 1)]);
         let expected_accepting_states_2 = HashSet::from([0, 1]);
 
@@ -389,7 +389,7 @@ mod tests {
             generated_dfa_2.accepting_states
         );
 
-        let generated_dfa = ThompsonDfa::new("a*b");
+        let generated_dfa = ThompsonDfa::new("a*b").expect("Valid dfa");
         let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 1)]);
         let expected_accepting_states = HashSet::from([1]);
 
@@ -399,7 +399,7 @@ mod tests {
 
     #[test]
     fn prozess_regex_test() {
-        let generated_dfa = ThompsonDfa::new("(a|b)*");
+        let generated_dfa = ThompsonDfa::new("(a|b)*").expect("Valid dfa");
         let test_strings = vec!["abbbababaaaa", ""];
         for string in test_strings {
             assert!(generated_dfa.process(string));
diff --git a/tests/glushkov_test.rs b/tests/glushkov_test.rs
index 20c5306..545218e 100644
--- a/tests/glushkov_test.rs
+++ b/tests/glushkov_test.rs
@@ -6,7 +6,7 @@ fn test_all_bench_cases() {
     let cases = get_bench_cases();
 
     for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Glushkov);
+        let regex = Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex");
         assert_eq!(regex.is_match(&case.input), case.expected_is_match);
         assert_eq!(
             regex.find(&case.input),
diff --git a/tests/thompson_test.rs b/tests/thompson_test.rs
index d2cce3a..5532768 100644
--- a/tests/thompson_test.rs
+++ b/tests/thompson_test.rs
@@ -6,7 +6,7 @@ fn test_all_bench_cases() {
     let cases = get_bench_cases();
 
     for case in &cases {
-        let regex = Regex::new(case.regex, ConstructionType::Thompson);
+        let regex = Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex");
 
         assert_eq!(regex.is_match(&case.input), case.expected_is_match);
         assert_eq!(

From 7a8c7eb84daf924231c3410e23b0f8a95a3745c3 Mon Sep 17 00:00:00 2001
From: Pepe Hanisch <142326461+Testspieler09@users.noreply.github.com>
Date: Sun, 24 Aug 2025 20:44:58 +0200
Subject: [PATCH 7/8] feat: glushkov construction

---
 src/glushkov.rs | 1067 ++++++++++++++++++-----------------------------
 src/lib.rs      |    4 +-
 2 files changed, 415 insertions(+), 656 deletions(-)

diff --git a/src/glushkov.rs b/src/glushkov.rs
index b0360dc..85b3093 100644
--- a/src/glushkov.rs
+++ b/src/glushkov.rs
@@ -1,11 +1,12 @@
 use crate::{Dfa, is_valid_regex, normalise_regex};
-use std::collections::{HashMap, HashSet, VecDeque};
-
-#[derive(Clone, Debug, PartialEq)]
-enum SymbolType {
-    Normal,
-    KleeneStar,
-    Escaped,
+use std::collections::{BTreeSet, HashMap, HashSet, VecDeque};
+
+#[derive(Debug, Clone)]
+enum RegexAst {
+    Char(char),
+    Concat(Vec<RegexAst>),
+    Alternation(Vec<RegexAst>),
+    KleeneStar(Box<RegexAst>),
 }
 
 #[derive(Debug)]
@@ -23,16 +24,15 @@ pub struct GlushkovDfa {
 impl Dfa for GlushkovDfa {
     fn new(regex: &str) -> Result<Self, String> {
         if !is_valid_regex(regex) {
-            return Err("{regex} is not a valid regular expression!".to_string());
+            return Err(format!("{regex} is not a valid regular expression!"));
         }
 
         let normalised_regex = normalise_regex(regex);
-        let regex_nfa = glushkov_construction(&normalised_regex);
-        dbg!(&regex_nfa);
-        let mut regex_dfa = nfa_no_epsilon_to_dfa(&regex_nfa);
-        // dbg!(&regex_dfa);
+        let ast = parse_regex(&normalised_regex)?;
+        let nfa = glushkov_construction(ast)?;
+        let mut regex_dfa = nfa_to_dfa(nfa);
+
         <Self as Dfa>::optimise_dfa(&mut regex_dfa);
-        // dbg!(&regex_dfa);
         Ok(regex_dfa)
     }
 
@@ -53,743 +53,502 @@ impl Dfa for GlushkovDfa {
     }
 }
 
-// GLUSHKOV CONSTRUCTION
-fn glushkov_construction(regex: &str) -> Nfa {
-    let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
-    let accepting_states: HashSet<u32> = compute_accepting_states(regex);
+// Parser for regex string to AST
+fn parse_regex(regex: &str) -> Result<RegexAst, String> {
+    let chars: Vec<char> = regex.chars().collect();
+    let (ast, pos) = parse_alternation(&chars, 0)?;
 
-    let states: HashMap<u32, (char, SymbolType, u32)> = index_states(regex);
+    if pos != chars.len() {
+        return Err("Unexpected characters at end of regex".to_string());
+    }
 
-    fill_sets(states, &mut transitions);
+    Ok(ast)
+}
 
-    Nfa {
-        transitions,
-        accepting_states,
+fn parse_alternation(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> {
+    let mut alternatives = Vec::new();
+
+    let (first_alt, new_pos) = parse_concatenation(chars, pos)?;
+    alternatives.push(first_alt);
+    pos = new_pos;
+
+    while pos < chars.len() && chars[pos] == '|' {
+        pos += 1; // skip '|'
+        let (alt, new_pos) = parse_concatenation(chars, pos)?;
+        alternatives.push(alt);
+        pos = new_pos;
+    }
+
+    if alternatives.len() == 1 {
+        Ok((alternatives.into_iter().next().unwrap(), pos))
+    } else {
+        Ok((RegexAst::Alternation(alternatives), pos))
     }
 }
 
-fn compute_accepting_states(regex: &str) -> HashSet<u32> {
-    // also need handling of escape sequence?!
-    let mut accepting_states = HashSet::new();
-    let mut number_of_accepting_states_in_group = 0;
-    let mut group_is_exhausted = false;
-    let mut last_element_was_seperator = false;
-    let num_unions = regex.chars().filter(|&c| matches!(c, '|'));
-    let mut position: u32 = regex
-        .chars()
-        .filter(|&c| !matches!(c, '(' | ')' | '|' | '*'))
-        .count() as u32;
-
-    dbg!(regex);
-
-    // check if after none ) a | is present
-    for ch in regex.chars().rev() {
-        match ch {
-            ')' => {
-                if !last_element_was_seperator {
-                    group_is_exhausted = number_of_accepting_states_in_group != 0;
-                }
-                last_element_was_seperator = true;
-            }
-            '|' => {
-                group_is_exhausted = false;
-                last_element_was_seperator = true;
+fn parse_concatenation(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> {
+    let mut elements = Vec::new();
+
+    while pos < chars.len() && chars[pos] != '|' && chars[pos] != ')' {
+        let (element, new_pos) = parse_factor(chars, pos)?;
+        elements.push(element);
+        pos = new_pos;
+    }
+
+    // Handle empty concatenation (empty alternative)
+    if elements.is_empty() {
+        // Return an epsilon (empty string) represented as an empty concatenation
+        return Ok((RegexAst::Concat(vec![]), pos));
+    }
+
+    if elements.len() == 1 {
+        Ok((elements.into_iter().next().unwrap(), pos))
+    } else {
+        Ok((RegexAst::Concat(elements), pos))
+    }
+}
+
+fn parse_factor(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> {
+    if pos >= chars.len() {
+        return Err("Unexpected end of regex".to_string());
+    }
+
+    let (base, new_pos) = match chars[pos] {
+        '(' => {
+            pos += 1; // skip '('
+            let (inner, inner_pos) = parse_alternation(chars, pos)?;
+            if inner_pos >= chars.len() || chars[inner_pos] != ')' {
+                return Err("Unmatched opening parenthesis".to_string());
             }
-            '(' => group_is_exhausted = true,
-            '*' => {
-                // Should account for ba* -> b and a are accepting
-                last_element_was_seperator = false;
+            (inner, inner_pos + 1) // skip ')'
+        }
+        '\\' => {
+            if pos + 1 >= chars.len() {
+                return Err("Invalid escape sequence".to_string());
             }
-            _ => {
-                if position != 0 {
-                    position -= 1;
-                } else {
-                    break;
-                }
+            pos += 1; // skip '\'
+            (RegexAst::Char(chars[pos]), pos + 1)
+        }
+        c if c.is_ascii() && !"()|*+\\".contains(c) => (RegexAst::Char(c), pos + 1),
+        _ => {
+            return Err(format!("Unexpected character: {}", chars[pos]));
+        }
+    };
 
-                if group_is_exhausted {
-                    continue;
-                }
-                dbg!(&ch, &position);
-                accepting_states.insert(position);
-                number_of_accepting_states_in_group += 1;
-                group_is_exhausted = true;
+    pos = new_pos;
+
+    // Check for Kleene star
+    if pos < chars.len() && chars[pos] == '*' {
+        pos += 1;
+        Ok((RegexAst::KleeneStar(Box::new(base)), pos))
+    } else {
+        Ok((base, pos))
+    }
+}
+
+fn glushkov_construction(ast: RegexAst) -> Result<Nfa, String> {
+    let mut state_counter = 0u32;
+    let mut state_to_char: HashMap<u32, char> = HashMap::new();
+
+    // Assign unique state numbers to each character occurrence
+    assign_positions(&ast, &mut state_counter, &mut state_to_char);
+
+    let start_state = state_counter;
+
+    // Compute First, Last, Follow sets - each with fresh position counter
+    let first_set = first(&ast);
+    let last_set = last(&ast);
+    let follow_map = follow(&ast);
+
+    // Build NFA
+    let mut transitions = HashMap::new();
+    let mut accepting_states = HashSet::new();
+
+    // Transitions from start state
+    for &state in &first_set {
+        if let Some(&ch) = state_to_char.get(&state) {
+            transitions
+                .entry((start_state, ch))
+                .or_insert_with(Vec::new)
+                .push(state);
+        }
+    }
+
+    // Internal transitions based on follow sets
+    for (state, follow_states) in follow_map {
+        for &follow_state in &follow_states {
+            if let Some(&ch) = state_to_char.get(&follow_state) {
+                transitions
+                    .entry((state, ch))
+                    .or_insert_with(Vec::new)
+                    .push(follow_state);
             }
         }
     }
 
-    accepting_states
+    // Accepting states
+    if nullable(&ast) {
+        accepting_states.insert(start_state);
+    }
+    for &state in &last_set {
+        accepting_states.insert(state);
+    }
+
+    Ok(Nfa {
+        transitions,
+        accepting_states,
+    })
+}
+
+fn first(ast: &RegexAst) -> HashSet<u32> {
+    let mut positions = HashMap::new();
+    let mut counter = 0;
+    map_ast_to_positions(ast, &mut counter, &mut positions);
+    first_positions(ast, &positions)
 }
 
-fn index_states(regex: &str) -> HashMap<u32, (char, SymbolType, u32)> {
-    let mut indexed_states: HashMap<u32, (char, SymbolType, u32)> = HashMap::new();
-    let mut symbol_type: SymbolType = SymbolType::Normal;
-    let mut group_stack: Vec<u32> = vec![0];
-    let mut idx: u32 = 0;
-    let mut next_group_id: u32 = 1;
-    let mut chars = regex.chars().peekable();
-
-    while let Some(symbol) = chars.next() {
-        if symbol_type == SymbolType::Escaped {
-            indexed_states.entry(idx).or_insert((
-                symbol,
-                symbol_type.clone(),
-                *group_stack.last().unwrap(),
-            ));
-            idx += 1;
-            symbol_type = SymbolType::Normal;
-            continue;
-        }
+fn last(ast: &RegexAst) -> HashSet<u32> {
+    let mut positions = HashMap::new();
+    let mut counter = 0;
+    map_ast_to_positions(ast, &mut counter, &mut positions);
+    last_positions(ast, &positions)
+}
 
-        match symbol {
-            '|' => {
-                // Start a new group for the next alternative
-                let new_group_id = next_group_id;
-                next_group_id += 1;
-                // Replace the current group on the stack with the new one
-                if let Some(last) = group_stack.last_mut() {
-                    *last = new_group_id;
-                }
+fn follow(ast: &RegexAst) -> HashMap<u32, HashSet<u32>> {
+    let mut positions = HashMap::new();
+    let mut counter = 0;
+    map_ast_to_positions(ast, &mut counter, &mut positions);
+
+    let mut result = HashMap::new();
+    follow_positions(ast, &positions, &mut result);
+    result
+}
+
+// Helper function to create a mapping from AST nodes to their position ranges
+fn map_ast_to_positions(
+    ast: &RegexAst,
+    counter: &mut u32,
+    positions: &mut HashMap<*const RegexAst, (u32, u32)>,
+) {
+    let start_pos = *counter;
+
+    match ast {
+        RegexAst::Char(_) => {
+            *counter += 1;
+        }
+        RegexAst::Concat(elements) => {
+            for element in elements {
+                map_ast_to_positions(element, counter, positions);
             }
-            '(' => {
-                // Push the next group ID onto the stack for this grouping level
-                let new_group_id = next_group_id;
-                next_group_id += 1;
-                group_stack.push(new_group_id);
+        }
+        RegexAst::Alternation(alternatives) => {
+            for alt in alternatives {
+                map_ast_to_positions(alt, counter, positions);
             }
-            ')' => {
-                // Pop the current group and return to parent group
-                group_stack.pop();
+        }
+        RegexAst::KleeneStar(inner) => {
+            map_ast_to_positions(inner, counter, positions);
+        }
+    }
+
+    positions.insert(ast as *const RegexAst, (start_pos, *counter));
+}
+
+fn first_positions(
+    ast: &RegexAst,
+    positions: &HashMap<*const RegexAst, (u32, u32)>,
+) -> HashSet<u32> {
+    match ast {
+        RegexAst::Char(_) => {
+            let (start_pos, _) = positions[&(ast as *const RegexAst)];
+            let mut result = HashSet::new();
+            result.insert(start_pos);
+            result
+        }
+        RegexAst::Concat(elements) => {
+            let mut result = HashSet::new();
+            for element in elements {
+                result.extend(first_positions(element, positions));
+                if !nullable(element) {
+                    break;
+                }
             }
-            '*' => {
-                symbol_type = SymbolType::Normal;
-                continue;
+            result
+        }
+        RegexAst::Alternation(alternatives) => {
+            let mut result = HashSet::new();
+            for alt in alternatives {
+                result.extend(first_positions(alt, positions));
             }
-            '\\' => symbol_type = SymbolType::Escaped,
-            _ => {
-                if let Some(next_symbol) = chars.peek()
-                    && matches!(*next_symbol, '*')
-                {
-                    symbol_type = SymbolType::KleeneStar
+            result
+        }
+        RegexAst::KleeneStar(inner) => first_positions(inner, positions),
+    }
+}
+
+fn last_positions(
+    ast: &RegexAst,
+    positions: &HashMap<*const RegexAst, (u32, u32)>,
+) -> HashSet<u32> {
+    match ast {
+        RegexAst::Char(_) => {
+            let (start_pos, _) = positions[&(ast as *const RegexAst)];
+            let mut result = HashSet::new();
+            result.insert(start_pos);
+            result
+        }
+        RegexAst::Concat(elements) => {
+            let mut result = HashSet::new();
+            for element in elements.iter().rev() {
+                result.extend(last_positions(element, positions));
+                if !nullable(element) {
+                    break;
                 }
-                indexed_states.entry(idx).or_insert((
-                    symbol,
-                    symbol_type.clone(),
-                    *group_stack.last().unwrap(),
-                ));
-                idx += 1;
             }
+            result
+        }
+        RegexAst::Alternation(alternatives) => {
+            let mut result = HashSet::new();
+            for alt in alternatives {
+                result.extend(last_positions(alt, positions));
+            }
+            result
         }
+        RegexAst::KleeneStar(inner) => last_positions(inner, positions),
     }
-    indexed_states
 }
 
-// TODO: remove the unused param later
-fn fill_sets(
-    states: HashMap<u32, (char, SymbolType, u32)>,
-    transitions: &mut HashMap<(u32, char), Vec<u32>>,
+fn follow_positions(
+    ast: &RegexAst,
+    positions: &HashMap<*const RegexAst, (u32, u32)>,
+    result: &mut HashMap<u32, HashSet<u32>>,
 ) {
-    dbg!(&states);
-    let mut start_states = HashSet::new();
+    match ast {
+        RegexAst::Char(_) => {
+            // Base case - no follow computation needed
+        }
+        RegexAst::Concat(elements) => {
+            // Process each element recursively
+            for element in elements {
+                follow_positions(element, positions, result);
+            }
 
-    let amount_states = states.len() as u32;
-    if amount_states == 0 {
-        return;
-    }
+            // Add follow relationships between consecutive elements
+            for i in 0..elements.len() {
+                let last_i = last_positions(&elements[i], positions);
 
-    // Group states by their group index
-    let mut groups: HashMap<u32, Vec<u32>> = HashMap::new();
-    for (state_id, (_, _, group_idx)) in &states {
-        groups.entry(*group_idx).or_default().push(*state_id);
-    }
+                // For each subsequent element j > i
+                for j in (i + 1)..elements.len() {
+                    // Check if all elements between i and j are nullable
+                    let all_between_nullable = elements[(i + 1)..j].iter().all(nullable);
 
-    // Sort states within each group
-    for group in groups.values_mut() {
-        group.sort();
-    }
+                    if j == i + 1 || all_between_nullable {
+                        let first_j = first_positions(&elements[j], positions);
+
+                        // Add follow relationships from last(i) to first(j)
+                        for &last_state in &last_i {
+                            result.entry(last_state).or_default().extend(&first_j);
+                        }
+                    }
 
-    // Determine start states (first state of each group)
-    for group in groups.values() {
-        if group.is_empty() {
-            continue;
+                    // If element j is not nullable, we can't skip further
+                    if !nullable(&elements[j]) {
+                        break;
+                    }
+                }
+            }
+        }
+        RegexAst::Alternation(alternatives) => {
+            for alt in alternatives {
+                follow_positions(alt, positions, result);
+            }
         }
+        RegexAst::KleeneStar(inner) => {
+            follow_positions(inner, positions, result);
 
-        start_states.insert(group[0]);
+            // Kleene star: last positions can loop back to first positions
+            let inner_last = last_positions(inner, positions);
+            let inner_first = first_positions(inner, positions);
 
-        for i in 0..group.len() {
-            let state = group[i];
-            if let Some((_, symbol_type, _)) = states.get(&state)
-                && symbol_type == &SymbolType::KleeneStar
-                && i + 1 < group.len()
-            {
-                start_states.insert(group[i + 1]);
+            for &last_state in &inner_last {
+                result.entry(last_state).or_default().extend(&inner_first);
             }
         }
     }
+}
 
-    // Build transitions and determine accepting states
-    for (state_id, (symbol, symbol_type, group_idx)) in &states {
-        let current_group = &groups[group_idx];
-        let pos_in_group = current_group.iter().position(|&x| x == *state_id).unwrap();
+fn nullable(ast: &RegexAst) -> bool {
+    match ast {
+        RegexAst::Char(_) => false,
+        RegexAst::Concat(elements) => {
+            // Empty concat is nullable (represents epsilon)
+            elements.is_empty() || elements.iter().all(nullable)
+        }
+        RegexAst::Alternation(alternatives) => alternatives.iter().any(nullable),
+        RegexAst::KleeneStar(_) => true,
+    }
+}
 
-        match symbol_type {
-            SymbolType::Normal | SymbolType::Escaped => {
-                if pos_in_group + 1 < current_group.len() {
-                    let next_state = current_group[pos_in_group + 1];
-                    transitions
-                        .entry((*state_id, *symbol))
-                        .or_default()
-                        .push(next_state);
-                }
+fn assign_positions(ast: &RegexAst, counter: &mut u32, state_to_char: &mut HashMap<u32, char>) {
+    match ast {
+        RegexAst::Char(ch) => {
+            let state = *counter;
+            *counter += 1;
+            state_to_char.insert(state, *ch);
+        }
+        RegexAst::Concat(elements) => {
+            for element in elements {
+                assign_positions(element, counter, state_to_char);
             }
-            SymbolType::KleeneStar => {
-                transitions
-                    .entry((*state_id, *symbol))
-                    .or_default()
-                    .push(*state_id);
-
-                if pos_in_group + 1 < current_group.len() {
-                    for next_state in current_group.iter().skip(pos_in_group + 1) {
-                        transitions
-                            .entry((*state_id, *symbol))
-                            .or_default()
-                            .push(*next_state);
-                    }
-                }
+        }
+        RegexAst::Alternation(alternatives) => {
+            for alt in alternatives {
+                assign_positions(alt, counter, state_to_char);
             }
         }
-    }
-
-    // Setup virtual (start-)state
-    let virtual_start = states.keys().max().copied().unwrap_or(0) + 1;
-
-    let symbol_to_first_state: Vec<(u32, char)> = start_states
-        .iter()
-        .map(|&s| (s, states.get(&s).expect("Expected an entry").0))
-        .collect();
-
-    for (first_state, symbol) in symbol_to_first_state {
-        transitions
-            .entry((virtual_start, symbol))
-            .or_default()
-            .push(first_state);
+        RegexAst::KleeneStar(inner) => {
+            assign_positions(inner, counter, state_to_char);
+        }
     }
 }
-// END GLUSHKOV CONSTRUCTION
 
-fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa {
+fn nfa_to_dfa(nfa: Nfa) -> GlushkovDfa {
     let mut dfa_transitions = HashMap::new();
     let mut dfa_accepting_states = HashSet::new();
+    let mut state_sets_to_dfa_state: HashMap<BTreeSet<u32>, u32> = HashMap::new();
+    let mut queue = VecDeque::new();
+    let mut next_dfa_state = 0u32;
 
-    // Map from sorted vector of NFA states to DFA state ID (for hashable key)
-    let mut nfa_states_to_dfa_state: HashMap<Vec<u32>, u32> = HashMap::new();
-    let mut next_dfa_state_id = 0u32;
-    let mut work_queue = VecDeque::new();
-
-    // Helper function to convert HashSet to sorted Vec for use as HashMap key
-    let set_to_sorted_vec = |set: &HashSet<u32>| -> Vec<u32> {
-        let mut vec: Vec<u32> = set.iter().cloned().collect();
-        vec.sort_unstable();
-        vec
-    };
-
-    // Get all possible input symbols from NFA transitions
-    let alphabet: HashSet<char> = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect();
+    // Get alphabet from NFA
+    let alphabet: HashSet<char> = nfa.transitions.keys().map(|(_, ch)| *ch).collect();
 
-    // Find all states that exist in the NFA
+    // Find start state (highest numbered state in NFA)
     let mut all_nfa_states = HashSet::new();
-    for &(state, _) in nfa.transitions.keys() {
-        all_nfa_states.insert(state);
+
+    for &(from_state, _) in nfa.transitions.keys() {
+        all_nfa_states.insert(from_state);
     }
     for target_states in nfa.transitions.values() {
-        for &state in target_states {
-            all_nfa_states.insert(state);
+        for &to_state in target_states {
+            all_nfa_states.insert(to_state);
         }
     }
-    for &state in &nfa.accepting_states {
-        all_nfa_states.insert(state);
+    for &accepting_state in &nfa.accepting_states {
+        all_nfa_states.insert(accepting_state);
     }
 
-    // In a Glushkov NFA, state 0 is always the start state
-    let start_state = 0;
+    let start_state = all_nfa_states.iter().max().copied().unwrap_or(0);
 
-    // Verify that state 0 exists in the NFA
-    if !all_nfa_states.contains(&start_state) {
-        panic!("Expected start state 0 not found in NFA states: {all_nfa_states:?}");
-    }
-
-    let start_state_set = {
-        let mut set = HashSet::new();
+    let start_set: BTreeSet<u32> = {
+        let mut set = BTreeSet::new();
         set.insert(start_state);
         set
     };
 
-    // Create initial DFA state
-    let start_dfa_state = next_dfa_state_id;
-    next_dfa_state_id += 1;
-
-    let start_state_key = set_to_sorted_vec(&start_state_set);
-    nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state);
-    work_queue.push_back(start_state_set);
+    state_sets_to_dfa_state.insert(start_set.clone(), next_dfa_state);
+    queue.push_back(start_set);
+    next_dfa_state += 1;
 
-    // Process each DFA state
-    while let Some(current_nfa_states) = work_queue.pop_front() {
-        let current_state_key = set_to_sorted_vec(&current_nfa_states);
-        let current_dfa_state = nfa_states_to_dfa_state[&current_state_key];
+    while let Some(current_set) = queue.pop_front() {
+        let current_dfa_state = state_sets_to_dfa_state[&current_set];
 
         // Check if this DFA state should be accepting
-        if current_nfa_states
+        if current_set
             .iter()
-            .any(|&state| nfa.accepting_states.contains(&state))
+            .any(|&s| nfa.accepting_states.contains(&s))
         {
             dfa_accepting_states.insert(current_dfa_state);
         }
 
-        // For each symbol in the alphabet
+        // For each symbol in alphabet
         for &symbol in &alphabet {
-            let mut next_nfa_states = HashSet::new();
+            let mut next_set = BTreeSet::new();
 
-            // Collect all states reachable from current_nfa_states via symbol
-            for &nfa_state in &current_nfa_states {
-                if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) {
-                    for &target_state in target_states {
-                        next_nfa_states.insert(target_state);
-                    }
+            // Collect all states reachable via this symbol
+            for &state in &current_set {
+                if let Some(targets) = nfa.transitions.get(&(state, symbol)) {
+                    next_set.extend(targets);
                 }
             }
 
-            // Skip if no transitions exist for this symbol
-            if next_nfa_states.is_empty() {
-                continue;
-            }
-
-            // Get or create DFA state for this set of NFA states
-            let next_state_key = set_to_sorted_vec(&next_nfa_states);
-            let next_dfa_state =
-                if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) {
-                    existing_state
+            if !next_set.is_empty() {
+                let next_dfa_state = if let Some(&existing) = state_sets_to_dfa_state.get(&next_set)
+                {
+                    existing
                 } else {
-                    let new_state = next_dfa_state_id;
-                    next_dfa_state_id += 1;
-
-                    nfa_states_to_dfa_state.insert(next_state_key.clone(), new_state);
-                    work_queue.push_back(next_nfa_states);
-
+                    let new_state = next_dfa_state;
+                    next_dfa_state += 1;
+                    state_sets_to_dfa_state.insert(next_set.clone(), new_state);
+                    queue.push_back(next_set.clone());
                     new_state
                 };
 
-            // Add transition to DFA
-            dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state);
+                dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state);
+            }
         }
     }
 
-    GlushkovDfa {
-        transitions: dfa_transitions,
-        accepting_states: dfa_accepting_states,
-    }
+    // Normalize to start from state 0
+    normalize_dfa_states(dfa_transitions, dfa_accepting_states)
 }
 
-// fn nfa_no_epsilon_to_dfa(nfa: &Nfa) -> GlushkovDfa {
-//     let mut dfa_transitions = HashMap::new();
-//     let mut dfa_accepting_states = HashSet::new();
-//
-//     // Map from DFA state ID to the set of NFA states it represents
-//     let mut dfa_state_to_nfa_states: HashMap<u32, HashSet<u32>> = HashMap::new();
-//     // Map from sorted vector of NFA states to DFA state ID (for hashable key)
-//     let mut nfa_states_to_dfa_state: HashMap<Vec<u32>, u32> = HashMap::new();
-//
-//     let mut next_dfa_state_id = 0u32;
-//     let mut work_queue = VecDeque::new();
-//
-//     // Helper function to convert HashSet to sorted Vec for use as HashMap key
-//     let set_to_sorted_vec = |set: &HashSet<u32>| -> Vec<u32> {
-//         let mut vec: Vec<u32> = set.iter().cloned().collect();
-//         vec.sort_unstable();
-//         vec
-//     };
-//
-//     // Get all possible input symbols from NFA transitions
-//     let alphabet: HashSet<char> = nfa.transitions.keys().map(|(_, symbol)| *symbol).collect();
-//
-//     // Find the start state (assuming state 0 is the start state)
-//     let start_state_set = {
-//         let mut set = HashSet::new();
-//         set.insert(0u32);
-//         set
-//     };
-//
-//     // Create initial DFA state
-//     let start_dfa_state = next_dfa_state_id;
-//     next_dfa_state_id += 1;
-//
-//     let start_state_key = set_to_sorted_vec(&start_state_set);
-//     dfa_state_to_nfa_states.insert(start_dfa_state, start_state_set.clone());
-//     nfa_states_to_dfa_state.insert(start_state_key, start_dfa_state);
-//     work_queue.push_back(start_state_set);
-//
-//     // Process each DFA state
-//     while let Some(current_nfa_states) = work_queue.pop_front() {
-//         let current_state_key = set_to_sorted_vec(&current_nfa_states);
-//         let current_dfa_state = nfa_states_to_dfa_state[&current_state_key];
-//
-//         // Check if this DFA state should be accepting
-//         if current_nfa_states
-//             .iter()
-//             .any(|&state| nfa.accepting_states.contains(&state))
-//         {
-//             dfa_accepting_states.insert(current_dfa_state);
-//         }
-//
-//         // For each symbol in the alphabet
-//         for &symbol in &alphabet {
-//             let mut next_nfa_states = HashSet::new();
-//
-//             // Collect all states reachable from current_nfa_states via symbol
-//             for &nfa_state in &current_nfa_states {
-//                 if let Some(target_states) = nfa.transitions.get(&(nfa_state, symbol)) {
-//                     for &target_state in target_states {
-//                         next_nfa_states.insert(target_state);
-//                     }
-//                 }
-//             }
-//
-//             // Skip if no transitions exist for this symbol
-//             if next_nfa_states.is_empty() {
-//                 continue;
-//             }
-//
-//             // Get or create DFA state for this set of NFA states
-//             let next_state_key = set_to_sorted_vec(&next_nfa_states);
-//             let next_dfa_state =
-//                 if let Some(&existing_state) = nfa_states_to_dfa_state.get(&next_state_key) {
-//                     existing_state
-//                 } else {
-//                     let new_state = next_dfa_state_id;
-//                     next_dfa_state_id += 1;
-//
-//                     dfa_state_to_nfa_states.insert(new_state, next_nfa_states.clone());
-//                     nfa_states_to_dfa_state.insert(next_state_key, new_state);
-//                     work_queue.push_back(next_nfa_states);
-//
-//                     new_state
-//                 };
-//
-//             // Add transition to DFA
-//             dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state);
-//         }
-//     }
-//
-//     GlushkovDfa {
-//         transitions: dfa_transitions,
-//         accepting_states: dfa_accepting_states,
-//     }
-// }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_index_single_character() {
-        let expected = HashMap::from([(0, ('a', SymbolType::Normal, 0))]);
-
-        let result = index_states("a");
-        assert_eq!(result, expected, "Mismatch in single character test");
-    }
-
-    #[test]
-    fn test_nfa_single_character() {
-        let expected_finite = HashSet::from([0]);
-        let expected_transitions: HashMap<(u32, char), Vec<u32>> =
-            HashMap::from([((1, 'a'), vec![0])]);
-
-        let result = glushkov_construction("a");
-        assert_eq!(
-            result.transitions, expected_transitions,
-            "Mismatch in single character test"
-        );
-        assert_eq!(
-            result.accepting_states, expected_finite,
-            "Mismatch in single character test"
-        );
-    }
-
-    #[test]
-    fn test_nfa_single_character_kleene_star() {
-        let expected_finite = HashSet::from([0]);
-        let expected_transitions: HashMap<(u32, char), Vec<u32>> =
-            HashMap::from([((0, 'a'), vec![0]), ((1, 'a'), vec![0])]);
-
-        let result = glushkov_construction("a*");
-        assert_eq!(
-            result.transitions, expected_transitions,
-            "Mismatch in single character test"
-        );
-        assert_eq!(
-            result.accepting_states, expected_finite,
-            "Mismatch in single character test"
-        );
-    }
-
-    #[test]
-    fn test_index_kleene_star() {
-        let expected = HashMap::from([(0, ('a', SymbolType::KleeneStar, 0))]);
-
-        let result = index_states("a*");
-        assert_eq!(result, expected, "Mismatch in kleene star test");
-    }
-
-    #[test]
-    fn test_index_union_and_groups() {
-        let expected = HashMap::from([
-            (0, ('a', SymbolType::Normal, 1)),
-            (1, ('b', SymbolType::Normal, 2)),
-        ]);
-
-        let result = index_states("(a|b)");
-        assert_eq!(result, expected, "Mismatch in union and groups test");
-    }
-
-    #[test]
-    fn test_index_escaped_character() {
-        let expected = HashMap::from([(0, ('a', SymbolType::Escaped, 0))]);
-
-        let result = index_states("\\a");
-        assert_eq!(result, expected, "Mismatch in escaped character test");
-    }
-
-    #[test]
-    fn test_index_mixed_regex() {
-        let expected = HashMap::from([
-            (0, ('a', SymbolType::Normal, 0)),
-            (1, ('*', SymbolType::Escaped, 0)),
-            (2, ('b', SymbolType::Normal, 0)),
-            (3, ('c', SymbolType::KleeneStar, 0)),
-            (4, ('d', SymbolType::Normal, 0)),
-            (5, ('e', SymbolType::Normal, 1)),
-            (6, ('f', SymbolType::Normal, 2)),
-            (7, ('g', SymbolType::Normal, 4)),
-            (8, ('h', SymbolType::Normal, 5)),
-            (9, ('i', SymbolType::Normal, 0)),
-        ]);
-
-        let result = index_states("a\\*bc*d(e|f|(g|h))i");
-        assert_eq!(result, expected, "Mismatch in mixed regex test");
-    }
-
-    #[test]
-    fn test_index_too_many_brackets() {
-        let expected = HashMap::from([
-            (0, ('a', SymbolType::KleeneStar, 0)),
-            (1, ('b', SymbolType::Normal, 0)),
-            (2, ('c', SymbolType::Normal, 3)),
-            (3, ('d', SymbolType::Normal, 4)),
-            (4, ('e', SymbolType::Normal, 5)),
-            (5, ('f', SymbolType::Normal, 5)),
-        ]);
-
-        let result = index_states("a*b|((c|d))|ef");
-        assert_eq!(result, expected, "Mismatch in mixed regex test");
-    }
-
-    #[test]
-    fn test_compute_accepting_states_too_many_brackets() {
-        let regex = "a*b|(c|d)|ef";
-        let accepting_states = compute_accepting_states(regex);
-
-        assert_eq!(accepting_states, HashSet::from([1, 2, 3, 5]))
-    }
-
-    #[test]
-    fn test_compute_accepting_states_escape_sequence() {
-        let regex = r"a\*b|cd\*|sdfe\|f";
-        let accepting_states = compute_accepting_states(regex);
-
-        assert_eq!(accepting_states, HashSet::from([3, 6, 12]))
-    }
-
-    #[test]
-    fn test_compute_accepting_states_complex() {
-        let regex = "a*b*c|d*e";
-        let accepting_states = compute_accepting_states(regex);
-
-        assert_eq!(accepting_states, HashSet::from([2, 4]))
+fn normalize_dfa_states(
+    transitions: HashMap<(u32, char), u32>,
+    accepting_states: HashSet<u32>,
+) -> GlushkovDfa {
+    if transitions.is_empty() && accepting_states.is_empty() {
+        return GlushkovDfa {
+            transitions,
+            accepting_states,
+        };
     }
 
-    #[test]
-    fn test_fill_sets_too_many_brackets() {
-        let states = index_states("a*b|(c|d)|ef");
-        let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
-
-        let expected_transitions: HashMap<(u32, char), Vec<u32>> = HashMap::from([
-            ((6, 'a'), vec![0]),
-            ((6, 'b'), vec![1]),
-            ((6, 'c'), vec![2]),
-            ((6, 'd'), vec![3]),
-            ((6, 'e'), vec![4]),
-            ((0, 'a'), vec![0, 1]),
-            ((4, 'e'), vec![5]),
-        ]);
-
-        fill_sets(states, &mut transitions);
-
-        assert_eq!(transitions, expected_transitions);
+    // Find all states
+    let mut all_states = HashSet::new();
+    for &(from, _) in transitions.keys() {
+        all_states.insert(from);
     }
-
-    #[test]
-    fn test_fill_sets_complex() {
-        let states = index_states("a*b*c|d*e");
-        let mut transitions: HashMap<(u32, char), Vec<u32>> = HashMap::new();
-
-        let expected_transitions = HashMap::from([
-            ((5, 'a'), vec![0]),
-            ((5, 'b'), vec![1]),
-            ((5, 'c'), vec![2]),
-            ((5, 'd'), vec![3]),
-            ((5, 'e'), vec![4]),
-            ((0, 'a'), vec![0, 1, 2]),
-            ((1, 'b'), vec![1, 2]),
-            ((3, 'd'), vec![3, 4]),
-        ]);
-
-        fill_sets(states, &mut transitions);
-
-        assert_eq!(transitions, expected_transitions);
+    for &to in transitions.values() {
+        all_states.insert(to);
     }
+    all_states.extend(&accepting_states);
 
-    #[test]
-    fn nfa_to_dfa_simple_test() {
-        // NFA that accepts exactly "a"
-        // State 0 --a--> State 1 (accepting)
-        let input_nfa = Nfa {
-            transitions: HashMap::from([((0, 'a'), vec![1])]),
-            accepting_states: HashSet::from([1]),
+    if all_states.is_empty() {
+        return GlushkovDfa {
+            transitions,
+            accepting_states,
         };
-
-        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
-
-        let expected_transitions = HashMap::from([((0, 'a'), 1)]);
-        let expected_accepting_states = HashSet::from([1]);
-
-        assert_eq!(expected_transitions, generated_dfa.transitions);
-        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
     }
 
-    #[test]
-    fn nfa_to_dfa_sequence_test() {
-        // NFA that accepts exactly "ab"
-        // State 0 --a--> State 1 --b--> State 2 (accepting)
-        let input_nfa = Nfa {
-            transitions: HashMap::from([((0, 'a'), vec![1]), ((1, 'b'), vec![2])]),
-            accepting_states: HashSet::from([2]),
-        };
-
-        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
+    // Create mapping with 0 as start state
+    let start_state = *all_states.iter().min().unwrap();
+    let mut state_mapping = HashMap::new();
+    state_mapping.insert(start_state, 0);
 
-        let expected_transitions = HashMap::from([((0, 'a'), 1), ((1, 'b'), 2)]);
-        let expected_accepting_states = HashSet::from([2]);
-
-        assert_eq!(expected_transitions, generated_dfa.transitions);
-        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    let mut next_state = 1;
+    for &state in &all_states {
+        if state != start_state {
+            state_mapping.insert(state, next_state);
+            next_state += 1;
+        }
     }
 
-    #[test]
-    fn nfa_to_dfa_alternation_test() {
-        // NFA that accepts "a" or "b"
-        // State 0 --a--> State 1 (accepting)
-        // State 0 --b--> State 2 (accepting)
-        let input_nfa = Nfa {
-            transitions: HashMap::from([((0, 'a'), vec![1]), ((0, 'b'), vec![2])]),
-            accepting_states: HashSet::from([1, 2]),
-        };
-
-        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
-
-        let expected_transitions = [
-            HashMap::from([((0, 'a'), 1), ((0, 'b'), 2)]),
-            HashMap::from([((0, 'a'), 2), ((0, 'b'), 1)]),
-        ];
-        let expected_accepting_states = HashSet::from([1, 2]);
-
-        assert!(
-            generated_dfa.transitions == expected_transitions[0]
-                || generated_dfa.transitions == expected_transitions[1],
-            "generated_dfa.transitions did not match either expected set"
-        );
-        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
-    }
-
-    #[test]
-    fn nfa_to_dfa_nondeterministic_test() {
-        // NFA with nondeterministic transition
-        // State 0 --a--> State 1, State 2
-        // State 1 --b--> State 3 (accepting)
-        // State 2 --c--> State 3 (accepting)
-        let input_nfa = Nfa {
-            transitions: HashMap::from([
-                ((0, 'a'), vec![1, 2]),
-                ((1, 'b'), vec![3]),
-                ((2, 'c'), vec![3]),
-            ]),
-            accepting_states: HashSet::from([3]),
-        };
-
-        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
-
-        // After 'a' from state 0, we should be in a state representing {1, 2}
-        // Let's call this combined state "1" in the DFA
-        let expected_transitions = HashMap::from([
-            ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1)
-            ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2)
-            ((1, 'c'), 2), // {1,2} --c--> {3} (DFA state 2)
-        ]);
-        let expected_accepting_states = HashSet::from([2]); // DFA state 2 represents {3}
-
-        assert_eq!(expected_transitions, generated_dfa.transitions);
-        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
-    }
-
-    #[test]
-    fn nfa_to_dfa_multiple_accepting_test() {
-        // NFA where multiple paths lead to accepting states
-        // State 0 --a--> State 1 (accepting)
-        // State 0 --a--> State 2 --b--> State 3 (accepting)
-        let input_nfa = Nfa {
-            transitions: HashMap::from([((0, 'a'), vec![1, 2]), ((2, 'b'), vec![3])]),
-            accepting_states: HashSet::from([1, 3]),
-        };
-
-        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
-
-        // After 'a' from state 0, we're in state representing {1, 2}
-        // This should be accepting because it contains state 1
-        let expected_transitions = HashMap::from([
-            ((0, 'a'), 1), // {0} --a--> {1,2} (DFA state 1)
-            ((1, 'b'), 2), // {1,2} --b--> {3} (DFA state 2)
-        ]);
-        let expected_accepting_states = HashSet::from([1, 2]); // Both DFA states are accepting
-
-        assert_eq!(expected_transitions, generated_dfa.transitions);
-        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    // Remap transitions
+    let mut new_transitions = HashMap::new();
+    for ((from, symbol), to) in transitions {
+        let new_from = state_mapping[&from];
+        let new_to = state_mapping[&to];
+        new_transitions.insert((new_from, symbol), new_to);
     }
 
-    #[test]
-    fn nfa_to_dfa_self_loop_test() {
-        // NFA with self-loop: accepts a*
-        // State 0 (accepting) --a--> State 0
-        let input_nfa = Nfa {
-            transitions: HashMap::from([((0, 'a'), vec![0])]),
-            accepting_states: HashSet::from([0]),
-        };
-
-        let generated_dfa = nfa_no_epsilon_to_dfa(&input_nfa);
-
-        let expected_transitions = HashMap::from([
-            ((0, 'a'), 0), // Self-loop
-        ]);
-        let expected_accepting_states = HashSet::from([0]);
+    // Remap accepting states
+    let mut new_accepting_states = HashSet::new();
+    for state in accepting_states {
+        new_accepting_states.insert(state_mapping[&state]);
+    }
 
-        assert_eq!(expected_transitions, generated_dfa.transitions);
-        assert_eq!(expected_accepting_states, generated_dfa.accepting_states);
+    GlushkovDfa {
+        transitions: new_transitions,
+        accepting_states: new_accepting_states,
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index a6adb31..482ee9a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -288,7 +288,7 @@ impl Regex {
     /// ```rust
     /// use regex_engine::{Regex, ConstructionType};
     ///
-    /// let regex = Regex::new("(a|b)*", ConstructionType::Thompson);
+    /// let regex = Regex::new("(a|b)*", ConstructionType::Thompson).expect("Valied regex");
     /// assert!(regex.is_match("abba"));
     /// assert!(!regex.is_match("abc"));
     /// ```
@@ -318,7 +318,7 @@ impl Regex {
     /// ```rust
     /// use regex_engine::{Regex, ConstructionType};
     ///
-    /// let regex = Regex::new("ab+", ConstructionType::Thompson);
+    /// let regex = Regex::new("ab+", ConstructionType::Thompson).expect("Valied regex");
     /// if let Some(matched) = regex.find("aabbcc") {
     ///     println!("Found: {}", matched);
     /// }

From 13e48c31228b5273ec04ad49d020caaf28e1ca7c Mon Sep 17 00:00:00 2001
From: Pepe Hanisch <142326461+Testspieler09@users.noreply.github.com>
Date: Sun, 24 Aug 2025 20:59:22 +0200
Subject: [PATCH 8/8] feat: github pages for html report

---
 .github/workflows/benchmark.yml | 37 +++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/benchmark.yml

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..7a9335e
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,37 @@
+name: Benchmark
+
+on:
+  push:
+    branches: [main]
+
+# Add permissions for Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+
+      - name: Run benchmarks
+        run: cargo bench
+
+      - name: Setup Pages
+        uses: actions/configure-pages@v3
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v2
+        with:
+          path: './target/criterion'
+
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v2