diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..7a9335e --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,37 @@ +name: Benchmark + +on: + push: + branches: [main] + +# Add permissions for Pages +permissions: + contents: read + pages: write + id-token: write + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + + - name: Run benchmarks + run: cargo bench + + - name: Setup Pages + uses: actions/configure-pages@v3 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v2 + with: + path: './target/criterion' + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 diff --git a/Cargo.lock b/Cargo.lock index ab61766..4f0180a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,606 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "half" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.174" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + [[package]] name = "regex_engine" version = "0.1.0" +dependencies = [ + "criterion", + "regex", +] + +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.141" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b9eff21ebe718216c6ec64e1d9ac57087aad11efc64e32002bce4a0d4c03d3" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index e3ddfb0..4cdf3fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,12 @@ [package] name = "regex_engine" version = "0.1.0" -edition = "2021" +edition = "2024" -[dependencies] +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } +regex = "1.11.1" + +[[bench]] +name = "regex_benchmark" +harness = false diff --git a/benches/bench_cases.rs b/benches/bench_cases.rs new file mode 100644 index 0000000..e76cd8a --- /dev/null +++ b/benches/bench_cases.rs @@ -0,0 +1,113 @@ +use std::hint::black_box; + +#[allow(dead_code)] +struct BenchCase<'a> { + pub regex: &'a str, + pub input: String, + pub expected_is_match: bool, + pub expected_first_match: Option, + pub expected_all_matches: Vec, +} + +// This function is used in the `benchmark` files +#[allow(dead_code)] +fn get_bench_cases() -> Vec> { + black_box(vec![ + BenchCase { + regex: r"a.b", + input: "abcd abef abgh ijk".to_string(), + expected_is_match: false, + expected_first_match: None, + expected_all_matches: vec![], + }, + BenchCase { + regex: r"a*b", + input: "aaaaaaaaab".to_string(), + expected_is_match: true, + expected_first_match: Some("aaaaaaaaab".to_string()), + expected_all_matches: vec!["aaaaaaaaab".to_string()], + }, + BenchCase { + regex: r"a+b", + input: "aabab".to_string(), + expected_is_match: false, + expected_first_match: Some("aab".to_string()), + expected_all_matches: vec!["aab".to_string(), "ab".to_string()], + }, + BenchCase { + regex: r"a?b", + input: "b aaab ab".to_string(), + expected_is_match: false, + expected_first_match: Some("b".to_string()), + expected_all_matches: vec!["b".to_string(), "ab".to_string(), "ab".to_string()], + }, + BenchCase { + regex: r"a|b", + input: "xxaxybxx".to_string(), + expected_is_match: false, + expected_first_match: Some("a".to_string()), + expected_all_matches: vec!["a".to_string(), "b".to_string()], + }, + BenchCase { + regex: r"(a|b)c", + input: "abc ac bc bbcc".to_string(), + expected_is_match: false, + expected_first_match: Some("bc".to_string()), + expected_all_matches: vec![ + "bc".to_string(), + "ac".to_string(), + "bc".to_string(), + "bc".to_string(), + ], + }, + BenchCase { + regex: r"\.", + input: "Find . within this !?. sentence.".to_string(), + expected_is_match: false, + expected_first_match: Some(".".to_string()), + expected_all_matches: vec![".".to_string(), ".".to_string(), ".".to_string()], + }, + BenchCase { + regex: r"(hel+o|wor?ld)", + input: "hello helolllo world worlld helloworld".to_string(), + expected_is_match: false, + expected_first_match: Some("hello".to_string()), + expected_all_matches: vec![ + "hello".to_string(), + "helo".to_string(), + "world".to_string(), + "hello".to_string(), + "world".to_string(), + ], + }, + BenchCase { + regex: r"ab*c+", + input: "abbc abbbbbbbcc bccaaabbabc".to_string(), + expected_is_match: false, + expected_first_match: Some("abbc".to_string()), + expected_all_matches: vec![ + "abbc".to_string(), + "abbbbbbbcc".to_string(), + "abc".to_string(), + ], + }, + BenchCase { + regex: r"(a(bc|de)+)", + input: "abc abcbc abcdedef".to_string(), + expected_is_match: false, + expected_first_match: Some("abc".to_string()), + expected_all_matches: vec![ + "abc".to_string(), + "abcbc".to_string(), + "abcdede".to_string(), + ], + }, + BenchCase { + regex: r"(a|b)*c", + input: format!("{}{}", "a".repeat(1000), "bc"), + expected_is_match: true, + expected_first_match: Some(format!("{}{}", "a".repeat(1000), "bc")), + expected_all_matches: vec![format!("{}{}", "a".repeat(1000), "bc")], + }, + ]) +} diff --git a/benches/regex_benchmark.rs b/benches/regex_benchmark.rs new file mode 100644 index 0000000..7105fa5 --- /dev/null +++ b/benches/regex_benchmark.rs @@ -0,0 +1,187 @@ +include!("bench_cases.rs"); +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use regex as rust_regex; +use regex_engine::{ConstructionType, Regex}; + +fn benchmark_regex_compile_time(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Compile Time"); + + for case in cases { + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.regex, + |b, regex| { + b.iter(|| { + let _ = Regex::new(regex, ConstructionType::Thompson); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.regex, + |b, regex| { + b.iter(|| { + let _ = Regex::new(regex, ConstructionType::Glushkov); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.regex, + |b, regex| { + b.iter(|| { + rust_regex::Regex::new(regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {regex}")); + }) + }, + ); + } + group.finish(); +} + +fn benchmark_regex_is_match(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Is Match"); + + for case in &cases { + let thompson_regex = + Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); + let glushkov_regex = + Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); + let rust_regex = rust_regex::Regex::new(&format!("^{}$", case.regex)) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.input, + |b, input| { + b.iter(|| { + thompson_regex.is_match(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.input, + |b, input| { + b.iter(|| { + glushkov_regex.is_match(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.input, + |b, input| { + b.iter(|| { + rust_regex.is_match(input); + }) + }, + ); + } + group.finish(); +} + +fn benchmark_regex_find_first(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Find First"); + + for case in &cases { + let thompson_regex = + Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); + let glushkov_regex = + Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); + let rust_regex = rust_regex::Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.input, + |b, input| { + b.iter(|| { + thompson_regex.find(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.input, + |b, input| { + b.iter(|| { + glushkov_regex.find(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.input, + |b, input| { + b.iter(|| { + rust_regex.find(input).map(|m| m.as_str()); + }) + }, + ); + } + group.finish(); +} + +fn benchmark_regex_find_all(c: &mut Criterion) { + let cases = get_bench_cases(); + let mut group = c.benchmark_group("Regex Find All"); + + for case in &cases { + let thompson_regex = + Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); + let glushkov_regex = + Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); + let rust_regex = rust_regex::Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + group.bench_with_input( + BenchmarkId::new("Thompson", case.regex), + &case.input, + |b, input| { + b.iter(|| { + thompson_regex.findall(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Glushkov", case.regex), + &case.input, + |b, input| { + b.iter(|| { + glushkov_regex.findall(input); + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("Rust", case.regex), + &case.input, + |b, input| { + b.iter(|| { + rust_regex.find_iter(input); + }) + }, + ); + } + group.finish(); +} + +criterion_group!( + benches, + benchmark_regex_compile_time, + benchmark_regex_is_match, + benchmark_regex_find_first, + benchmark_regex_find_all +); +criterion_main!(benches); diff --git a/src/dfa.rs b/src/dfa.rs deleted file mode 100644 index 17019c6..0000000 --- a/src/dfa.rs +++ /dev/null @@ -1,924 +0,0 @@ -use core::panic; -use std::collections::{HashMap, HashSet, VecDeque}; - -struct NFA { - transitions: HashMap<(u32, Option), Vec>, - accepting_state: u32, // the thompson construction always has one accepting_state -} - -pub struct DFA { - transitions: HashMap<(u32, Option), u32>, - accepting_states: HashSet, -} - -fn is_valid_regex(regex: &str) -> bool { - if regex.is_empty() { - return false; - } - - let mut open_paren_count = 0; - let mut last_was_quantifier = false; - - let mut chars = regex.chars().peekable(); - while let Some(c) = chars.next() { - match c { - '(' => { - open_paren_count += 1; - last_was_quantifier = false; - } - - ')' => { - if open_paren_count == 0 { - return false; - } - open_paren_count -= 1; - last_was_quantifier = false; - } - - '*' | '+' => { - // Ensure quantifiers are not the first character and are not repeated - if last_was_quantifier || regex.starts_with('*') || regex.starts_with('+') { - return false; - } - last_was_quantifier = true; - } - - '|' => { - // Ensure alternation isn't the first or last character - if regex.starts_with('|') || chars.peek().is_none() { - return false; - } - last_was_quantifier = false; - } - - '\\' => { - // Handle escaped characters: ensure there's a character after the escape - if chars.peek().is_none() { - return false; - } - chars.next(); // Skip the escaped character - last_was_quantifier = false; - } - - _ => { - last_was_quantifier = false; - } - } - } - - open_paren_count == 0 -} - -fn normalise_regex(regex: &str) -> String { - let mut normalised = String::new(); - let mut escape_sequence = false; - let mut prev_char = '\0'; - - for curr_char in regex.chars() { - if escape_sequence { - // TODO: Implement further parsing features here (e.g. \w \d) - normalised.push(curr_char); - escape_sequence = false; - prev_char = curr_char; - continue; - } - - if curr_char == '\\' { - escape_sequence = true; - normalised.push(curr_char); - continue; - } - - if curr_char == '+' { - normalised.push(prev_char); - normalised.push('*'); - prev_char = curr_char; - continue; - } - if curr_char == '?' { - match prev_char { - ')' => { - let mut balance = 0; - - for j in (0..normalised.len()).rev() { - let ch = normalised.chars().nth(j).unwrap(); - if ch == ')' { - balance += 1; - } else if ch == '(' { - balance -= 1; - if balance == 0 { - normalised.insert(j, '('); - break; - } - } - } - } - _ => { - normalised.insert(normalised.len() - 1, '('); - } - } - normalised.push_str("|())"); - prev_char = curr_char; - continue; - } - if curr_char == '.' { - normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"); - prev_char = curr_char; - continue; - } - - normalised.push(curr_char); - prev_char = curr_char; - } - - normalised -} - -// GLUSHKOV CONSTRUCTION -fn glushkov_construction(regex: &str) -> NFA { - // TODO: Step 1 (rename letters / index them) - // TODO: Step 2a () - // TODO: Step 2b () - // TODO: Step 3 () - // TODO: Step 4 () - todo!() -} - -fn nfa_no_epsilon_to_dfa() { - todo!() -} -// END GLUSHKOV CONSTRUCTION - -// THOMPSON CONSTRUCTION --- -fn thompson_construction(normalised_regex: &str) -> NFA { - fn apply_operator(nfa_stack: &mut Vec, operator: char) { - match operator { - '|' => { - let nfa_right = nfa_stack.pop().expect("Expected NFA for union"); - let nfa_left = nfa_stack.pop().expect("Expected NFA for union"); - nfa_stack.push(union(&nfa_left, &nfa_right)); - } - '.' => { - let nfa_right = nfa_stack.pop().expect("Expected NFA for concatenation"); - let nfa_left = nfa_stack.pop().expect("Expected NFA for concatenation"); - nfa_stack.push(concatenate(&nfa_left, &nfa_right)); - } - _ => panic!("Unknown operator {:?}", operator), - } - } - - let mut operators: Vec = Vec::new(); - let mut nfa_stack: Vec = Vec::new(); - let mut concat_flag = false; - let mut escape_sequence = false; - - for symbol in normalised_regex.chars() { - if escape_sequence { - if concat_flag { - operators.push('.'); - } - nfa_stack.push(create_basic_nfa(&symbol)); - concat_flag = true; - escape_sequence = false; - continue; - } - match symbol { - '(' => { - if concat_flag { - operators.push('.'); - } - operators.push('('); - concat_flag = false; - } - ')' => { - let mut is_epsilon = true; - while let Some(op) = operators.pop() { - if op == '(' && is_epsilon { - nfa_stack.push(create_basic_epsilon_nfa()); - break; - } else if op == '(' { - break; - } - is_epsilon = false; - apply_operator(&mut nfa_stack, op); - } - concat_flag = true; - } - '*' => { - let last_nfa = nfa_stack.pop().expect("Expected NFA for Kleene Star"); - nfa_stack.push(apply_kleene_star(&last_nfa)); - concat_flag = true; - } - '|' => { - operators.push('|'); - concat_flag = false; - } - '\\' => { - escape_sequence = true; - } - _ => { - if concat_flag { - operators.push('.'); - } - nfa_stack.push(create_basic_nfa(&symbol)); - concat_flag = true; - } - } - } - - while let Some(op) = operators.pop() { - apply_operator(&mut nfa_stack, op); - } - - if nfa_stack.len() != 1 { - panic!("Invalid Regex, unexpected final NFA stack size"); - } - - nfa_stack.pop().unwrap() -} - -fn apply_kleene_star(last_nfa: &NFA) -> NFA { - let mut transitions = HashMap::new(); - - let new_accepting = last_nfa.accepting_state + 2; - - // Epsilon transition from new start to original start - transitions.insert((0, None), vec![1]); - - // Copy existing transitions, shifting state numbers to make room for new start - for ((state, input), targets) in &last_nfa.transitions { - // Shift each transition to new indices - transitions.insert((state + 1, *input), targets.iter().map(|s| s + 1).collect()); - } - - // Epsilon transitions returning to original start for loops, and new accepting state - transitions - .entry((&last_nfa.accepting_state + 1, None)) - .or_insert_with(Vec::new) - .push(1); - - transitions - .entry((&last_nfa.accepting_state + 1, None)) - .or_insert_with(Vec::new) - .push(new_accepting); - - // Final acceptance state is accepting with epsilon transition from start for empty string - transitions - .entry((0, None)) - .or_insert_with(Vec::new) - .push(new_accepting); - - NFA { - transitions, - accepting_state: new_accepting, - } -} - -fn union(left: &NFA, right: &NFA) -> NFA { - let mut transitions = HashMap::new(); - - let num_states_left_nfa = left.accepting_state; - let num_states_right_nfa = right.accepting_state; - - // Shift the NFA states - for ((state, input), targets) in &left.transitions { - transitions.insert((state + 1, *input), targets.iter().map(|s| s + 1).collect()); - } - - for ((state, input), targets) in &right.transitions { - transitions.insert( - (state + num_states_left_nfa + 2, *input), - targets - .iter() - .map(|s| s + num_states_left_nfa + 2) - .collect(), - ); - } - - // Add new start and end state - let new_accepting_state = num_states_left_nfa + num_states_right_nfa + 3; - - transitions.insert((0, None), vec![1, num_states_left_nfa + 2]); - transitions - .entry((&left.accepting_state + 1, None)) - .or_insert_with(Vec::new) - .push(new_accepting_state); - transitions - .entry((&right.accepting_state + num_states_left_nfa + 2, None)) - .or_insert_with(Vec::new) - .push(new_accepting_state); - - NFA { - transitions, - accepting_state: new_accepting_state, - } -} - -fn concatenate(left: &NFA, right: &NFA) -> NFA { - let mut transitions: HashMap<(u32, Option), Vec> = left.transitions.clone(); - - // HACK: The accepting states are (based on the implementation) the last ones of the NFA - // thus it is possible to get the num of states in the first NFA like this - let num_states_left_nfa = left.accepting_state; - - for ((state, input), targets) in &right.transitions { - transitions.insert( - (state + num_states_left_nfa, *input), - targets.iter().map(|s| s + num_states_left_nfa).collect(), - ); - } - - NFA { - transitions, - accepting_state: right.accepting_state + num_states_left_nfa, - } -} - -fn create_basic_nfa(letter: &char) -> NFA { - NFA { - transitions: HashMap::from([((0, Some(*letter)), vec![1])]), - accepting_state: 1, - } -} - -fn create_basic_epsilon_nfa() -> NFA { - NFA { - transitions: HashMap::from([((0, None), vec![1])]), - accepting_state: 1, - } -} -// END THOMPSON CONSTRUCTION --- - -// NFA to DFA functions --- -fn epsilon_closure(nfa: &NFA, states: &mut HashSet) { - let mut stack = states.clone(); - - while let Some(&state_id) = stack.iter().next() { - stack.remove(&state_id); - if let Some(epsilon_states) = nfa.transitions.get(&(state_id, None)) { - for &next_state in epsilon_states { - if states.insert(next_state) { - stack.insert(next_state); - } - } - } - } -} - -fn move_nfa(nfa: &NFA, states: &HashSet, symbol: char) -> HashSet { - let mut move_states = HashSet::new(); - - for &state in states { - if let Some(next_states) = nfa.transitions.get(&(state, Some(symbol))) { - move_states.extend(next_states); - } - } - - move_states -} - -fn hash_set_to_sorted_vec(set: &HashSet) -> Vec { - let mut vec: Vec = set.iter().cloned().collect(); - vec.sort_unstable(); - vec -} - -fn nfa_to_dfa(nfa: &NFA) -> DFA { - // Start from the initial state of the NFA, assuming it's state 0 - let mut start_closure = HashSet::from([0]); - epsilon_closure(nfa, &mut start_closure); - let mut state_map = HashMap::new(); - let mut dfa_accepting_states = HashSet::new(); - let mut transitions = HashMap::new(); - - // Map the initial DFA state from the initial NFA state closure - state_map.insert(hash_set_to_sorted_vec(&start_closure), 0); - - let mut unmarked_states = vec![start_closure]; - - while let Some(current_closure) = unmarked_states.pop() { - let current_dfa_state_id = state_map[&hash_set_to_sorted_vec(¤t_closure)]; - - if current_closure.contains(&nfa.accepting_state) { - dfa_accepting_states.insert(current_dfa_state_id); - } - - // Collect symbols from transitions - let symbols: HashSet<_> = nfa - .transitions - .keys() - .filter_map(|(_, symbol)| *symbol) - .collect(); - - for symbol in symbols { - let mut move_closure = move_nfa(nfa, ¤t_closure, symbol); - epsilon_closure(nfa, &mut move_closure); - - if move_closure.is_empty() { - continue; - } - - let sorted_vec = hash_set_to_sorted_vec(&move_closure); - let next_dfa_state_id = state_map.len() as u32; - - // Insert new DFA state if isn't already mapped - if !state_map.contains_key(&sorted_vec) { - state_map.insert(sorted_vec.clone(), next_dfa_state_id); - unmarked_states.push(move_closure); - } - - transitions.insert((current_dfa_state_id, Some(symbol)), state_map[&sorted_vec]); - } - } - - DFA { - transitions, - accepting_states: dfa_accepting_states, - } -} -// END NFA to DFA functions --- - -fn optimise_dfa(dfa: &DFA) -> DFA { - let mut partition: HashMap = HashMap::new(); - let mut accepting_states_set: HashSet = dfa.accepting_states.clone(); - let mut non_accepting_states: HashSet = HashSet::new(); - let mut all_states: HashSet = HashSet::new(); - - for (&(state, _), _) in &dfa.transitions { - all_states.insert(state); - if dfa.accepting_states.contains(&state) { - accepting_states_set.insert(state); - } else { - non_accepting_states.insert(state); - } - } - - for state in dfa.accepting_states.iter() { - all_states.insert(*state); - } - - for state in all_states.iter() { - if dfa.accepting_states.contains(state) { - partition.insert(*state, 0); - } else { - partition.insert(*state, 1); - } - } - - let mut partition_list: Vec> = Vec::new(); - partition_list.push(accepting_states_set); - partition_list.push(non_accepting_states); - - let mut worklist: VecDeque = VecDeque::new(); - if partition_list[0].len() > 0 { - worklist.push_back(0); - } - if partition_list.len() > 1 && partition_list[1].len() > 0 { - worklist.push_back(1); - } - - while let Some(current_partition_index) = worklist.pop_front() { - let mut states_to_check: HashMap, HashSet> = HashMap::new(); - for (&(source_state, symbol), &target_state) in &dfa.transitions { - if partition[&target_state] == current_partition_index { - states_to_check - .entry(symbol) - .or_insert_with(HashSet::new) - .insert(source_state); - } - } - - for (_, states_to_split) in states_to_check.iter() { - let mut partitions_to_split: HashSet = HashSet::new(); - - for &state in states_to_split.iter() { - let partition_index = partition[&state]; - if partition_list[partition_index].len() > 1 { - partitions_to_split.insert(partition_index); - } - } - - for &partition_index_to_split in partitions_to_split.iter() { - let mut intersection: HashSet = HashSet::new(); - let mut difference: HashSet = HashSet::new(); - - for &state in partition_list[partition_index_to_split].iter() { - if states_to_split.contains(&state) { - intersection.insert(state); - } else { - difference.insert(state); - } - } - - if !intersection.is_empty() && !difference.is_empty() { - let new_partition_index = partition_list.len(); - - for &state in intersection.iter() { - partition.insert(state, new_partition_index); - } - - partition_list.push(intersection); - - for &state in &difference { - partition.insert(state, partition_index_to_split); - } - partition_list[partition_index_to_split] = difference; - - if partition_list[new_partition_index].len() - < partition_list[partition_index_to_split].len() - { - worklist.push_back(new_partition_index); - } else { - worklist.push_back(partition_index_to_split); - } - } - } - } - } - - let mut minimal_transitions: HashMap<(u32, Option), u32> = HashMap::new(); - let mut minimal_accepting_states: HashSet = HashSet::new(); - let mut new_state_map: HashMap = HashMap::new(); - - let mut next_state_id: u32 = 0; - - if let Some(partition_index) = partition.get(&0) { - new_state_map.insert(*partition_index, next_state_id); - next_state_id += 1; - } - - for (_, &partition_index) in partition.iter() { - if !new_state_map.contains_key(&partition_index) { - new_state_map.insert(partition_index, next_state_id); - next_state_id += 1; - } - } - - for (original_state, &partition_index) in partition.iter() { - let new_state_id = new_state_map[&partition_index]; - if dfa.accepting_states.contains(original_state) { - minimal_accepting_states.insert(new_state_id); - } - } - - for (&(source_state, symbol), &target_state) in &dfa.transitions { - let source_partition = partition[&source_state]; - let target_partition = partition[&target_state]; - - let new_source_state = new_state_map[&source_partition]; - let new_target_state = new_state_map[&target_partition]; - - minimal_transitions.insert((new_source_state, symbol), new_target_state); - } - - DFA { - transitions: minimal_transitions, - accepting_states: minimal_accepting_states, - } -} - -impl DFA { - pub fn new(regex: &str) -> Self { - if !is_valid_regex(regex) { - panic!("{} is not a valid regular expression!", regex); - } - - let normalised_regex = normalise_regex(®ex); - let regex_nfa: NFA = thompson_construction(&normalised_regex); - let regex_dfa = nfa_to_dfa(®ex_nfa); - optimise_dfa(®ex_dfa) - } - - pub fn process(&self, input: &str) -> bool { - let mut current_state = 0; - for c in input.chars() { - if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) { - current_state = next_state; - } else { - return false; - } - } - self.accepting_states.contains(¤t_state) - } - - pub fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> { - let mut start_pos = 0; - while start_pos < text.len() { - let mut current_state = 0; - let mut match_start = None; - let mut match_end = None; - let mut found_match = false; - - for (i, c) in text.chars().enumerate().skip(start_pos) { - if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) { - current_state = next_state; - match_start = match_start.or(Some(i)); - - if self.accepting_states.contains(¤t_state) { - found_match = true; - match_end = Some(i) - } - - if i == text.len() - 1 && found_match { - break; - } - } else { - break; - } - } - - if let (Some(start), Some(end)) = (match_start, match_end) { - return Some(&text[start..=end]); - } else { - start_pos += 1; - } - } - - None - } - - pub fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> { - let mut matches: Vec<&str> = Vec::new(); - - let mut start_pos = 0; - while start_pos < input.len() { - let mut current_state = 0; - let mut match_start: Option = None; - let mut match_end: Option = None; - let mut found_match = false; - - for (i, c) in input.chars().enumerate().skip(start_pos) { - if let Some(&next_state) = self.transitions.get(&(current_state, Some(c))) { - current_state = next_state; - match_start = match_start.or(Some(start_pos)); - - if self.accepting_states.contains(¤t_state) { - match_end = Some(i); - found_match = true; - } - - if i == input.len() - 1 && found_match { - break; - } - } else { - break; - } - } - - if let (Some(start), Some(end)) = (match_start, match_end) { - matches.push(&input[start..=end]); - start_pos = end; - } else { - start_pos += 1; - } - } - - matches - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn valid_regex_basic_test() { - let regex = "(a|b)*"; - assert!(is_valid_regex(regex), "Expected valid regex."); - } - - #[test] - fn invalid_empty_regex_test() { - let regex = ""; - assert!(!is_valid_regex(regex), "Expected invalid regex (empty)."); - } - - #[test] - fn invalid_unbalanced_parentheses_test() { - let regex1 = "(a|b"; - let regex2 = "a|b)"; - assert!( - !is_valid_regex(regex1), - "Expected invalid regex (unbalanced parentheses)." - ); - assert!( - !is_valid_regex(regex2), - "Expected invalid regex (unbalanced parentheses)." - ); - } - - #[test] - fn invalid_operator_placement_test() { - let regex1 = "*a"; - let regex2 = "|a|b"; - assert!( - !is_valid_regex(regex1), - "Expected invalid regex (invalid quantifier placement)." - ); - assert!( - !is_valid_regex(regex2), - "Expected invalid regex (invalid alternation placement)." - ); - } - - #[test] - fn valid_nested_parentheses_test() { - let regex = "((a|b)*c)"; - assert!( - is_valid_regex(regex), - "Expected valid regex with nested parentheses." - ); - } - - #[test] - fn valid_escape_sequence_test() { - let regex = "a\\*b"; - assert!( - is_valid_regex(regex), - "Expected valid regex with escape sequence." - ); - } - - #[test] - fn invalid_escape_sequence_test() { - let regex = "a\\"; - assert!( - !is_valid_regex(regex), - "Expected invalid regex with unpaired escape." - ); - } - - #[test] - fn normalise_regex_test() { - let cases = [ - (r"a+", r"aa*"), - (r"a\+", r"a\+"), - (r"a?", r"(a|())"), - (r"a\?", r"a\?"), - (r"(ab)?", r"((ab)|())"), - (r".", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"), - ]; - - for (input, expected) in cases { - let result = normalise_regex(input); - assert_eq!( - result, expected, - "Normalisation failed for input '{}'", - input - ); - } - } - - #[test] - fn create_dfa_test() { - let generated_dfa = DFA::new("(a|b)*"); - let expected_transitions = HashMap::from([((0, Some('a')), 0), ((0, Some('b')), 0)]); - let expected_accepting_states = HashSet::from([0]); - - assert_eq!(expected_transitions, generated_dfa.transitions); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); - - let generated_dfa_2 = DFA::new("a|()"); - let expected_transitions_2 = HashMap::from([((0, Some('a')), 1)]); - let expected_accepting_states_2 = HashSet::from([0, 1]); - - assert_eq!(expected_transitions_2, generated_dfa_2.transitions); - assert_eq!( - expected_accepting_states_2, - generated_dfa_2.accepting_states - ); - } - - #[test] - fn prozess_regex_test() { - let generated_dfa = DFA::new("(a|b)*"); - let test_strings = vec!["abbbababaaaa", ""]; - for string in test_strings { - assert!(generated_dfa.process(string)); - } - } - - #[test] - fn create_basic_nfa_test() { - let nfa_a = create_basic_nfa(&'a'); - let expected_transitions = HashMap::from([((0, Some('a')), vec![1])]); - let expected_accepting_state: u32 = 1; - - assert_eq!(nfa_a.transitions, expected_transitions); - assert_eq!(nfa_a.accepting_state, expected_accepting_state); - } - - #[test] - fn concatenate_test() { - let nfa_a = create_basic_nfa(&'a'); - let nfa_b = create_basic_nfa(&'b'); - let concatenated_nfa = concatenate(&nfa_a, &nfa_b); - - let expected_transitions = - HashMap::from([((0, Some('a')), vec![1]), ((1, Some('b')), vec![2])]); - let expected_accepting_state: u32 = 2; - - assert_eq!(concatenated_nfa.transitions, expected_transitions); - assert_eq!(concatenated_nfa.accepting_state, expected_accepting_state); - } - - #[test] - fn apply_kleene_star_test() { - let basic_nfa = create_basic_nfa(&'a'); - let starred_nfa = apply_kleene_star(&basic_nfa); - - let expected_transitions = HashMap::from([ - ((0, None), vec![1, 3]), // Epsilon to start and new accepting - ((1, Some('a')), vec![2]), // Original transition - ((2, None), vec![1, 3]), // Loop back and transition to new accepting - ]); - - let expected_accepting_state: u32 = 3; - - assert_eq!(starred_nfa.transitions, expected_transitions); - assert_eq!(starred_nfa.accepting_state, expected_accepting_state); - } - - #[test] - fn union_test() { - let nfa_a = create_basic_nfa(&'a'); - let nfa_b = create_basic_nfa(&'b'); - let union_nfa = union(&nfa_a, &nfa_b); - - let expected_transitions = HashMap::from([ - ((0, None), vec![1, 3]), // Combined initial state transitions - ((1, Some('a')), vec![2]), // Offset transitions for NFA a - ((3, Some('b')), vec![4]), // Offset transitions for NFA b - ((2, None), vec![5]), // Accepting state transition for a - ((4, None), vec![5]), // Accepting state transition for b - ]); - - let expected_accepting_state: u32 = 5; - - assert_eq!(union_nfa.transitions, expected_transitions); - assert_eq!(union_nfa.accepting_state, expected_accepting_state); - } - - #[test] - fn thompson_construction_test() { - let regex_nfa = thompson_construction("(a|b)*"); - - let expected_transitions = HashMap::from([ - ((0, None), vec![1, 7]), - ((1, None), vec![2, 4]), - ((2, Some('a')), vec![3]), - ((3, None), vec![6]), - ((4, Some('b')), vec![5]), - ((5, None), vec![6]), - ((6, None), vec![1, 7]), - ]); - let expected_accepting_state = 7; - - assert_eq!(regex_nfa.transitions, expected_transitions); - assert_eq!(regex_nfa.accepting_state, expected_accepting_state); - } - - #[test] - fn nfa_to_dfa_test() { - let input_nfa = NFA { - transitions: HashMap::from([ - ((0, None), vec![1, 7]), - ((1, None), vec![2, 4]), - ((2, Some('a')), vec![3]), - ((3, None), vec![6]), - ((4, Some('b')), vec![5]), - ((5, None), vec![6]), - ((6, None), vec![1, 7]), - ]), - accepting_state: 7, - }; - - let generated_dfa = nfa_to_dfa(&input_nfa); - - let expected_options = vec![ - HashMap::from([ - ((0, Some('a')), 1), - ((0, Some('b')), 2), - ((1, Some('a')), 1), - ((1, Some('b')), 2), - ((2, Some('a')), 1), - ((2, Some('b')), 2), - ]), - HashMap::from([ - ((0, Some('a')), 2), - ((0, Some('b')), 1), - ((1, Some('a')), 2), - ((1, Some('b')), 1), - ((2, Some('a')), 2), - ((2, Some('b')), 1), - ]), - ]; - let expected_accepting_states = HashSet::from([0, 1, 2]); - - assert!( - expected_options.contains(&generated_dfa.transitions), - "Transitions did not match any of the expected options." - ); - assert_eq!(expected_accepting_states, generated_dfa.accepting_states); - } -} diff --git a/src/glushkov.rs b/src/glushkov.rs new file mode 100644 index 0000000..85b3093 --- /dev/null +++ b/src/glushkov.rs @@ -0,0 +1,554 @@ +use crate::{Dfa, is_valid_regex, normalise_regex}; +use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; + +#[derive(Debug, Clone)] +enum RegexAst { + Char(char), + Concat(Vec), + Alternation(Vec), + KleeneStar(Box), +} + +#[derive(Debug)] +struct Nfa { + transitions: HashMap<(u32, char), Vec>, + accepting_states: HashSet, +} + +#[derive(Debug)] +pub struct GlushkovDfa { + transitions: HashMap<(u32, char), u32>, + accepting_states: HashSet, +} + +impl Dfa for GlushkovDfa { + fn new(regex: &str) -> Result { + if !is_valid_regex(regex) { + return Err(format!("{regex} is not a valid regular expression!")); + } + + let normalised_regex = normalise_regex(regex); + let ast = parse_regex(&normalised_regex)?; + let nfa = glushkov_construction(ast)?; + let mut regex_dfa = nfa_to_dfa(nfa); + + ::optimise_dfa(&mut regex_dfa); + Ok(regex_dfa) + } + + fn get_transitions(&self) -> &HashMap<(u32, char), u32> { + &self.transitions + } + + fn get_accepting_states(&self) -> &HashSet { + &self.accepting_states + } + + fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32> { + &mut self.transitions + } + + fn get_accepting_states_mut(&mut self) -> &mut HashSet { + &mut self.accepting_states + } +} + +// Parser for regex string to AST +fn parse_regex(regex: &str) -> Result { + let chars: Vec = regex.chars().collect(); + let (ast, pos) = parse_alternation(&chars, 0)?; + + if pos != chars.len() { + return Err("Unexpected characters at end of regex".to_string()); + } + + Ok(ast) +} + +fn parse_alternation(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> { + let mut alternatives = Vec::new(); + + let (first_alt, new_pos) = parse_concatenation(chars, pos)?; + alternatives.push(first_alt); + pos = new_pos; + + while pos < chars.len() && chars[pos] == '|' { + pos += 1; // skip '|' + let (alt, new_pos) = parse_concatenation(chars, pos)?; + alternatives.push(alt); + pos = new_pos; + } + + if alternatives.len() == 1 { + Ok((alternatives.into_iter().next().unwrap(), pos)) + } else { + Ok((RegexAst::Alternation(alternatives), pos)) + } +} + +fn parse_concatenation(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> { + let mut elements = Vec::new(); + + while pos < chars.len() && chars[pos] != '|' && chars[pos] != ')' { + let (element, new_pos) = parse_factor(chars, pos)?; + elements.push(element); + pos = new_pos; + } + + // Handle empty concatenation (empty alternative) + if elements.is_empty() { + // Return an epsilon (empty string) represented as an empty concatenation + return Ok((RegexAst::Concat(vec![]), pos)); + } + + if elements.len() == 1 { + Ok((elements.into_iter().next().unwrap(), pos)) + } else { + Ok((RegexAst::Concat(elements), pos)) + } +} + +fn parse_factor(chars: &[char], mut pos: usize) -> Result<(RegexAst, usize), String> { + if pos >= chars.len() { + return Err("Unexpected end of regex".to_string()); + } + + let (base, new_pos) = match chars[pos] { + '(' => { + pos += 1; // skip '(' + let (inner, inner_pos) = parse_alternation(chars, pos)?; + if inner_pos >= chars.len() || chars[inner_pos] != ')' { + return Err("Unmatched opening parenthesis".to_string()); + } + (inner, inner_pos + 1) // skip ')' + } + '\\' => { + if pos + 1 >= chars.len() { + return Err("Invalid escape sequence".to_string()); + } + pos += 1; // skip '\' + (RegexAst::Char(chars[pos]), pos + 1) + } + c if c.is_ascii() && !"()|*+\\".contains(c) => (RegexAst::Char(c), pos + 1), + _ => { + return Err(format!("Unexpected character: {}", chars[pos])); + } + }; + + pos = new_pos; + + // Check for Kleene star + if pos < chars.len() && chars[pos] == '*' { + pos += 1; + Ok((RegexAst::KleeneStar(Box::new(base)), pos)) + } else { + Ok((base, pos)) + } +} + +fn glushkov_construction(ast: RegexAst) -> Result { + let mut state_counter = 0u32; + let mut state_to_char: HashMap = HashMap::new(); + + // Assign unique state numbers to each character occurrence + assign_positions(&ast, &mut state_counter, &mut state_to_char); + + let start_state = state_counter; + + // Compute First, Last, Follow sets - each with fresh position counter + let first_set = first(&ast); + let last_set = last(&ast); + let follow_map = follow(&ast); + + // Build NFA + let mut transitions = HashMap::new(); + let mut accepting_states = HashSet::new(); + + // Transitions from start state + for &state in &first_set { + if let Some(&ch) = state_to_char.get(&state) { + transitions + .entry((start_state, ch)) + .or_insert_with(Vec::new) + .push(state); + } + } + + // Internal transitions based on follow sets + for (state, follow_states) in follow_map { + for &follow_state in &follow_states { + if let Some(&ch) = state_to_char.get(&follow_state) { + transitions + .entry((state, ch)) + .or_insert_with(Vec::new) + .push(follow_state); + } + } + } + + // Accepting states + if nullable(&ast) { + accepting_states.insert(start_state); + } + for &state in &last_set { + accepting_states.insert(state); + } + + Ok(Nfa { + transitions, + accepting_states, + }) +} + +fn first(ast: &RegexAst) -> HashSet { + let mut positions = HashMap::new(); + let mut counter = 0; + map_ast_to_positions(ast, &mut counter, &mut positions); + first_positions(ast, &positions) +} + +fn last(ast: &RegexAst) -> HashSet { + let mut positions = HashMap::new(); + let mut counter = 0; + map_ast_to_positions(ast, &mut counter, &mut positions); + last_positions(ast, &positions) +} + +fn follow(ast: &RegexAst) -> HashMap> { + let mut positions = HashMap::new(); + let mut counter = 0; + map_ast_to_positions(ast, &mut counter, &mut positions); + + let mut result = HashMap::new(); + follow_positions(ast, &positions, &mut result); + result +} + +// Helper function to create a mapping from AST nodes to their position ranges +fn map_ast_to_positions( + ast: &RegexAst, + counter: &mut u32, + positions: &mut HashMap<*const RegexAst, (u32, u32)>, +) { + let start_pos = *counter; + + match ast { + RegexAst::Char(_) => { + *counter += 1; + } + RegexAst::Concat(elements) => { + for element in elements { + map_ast_to_positions(element, counter, positions); + } + } + RegexAst::Alternation(alternatives) => { + for alt in alternatives { + map_ast_to_positions(alt, counter, positions); + } + } + RegexAst::KleeneStar(inner) => { + map_ast_to_positions(inner, counter, positions); + } + } + + positions.insert(ast as *const RegexAst, (start_pos, *counter)); +} + +fn first_positions( + ast: &RegexAst, + positions: &HashMap<*const RegexAst, (u32, u32)>, +) -> HashSet { + match ast { + RegexAst::Char(_) => { + let (start_pos, _) = positions[&(ast as *const RegexAst)]; + let mut result = HashSet::new(); + result.insert(start_pos); + result + } + RegexAst::Concat(elements) => { + let mut result = HashSet::new(); + for element in elements { + result.extend(first_positions(element, positions)); + if !nullable(element) { + break; + } + } + result + } + RegexAst::Alternation(alternatives) => { + let mut result = HashSet::new(); + for alt in alternatives { + result.extend(first_positions(alt, positions)); + } + result + } + RegexAst::KleeneStar(inner) => first_positions(inner, positions), + } +} + +fn last_positions( + ast: &RegexAst, + positions: &HashMap<*const RegexAst, (u32, u32)>, +) -> HashSet { + match ast { + RegexAst::Char(_) => { + let (start_pos, _) = positions[&(ast as *const RegexAst)]; + let mut result = HashSet::new(); + result.insert(start_pos); + result + } + RegexAst::Concat(elements) => { + let mut result = HashSet::new(); + for element in elements.iter().rev() { + result.extend(last_positions(element, positions)); + if !nullable(element) { + break; + } + } + result + } + RegexAst::Alternation(alternatives) => { + let mut result = HashSet::new(); + for alt in alternatives { + result.extend(last_positions(alt, positions)); + } + result + } + RegexAst::KleeneStar(inner) => last_positions(inner, positions), + } +} + +fn follow_positions( + ast: &RegexAst, + positions: &HashMap<*const RegexAst, (u32, u32)>, + result: &mut HashMap>, +) { + match ast { + RegexAst::Char(_) => { + // Base case - no follow computation needed + } + RegexAst::Concat(elements) => { + // Process each element recursively + for element in elements { + follow_positions(element, positions, result); + } + + // Add follow relationships between consecutive elements + for i in 0..elements.len() { + let last_i = last_positions(&elements[i], positions); + + // For each subsequent element j > i + for j in (i + 1)..elements.len() { + // Check if all elements between i and j are nullable + let all_between_nullable = elements[(i + 1)..j].iter().all(nullable); + + if j == i + 1 || all_between_nullable { + let first_j = first_positions(&elements[j], positions); + + // Add follow relationships from last(i) to first(j) + for &last_state in &last_i { + result.entry(last_state).or_default().extend(&first_j); + } + } + + // If element j is not nullable, we can't skip further + if !nullable(&elements[j]) { + break; + } + } + } + } + RegexAst::Alternation(alternatives) => { + for alt in alternatives { + follow_positions(alt, positions, result); + } + } + RegexAst::KleeneStar(inner) => { + follow_positions(inner, positions, result); + + // Kleene star: last positions can loop back to first positions + let inner_last = last_positions(inner, positions); + let inner_first = first_positions(inner, positions); + + for &last_state in &inner_last { + result.entry(last_state).or_default().extend(&inner_first); + } + } + } +} + +fn nullable(ast: &RegexAst) -> bool { + match ast { + RegexAst::Char(_) => false, + RegexAst::Concat(elements) => { + // Empty concat is nullable (represents epsilon) + elements.is_empty() || elements.iter().all(nullable) + } + RegexAst::Alternation(alternatives) => alternatives.iter().any(nullable), + RegexAst::KleeneStar(_) => true, + } +} + +fn assign_positions(ast: &RegexAst, counter: &mut u32, state_to_char: &mut HashMap) { + match ast { + RegexAst::Char(ch) => { + let state = *counter; + *counter += 1; + state_to_char.insert(state, *ch); + } + RegexAst::Concat(elements) => { + for element in elements { + assign_positions(element, counter, state_to_char); + } + } + RegexAst::Alternation(alternatives) => { + for alt in alternatives { + assign_positions(alt, counter, state_to_char); + } + } + RegexAst::KleeneStar(inner) => { + assign_positions(inner, counter, state_to_char); + } + } +} + +fn nfa_to_dfa(nfa: Nfa) -> GlushkovDfa { + let mut dfa_transitions = HashMap::new(); + let mut dfa_accepting_states = HashSet::new(); + let mut state_sets_to_dfa_state: HashMap, u32> = HashMap::new(); + let mut queue = VecDeque::new(); + let mut next_dfa_state = 0u32; + + // Get alphabet from NFA + let alphabet: HashSet = nfa.transitions.keys().map(|(_, ch)| *ch).collect(); + + // Find start state (highest numbered state in NFA) + let mut all_nfa_states = HashSet::new(); + + for &(from_state, _) in nfa.transitions.keys() { + all_nfa_states.insert(from_state); + } + for target_states in nfa.transitions.values() { + for &to_state in target_states { + all_nfa_states.insert(to_state); + } + } + for &accepting_state in &nfa.accepting_states { + all_nfa_states.insert(accepting_state); + } + + let start_state = all_nfa_states.iter().max().copied().unwrap_or(0); + + let start_set: BTreeSet = { + let mut set = BTreeSet::new(); + set.insert(start_state); + set + }; + + state_sets_to_dfa_state.insert(start_set.clone(), next_dfa_state); + queue.push_back(start_set); + next_dfa_state += 1; + + while let Some(current_set) = queue.pop_front() { + let current_dfa_state = state_sets_to_dfa_state[¤t_set]; + + // Check if this DFA state should be accepting + if current_set + .iter() + .any(|&s| nfa.accepting_states.contains(&s)) + { + dfa_accepting_states.insert(current_dfa_state); + } + + // For each symbol in alphabet + for &symbol in &alphabet { + let mut next_set = BTreeSet::new(); + + // Collect all states reachable via this symbol + for &state in ¤t_set { + if let Some(targets) = nfa.transitions.get(&(state, symbol)) { + next_set.extend(targets); + } + } + + if !next_set.is_empty() { + let next_dfa_state = if let Some(&existing) = state_sets_to_dfa_state.get(&next_set) + { + existing + } else { + let new_state = next_dfa_state; + next_dfa_state += 1; + state_sets_to_dfa_state.insert(next_set.clone(), new_state); + queue.push_back(next_set.clone()); + new_state + }; + + dfa_transitions.insert((current_dfa_state, symbol), next_dfa_state); + } + } + } + + // Normalize to start from state 0 + normalize_dfa_states(dfa_transitions, dfa_accepting_states) +} + +fn normalize_dfa_states( + transitions: HashMap<(u32, char), u32>, + accepting_states: HashSet, +) -> GlushkovDfa { + if transitions.is_empty() && accepting_states.is_empty() { + return GlushkovDfa { + transitions, + accepting_states, + }; + } + + // Find all states + let mut all_states = HashSet::new(); + for &(from, _) in transitions.keys() { + all_states.insert(from); + } + for &to in transitions.values() { + all_states.insert(to); + } + all_states.extend(&accepting_states); + + if all_states.is_empty() { + return GlushkovDfa { + transitions, + accepting_states, + }; + } + + // Create mapping with 0 as start state + let start_state = *all_states.iter().min().unwrap(); + let mut state_mapping = HashMap::new(); + state_mapping.insert(start_state, 0); + + let mut next_state = 1; + for &state in &all_states { + if state != start_state { + state_mapping.insert(state, next_state); + next_state += 1; + } + } + + // Remap transitions + let mut new_transitions = HashMap::new(); + for ((from, symbol), to) in transitions { + let new_from = state_mapping[&from]; + let new_to = state_mapping[&to]; + new_transitions.insert((new_from, symbol), new_to); + } + + // Remap accepting states + let mut new_accepting_states = HashSet::new(); + for state in accepting_states { + new_accepting_states.insert(state_mapping[&state]); + } + + GlushkovDfa { + transitions: new_transitions, + accepting_states: new_accepting_states, + } +} diff --git a/src/lib.rs b/src/lib.rs index 244eee8..482ee9a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,610 @@ -mod dfa; -pub mod regex_engine; +use crate::{glushkov::GlushkovDfa, thompson::ThompsonDfa}; +use std::collections::{HashMap, HashSet, VecDeque}; + +mod glushkov; +mod thompson; + +trait Dfa { + fn new(regex: &str) -> Result + where + Self: std::marker::Sized; + fn get_transitions(&self) -> &HashMap<(u32, char), u32>; + fn get_accepting_states(&self) -> &HashSet; + fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32>; + fn get_accepting_states_mut(&mut self) -> &mut HashSet; + fn optimise_dfa(&mut self) { + let mut partition: HashMap = HashMap::new(); + let mut accepting_states_set: HashSet = self.get_accepting_states().clone(); + let mut non_accepting_states: HashSet = HashSet::new(); + let mut all_states: HashSet = HashSet::new(); + + for &(state, _) in self.get_transitions().keys() { + all_states.insert(state); + if self.get_accepting_states().contains(&state) { + accepting_states_set.insert(state); + } else { + non_accepting_states.insert(state); + } + } + + for state in self.get_accepting_states().iter() { + all_states.insert(*state); + } + + for state in all_states.iter() { + if self.get_accepting_states().contains(state) { + partition.insert(*state, 0); + } else { + partition.insert(*state, 1); + } + } + + let mut partition_list: Vec> = Vec::new(); + partition_list.push(accepting_states_set); + partition_list.push(non_accepting_states); + + let mut worklist: VecDeque = VecDeque::new(); + if !partition_list[0].is_empty() { + worklist.push_back(0); + } + if partition_list.len() > 1 && !partition_list[1].is_empty() { + worklist.push_back(1); + } + + while let Some(current_partition_index) = worklist.pop_front() { + let mut states_to_check: HashMap> = HashMap::new(); + for (&(source_state, symbol), &target_state) in self.get_transitions() { + if partition[&target_state] == current_partition_index { + states_to_check + .entry(symbol) + .or_default() + .insert(source_state); + } + } + + for (_, states_to_split) in states_to_check.iter() { + let mut partitions_to_split: HashSet = HashSet::new(); + + for &state in states_to_split.iter() { + let partition_index = partition[&state]; + if partition_list[partition_index].len() > 1 { + partitions_to_split.insert(partition_index); + } + } + + for &partition_index_to_split in partitions_to_split.iter() { + let mut intersection: HashSet = HashSet::new(); + let mut difference: HashSet = HashSet::new(); + + for &state in partition_list[partition_index_to_split].iter() { + if states_to_split.contains(&state) { + intersection.insert(state); + } else { + difference.insert(state); + } + } + + if !intersection.is_empty() && !difference.is_empty() { + let new_partition_index = partition_list.len(); + + for &state in intersection.iter() { + partition.insert(state, new_partition_index); + } + + partition_list.push(intersection); + + for &state in &difference { + partition.insert(state, partition_index_to_split); + } + partition_list[partition_index_to_split] = difference; + + if partition_list[new_partition_index].len() + < partition_list[partition_index_to_split].len() + { + worklist.push_back(new_partition_index); + } else { + worklist.push_back(partition_index_to_split); + } + } + } + } + } + + // Build new transitions and accepting states + let mut minimal_transitions: HashMap<(u32, char), u32> = HashMap::new(); + let mut minimal_accepting_states: HashSet = HashSet::new(); + let mut new_state_map: HashMap = HashMap::new(); + + let mut next_state_id: u32 = 0; + + if let Some(partition_index) = partition.get(&0) { + new_state_map.insert(*partition_index, next_state_id); + next_state_id += 1; + } + + for (_, &partition_index) in partition.iter() { + if let std::collections::hash_map::Entry::Vacant(e) = + new_state_map.entry(partition_index) + { + e.insert(next_state_id); + next_state_id += 1; + } + } + + for (original_state, &partition_index) in partition.iter() { + let new_state_id = new_state_map[&partition_index]; + if self.get_accepting_states().contains(original_state) { + minimal_accepting_states.insert(new_state_id); + } + } + + for (&(source_state, symbol), &target_state) in self.get_transitions() { + let source_partition = partition[&source_state]; + let target_partition = partition[&target_state]; + + let new_source_state = new_state_map[&source_partition]; + let new_target_state = new_state_map[&target_partition]; + + minimal_transitions.insert((new_source_state, symbol), new_target_state); + } + + // Modify the existing DFA in-place + *self.get_transitions_mut() = minimal_transitions; + *self.get_accepting_states_mut() = minimal_accepting_states; + } + + /// Determines if the given input string exactly matches the regex pattern. + /// + /// This function processes the input as though it is surrounded by start (`^`) and + /// end (`$`) position anchors, ensuring that the entire input must conform to the pattern. + /// + /// # Parameters + /// + /// - `input`: A string slice representing the text to be checked against the regex. + /// + /// # Returns + /// + /// Returns `true` if the entire input string matches the regex pattern exactly, + /// considering implicit start and end anchors. + /// + /// e.g., for the regex pattern "(a|b)*", the function checks if the input matches + /// the pattern from start to finish, equivalent to "^(a|b)*$". + /// + fn process(&self, input: &str) -> bool { + let mut current_state = 0; + for c in input.chars() { + if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { + current_state = next_state; + } else { + return false; + } + } + self.get_accepting_states().contains(¤t_state) + } + + fn find_first_match<'a>(&self, text: &'a str) -> Option<&'a str> { + let mut start_pos = 0; + while start_pos < text.len() { + let mut current_state = 0; + let mut match_start = None; + let mut match_end = None; + + for (i, c) in text.chars().enumerate().skip(start_pos) { + if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { + current_state = next_state; + match_start = match_start.or(Some(i)); + + if self.get_accepting_states().contains(¤t_state) { + match_end = Some(i) + } + } else { + break; + } + } + + if let (Some(start), Some(end)) = (match_start, match_end) { + return Some(&text[start..=end]); + } else { + start_pos += 1; + } + } + + None + } + + fn find_all_matches<'a>(&self, input: &'a str) -> Vec<&'a str> { + let mut matches: Vec<&str> = Vec::new(); + + let mut start_pos = 0; + while start_pos < input.len() { + let mut current_state = 0; + let mut match_start: Option = None; + let mut match_end: Option = None; + + for (i, c) in input.chars().enumerate().skip(start_pos) { + if let Some(&next_state) = self.get_transitions().get(&(current_state, c)) { + current_state = next_state; + match_start = match_start.or(Some(start_pos)); + + if self.get_accepting_states().contains(¤t_state) { + match_end = Some(i); + } + } else { + break; + } + } + + if let (Some(start), Some(end)) = (match_start, match_end) { + matches.push(&input[start..=end]); + start_pos = end + 1; + } else { + start_pos += 1; + } + } + + matches + } +} + +pub enum ConstructionType { + Thompson, + Glushkov, +} + +enum DfaType { + Thompson(ThompsonDfa), + Glushkov(GlushkovDfa), +} + +pub struct Regex { + dfa: DfaType, +} + +impl Regex { + pub fn new(pattern: &str, construction: ConstructionType) -> Result { + let dfa_type = match construction { + ConstructionType::Thompson => DfaType::Thompson(ThompsonDfa::new(pattern)?), + ConstructionType::Glushkov => DfaType::Glushkov(GlushkovDfa::new(pattern)?), + }; + Ok(Regex { dfa: dfa_type }) + } + + /// Determines if the provided `text` is an exact match for the regex pattern. + /// + /// This method interprets the regex pattern as though it is bracketed by start (`^`) + /// and end (`$`) anchors, requiring the entire `text` to conform to the pattern. + /// + /// # Parameters + /// + /// - `text`: A string slice that represents the text to be verified against the regex. + /// + /// # Returns + /// + /// Returns `true` if the `text` completely matches the regex pattern encompassed by implicit + /// anchors, otherwise returns `false`. + /// + /// # Example + /// + /// ```rust + /// use regex_engine::{Regex, ConstructionType}; + /// + /// let regex = Regex::new("(a|b)*", ConstructionType::Thompson).expect("Valied regex"); + /// assert!(regex.is_match("abba")); + /// assert!(!regex.is_match("abc")); + /// ``` + pub fn is_match(&self, text: &str) -> bool { + match &self.dfa { + DfaType::Thompson(dfa) => dfa.process(text), + DfaType::Glushkov(dfa) => dfa.process(text), + } + } + + /// Searches for the first occurrence of a sequence in `text` that matches the regex pattern. + /// + /// This method locates and returns the first substring of `text` that matches the regex, + /// if such a substring exists. + /// + /// # Parameters + /// + /// - `text`: A string slice in which to search for the regex pattern. + /// + /// # Returns + /// + /// Returns an `Option<&str>` which contains the first matching substring if a match is found, + /// or `None` if no match occurs. + /// + /// # Example + /// + /// ```rust + /// use regex_engine::{Regex, ConstructionType}; + /// + /// let regex = Regex::new("ab+", ConstructionType::Thompson).expect("Valied regex"); + /// if let Some(matched) = regex.find("aabbcc") { + /// println!("Found: {}", matched); + /// } + /// // Output: Found: abb + /// ``` + pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> { + match &self.dfa { + DfaType::Thompson(dfa) => dfa.find_first_match(text), + DfaType::Glushkov(dfa) => dfa.find_first_match(text), + } + } + + pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> { + match &self.dfa { + DfaType::Thompson(dfa) => dfa.find_all_matches(text), + DfaType::Glushkov(dfa) => dfa.find_all_matches(text), + } + } +} + +pub fn is_valid_regex(regex: &str) -> bool { + if regex.is_empty() { + return false; + } + + let mut open_paren_count = 0; + let mut last_was_quantifier = true; + + let mut chars = regex.chars().peekable(); + while let Some(c) = chars.next() { + match c { + '(' => { + open_paren_count += 1; + last_was_quantifier = true; + } + ')' => { + if open_paren_count == 0 { + return false; + } + open_paren_count -= 1; + last_was_quantifier = false; + } + '*' | '+' => { + // Ensure quantifiers are not the first character and are not repeated + if last_was_quantifier { + return false; + } + last_was_quantifier = true; + } + '\\' => { + // Handle escaped characters: ensure there's a character after the escape + if chars.peek().is_none() { + return false; + } + chars.next(); // Skip the escaped character + last_was_quantifier = false; + } + + _ => { + last_was_quantifier = false; + } + } + } + + open_paren_count == 0 +} + +pub fn normalise_regex(regex: &str) -> String { + let mut normalised = String::new(); + let mut escape_sequence = false; + let mut prev_char = '\0'; + for curr_char in regex.chars() { + if escape_sequence { + // TODO: Implement further parsing features here (e.g. \w \d) + normalised.push(curr_char); + escape_sequence = false; + prev_char = curr_char; + continue; + } + if curr_char == '\\' { + escape_sequence = true; + normalised.push(curr_char); + continue; + } + if curr_char == '+' { + match prev_char { + ')' => { + let mut balance = 0; + let mut group_start = 0; + + for j in (0..normalised.len()).rev() { + let ch = normalised.chars().nth(j).unwrap(); + if ch == ')' { + balance += 1; + } else if ch == '(' { + balance -= 1; + if balance == 0 { + group_start = j; + break; + } + } + } + + let group = String::from(&normalised[group_start..normalised.len()]); + normalised.push_str(&group); + } + _ => { + normalised.push(prev_char); + } + } + normalised.push('*'); + prev_char = '*'; + continue; + } + if curr_char == '?' { + match prev_char { + ')' => { + let mut balance = 0; + for j in (0..normalised.len()).rev() { + let ch = normalised.chars().nth(j).unwrap(); + if ch == ')' { + balance += 1; + } else if ch == '(' { + balance -= 1; + if balance == 0 { + normalised.insert(j, '('); + break; + } + } + } + } + _ => { + normalised.insert(normalised.len() - 1, '('); + } + } + normalised.push_str("|)"); + prev_char = ')'; + continue; + } + if curr_char == '.' { + normalised.push_str("(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)"); + prev_char = ')'; + continue; + } + normalised.push(curr_char); + prev_char = curr_char; + } + normalised +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_regex_basic_test() { + let regex = "(a|b)*"; + assert!(is_valid_regex(regex), "Expected valid regex."); + } + + #[test] + fn invalid_empty_regex_test() { + let regex = ""; + assert!(!is_valid_regex(regex), "Expected invalid regex (empty)."); + } + + #[test] + fn invalid_unbalanced_parentheses_test() { + let regex1 = "(a|b"; + let regex2 = "a|b)"; + assert!( + !is_valid_regex(regex1), + "Expected invalid regex (unbalanced parentheses)." + ); + assert!( + !is_valid_regex(regex2), + "Expected invalid regex (unbalanced parentheses)." + ); + } + + #[test] + fn invalid_operator_placement_test() { + let regex1 = "*a"; + let regex2 = "(+abc|x)"; + assert!( + !is_valid_regex(regex1), + "Expected invalid regex (invalid quantifier placement)." + ); + assert!( + !is_valid_regex(regex2), + "Expected invalid regex (invalid alternation placement)." + ); + } + + #[test] + fn valid_nested_parentheses_test() { + let regex = "((a|b)*c)"; + assert!( + is_valid_regex(regex), + "Expected valid regex with nested parentheses." + ); + } + + #[test] + fn valid_escape_sequence_test() { + let regex = "a\\*b"; + assert!( + is_valid_regex(regex), + "Expected valid regex with escape sequence." + ); + } + + #[test] + fn invalid_escape_sequence_test() { + let regex = "a\\"; + assert!( + !is_valid_regex(regex), + "Expected invalid regex with unpaired escape." + ); + } + + #[test] + fn normalise_regex_test() { + let cases = [ + (r"a+", r"aa*"), + (r"a\+", r"a\+"), + (r"a?", r"(a|)"), + (r"a\?", r"a\?"), + (r"(ab)?", r"((ab)|)"), + ( + r".", + "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|?|@|[|\\\\|]|^|_|`|{|}|~)", + ), + ]; + + for (input, expected) in cases { + let result = normalise_regex(input); + assert_eq!(result, expected, "Normalisation failed for input '{input}'"); + } + } + + #[test] + fn is_match_test() { + let regex_object = Regex::new("a(a|b)*", ConstructionType::Thompson).expect("Valid regex"); + + let success_strings = vec!["abababaaaababa", "a"]; + for string in success_strings { + assert!(regex_object.is_match(string)); + } + + let failing_strings = vec!["abc", "x"]; + for string in failing_strings { + assert!(!regex_object.is_match(string)); + } + } + + #[test] + fn find_test() { + let regex_object = Regex::new("abc", ConstructionType::Thompson).expect("Valid regex"); + let test_cases = vec![ + ("abcd", Some("abc")), + ("xyzabc", Some("abc")), + ("abc", Some("abc")), + ("ac", None), + ("def", None), + ("aabc", Some("abc")), + ]; + + for (text, expected) in test_cases { + let result = regex_object.find(text); + assert_eq!(result, expected, "Failed for input: {text}"); + } + } + + #[test] + fn find_all_test() { + let regex_object = Regex::new("abc*", ConstructionType::Thompson).expect("Valid regex"); + let test_cases = vec![ + ("abcd", vec!["abc"]), + ("ac", vec![]), + ("abcab", vec!["abc", "ab"]), + ]; + + for (text, expected) in test_cases { + let result = regex_object.findall(text); + assert_eq!(result, expected, "Failed for input: {text}"); + } + } +} diff --git a/src/regex_engine.rs b/src/regex_engine.rs deleted file mode 100644 index 2d528cf..0000000 --- a/src/regex_engine.rs +++ /dev/null @@ -1,78 +0,0 @@ -use crate::dfa::DFA; - -pub struct Regex { - dfa: DFA, -} - -impl Regex { - pub fn new(pattern: &str) -> Self { - Regex { - dfa: DFA::new(pattern), - } - } - - pub fn is_match(&self, text: &str) -> bool { - self.dfa.process(text) - } - - pub fn find<'a>(&self, text: &'a str) -> Option<&'a str> { - self.dfa.find_first_match(text) - } - - pub fn findall<'a>(&self, text: &'a str) -> Vec<&'a str> { - self.dfa.find_all_matches(text) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn is_match_test() { - let regex_object = Regex::new("(a|b)*"); - - let success_strings = vec!["abababaaaababa", ""]; - for string in success_strings { - assert!(regex_object.is_match(string)); - } - - let failing_strings = vec!["abc", "x"]; - for string in failing_strings { - assert!(!regex_object.is_match(string)); - } - } - - #[test] - fn find_test() { - let regex_object = Regex::new("abc"); - let test_cases = vec![ - ("abcd", Some("abc")), - ("xyzabc", Some("abc")), - ("abc", Some("abc")), - ("ac", None), - ("def", None), - ("aabc", Some("abc")), - ]; - - for (text, expected) in test_cases { - let result = regex_object.find(text); - assert_eq!(result, expected, "Failed for input: {}", text); - } - } - - #[test] - fn find_all_test() { - let regex_object = Regex::new("abc*"); - let test_cases = vec![ - ("abcd", vec!["abc"]), - ("ac", vec![]), - ("abcab", vec!["abc", "ab"]), - ]; - - for (text, expected) in test_cases { - let result = regex_object.findall(text); - assert_eq!(result, expected, "Failed for input: {}", text); - } - } -} diff --git a/src/thompson.rs b/src/thompson.rs new file mode 100644 index 0000000..a74b138 --- /dev/null +++ b/src/thompson.rs @@ -0,0 +1,532 @@ +use crate::{Dfa, is_valid_regex, normalise_regex}; +use std::collections::{HashMap, HashSet}; + +struct Nfa { + transitions: HashMap<(u32, Option), Vec>, + accepting_state: u32, // the thompson construction always has one accepting_state +} + +pub struct ThompsonDfa { + transitions: HashMap<(u32, char), u32>, + accepting_states: HashSet, +} + +impl Dfa for ThompsonDfa { + fn new(regex: &str) -> Result { + if !is_valid_regex(regex) { + return Err("{regex} is not a valid regular expression!".to_string()); + } + + let normalised_regex = normalise_regex(regex); + let regex_nfa: Nfa = thompson_construction(&normalised_regex); + let mut regex_dfa = nfa_to_dfa(®ex_nfa); + ::optimise_dfa(&mut regex_dfa); + Ok(regex_dfa) + } + + fn get_transitions(&self) -> &HashMap<(u32, char), u32> { + &self.transitions + } + + fn get_accepting_states(&self) -> &HashSet { + &self.accepting_states + } + + fn get_transitions_mut(&mut self) -> &mut HashMap<(u32, char), u32> { + &mut self.transitions + } + + fn get_accepting_states_mut(&mut self) -> &mut HashSet { + &mut self.accepting_states + } +} + +// THOMPSON CONSTRUCTION --- +fn thompson_construction(normalised_regex: &str) -> Nfa { + fn apply_operator(nfa_stack: &mut Vec, operator: char) { + match operator { + '|' => { + let nfa_right = nfa_stack.pop().expect("Expected NFA for union"); + let nfa_left = nfa_stack.pop().expect("Expected NFA for union"); + nfa_stack.push(union(&nfa_left, &nfa_right)); + } + '.' => { + let nfa_right = nfa_stack.pop().expect("Expected NFA for concatenation"); + let nfa_left = nfa_stack.pop().expect("Expected NFA for concatenation"); + nfa_stack.push(concatenate(&nfa_left, &nfa_right)); + } + _ => unreachable!("Unknown operator {}", operator), + } + } + + let mut operators: Vec = Vec::new(); + let mut nfa_stack: Vec = Vec::new(); + let mut concat_flag = false; + let mut escape_sequence = false; + + for symbol in normalised_regex.chars() { + if escape_sequence { + if concat_flag { + operators.push('.'); + } + nfa_stack.push(create_basic_nfa(&symbol)); + concat_flag = true; + escape_sequence = false; + continue; + } + + match symbol { + '(' => { + if concat_flag { + operators.push('.'); + } + operators.push('('); + concat_flag = false; + } + ')' => { + // If concat_flag is false, we have an empty right operand for union + if !concat_flag { + nfa_stack.push(create_basic_epsilon_nfa()); + } + + // Process all operators until we hit the matching '(' + while let Some(op) = operators.pop() { + if op == '(' { + break; + } + apply_operator(&mut nfa_stack, op); + } + + // If stack is empty after processing, we had completely empty parentheses + if nfa_stack.is_empty() { + nfa_stack.push(create_basic_epsilon_nfa()); + } + + concat_flag = true; + } + '*' => { + let last_nfa = nfa_stack.pop().expect("Expected NFA for Kleene Star"); + nfa_stack.push(apply_kleene_star(&last_nfa)); + concat_flag = true; + } + '|' => { + // Process all concatenation operators (higher precedence than union) + while let Some(&op) = operators.last() { + if op == '(' || op == '|' { + break; + } + operators.pop(); + apply_operator(&mut nfa_stack, op); + } + + // If we have no operand for the left side of union, create epsilon + if !concat_flag { + nfa_stack.push(create_basic_epsilon_nfa()); + } + + operators.push('|'); + concat_flag = false; + } + '\\' => { + escape_sequence = true; + } + _ => { + if concat_flag { + operators.push('.'); + } + nfa_stack.push(create_basic_nfa(&symbol)); + concat_flag = true; + } + } + } + + // Handle case where regex ends with '|' (empty right operand) + if let Some(&'|') = operators.last() + && nfa_stack.len() < 2 + { + nfa_stack.push(create_basic_epsilon_nfa()); + } + + // Process remaining operators + while let Some(op) = operators.pop() { + if op == '(' { + panic!("Unmatched opening parenthesis"); + } + apply_operator(&mut nfa_stack, op); + } + + if nfa_stack.len() != 1 { + panic!( + "Invalid Regex, unexpected final NFA stack size: {}", + nfa_stack.len() + ); + } + + nfa_stack.pop().unwrap() +} + +fn apply_kleene_star(last_nfa: &Nfa) -> Nfa { + let mut transitions = HashMap::new(); + + let new_accepting = last_nfa.accepting_state + 2; + + // Epsilon transition from new start to original start + transitions.insert((0, None), vec![1]); + + // Copy existing transitions, shifting state numbers to make room for new start + for ((state, input), targets) in &last_nfa.transitions { + // Shift each transition to new indices + transitions.insert((state + 1, *input), targets.iter().map(|s| s + 1).collect()); + } + + // Epsilon transitions returning to original start for loops, and new accepting state + transitions + .entry((&last_nfa.accepting_state + 1, None)) + .or_insert_with(Vec::new) + .push(1); + + transitions + .entry((&last_nfa.accepting_state + 1, None)) + .or_insert_with(Vec::new) + .push(new_accepting); + + // Final acceptance state is accepting with epsilon transition from start for empty string + transitions + .entry((0, None)) + .or_insert_with(Vec::new) + .push(new_accepting); + + Nfa { + transitions, + accepting_state: new_accepting, + } +} + +fn union(left: &Nfa, right: &Nfa) -> Nfa { + let mut transitions = HashMap::new(); + + let num_states_left_nfa = left.accepting_state; + let num_states_right_nfa = right.accepting_state; + + // Shift the NFA states + for ((state, input), targets) in &left.transitions { + transitions.insert((state + 1, *input), targets.iter().map(|s| s + 1).collect()); + } + + for ((state, input), targets) in &right.transitions { + transitions.insert( + (state + num_states_left_nfa + 2, *input), + targets + .iter() + .map(|s| s + num_states_left_nfa + 2) + .collect(), + ); + } + + // Add new start and end state + let new_accepting_state = num_states_left_nfa + num_states_right_nfa + 3; + + transitions.insert((0, None), vec![1, num_states_left_nfa + 2]); + transitions + .entry((left.accepting_state + 1, None)) + .or_insert_with(Vec::new) + .push(new_accepting_state); + transitions + .entry((right.accepting_state + num_states_left_nfa + 2, None)) + .or_insert_with(Vec::new) + .push(new_accepting_state); + + Nfa { + transitions, + accepting_state: new_accepting_state, + } +} + +fn concatenate(left: &Nfa, right: &Nfa) -> Nfa { + let mut transitions: HashMap<(u32, Option), Vec> = left.transitions.clone(); + + // HACK: The accepting states are (based on the implementation) the last ones of the NFA + // thus it is possible to get the num of states in the first NFA like this + let num_states_left_nfa = left.accepting_state; + + for ((state, input), targets) in &right.transitions { + transitions.insert( + (state + num_states_left_nfa, *input), + targets.iter().map(|s| s + num_states_left_nfa).collect(), + ); + } + + Nfa { + transitions, + accepting_state: right.accepting_state + num_states_left_nfa, + } +} + +fn create_basic_nfa(letter: &char) -> Nfa { + Nfa { + transitions: HashMap::from([((0, Some(*letter)), vec![1])]), + accepting_state: 1, + } +} + +fn create_basic_epsilon_nfa() -> Nfa { + Nfa { + transitions: HashMap::from([((0, None), vec![1])]), + accepting_state: 1, + } +} +// END THOMPSON CONSTRUCTION --- + +// NFA to DFA functions --- +fn epsilon_closure(nfa: &Nfa, states: &mut HashSet) { + let mut stack = states.clone(); + + while let Some(&state_id) = stack.iter().next() { + stack.remove(&state_id); + if let Some(epsilon_states) = nfa.transitions.get(&(state_id, None)) { + for &next_state in epsilon_states { + if states.insert(next_state) { + stack.insert(next_state); + } + } + } + } +} + +fn move_nfa(nfa: &Nfa, states: &HashSet, symbol: char) -> HashSet { + let mut move_states = HashSet::new(); + + for &state in states { + if let Some(next_states) = nfa.transitions.get(&(state, Some(symbol))) { + move_states.extend(next_states); + } + } + + move_states +} + +fn hash_set_to_sorted_vec(set: &HashSet) -> Vec { + let mut vec: Vec = set.iter().cloned().collect(); + vec.sort_unstable(); + vec +} + +fn nfa_to_dfa(nfa: &Nfa) -> ThompsonDfa { + // Start from the initial state of the NFA, assuming it's state 0 + let mut start_closure = HashSet::from([0]); + epsilon_closure(nfa, &mut start_closure); + let mut state_map = HashMap::new(); + let mut dfa_accepting_states = HashSet::new(); + let mut transitions = HashMap::new(); + + // Map the initial DFA state from the initial NFA state closure + state_map.insert(hash_set_to_sorted_vec(&start_closure), 0); + + let mut unmarked_states = vec![start_closure]; + + while let Some(current_closure) = unmarked_states.pop() { + let current_dfa_state_id = state_map[&hash_set_to_sorted_vec(¤t_closure)]; + + if current_closure.contains(&nfa.accepting_state) { + dfa_accepting_states.insert(current_dfa_state_id); + } + + // Collect symbols from transitions + let symbols: HashSet<_> = nfa + .transitions + .keys() + .filter_map(|(_, symbol)| *symbol) + .collect(); + + for symbol in symbols { + let mut move_closure = move_nfa(nfa, ¤t_closure, symbol); + epsilon_closure(nfa, &mut move_closure); + + if move_closure.is_empty() { + continue; + } + + let sorted_vec = hash_set_to_sorted_vec(&move_closure); + let next_dfa_state_id = state_map.len() as u32; + + // Insert new DFA state if isn't already mapped + if !state_map.contains_key(&sorted_vec) { + state_map.insert(sorted_vec.clone(), next_dfa_state_id); + unmarked_states.push(move_closure); + } + + transitions.insert((current_dfa_state_id, symbol), state_map[&sorted_vec]); + } + } + + ThompsonDfa { + transitions, + accepting_states: dfa_accepting_states, + } +} +// END NFA to DFA functions --- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn create_dfa_test() { + let generated_dfa = ThompsonDfa::new("(a|b)*").expect("Valid dfa"); + let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 0)]); + let expected_accepting_states = HashSet::from([0]); + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + + let generated_dfa_2 = ThompsonDfa::new("a|()").expect("Valid dfa"); + let expected_transitions_2 = HashMap::from([((0, 'a'), 1)]); + let expected_accepting_states_2 = HashSet::from([0, 1]); + + assert_eq!(expected_transitions_2, generated_dfa_2.transitions); + assert_eq!( + expected_accepting_states_2, + generated_dfa_2.accepting_states + ); + + let generated_dfa = ThompsonDfa::new("a*b").expect("Valid dfa"); + let expected_transitions = HashMap::from([((0, 'a'), 0), ((0, 'b'), 1)]); + let expected_accepting_states = HashSet::from([1]); + + assert_eq!(expected_transitions, generated_dfa.transitions); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + } + + #[test] + fn prozess_regex_test() { + let generated_dfa = ThompsonDfa::new("(a|b)*").expect("Valid dfa"); + let test_strings = vec!["abbbababaaaa", ""]; + for string in test_strings { + assert!(generated_dfa.process(string)); + } + } + + #[test] + fn create_basic_nfa_test() { + let nfa_a = create_basic_nfa(&'a'); + let expected_transitions = HashMap::from([((0, Some('a')), vec![1])]); + let expected_accepting_state: u32 = 1; + + assert_eq!(nfa_a.transitions, expected_transitions); + assert_eq!(nfa_a.accepting_state, expected_accepting_state); + } + + #[test] + fn concatenate_test() { + let nfa_a = create_basic_nfa(&'a'); + let nfa_b = create_basic_nfa(&'b'); + let concatenated_nfa = concatenate(&nfa_a, &nfa_b); + + let expected_transitions = + HashMap::from([((0, Some('a')), vec![1]), ((1, Some('b')), vec![2])]); + let expected_accepting_state: u32 = 2; + + assert_eq!(concatenated_nfa.transitions, expected_transitions); + assert_eq!(concatenated_nfa.accepting_state, expected_accepting_state); + } + + #[test] + fn apply_kleene_star_test() { + let basic_nfa = create_basic_nfa(&'a'); + let starred_nfa = apply_kleene_star(&basic_nfa); + + let expected_transitions = HashMap::from([ + ((0, None), vec![1, 3]), // Epsilon to start and new accepting + ((1, Some('a')), vec![2]), // Original transition + ((2, None), vec![1, 3]), // Loop back and transition to new accepting + ]); + + let expected_accepting_state: u32 = 3; + + assert_eq!(starred_nfa.transitions, expected_transitions); + assert_eq!(starred_nfa.accepting_state, expected_accepting_state); + } + + #[test] + fn union_test() { + let nfa_a = create_basic_nfa(&'a'); + let nfa_b = create_basic_nfa(&'b'); + let union_nfa = union(&nfa_a, &nfa_b); + + let expected_transitions = HashMap::from([ + ((0, None), vec![1, 3]), // Combined initial state transitions + ((1, Some('a')), vec![2]), // Offset transitions for NFA a + ((3, Some('b')), vec![4]), // Offset transitions for NFA b + ((2, None), vec![5]), // Accepting state transition for a + ((4, None), vec![5]), // Accepting state transition for b + ]); + + let expected_accepting_state: u32 = 5; + + assert_eq!(union_nfa.transitions, expected_transitions); + assert_eq!(union_nfa.accepting_state, expected_accepting_state); + } + + #[test] + fn thompson_construction_test() { + let regex_nfa = thompson_construction("(a|b)*"); + + let expected_transitions = HashMap::from([ + ((0, None), vec![1, 7]), + ((1, None), vec![2, 4]), + ((2, Some('a')), vec![3]), + ((3, None), vec![6]), + ((4, Some('b')), vec![5]), + ((5, None), vec![6]), + ((6, None), vec![1, 7]), + ]); + let expected_accepting_state = 7; + + assert_eq!(regex_nfa.transitions, expected_transitions); + assert_eq!(regex_nfa.accepting_state, expected_accepting_state); + } + + #[test] + fn nfa_to_dfa_test() { + let input_nfa = Nfa { + transitions: HashMap::from([ + ((0, None), vec![1, 7]), + ((1, None), vec![2, 4]), + ((2, Some('a')), vec![3]), + ((3, None), vec![6]), + ((4, Some('b')), vec![5]), + ((5, None), vec![6]), + ((6, None), vec![1, 7]), + ]), + accepting_state: 7, + }; + + let generated_dfa = nfa_to_dfa(&input_nfa); + + let expected_options = [ + HashMap::from([ + ((0, 'a'), 1), + ((0, 'b'), 2), + ((1, 'a'), 1), + ((1, 'b'), 2), + ((2, 'a'), 1), + ((2, 'b'), 2), + ]), + HashMap::from([ + ((0, 'a'), 2), + ((0, 'b'), 1), + ((1, 'a'), 2), + ((1, 'b'), 1), + ((2, 'a'), 2), + ((2, 'b'), 1), + ]), + ]; + let expected_accepting_states = HashSet::from([0, 1, 2]); + + assert!( + expected_options.contains(&generated_dfa.transitions), + "Transitions did not match any of the expected options." + ); + assert_eq!(expected_accepting_states, generated_dfa.accepting_states); + } +} diff --git a/tests/glushkov_test.rs b/tests/glushkov_test.rs new file mode 100644 index 0000000..545218e --- /dev/null +++ b/tests/glushkov_test.rs @@ -0,0 +1,17 @@ +include!("../benches/bench_cases.rs"); +use regex_engine::{ConstructionType, Regex}; + +#[test] +fn test_all_bench_cases() { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Glushkov).expect("Valid regex"); + assert_eq!(regex.is_match(&case.input), case.expected_is_match); + assert_eq!( + regex.find(&case.input), + case.expected_first_match.as_deref() + ); + assert_eq!(regex.findall(&case.input), case.expected_all_matches); + } +} diff --git a/tests/rust_regex_test.rs b/tests/rust_regex_test.rs new file mode 100644 index 0000000..9142f6a --- /dev/null +++ b/tests/rust_regex_test.rs @@ -0,0 +1,27 @@ +include!("../benches/bench_cases.rs"); +use regex::Regex; + +#[test] +fn test_all_bench_cases() { + let cases = get_bench_cases(); + + for case in &cases { + let match_regex = Regex::new(format!("^{}$", case.regex).as_str()) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + let regex = Regex::new(case.regex) + .unwrap_or_else(|_| panic!("Failed to create pattern: {}", case.regex)); + + assert_eq!(match_regex.is_match(&case.input), case.expected_is_match); + assert_eq!( + regex.find(&case.input).map(|s| s.as_str()), + case.expected_first_match.as_deref() + ); + assert_eq!( + regex + .find_iter(&case.input) + .map(|s| s.as_str()) + .collect::>(), + case.expected_all_matches + ); + } +} diff --git a/tests/test_one.rs b/tests/test_one.rs deleted file mode 100644 index 47ec5fe..0000000 --- a/tests/test_one.rs +++ /dev/null @@ -1,49 +0,0 @@ -use regex_engine::regex_engine::Regex; - -#[test] -fn test_escape_sequence_plus() { - let pattern = r"a*b\+"; - let text = "aaab+b"; // should fail on match - let text_success = "aaab+"; - - let engine = Regex::new(pattern); - - let expected_match = text_success; - - assert!(!engine.is_match(text)); - assert!(engine.is_match(text_success)); - assert_eq!(engine.find(text), Some(expected_match)); - assert_eq!(engine.findall(text), vec![expected_match]); -} - -#[test] -fn test_escape_sequence_slash() { - let pattern = r"a*b\\"; - let text = "aaab\\b"; // should fail on match - let text_success = "aaab\\"; - - let engine = Regex::new(pattern); - - let expected_match = text_success; - - assert!(!engine.is_match(text)); - assert!(engine.is_match(text_success)); - assert_eq!(engine.find(text), Some(expected_match)); - assert_eq!(engine.findall(text), vec![expected_match]); -} - -#[test] -fn test_dot_wildcard() { - let pattern = r"a.*"; - let text = "cabbc"; // should fail on match - let text_success = "abbc"; - - let engine = Regex::new(pattern); - - let expected_match = text_success; - - assert!(!engine.is_match(text)); - assert!(engine.is_match(text_success)); - assert_eq!(engine.find(text), Some(expected_match)); - assert_eq!(engine.findall(text), vec![expected_match]); -} diff --git a/tests/thompson_test.rs b/tests/thompson_test.rs new file mode 100644 index 0000000..5532768 --- /dev/null +++ b/tests/thompson_test.rs @@ -0,0 +1,18 @@ +include!("../benches/bench_cases.rs"); +use regex_engine::{ConstructionType, Regex}; + +#[test] +fn test_all_bench_cases() { + let cases = get_bench_cases(); + + for case in &cases { + let regex = Regex::new(case.regex, ConstructionType::Thompson).expect("Valid regex"); + + assert_eq!(regex.is_match(&case.input), case.expected_is_match); + assert_eq!( + regex.find(&case.input), + case.expected_first_match.as_deref() + ); + assert_eq!(regex.findall(&case.input), case.expected_all_matches); + } +}