Skip to content

Commit fcd2558

Browse files
add nfa builder, other minor changes-cursor_pos+std::move in add_concat_tokens()
std::move in add_concat_tokens() in RegexTokenizer.cpp
1 parent 96692dd commit fcd2558

5 files changed

Lines changed: 489 additions & 17 deletions

File tree

libpz/include/Nfa.hpp

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#ifndef NFA_HPP
2+
#define NFA_HPP
3+
4+
#include <RegexTokenizer.hpp>
5+
#include <pz_cxx_std.hpp>
6+
#include <pz_types.hpp>
7+
8+
/**
9+
* @brief Types of NFA states used in regex matching.
10+
*/
11+
enum class StateType {
12+
/** Match a single literal character */
13+
CHAR,
14+
15+
/** Match any character (.) */
16+
DOT,
17+
18+
/** Match a character class ([...]) */
19+
CHAR_CLASS,
20+
21+
/** Accepting (final) state */
22+
MATCH,
23+
24+
/** ε-transition with two outgoing branches */
25+
SPLIT,
26+
27+
/** Save input position (for capture groups) */
28+
SAVE,
29+
30+
/** Start-of-input anchor (^) */
31+
ANCHOR_START,
32+
33+
/** End-of-input anchor ($) */
34+
ANCHOR_END
35+
};
36+
37+
/**
38+
* @brief Represents a single state in the NFA.
39+
*/
40+
struct State {
41+
StateType type;
42+
43+
/** Literal character to match (valid only for CHAR states, unspecified
44+
* otherwise). */
45+
ut8 c;
46+
47+
/** Capture group identifier (used by SAVE states to store input positions).
48+
*/
49+
st32 save_id = -1;
50+
// Even IDs represent group start, odd IDs represent group end.
51+
52+
/** Character ranges for CHAR_CLASS states. */
53+
std::vector<CharRange> ranges;
54+
bool negated = false;
55+
56+
/** Primary outgoing transition. */
57+
State *out = nullptr;
58+
59+
/** Secondary outgoing transition (used only by SPLIT states). */
60+
State *out1 = nullptr;
61+
62+
/**
63+
* @brief Marker used during NFA simulation.
64+
*
65+
* Prevents revisiting the same state multiple times in a single step,
66+
* avoiding duplicate work and infinite ε-transition loops.
67+
*/
68+
st32 last_list = -1;
69+
// Marks whether this state has already been added to the current
70+
// active-states list, preventing duplicate entries and infinite ε-transition
71+
// loops
72+
73+
State(StateType t) : type(t) {}
74+
};
75+
76+
/**
77+
* @brief Represents a partially constructed NFA fragment.
78+
*
79+
* A fragment consists of:
80+
* - a start state
81+
* - a list of dangling outgoing transitions that must be patched later
82+
*/
83+
struct Frag {
84+
State *start;
85+
86+
/** Addresses of state pointers that need to be connected later. */
87+
std::vector<State **> out_ptrs;
88+
89+
/**
90+
* @brief Construct a fragment with a single dangling exit.
91+
*/
92+
Frag(State *s) : start(s) { out_ptrs.push_back(&s->out); }
93+
94+
/**
95+
* @brief Construct a fragment with multiple dangling exits.
96+
*/
97+
Frag(State *s, std::vector<State **> out) : start(s), out_ptrs(out) {}
98+
99+
/**
100+
* @brief Patch all dangling exits to point to the given state.
101+
*/
102+
void patch(State *s) {
103+
for (auto &ptr : out_ptrs) {
104+
if (ptr &&
105+
!*ptr) { // Only patch if the pointer exists and is currently null
106+
*ptr = s;
107+
}
108+
}
109+
}
110+
};
111+
112+
#endif // NFA_HPP

libpz/include/NfaBuilder.hpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#ifndef NFA_BUILDER_HPP
2+
#define NFA_BUILDER_HPP
3+
4+
#include <Nfa.hpp>
5+
6+
/**
7+
* @brief Builds an ε-NFA from a postfix regex token sequence.
8+
*
9+
* Implements Thompson-style construction to convert postfix regex tokens
10+
* into an NFA graph. All states created during construction are owned
11+
* internally and cleaned up automatically.
12+
*/
13+
class NfaBuilder {
14+
public:
15+
/**
16+
* @brief Build an NFA from a postfix regex.
17+
*
18+
* The resulting NFA has a single accepting state of type
19+
* StateType::MATCH. The returned pointer refers to the start state.
20+
*
21+
* @param postfix Regex tokens in postfix (RPN) form.
22+
* @return Pointer to the start state of the constructed NFA.
23+
*/
24+
State *build(const std::vector<Token> &postfix);
25+
26+
/**
27+
* @brief Create a deep copy of an NFA fragment.
28+
*
29+
* Used for handling quantifiers that require duplication of subgraphs
30+
* (e.g. {m,n}, *, +).
31+
*/
32+
Frag copy_fragment(Frag);
33+
34+
/**
35+
* @brief Deep copy an NFA subgraph starting from a given state.
36+
*
37+
* Keeps a lookup map to avoid duplicating already-copied states.
38+
*
39+
* @param s Original state to copy.
40+
* @param lookup Map from original states to their copies.
41+
* @return Pointer to the copied state.
42+
*/
43+
State *copy_state(State *, std::unordered_map<State *, State *> &);
44+
45+
private:
46+
/**
47+
* @brief Allocate a new NFA state and store it in the internal pool.
48+
*
49+
* Ownership is retained by the builder to ensure correct lifetime.
50+
*/
51+
State *create_state(StateType type);
52+
53+
/**
54+
* @brief Owns all NFA states created during construction.
55+
*
56+
* Ensures that all State objects remain valid for the lifetime
57+
* of the NfaBuilder and are automatically destroyed via RAII.
58+
*/
59+
std::vector<std::unique_ptr<State>> state_pool;
60+
};
61+
62+
#endif // NFA_BUILDER_HPP

libpz/include/RegexTokenizer.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ class Tokenizer {
108108
/** Input regex pattern */
109109
std::string_view pattern;
110110
/** Current cursor position */
111-
size_t i = 0;
111+
size_t cursor_pos = 0;
112112
/** Counter for assigning group IDs */
113113
st32 group_counter = 0;
114114
/** Stack for nested group tracking */

0 commit comments

Comments
 (0)