Skip to content

Commit 76224ed

Browse files
add postfix conversion for tokens
1 parent a63a235 commit 76224ed

2 files changed

Lines changed: 152 additions & 152 deletions

File tree

libpz/include/RegexPostfix.hpp

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,28 @@
1-
#ifndef REGEX_POSTFIX_HPP
2-
#define REGEX_POSTFIX_HPP
3-
4-
#include <RegexTokenizer.hpp>
5-
#include <pz_cxx_std.hpp>
6-
#include <pz_types.hpp>
7-
8-
/**
9-
* @brief Converts regex tokens from infix to postfix (RPN) form.
10-
*
11-
* This conversion is used as a preprocessing step before NFA construction.
12-
* The class is stateless and intended to be used via its static methods.
13-
*/
14-
class Postfix {
15-
public:
16-
/**
17-
* @brief Convert an infix token sequence into postfix order.
18-
*/
19-
static std::vector<Token> convert(const std::vector<Token> &infix);
20-
21-
private:
22-
/**
23-
* @brief Returns precedence of a regex operator token.
24-
*/
25-
static st32 get_precedence(TokenType type);
26-
};
27-
1+
#ifndef REGEX_POSTFIX_HPP
2+
#define REGEX_POSTFIX_HPP
3+
4+
#include <RegexTokenizer.hpp>
5+
#include <pz_cxx_std.hpp>
6+
#include <pz_types.hpp>
7+
8+
/**
9+
* @brief Converts regex tokens from infix to postfix (RPN) form.
10+
*
11+
* This conversion is used as a preprocessing step before NFA construction.
12+
* The class is stateless and intended to be used via its static methods.
13+
*/
14+
class Postfix {
15+
public:
16+
/**
17+
* @brief Convert an infix token sequence into postfix order.
18+
*/
19+
static std::vector<Token> convert(const std::vector<Token> &infix);
20+
21+
private:
22+
/**
23+
* @brief Returns precedence of a regex operator token.
24+
*/
25+
static st32 get_precedence(TokenType type);
26+
};
27+
2828
#endif // REGEX_POSTFIX_HPP

libpz/regex/RegexPostfix.cpp

Lines changed: 125 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1,126 +1,126 @@
1-
#include "RegexPostfix.hpp"
2-
#include "pz_error.hpp"
3-
4-
st32 Postfix::get_precedence(TokenType type) {
5-
switch (type) {
6-
case TokenType::STAR:
7-
case TokenType::PLUS:
8-
case TokenType::QUESTION:
9-
case TokenType::QUANTIFIER_RANGE:
10-
return 3; // Unary postfix operators
11-
case TokenType::CONCAT:
12-
return 2; // Implicit concatenation
13-
case TokenType::ALTERNATION:
14-
return 1; // Lowest precedence
15-
default:
16-
return 0;
17-
}
18-
}
19-
20-
std::vector<Token> Postfix::convert(const std::vector<Token> &infix) {
21-
std::vector<Token> postfix;
22-
std::stack<Token> operators;
23-
TokenType last_type = TokenType::END; // Tracks previous token for validation
24-
25-
for (const auto &t : infix) {
26-
switch (t.type) {
27-
// Operands go directly to output
28-
case TokenType::LITERAL:
29-
case TokenType::DOT:
30-
case TokenType::CHAR_CLASS:
31-
case TokenType::CARET:
32-
case TokenType::DOLLAR:
33-
postfix.push_back(t);
34-
break;
35-
36-
// '(' is pushed to operator stack and output (for NFA grouping)
37-
case TokenType::LPAREN: {
38-
postfix.push_back(t);
39-
operators.push(t);
40-
break;
41-
}
42-
43-
// Pop operators until matching '(' is found
44-
case TokenType::RPAREN: {
45-
if (last_type == TokenType::LPAREN)
46-
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
47-
"Empty Parentheses at position " +
48-
std::to_string(t.pos));
49-
while (!operators.empty() && operators.top().type != TokenType::LPAREN) {
50-
postfix.push_back(operators.top());
51-
operators.pop();
52-
}
53-
if (operators.empty())
54-
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
55-
"Mismatched ')' at position " +
56-
std::to_string(t.pos));
57-
operators.pop(); // Discard '('
58-
postfix.push_back(t);
59-
break;
60-
}
61-
// Unary postfix operators must follow a valid expression
62-
case TokenType::STAR:
63-
case TokenType::PLUS:
64-
case TokenType::QUESTION:
65-
case TokenType::QUANTIFIER_RANGE:
66-
if (last_type != TokenType::LITERAL && last_type != TokenType::DOT &&
67-
last_type != TokenType::CHAR_CLASS &&
68-
last_type != TokenType::RPAREN) {
69-
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
70-
"Quantifier used without a valid preceding "
71-
"expression at position " +
72-
std::to_string(t.pos));
73-
}
74-
postfix.push_back(t);
75-
break;
76-
77-
case TokenType::ALTERNATION:
78-
// '|' must separate two valid expressions
79-
if (last_type == TokenType::END || last_type == TokenType::LPAREN ||
80-
last_type == TokenType::ALTERNATION) {
81-
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
82-
"Invalid '|' at position " +
83-
std::to_string(t.pos) +
84-
". It must separate two expressions.");
85-
}
86-
goto push_operator;
87-
88-
// Binary operators handled via precedence rules
89-
case TokenType::CONCAT:
90-
push_operator:
91-
while (!operators.empty() && operators.top().type != TokenType::LPAREN &&
92-
get_precedence(operators.top().type) >= get_precedence(t.type)) {
93-
postfix.push_back(operators.top());
94-
operators.pop();
95-
}
96-
operators.push(t);
97-
break;
98-
99-
default:
100-
break;
101-
}
102-
103-
if (t.type != TokenType::END)
104-
last_type = t.type;
105-
}
106-
107-
// Pattern must not end with a binary operator
108-
if (last_type == TokenType::ALTERNATION || last_type == TokenType::CONCAT) {
109-
PzError::report_error(
110-
PzError::PzErrorType::PZ_INVALID_INPUT,
111-
"Trailing binary operator at end of pattern at position " +
112-
std::to_string(infix.back().pos));
113-
}
114-
115-
// Drain remaining operators
116-
while (!operators.empty()) {
117-
if (operators.top().type == TokenType::LPAREN)
118-
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
119-
"Unmatched '(' at position " +
120-
std::to_string(operators.top().pos));
121-
postfix.push_back(operators.top());
122-
operators.pop();
123-
}
124-
125-
return postfix;
1+
#include "RegexPostfix.hpp"
2+
#include "pz_error.hpp"
3+
4+
st32 Postfix::get_precedence(TokenType type) {
5+
switch (type) {
6+
case TokenType::STAR:
7+
case TokenType::PLUS:
8+
case TokenType::QUESTION:
9+
case TokenType::QUANTIFIER_RANGE:
10+
return 3; // Unary postfix operators
11+
case TokenType::CONCAT:
12+
return 2; // Implicit concatenation
13+
case TokenType::ALTERNATION:
14+
return 1; // Lowest precedence
15+
default:
16+
return 0;
17+
}
18+
}
19+
20+
std::vector<Token> Postfix::convert(const std::vector<Token> &infix) {
21+
std::vector<Token> postfix;
22+
std::stack<Token> operators;
23+
TokenType last_type = TokenType::END; // Tracks previous token for validation
24+
25+
for (const auto &t : infix) {
26+
switch (t.type) {
27+
// Operands go directly to output
28+
case TokenType::LITERAL:
29+
case TokenType::DOT:
30+
case TokenType::CHAR_CLASS:
31+
case TokenType::CARET:
32+
case TokenType::DOLLAR:
33+
postfix.push_back(t);
34+
break;
35+
36+
// '(' is pushed to operator stack and output (for NFA grouping)
37+
case TokenType::LPAREN: {
38+
postfix.push_back(t);
39+
operators.push(t);
40+
break;
41+
}
42+
43+
// Pop operators until matching '(' is found
44+
case TokenType::RPAREN: {
45+
if (last_type == TokenType::LPAREN)
46+
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
47+
"Empty Parentheses at position " +
48+
std::to_string(t.pos));
49+
while (!operators.empty() && operators.top().type != TokenType::LPAREN) {
50+
postfix.push_back(operators.top());
51+
operators.pop();
52+
}
53+
if (operators.empty())
54+
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
55+
"Mismatched ')' at position " +
56+
std::to_string(t.pos));
57+
operators.pop(); // Discard '('
58+
postfix.push_back(t);
59+
break;
60+
}
61+
// Unary postfix operators must follow a valid expression
62+
case TokenType::STAR:
63+
case TokenType::PLUS:
64+
case TokenType::QUESTION:
65+
case TokenType::QUANTIFIER_RANGE:
66+
if (last_type != TokenType::LITERAL && last_type != TokenType::DOT &&
67+
last_type != TokenType::CHAR_CLASS &&
68+
last_type != TokenType::RPAREN) {
69+
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
70+
"Quantifier used without a valid preceding "
71+
"expression at position " +
72+
std::to_string(t.pos));
73+
}
74+
postfix.push_back(t);
75+
break;
76+
77+
case TokenType::ALTERNATION:
78+
// '|' must separate two valid expressions
79+
if (last_type == TokenType::END || last_type == TokenType::LPAREN ||
80+
last_type == TokenType::ALTERNATION) {
81+
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
82+
"Invalid '|' at position " +
83+
std::to_string(t.pos) +
84+
". It must separate two expressions.");
85+
}
86+
goto push_operator;
87+
88+
// Binary operators handled via precedence rules
89+
case TokenType::CONCAT:
90+
push_operator:
91+
while (!operators.empty() && operators.top().type != TokenType::LPAREN &&
92+
get_precedence(operators.top().type) >= get_precedence(t.type)) {
93+
postfix.push_back(operators.top());
94+
operators.pop();
95+
}
96+
operators.push(t);
97+
break;
98+
99+
default:
100+
break;
101+
}
102+
103+
if (t.type != TokenType::END)
104+
last_type = t.type;
105+
}
106+
107+
// Pattern must not end with a binary operator
108+
if (last_type == TokenType::ALTERNATION || last_type == TokenType::CONCAT) {
109+
PzError::report_error(
110+
PzError::PzErrorType::PZ_INVALID_INPUT,
111+
"Trailing binary operator at end of pattern at position " +
112+
std::to_string(infix.back().pos));
113+
}
114+
115+
// Drain remaining operators
116+
while (!operators.empty()) {
117+
if (operators.top().type == TokenType::LPAREN)
118+
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
119+
"Unmatched '(' at position " +
120+
std::to_string(operators.top().pos));
121+
postfix.push_back(operators.top());
122+
operators.pop();
123+
}
124+
125+
return postfix;
126126
}

0 commit comments

Comments
 (0)