Skip to content

Commit c1d2b48

Browse files
handle {,num} and use pz_types
1 parent a40f05d commit c1d2b48

2 files changed

Lines changed: 50 additions & 53 deletions

File tree

libpz/include/RegexTokenizer.hpp

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define REGEX_TOKENIZER_HPP
33

44
#include <pz_cxx_std.hpp>
5+
#include <pz_types.hpp>
56

67
/**
78
* @brief Types of tokens produced by the regex tokenizer.
@@ -55,10 +56,10 @@ enum class TokenType {
5556
*/
5657
struct CharRange {
5758
/** Lower bound */
58-
char lo;
59+
ut8 lo;
5960

6061
/** Upper bound */
61-
char hi;
62+
ut8 hi;
6263
};
6364

6465
/**
@@ -70,20 +71,20 @@ struct Token {
7071
/** Position in pattern (for error reporting) */
7172
size_t pos;
7273
/** Group ID for parentheses */
73-
int group_id = -1;
74+
st32 group_id = -1;
7475

7576
/** Literal character value */
76-
char literal = '\0';
77+
ut8 literal = '\0';
7778

7879
/** Whether character class is negated */
7980
bool negated = false;
8081
/** Character ranges for character class */
8182
std::vector<CharRange> ranges{};
8283

8384
/** Minimum repetitions for quantifier */
84-
int min = 0;
85+
st32 min = 0;
8586
/** Maximum repetitions (-1 means unbounded) */
86-
int max = 0;
87+
st32 max = 0;
8788
};
8889

8990
/**
@@ -109,21 +110,21 @@ class Tokenizer {
109110
/** Current cursor position */
110111
size_t i = 0;
111112
/** Counter for assigning group IDs */
112-
int group_counter = 0;
113+
st32 group_counter = 0;
113114
/** Stack for nested group tracking */
114-
std::stack<int> group_stack;
115+
std::stack<st32> group_stack;
115116

116117
/** Peek next character without consuming */
117-
char peek() const;
118+
ut8 peek() const;
118119
/** Consume next character */
119-
char get();
120+
ut8 get();
120121
/** Check for end of input */
121122
bool eof() const;
122123

123124
/** Read next token */
124125
Token next_token();
125126
/** Read literal character */
126-
Token read_literal(char);
127+
Token read_literal(ut8);
127128
/** Read escape sequence */
128129
Token read_escape();
129130
/** Read character class */
@@ -132,7 +133,7 @@ class Tokenizer {
132133
Token read_quantifier();
133134

134135
/** @brief Populates a token with ranges for \d, \w, \s, etc. */
135-
void add_shorthand_ranges(char, Token &);
136+
void add_shorthand_ranges(ut8, Token &);
136137

137138
/** @brief Inserts implicit CONCAT tokens where concatenation occurs. */
138139
void add_concat_tokens(std::vector<Token> &);

libpz/regex/RegexTokenizer.cpp

Lines changed: 37 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33

44
Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {}
55

6-
char Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; }
6+
ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; }
77

8-
char Tokenizer::get() { return eof() ? '\0' : pattern[i++]; }
8+
ut8 Tokenizer::get() { return eof() ? '\0' : pattern[i++]; }
99

1010
bool Tokenizer::eof() const { return i >= pattern.size(); }
1111

@@ -62,7 +62,7 @@ void Tokenizer::add_concat_tokens(std::vector<Token> &tokens) {
6262
}
6363

6464
Token Tokenizer::next_token() {
65-
char c = get();
65+
ut8 c = get();
6666

6767
// Position of the character that produced this token
6868
size_t pos = i - 1;
@@ -79,7 +79,7 @@ Token Tokenizer::next_token() {
7979
case '|':
8080
return {TokenType::ALTERNATION, pos};
8181
case '(': {
82-
int id = ++group_counter;
82+
st32 id = ++group_counter;
8383
group_stack.push(id);
8484
Token t{TokenType::LPAREN, pos};
8585
t.group_id = id;
@@ -90,7 +90,7 @@ Token Tokenizer::next_token() {
9090
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
9191
"Mismatched ')' at position " +
9292
std::to_string(pos));
93-
int id = group_stack.top();
93+
st32 id = group_stack.top();
9494
group_stack.pop();
9595
Token t{TokenType::RPAREN, pos};
9696
t.group_id = id;
@@ -111,7 +111,7 @@ Token Tokenizer::next_token() {
111111
}
112112
}
113113

114-
Token Tokenizer::read_literal(char c) {
114+
Token Tokenizer::read_literal(ut8 c) {
115115
Token t{TokenType::LITERAL, i - 1};
116116
t.literal = c;
117117
return t;
@@ -124,7 +124,7 @@ Token Tokenizer::read_escape() {
124124

125125
Token t;
126126
t.pos = i - 1;
127-
char c = get();
127+
ut8 c = get();
128128

129129
if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') {
130130
t.type = TokenType::CHAR_CLASS;
@@ -156,49 +156,45 @@ Token Tokenizer::read_escape() {
156156
return t;
157157
}
158158

159-
void Tokenizer::add_shorthand_ranges(char c, Token &t) {
160-
const char MIN_CHAR = '\0'; // ascii index 0
161-
const char MAX_CHAR = '\x7F'; // ascii index 127
159+
void Tokenizer::add_shorthand_ranges(ut8 c, Token &t) {
160+
static constexpr ut8 MIN_CHAR = 0; // ascii index 0
161+
static constexpr ut8 MAX_CHAR = ASCII_MAX; // ascii index 127
162162
switch (c) {
163163
case 'd':
164-
t.ranges.push_back({'0', '9'});
164+
t.ranges.push_back({48, 57}); // '0' - '9'
165165
break;
166166
case 'D':
167-
t.ranges.insert(t.ranges.end(),
168-
{
169-
{MIN_CHAR, '/'}, // Everything before '0'
170-
{':', MAX_CHAR} // Everything after '9'
171-
});
167+
t.ranges.insert(t.ranges.end(), {
168+
{MIN_CHAR, 47}, // Everything before '0'
169+
{58, MAX_CHAR} // Everything after '9'
170+
});
172171
break;
173172
case 'w':
174-
t.ranges.insert(t.ranges.end(),
175-
{{'a', 'z'}, {'A', 'Z'}, {'0', '9'}, {'_', '_'}});
173+
t.ranges.insert(
174+
t.ranges.end(),
175+
{{97, 122}, {65, 90}, {48, 57}, {95, 95}}); // a-z, A-Z, 0-9, _
176176
break;
177177
case 'W':
178178
t.ranges.insert(t.ranges.end(), {
179-
{MIN_CHAR, '/'}, // Before '0'
180-
{':', '@'}, // Between '9' and 'A'
181-
{'[', '^'}, // Between 'Z' and '_'
182-
{'`', '`'}, // Between '_' and 'a'
183-
{'{', MAX_CHAR} // After 'z'
179+
{MIN_CHAR, 47}, // Before '0'
180+
{58, 64}, // Between '9' and 'A'
181+
{91, 94}, // Between 'Z' and '_'
182+
{96, 96}, // Between '_' and 'a'
183+
{123, MAX_CHAR} // After 'z'
184184
});
185185
break;
186186
case 's':
187-
t.ranges.insert(t.ranges.end(), {{' ', ' '},
188-
{'\t', '\t'},
189-
{'\n', '\n'},
190-
{'\r', '\r'},
191-
{'\f', '\f'},
192-
{'\v', '\v'}});
187+
t.ranges.insert(t.ranges.end(), {{32, 32}, // Space
188+
{9, 13}} // \t, \n, \v, \f, \r
189+
);
193190
break;
194191

195192
case 'S':
196-
t.ranges.insert(t.ranges.end(),
197-
{
198-
{MIN_CHAR, '\x08'}, // Before \t (0-8)
199-
{'\x0E', '\x1F'}, // Between \r and Space (14-31)
200-
{'!', MAX_CHAR} // After Space (33-127)
201-
});
193+
t.ranges.insert(t.ranges.end(), {
194+
{MIN_CHAR, 8}, // Before \t
195+
{14, 31}, // Between \r and Space
196+
{33, MAX_CHAR} // After Space
197+
});
202198
break;
203199
}
204200
}
@@ -242,11 +238,11 @@ Token Tokenizer::read_char_class() {
242238

243239
bool have_prev = false; // pending character for range
244240
bool last_was_shorthand = false; // whether last token was \d, \w, etc.
245-
char prev;
241+
ut8 prev;
246242

247243
// Read until closing ']'
248244
while (!eof() && peek() != ']') {
249-
char c = get();
245+
ut8 c = get();
250246
if (c == '\\') // Handle escape sequences
251247
{
252248
if (eof())
@@ -313,7 +309,7 @@ Token Tokenizer::read_char_class() {
313309
// Handle range syntax:
314310
if (have_prev && c == '-' &&
315311
peek() != ']') { // when '-' acts as a range specifier
316-
char ub = get();
312+
ut8 ub = get();
317313
if (ub == '\\') // Handle escaped upper bound
318314
{
319315
if (eof())
@@ -385,15 +381,15 @@ Token Tokenizer::read_quantifier() {
385381
}
386382
};
387383

388-
auto read_int = [&]() -> int {
384+
auto read_int = [&]() -> st32 {
389385
skip_spaces();
390-
int val = 0;
386+
st32 val = 0;
391387
bool found = false;
392388
while (!eof() && std::isdigit(peek())) {
393389
found = true;
394390
val = val * 10 + (get() - '0');
395391
}
396-
if (!found)
392+
if (!found && peek() != ',')
397393
PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
398394
"Expected number in quantifier at position " +
399395
std::to_string(t.pos));

0 commit comments

Comments
 (0)