33
44Tokenizer::Tokenizer (std::string_view pat) : pattern(pat) {}
55
6- char Tokenizer::peek () const { return eof () ? ' \0 ' : pattern[i]; }
6+ ut8 Tokenizer::peek () const { return eof () ? ' \0 ' : pattern[i]; }
77
8- char Tokenizer::get () { return eof () ? ' \0 ' : pattern[i++]; }
8+ ut8 Tokenizer::get () { return eof () ? ' \0 ' : pattern[i++]; }
99
1010bool Tokenizer::eof () const { return i >= pattern.size (); }
1111
@@ -62,7 +62,7 @@ void Tokenizer::add_concat_tokens(std::vector<Token> &tokens) {
6262}
6363
6464Token Tokenizer::next_token () {
65- char c = get ();
65+ ut8 c = get ();
6666
6767 // Position of the character that produced this token
6868 size_t pos = i - 1 ;
@@ -79,7 +79,7 @@ Token Tokenizer::next_token() {
7979 case ' |' :
8080 return {TokenType::ALTERNATION, pos};
8181 case ' (' : {
82- int id = ++group_counter;
82+ st32 id = ++group_counter;
8383 group_stack.push (id);
8484 Token t{TokenType::LPAREN, pos};
8585 t.group_id = id;
@@ -90,7 +90,7 @@ Token Tokenizer::next_token() {
9090 PzError::report_error (PzError::PzErrorType::PZ_INVALID_INPUT,
9191 " Mismatched ')' at position " +
9292 std::to_string (pos));
93- int id = group_stack.top ();
93+ st32 id = group_stack.top ();
9494 group_stack.pop ();
9595 Token t{TokenType::RPAREN, pos};
9696 t.group_id = id;
@@ -111,7 +111,7 @@ Token Tokenizer::next_token() {
111111 }
112112}
113113
114- Token Tokenizer::read_literal (char c) {
114+ Token Tokenizer::read_literal (ut8 c) {
115115 Token t{TokenType::LITERAL, i - 1 };
116116 t.literal = c;
117117 return t;
@@ -124,7 +124,7 @@ Token Tokenizer::read_escape() {
124124
125125 Token t;
126126 t.pos = i - 1 ;
127- char c = get ();
127+ ut8 c = get ();
128128
129129 if (c == ' d' || c == ' D' || c == ' w' || c == ' W' || c == ' s' || c == ' S' ) {
130130 t.type = TokenType::CHAR_CLASS;
@@ -156,49 +156,45 @@ Token Tokenizer::read_escape() {
156156 return t;
157157}
158158
159- void Tokenizer::add_shorthand_ranges (char c, Token &t) {
160- const char MIN_CHAR = ' \0 ' ; // ascii index 0
161- const char MAX_CHAR = ' \x7F ' ; // ascii index 127
159+ void Tokenizer::add_shorthand_ranges (ut8 c, Token &t) {
160+ static constexpr ut8 MIN_CHAR = 0 ; // ascii index 0
161+ static constexpr ut8 MAX_CHAR = ASCII_MAX ; // ascii index 127
162162 switch (c) {
163163 case ' d' :
164- t.ranges .push_back ({' 0' , ' 9' });
164+ t.ranges .push_back ({48 , 57 }); // '0' - '9'
165165 break ;
166166 case ' D' :
167- t.ranges .insert (t.ranges .end (),
168- {
169- {MIN_CHAR, ' /' }, // Everything before '0'
170- {' :' , MAX_CHAR} // Everything after '9'
171- });
167+ t.ranges .insert (t.ranges .end (), {
168+ {MIN_CHAR, 47 }, // Everything before '0'
169+ {58 , MAX_CHAR} // Everything after '9'
170+ });
172171 break ;
173172 case ' w' :
174- t.ranges .insert (t.ranges .end (),
175- {{' a' , ' z' }, {' A' , ' Z' }, {' 0' , ' 9' }, {' _' , ' _' }});
173+ t.ranges .insert (
174+ t.ranges .end (),
175+ {{97 , 122 }, {65 , 90 }, {48 , 57 }, {95 , 95 }}); // a-z, A-Z, 0-9, _
176176 break ;
177177 case ' W' :
178178 t.ranges .insert (t.ranges .end (), {
179- {MIN_CHAR, ' / ' }, // Before '0'
180- {' : ' , ' @ ' }, // Between '9' and 'A'
181- {' [ ' , ' ^ ' }, // Between 'Z' and '_'
182- {' ` ' , ' ` ' }, // Between '_' and 'a'
183- {' { ' , MAX_CHAR} // After 'z'
179+ {MIN_CHAR, 47 }, // Before '0'
180+ {58 , 64 }, // Between '9' and 'A'
181+ {91 , 94 }, // Between 'Z' and '_'
182+ {96 , 96 }, // Between '_' and 'a'
183+ {123 , MAX_CHAR} // After 'z'
184184 });
185185 break ;
186186 case ' s' :
187- t.ranges .insert (t.ranges .end (), {{' ' , ' ' },
188- {' \t ' , ' \t ' },
189- {' \n ' , ' \n ' },
190- {' \r ' , ' \r ' },
191- {' \f ' , ' \f ' },
192- {' \v ' , ' \v ' }});
187+ t.ranges .insert (t.ranges .end (), {{32 , 32 }, // Space
188+ {9 , 13 }} // \t, \n, \v, \f, \r
189+ );
193190 break ;
194191
195192 case ' S' :
196- t.ranges .insert (t.ranges .end (),
197- {
198- {MIN_CHAR, ' \x08 ' }, // Before \t (0-8)
199- {' \x0E ' , ' \x1F ' }, // Between \r and Space (14-31)
200- {' !' , MAX_CHAR} // After Space (33-127)
201- });
193+ t.ranges .insert (t.ranges .end (), {
194+ {MIN_CHAR, 8 }, // Before \t
195+ {14 , 31 }, // Between \r and Space
196+ {33 , MAX_CHAR} // After Space
197+ });
202198 break ;
203199 }
204200}
@@ -242,11 +238,11 @@ Token Tokenizer::read_char_class() {
242238
243239 bool have_prev = false ; // pending character for range
244240 bool last_was_shorthand = false ; // whether last token was \d, \w, etc.
245- char prev;
241+ ut8 prev;
246242
247243 // Read until closing ']'
248244 while (!eof () && peek () != ' ]' ) {
249- char c = get ();
245+ ut8 c = get ();
250246 if (c == ' \\ ' ) // Handle escape sequences
251247 {
252248 if (eof ())
@@ -313,7 +309,7 @@ Token Tokenizer::read_char_class() {
313309 // Handle range syntax:
314310 if (have_prev && c == ' -' &&
315311 peek () != ' ]' ) { // when '-' acts as a range specifier
316- char ub = get ();
312+ ut8 ub = get ();
317313 if (ub == ' \\ ' ) // Handle escaped upper bound
318314 {
319315 if (eof ())
@@ -385,15 +381,15 @@ Token Tokenizer::read_quantifier() {
385381 }
386382 };
387383
388- auto read_int = [&]() -> int {
384+ auto read_int = [&]() -> st32 {
389385 skip_spaces ();
390- int val = 0 ;
386+ st32 val = 0 ;
391387 bool found = false ;
392388 while (!eof () && std::isdigit (peek ())) {
393389 found = true ;
394390 val = val * 10 + (get () - ' 0' );
395391 }
396- if (!found)
392+ if (!found && peek () != ' , ' )
397393 PzError::report_error (PzError::PzErrorType::PZ_INVALID_INPUT,
398394 " Expected number in quantifier at position " +
399395 std::to_string (t.pos ));
0 commit comments