-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCFGScanner.cpp
More file actions
125 lines (111 loc) · 4.74 KB
/
CFGScanner.cpp
File metadata and controls
125 lines (111 loc) · 4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#include "CFGScanner.h"
#include "Utilities/Unicode.h"
#include "StrUtils/StrUtils.h"
#include <unordered_map>
#include <sstream>
using namespace std;
namespace CFG {
namespace {
const unordered_map<string, TokenType> kTokens = {
{ "->", TokenType::ARROW },
{ "=>", TokenType::ARROW },
{ "\\to", TokenType::ARROW },
{ "\\rightarrow", TokenType::ARROW },
{ "\\Rightarrow", TokenType::ARROW },
{ "→", TokenType::ARROW },
{ "⇒", TokenType::ARROW },
{ "::=", TokenType::ARROW },
{ "|", TokenType::BAR },
{ "ϵ", TokenType::EPSILON },
{ "ε", TokenType::EPSILON },
{ "_", TokenType::EPSILON },
};
/* Replacements for <cctype>, given that we're working with
* Unicode characters.
*/
bool isUpper(char32_t ch) {
return (ch >= 'A' && ch <= 'Z');
}
bool isASCII(char32_t ch) {
return 0 <= static_cast<int>(ch) && ch <= 127;
}
bool isSpace(char32_t ch) {
return isASCII(ch) && isspace(static_cast<int>(ch));
}
/* Whether something is a terminal. */
bool isNonterminal(char32_t ch) {
return isUpper(ch);
}
/* Returns whether any token begins with the specified pattern. */
bool someTokenStartsWith(const string& soFar) {
for (const auto& token: kTokens) {
if (Utilities::startsWith(token.first, soFar)) return true;
}
return false;
}
/* Scans until either (1) a complete symbol is found or (2) the input is
* consumed. In the first case, great! We return what token we got. In the
* second case, we report an error, because we don't know what to do.
*/
void scanSymbol(deque<Token>& result, istream& input) {
/* Grab the first character, which we know exists. */
string token = toUTF8(readChar(input));
/* Keep extending this while one of our special characters starts with it. Stop
* once we hit the end or when the next character is a space.
*/
bool match = kTokens.count(token);
while (someTokenStartsWith(token) && input.peek() != EOF && !isSpace(peekChar(input))) {
token += toUTF8(readChar(input));
match |= kTokens.count(token);
}
/* One of two things is true at this point:
*
* 1. We have read something that matched at some point. In that case, use
* maximal munch to figure out what that something is, then return it.
* 2. Nothing matched. That's okay! Because in CFG land we don't have multiletter
* expressions, we just want the first character that we read. Put the rest
* back into the stream and figure out what those tokens mean later.
*/
if (match) {
while (!kTokens.count(token)) {
if (!input.unget()) throw runtime_error("Can't rewind stream.");
token.pop_back();
}
result.push_back({ kTokens.at(token), static_cast<char32_t>(kTokens.at(token)) });
} else {
/* Rewind all the way. */
while (!token.empty()) {
if (!input.unget()) throw runtime_error("Can't rewind stream.");
token.pop_back();
}
/* Just read one character. */
char32_t ch = readChar(input);
result.push_back({ isNonterminal(ch)? TokenType::NONTERMINAL : TokenType::TERMINAL, ch });
}
}
}
deque<Token> scan(istream& input) {
deque<Token> result;
while (input.peek() != EOF) {
/* Grab the next character to see what to do with it. */
char32_t next = peekChar(input);
/* Skip whitespace. */
if (isSpace(next)) {
(void) readChar(input);
} else {
scanSymbol(result, input);
}
}
/* Tack on two EOF markers, since our scanner needs two tokens of lookahead. */
result.push_back({ TokenType::SCAN_EOF, '$' });
result.push_back({ TokenType::SCAN_EOF, '$' });
return result;
}
deque<Token> scan(const string& sourceText) {
istringstream extractor(sourceText);
return scan(extractor);
}
string to_string(const Token& t) {
return toUTF8(t.data);
}
}