-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlexer.py
More file actions
126 lines (98 loc) · 3.56 KB
/
lexer.py
File metadata and controls
126 lines (98 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
from constants import SYMBOLS_DICT, RESERVED_WORDS_LIST
NUMBER_REGEX_PATTERN = r"^\d+\.?\d*$"
BOOLEANS_REGEX_PATTERN = r'"(true|false)"'
TXT_REGEX_PATTERN = r'".*?"'
SPECIAL_SYMBOLS_REGEX_PATTERN = r"(?:<=|>=|<>)"
SYMBOLS_REGEX_PATTERN = "|".join([re.escape(symbol[0]) for symbol in SYMBOLS_DICT])
SYMBOLS_REGEX_PATTERN = r"[" + SYMBOLS_REGEX_PATTERN + r"]"
ID_REGEX_PATTERN = r"^[^\W\d_]\w*$"
RESERVED_WORDS_REGEX_PATTERN = r"\b(?:" + "|".join(RESERVED_WORDS_LIST) + r")\b"
TOKEN_LIST = [
("num", re.compile(NUMBER_REGEX_PATTERN)),
("boolean", re.compile(BOOLEANS_REGEX_PATTERN, re.IGNORECASE)),
("str", re.compile(TXT_REGEX_PATTERN)),
("special_symbol", re.compile(SPECIAL_SYMBOLS_REGEX_PATTERN)),
("symbol", re.compile(SYMBOLS_REGEX_PATTERN)),
("reserved_word", re.compile(RESERVED_WORDS_REGEX_PATTERN)),
("id", re.compile(ID_REGEX_PATTERN)),
]
def classify_token(token, lex):
if token == "special_symbol":
return lex
if token == "symbol":
return lex
if token == "reserved_word":
return lex
if token == "boolean":
new_string = lex[1:-1]
return new_string.capitalize()
return token
def aggregate_lex(token, lex):
if token == "str":
new_string = lex[1:-1]
return new_string
return lex
class Token:
def __init__(self, token, lex, row, column):
self.token = classify_token(token, lex)
self.lex = aggregate_lex(token, lex)
self.row = row
self.column = column
self.token_type = token
def __str__(self):
special_cases = ["special_symbol", "symbol", "reserved_word", "boolean", "$"]
if self.token_type in special_cases:
return "<{}, {}, {}>".format(self.token, self.row, self.column)
return "<{}, {}, {}, {}>".format(self.token, self.lex, self.row, self.column)
def lexical(user_input):
tokens = []
lines = user_input.split("\n")
abort_analysis = False
for i in range(len(lines)):
row = lines[i]
j = 0
while j < len(row):
match = None
# Ignore spaces
if row[j] == " ":
j += 1
continue
# Jump line if a comment is found
if row[j] == "'":
break
line_end = len(row)
break_loop = False
while j <= line_end:
word = row[j:line_end]
for token_type, compiled_regex in TOKEN_LIST:
match = compiled_regex.match(word)
if match:
# Assign values
token_value = match.group()
token_end = match.end()
current_token = Token(token_type, token_value, i + 1, j + 1)
# Reassign j
j += token_end
# Print the value
tokens.append(current_token)
break_loop = True
break
if break_loop:
break
line_end -= 1
if not match:
error = Token(
"lexical_error",
">>> Lexical Error (Line: {}, Pos: {})".format(i + 1, j + 1),
i + 1,
j + 1,
)
tokens.append(error)
abort_analysis = True
break
if abort_analysis:
break
if not abort_analysis:
tokens.append(Token("$", "$", i + 1, j + 1))
return tokens