diff --git a/pyauparser/grammar.py b/pyauparser/grammar.py index cafacb3..aaa727b 100644 --- a/pyauparser/grammar.py +++ b/pyauparser/grammar.py @@ -267,8 +267,7 @@ def load_file(file_or_path): """Load grammar information from file. http://goldparser.org/doc/egt/index.htm """ - if (isinstance(file_or_path, str) or - isinstance(file_or_path, str)): + if (isinstance(file_or_path, str)): with open(file_or_path, "rb") as file: return Grammar._load(file) else: @@ -282,36 +281,36 @@ def read_empty(): def read_byte(): c = f.read(1) - return c[0] if len(c) == 1 else None + return c if len(c) == 1 else None def read_bool(): c = f.read(1) - return ord(c[0]) == 1 if len(c) == 1 else None + return c[0] == 1 if len(c) == 1 else None def read_short(): c = f.read(2) - return ord(c[0]) + ord(c[1]) * 256 if len(c) == 2 else None + return c[0] + c[1] * 256 if len(c) == 2 else None def read_string(): - s = "" + s = bytes() while True: c = f.read(2) - if len(c) < 2 or (ord(c[0]) == 0 and ord(c[1]) == 0): + if len(c) < 2 or (c[0] == 0 and c[1] == 0): break s += c return s.decode("utf-16le") def read_value(): t = f.read(1) - if t == 'E': + if t == b'E': return read_empty() - elif t == 'b': + elif t == b'b': return read_byte() - elif t == 'B': + elif t == b'B': return read_bool() - elif t == 'I': + elif t == b'I': return read_short() - elif t == 'S': + elif t == b'S': return read_string() else: return None @@ -323,44 +322,44 @@ def read_value(): raise Exception("Unknown Header: " + header) # read records - while read_byte() == 'M': + while read_byte() == b'M': v = [] for x in range(read_short()): v.append(read_value()) t = v[0] if len(v) > 0 else None - if t == 'p': # Property + if t == b'p': # Property grm.properties[v[1]] = Property(v[1], v[2], v[3]) - elif t == 't': # Table Counts + elif t == b't': # Table Counts tablecounts = tuple(v[1:]) - elif t == 'c': # Character Set Table + elif t == b'c': # Character Set Table grm.charsets[v[1]] = CharacterSet( v[1], v[2], tuple([(v[i * 2 + 5], v[i * 2 + 6]) for i in range(v[3])])) - elif t == 'S': # Symbol Record + elif t == b'S': # Symbol Record grm.symbols[v[1]] = Symbol(v[1], v[2], v[3]) - elif t == 'g': # Group Record + elif t == b'g': # Group Record grm.symbolgroups[v[1]] = SymbolGroup( v[1], v[2], v[3], v[4], v[5], v[6], v[7], tuple(v[10:])) - elif t == 'R': # Production Record + elif t == b'R': # Production Record grm.productions[v[1]] = Production(v[1], v[2], tuple(v[4:])) - elif t == 'I': # Initial States Record + elif t == b'I': # Initial States Record grm.dfainit, grm.lalrinit = v[1:] - elif t == 'D': # DFA State Record + elif t == b'D': # DFA State Record grm.dfastates[v[1]] = DFAState( v[1], v[3] if v[2] else None, tuple([DFAEdge(v[i * 3 + 5], v[i * 3 + 6]) - for i in range((len(v) - 5) / 3)])) - elif t == 'L': # LALR State Record + for i in range(int((len(v) - 5) / 3))])) + elif t == b'L': # LALR State Record grm.lalrstates[v[1]] = LALRState( v[1], dict([(v[i * 4 + 3], LALRAction(v[i * 4 + 3], v[i * 4 + 4], v[i * 4 + 5])) - for i in range((len(v) - 3) / 4)])) + for i in range(int((len(v) - 3) / 4))])) else: - raise Exception("Unknown type: " + t) + raise Exception("Unknown type: " + t.decode("utf-16le")) # check read counts readcounts = (len(grm.symbols), diff --git a/pyauparser/lexer.py b/pyauparser/lexer.py index 018f695..3d0b685 100644 --- a/pyauparser/lexer.py +++ b/pyauparser/lexer.py @@ -2,6 +2,51 @@ import sys from . import grammar +class Buffer(object): + """Encapsulation of the data buffer + """ + + def __init__(self, file, is_unicode): + self.is_unicode = is_unicode + self.file = file + self.reset() + + def reset(self): + self.buf = str() if self.is_unicode else bytes() + self.buf_cur = 0 + self.buf_remain = 0 + + def fill(self): + if self.buf_cur >= 4096: + self.buf = self.buf[self.buf_cur:] + self.buf_cur = 0 + self.buf += self.file.read(4096) + self.buf_remain = len(self.buf) - self.buf_cur + + def peek_char(self, incr): + if incr < self.buf_remain: + return self.buf[self.buf_cur + incr] + else: + self.fill() + if incr < self.buf_remain: + return self.buf[self.buf_cur + incr] + else: + return None + + def code(self, char): + return ord(char) if self.is_unicode else char + + def get_data(self, data_size): + return self.buf[self.buf_cur:self.buf_cur + data_size] + + def find_eol(self, start, size): + eol = '\n' if self.is_unicode else b'\n' + return self.buf.find(eol, start, self.buf_cur + size) + + def seek_forward(self, value): + self.buf_cur += value + self.buf_remain -= value + class Token(object): """Token which is a result from Lexer @@ -31,8 +76,7 @@ def load_file(self, file_or_path, encoding=None): """ Load a file to lexer. File_or_path could be file object or file name. """ - if (isinstance(file_or_path, str) or - isinstance(file_or_path, str)): + if (isinstance(file_or_path, str)): import codecs if encoding: self._load(codecs.open(file_or_path, encoding=encoding), True) @@ -45,33 +89,20 @@ def load_string(self, s): """ Load a string to lexer. """ import io - self._load(io.StringIO(s), s is str) + self._load(io.StringIO(s), True) # TODO: add load_bytes or similar def _load(self, file, is_unicode): - self.file = file - self.is_unicode = is_unicode - self.buf = "" if is_unicode else str() - self.buf_cur = 0 - self.buf_remain = 0 + self.buffer = Buffer(file, is_unicode) self.line = 1 self.column = 1 self.group_stack = [] - def _load_buffer(self): - # shrink buffer - if self.buf_cur >= 4096: - self.buf = self.buf[self.buf_cur:] - self.buf_cur = 0 - # read into buffer - self.buf += self.file.read(4096) - self.buf_remain = len(self.buf) - self.buf_cur - def _consume_buffer(self, n): # update line, column position - start = self.buf_cur + start = self.buffer.buf_cur new_line_i = -1 while True: - i = self.buf.find("\n", start, self.buf_cur + n) + i = self.buffer.find_eol(start, n) if i != -1: start = new_line_i = i + 1 self.line += 1 @@ -79,16 +110,13 @@ def _consume_buffer(self, n): if new_line_i == -1: self.column += n else: - self.column = 1 + self.buf_cur + n - new_line_i + self.column = 1 + self.buffer.buf_cur + n - new_line_i break # manipulate buffer - if n < self.buf_remain: - self.buf_cur += n - self.buf_remain -= n + if n < self.buffer.buf_remain: + self.buffer.seek_forward(n) else: - self.buf = "" if self.is_unicode else str() - self.buf_cur = 0 - self.buf_remain = 0 + self.buffer.reset() @property def position(self): @@ -102,18 +130,12 @@ def peek_token(self): cur = 0 hit_symbol = None while True: - if cur < self.buf_remain: # peek 1 char - c = self.buf[self.buf_cur + cur] - else: - self._load_buffer() - if cur < self.buf_remain: - c = self.buf[self.buf_cur + cur] - else: - break # if EOF + c = self.buffer.peek_char(cur) + if not c: + break cur += 1 - next_index = -1 # find next state - c_ord = ord(c) + c_ord = self.buffer.code(c) for (r_min, r_max), target_index, target in state.edges_lookup: if c_ord >= r_min and c_ord <= r_max: next_index = target_index @@ -134,14 +156,11 @@ def peek_token(self): hit_cur = cur if hit_symbol: - lexeme = self.buf[self.buf_cur:self.buf_cur + hit_cur] - return Token(hit_symbol, lexeme, self.position) + return Token(hit_symbol, self.buffer.get_data(hit_cur), self.position) + elif cur == 0: + return Token(self.grammar.symbol_EOF, "", self.position) else: - if cur == 0: - return Token(self.grammar.symbol_EOF, "", self.position) - else: - lexeme = self.buf[self.buf_cur:self.buf_cur + cur] - return Token(self.grammar.symbol_Error, lexeme, self.position) + return Token(self.grammar.symbol_Error, self.buffer.get_data(cur), self.position) def read_token(self): """ Read next token and return it. @@ -193,7 +212,7 @@ def read_token(self): top[1] = top[1] + token.lexeme self._consume_buffer(len(token.lexeme)) else: - top[1] = top[1] + token.lexeme[0] + top[1] = top[1] + token.lexeme[0:1] self._consume_buffer(1) def read_token_all(self): diff --git a/pyauparser/test/test_grammar.py b/pyauparser/test/test_grammar.py index 40e72dd..2913804 100644 --- a/pyauparser/test/test_grammar.py +++ b/pyauparser/test/test_grammar.py @@ -19,7 +19,7 @@ def test_load(self): self.assertEqual(len(self.grammar.lalrstates), 19) def test_export(self): - with open("temp_operator_grammar.py", "wb") as f: + with open("temp_operator_grammar.py", "w") as f: self.grammar.export_to_py(f) import temp_operator_grammar grammar2 = temp_operator_grammar.load()