From 57bbd73a07286c59aef4e6e63be2a341e327d1c4 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Tue, 2 May 2023 10:26:00 +0200 Subject: [PATCH] Offer the possibility to give a custom lexer --- AUTHORS | 1 + sqlparse/__init__.py | 19 +++++++++++-------- sqlparse/engine/filter_stack.py | 4 ++-- sqlparse/lexer.py | 8 ++++++-- tests/test_parse.py | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 12 deletions(-) diff --git a/AUTHORS b/AUTHORS index 1717adff..9f971082 100644 --- a/AUTHORS +++ b/AUTHORS @@ -31,6 +31,7 @@ Alphabetical list of contributors: * Florian Bauer * Fredy Wijaya * Gavin Wahl +* Guillaume Tassery * hurcy * Ian Robertson * JacekPliszka diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py index db0d2fc9..3c79f95d 100644 --- a/sqlparse/__init__.py +++ b/sqlparse/__init__.py @@ -20,29 +20,31 @@ __all__ = ['engine', 'filters', 'formatter', 'sql', 'tokens', 'cli'] -def parse(sql, encoding=None): +def parse(sql, encoding=None, lexer=None): """Parse sql and return a list of statements. :param sql: A string containing one or more SQL statements. :param encoding: The encoding of the statement (optional). + :param lexer: A custom SQL query lexer (optional). :returns: A tuple of :class:`~sqlparse.sql.Statement` instances. """ - return tuple(parsestream(sql, encoding)) + return tuple(parsestream(sql, encoding=encoding, lexer=lexer)) -def parsestream(stream, encoding=None): +def parsestream(stream, encoding=None, lexer=None): """Parses sql statements from file-like object. :param stream: A file-like object. :param encoding: The encoding of the stream contents (optional). + :param lexer: A custom SQL query lexer (optional). :returns: A generator of :class:`~sqlparse.sql.Statement` instances. """ stack = engine.FilterStack() stack.enable_grouping() - return stack.run(stream, encoding) + return stack.run(stream, encoding=encoding, custom_lexer=lexer) -def format(sql, encoding=None, **options): +def format(sql, encoding=None, lexer=None, **options): """Format *sql* according to *options*. Available options are documented in :ref:`formatting`. @@ -56,15 +58,16 @@ def format(sql, encoding=None, **options): options = formatter.validate_options(options) stack = formatter.build_filter_stack(stack, options) stack.postprocess.append(filters.SerializerUnicode()) - return ''.join(stack.run(sql, encoding)) + return ''.join(stack.run(sql, encoding=encoding, custom_lexer=lexer)) -def split(sql, encoding=None): +def split(sql, encoding=None, lexer=None): """Split *sql* into single statements. :param sql: A string containing one or more SQL statements. :param encoding: The encoding of the statement (optional). + :param lexer: A custom SQL query lexer (optional). :returns: A list of strings. """ stack = engine.FilterStack() - return [str(stmt).strip() for stmt in stack.run(sql, encoding)] + return [str(stmt).strip() for stmt in stack.run(sql, encoding=encoding, custom_lexer=lexer)] diff --git a/sqlparse/engine/filter_stack.py b/sqlparse/engine/filter_stack.py index 9665a224..6b41a64c 100644 --- a/sqlparse/engine/filter_stack.py +++ b/sqlparse/engine/filter_stack.py @@ -22,8 +22,8 @@ def __init__(self): def enable_grouping(self): self._grouping = True - def run(self, sql, encoding=None): - stream = lexer.tokenize(sql, encoding) + def run(self, sql, encoding=None, custom_lexer=None): + stream = lexer.tokenize(sql, encoding=encoding, custom_lexer=custom_lexer) # Process token stream for filter_ in self.preprocess: stream = filter_.process(stream) diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 9d25c9e6..dadf3b51 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -25,6 +25,9 @@ class Lexer: _default_intance = None + def __init__(self): + self.clear() + # Development notes: # - This class is prepared to be able to support additional SQL dialects # in the future by adding additional functions that take the place of @@ -146,10 +149,11 @@ def get_tokens(self, text, encoding=None): yield tokens.Error, char -def tokenize(sql, encoding=None): +def tokenize(sql, encoding=None, custom_lexer=None): """Tokenize sql. Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream of ``(token type, value)`` items. """ - return Lexer.get_default_instance().get_tokens(sql, encoding) + lexer = Lexer.get_default_instance() if custom_lexer is None else custom_lexer + return lexer.get_tokens(sql, encoding) diff --git a/tests/test_parse.py b/tests/test_parse.py index 5feef5a7..346e8319 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -566,3 +566,35 @@ def test_configurable_regex(): for t in tokens if t.ttype not in sqlparse.tokens.Whitespace )[4] == (sqlparse.tokens.Keyword, "zorder by") + + +def test_custom_lexer(): + lex = Lexer() + my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword) + + lex.set_SQL_REGEX( + keywords.SQL_REGEX[:38] + + [my_regex] + + keywords.SQL_REGEX[38:] + ) + lex.add_keywords(keywords.KEYWORDS_COMMON) + lex.add_keywords(keywords.KEYWORDS_ORACLE) + lex.add_keywords(keywords.KEYWORDS_PLPGSQL) + lex.add_keywords(keywords.KEYWORDS_HQL) + lex.add_keywords(keywords.KEYWORDS_MSACCESS) + lex.add_keywords(keywords.KEYWORDS) + + tokens = sqlparse.parse("select * from foo zorder by bar;", lexer=lex)[0] + assert list( + (t.ttype, t.value) + for t in tokens + if t.ttype not in sqlparse.tokens.Whitespace + )[4] == (sqlparse.tokens.Keyword, "zorder by") + + # Should not impact the parse who has a default configuration + tokens = sqlparse.parse("select * from foo forder by bar;")[0] + assert list( + (t.ttype, t.value) + for t in tokens + if t.ttype not in sqlparse.tokens.Whitespace + )[4] != (sqlparse.tokens.Keyword, "zorder by")