From 57bbd73a07286c59aef4e6e63be2a341e327d1c4 Mon Sep 17 00:00:00 2001
From: Guillaume Tassery <tassery.guillaume@gmail.com>
Date: Tue, 2 May 2023 10:26:00 +0200
Subject: [PATCH] Offer the possibility to give a custom lexer

---
 AUTHORS                         |  1 +
 sqlparse/__init__.py            | 19 +++++++++++--------
 sqlparse/engine/filter_stack.py |  4 ++--
 sqlparse/lexer.py               |  8 ++++++--
 tests/test_parse.py             | 32 ++++++++++++++++++++++++++++++++
 5 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 1717adff..9f971082 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -31,6 +31,7 @@ Alphabetical list of contributors:
 * Florian Bauer <florian.bauer@zmdi.com>
 * Fredy Wijaya <fredy.wijaya@gmail.com>
 * Gavin Wahl <gwahl@fusionbox.com>
+* Guillaume Tassery <tassery.guillaume@gmail.com>
 * hurcy <cinyoung.hur@gmail.com>
 * Ian Robertson <ian.robertson@capitalone.com>
 * JacekPliszka <Jacek.Pliszka@gmail.com>
diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py
index db0d2fc9..3c79f95d 100644
--- a/sqlparse/__init__.py
+++ b/sqlparse/__init__.py
@@ -20,29 +20,31 @@
 __all__ = ['engine', 'filters', 'formatter', 'sql', 'tokens', 'cli']
 
 
-def parse(sql, encoding=None):
+def parse(sql, encoding=None, lexer=None):
     """Parse sql and return a list of statements.
 
     :param sql: A string containing one or more SQL statements.
     :param encoding: The encoding of the statement (optional).
+    :param lexer: A custom SQL query lexer (optional).
     :returns: A tuple of :class:`~sqlparse.sql.Statement` instances.
     """
-    return tuple(parsestream(sql, encoding))
+    return tuple(parsestream(sql, encoding=encoding, lexer=lexer))
 
 
-def parsestream(stream, encoding=None):
+def parsestream(stream, encoding=None, lexer=None):
     """Parses sql statements from file-like object.
 
     :param stream: A file-like object.
     :param encoding: The encoding of the stream contents (optional).
+    :param lexer: A custom SQL query lexer (optional).
     :returns: A generator of :class:`~sqlparse.sql.Statement` instances.
     """
     stack = engine.FilterStack()
     stack.enable_grouping()
-    return stack.run(stream, encoding)
+    return stack.run(stream, encoding=encoding, custom_lexer=lexer)
 
 
-def format(sql, encoding=None, **options):
+def format(sql, encoding=None, lexer=None, **options):
     """Format *sql* according to *options*.
 
     Available options are documented in :ref:`formatting`.
@@ -56,15 +58,16 @@ def format(sql, encoding=None, **options):
     options = formatter.validate_options(options)
     stack = formatter.build_filter_stack(stack, options)
     stack.postprocess.append(filters.SerializerUnicode())
-    return ''.join(stack.run(sql, encoding))
+    return ''.join(stack.run(sql, encoding=encoding, custom_lexer=lexer))
 
 
-def split(sql, encoding=None):
+def split(sql, encoding=None, lexer=None):
     """Split *sql* into single statements.
 
     :param sql: A string containing one or more SQL statements.
     :param encoding: The encoding of the statement (optional).
+    :param lexer: A custom SQL query lexer (optional).
     :returns: A list of strings.
     """
     stack = engine.FilterStack()
-    return [str(stmt).strip() for stmt in stack.run(sql, encoding)]
+    return [str(stmt).strip() for stmt in stack.run(sql, encoding=encoding, custom_lexer=lexer)]
diff --git a/sqlparse/engine/filter_stack.py b/sqlparse/engine/filter_stack.py
index 9665a224..6b41a64c 100644
--- a/sqlparse/engine/filter_stack.py
+++ b/sqlparse/engine/filter_stack.py
@@ -22,8 +22,8 @@ def __init__(self):
     def enable_grouping(self):
         self._grouping = True
 
-    def run(self, sql, encoding=None):
-        stream = lexer.tokenize(sql, encoding)
+    def run(self, sql, encoding=None, custom_lexer=None):
+        stream = lexer.tokenize(sql, encoding=encoding, custom_lexer=custom_lexer)
         # Process token stream
         for filter_ in self.preprocess:
             stream = filter_.process(stream)
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index 9d25c9e6..dadf3b51 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -25,6 +25,9 @@ class Lexer:
 
     _default_intance = None
 
+    def __init__(self):
+        self.clear()
+
     # Development notes:
     # - This class is prepared to be able to support additional SQL dialects
     #   in the future by adding additional functions that take the place of
@@ -146,10 +149,11 @@ def get_tokens(self, text, encoding=None):
                 yield tokens.Error, char
 
 
-def tokenize(sql, encoding=None):
+def tokenize(sql, encoding=None, custom_lexer=None):
     """Tokenize sql.
 
     Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
     of ``(token type, value)`` items.
     """
-    return Lexer.get_default_instance().get_tokens(sql, encoding)
+    lexer = Lexer.get_default_instance() if custom_lexer is None else custom_lexer
+    return lexer.get_tokens(sql, encoding)
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 5feef5a7..346e8319 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -566,3 +566,35 @@ def test_configurable_regex():
         for t in tokens
         if t.ttype not in sqlparse.tokens.Whitespace
     )[4] == (sqlparse.tokens.Keyword, "zorder by")
+
+
+def test_custom_lexer():
+    lex = Lexer()
+    my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword)
+
+    lex.set_SQL_REGEX(
+        keywords.SQL_REGEX[:38]
+        + [my_regex]
+        + keywords.SQL_REGEX[38:]
+    )
+    lex.add_keywords(keywords.KEYWORDS_COMMON)
+    lex.add_keywords(keywords.KEYWORDS_ORACLE)
+    lex.add_keywords(keywords.KEYWORDS_PLPGSQL)
+    lex.add_keywords(keywords.KEYWORDS_HQL)
+    lex.add_keywords(keywords.KEYWORDS_MSACCESS)
+    lex.add_keywords(keywords.KEYWORDS)
+
+    tokens = sqlparse.parse("select * from foo zorder by bar;", lexer=lex)[0]
+    assert list(
+        (t.ttype, t.value)
+        for t in tokens
+        if t.ttype not in sqlparse.tokens.Whitespace
+    )[4] == (sqlparse.tokens.Keyword, "zorder by")
+
+    # Should not impact the parse who has a default configuration
+    tokens = sqlparse.parse("select * from foo forder by bar;")[0]
+    assert list(
+        (t.ttype, t.value)
+        for t in tokens
+        if t.ttype not in sqlparse.tokens.Whitespace
+    )[4] != (sqlparse.tokens.Keyword, "zorder by")