edge-cases in Oracle/pgsql parsing, notes on Rust

richarah · richarah · commit 0347a8e8d493 · 2026-03-22T22:08:47.000+01:00
diff --git a/README.md b/README.md
@@ -8,11 +8,11 @@ Supports stored procedures (PL/pgSQL, T-SQL, MySQL, PL/SQL): where sqlglot falls
 
 #### What it's for
 
-Anything that parses/transpiles SQL on the hot path: proxies, sidecars, migration tools, linters, CI/CD, Git pre-commit hooks where Python would be too slow. Also a replacement for every regex that's pretending to be a parser.
+Anywhere SQL parsing meets the hot path: proxies, sidecars, migration, linters, Git pre-commit hooks, CI/CD pipelines. Also a replacement for every regex that's pretending to be a parser.
 
 #### Why this exists
 
-libsqlglot was born out of a gap in the C++ ecosystem: the lack of native tooling for efficient, high-volume and hassle-free parsing & transpilation between dozens of SQL dialects.
+Because life is too short to parse SQL in Python.
 
 Inspired by the original [sqlglot](https://github.com/tobymao/sqlglot), which did the decade-long work of mapping 31+ SQL dialects into an elegant, universal AST. libsqlglot does the comparatively trivial work of compiling it: the algorithm was already O(n), the runtime wasn't.
 
@@ -28,6 +28,7 @@ Inspired by the original [sqlglot](https://github.com/tobymao/sqlglot), which di
 - [Benchmarks](#benchmarks)
 - [Examples](#examples)
 - [Supported SQL dialects](#supported-sql-dialects)
+- [Contributing](#contributing)
 - [Licence](#licence)
 
 ## Functionality
@@ -239,7 +240,7 @@ cmake --build build
 
 ## Architecture
 
-Header-only design: you only pay for what you use. 19 header files, no `.cpp`. See `include/libsqlglot/` for the full layout. Core files: `parser.h` (4171 lines), `generator.h` (2149), `expression.h` (1385, 115 expression types). Entry point is `transpiler.h` (86 lines).
+Header-only design: you only pay for what you use. 19 header files, no `.cpp`. See `include/libsqlglot/` for the full layout. Core files: `parser.h` (4191 lines), `generator.h` (2149), `expression.h` (1385, 115 expression types). Entry point is `transpiler.h` (86 lines).
 
 ### Memory management
 
@@ -610,14 +611,16 @@ These dialects inherit features from a compatible base dialect and add specific
 
 ## Contributing
 
-# Contributing
-
-libsqlglot is a solo project. Bug reports, test cases, and dialect edge cases are welcome via GitHub issues. If you have a dialect you wish to see added, please open an issue or PR.
+libsqlglot is currently a solo project. Bug reports, test cases, and dialect edge cases are welcome via GitHub issues. If you have a dialect you wish to see added, please open an issue or PR.
 
 If a query parses incorrectly, or a dialect transformation that produces wrong output, please open an issue with the input SQL, source dialect, target dialect, expected output and any other pertinent details.
 
 Pull requests are considered but there is no guarantee of merge. The codebase is intentionally small and opinionated.
 
+### How about a Rust rewrite?
+
+No.
+
 ## Licence
 
 Apache 2.0
diff --git a/include/libsqlglot/parser.h b/include/libsqlglot/parser.h
@@ -1499,7 +1499,8 @@ class Parser {
 
         // Database/Schema
         if (priv_upper == "CONNECT") return GrantStmt::PrivilegeType::CONNECT;
-        if (priv_upper == "TEMPORARY" || priv_upper == "TEMP") return GrantStmt::PrivilegeType::TEMPORARY;
+        if (priv_upper == "TEMPORARY") return GrantStmt::PrivilegeType::TEMPORARY;
+        if (priv_upper == "TEMP") return GrantStmt::PrivilegeType::TEMP;
         if (priv_upper == "USAGE") return GrantStmt::PrivilegeType::USAGE;
 
         // Procedures
@@ -1604,7 +1605,7 @@ class Parser {
                             if (std::string(current().text) == "PUBLIC" || std::string(current().text) == "public") {
                                 stmt->to_public = true;
                             } else {
-                                stmt->grantees.push_back(std::string(current().text));
+                                stmt->grantees.push_back(std::string(current().view(source_)));
                             }
                             advance();
                         }
@@ -1738,6 +1739,12 @@ class Parser {
                 } while (match(TokenType::COMMA));
 
                 expect(TokenType::RPAREN);
+
+                // Validate: column list cannot be empty
+                if (col_priv.columns.empty()) {
+                    error("Column list cannot be empty for column-level privilege");
+                }
+
                 stmt->column_privileges.push_back(std::move(col_priv));
                 handled_column_priv = true;
             }
@@ -1817,18 +1824,26 @@ class Parser {
 
                             // Check for third segment: catalog.schema.table
                             if (match(TokenType::DOT)) {
-                                stmt->schema_name = second_segment;  // catalog.schema becomes schema
+                                // Three-segment name: store as full qualified name
                                 if (current().type == TokenType::IDENTIFIER) {
                                     obj_name = obj_name + "." + second_segment + "." + std::string(current().text);
                                     advance();
                                 } else if (current().type == TokenType::STAR) {
                                     obj_name = obj_name + "." + second_segment + ".*";
                                     advance();
                                 }
+                                // Don't set schema_name for three-segment names - keep full name
                             } else {
                                 // Two-segment name: schema.table or *.*
-                                stmt->schema_name = obj_name;
-                                obj_name = second_segment;
+                                // For the first object, split into schema and name
+                                // For subsequent objects, store full qualified name
+                                if (stmt->object_name.empty()) {
+                                    stmt->schema_name = obj_name;
+                                    obj_name = second_segment;
+                                } else {
+                                    // Keep full qualified name for list items
+                                    obj_name = obj_name + "." + second_segment;
+                                }
                             }
                         }
 
@@ -1983,7 +1998,7 @@ class Parser {
                             if (std::string(current().text) == "PUBLIC" || std::string(current().text) == "public") {
                                 stmt->from_public = true;
                             } else {
-                                stmt->grantees.push_back(std::string(current().text));
+                                stmt->grantees.push_back(std::string(current().view(source_)));
                             }
                             advance();
                         }
@@ -2108,6 +2123,12 @@ class Parser {
                 } while (match(TokenType::COMMA));
 
                 expect(TokenType::RPAREN);
+
+                // Validate: column list cannot be empty
+                if (col_priv.columns.empty()) {
+                    error("Column list cannot be empty for column-level privilege");
+                }
+
                 stmt->column_privileges.push_back(std::move(col_priv));
                 handled_column_priv = true;
             }
@@ -2213,6 +2234,11 @@ class Parser {
                     advance();
                 }
             } while (match(TokenType::COMMA));
+
+            // Validate: must have at least one grantee
+            if (!stmt->from_public && stmt->grantees.empty()) {
+                error("REVOKE statement must specify at least one grantee");
+            }
         }
 
         // CASCADE / RESTRICT
@@ -2629,26 +2655,20 @@ class Parser {
 
         // Read up to 3 consecutive tokens that could form a delimiter
         // Examples: $ + $ = $$, / + / = //, | = |, ; = ;
-        // Note: $ may tokenize as ERROR, so we accept ERROR tokens here
+        // Be permissive: accept any non-keyword, non-identifier token
         for (int i = 0; i < 3 && !is_at_end(); ++i) {
             TokenType t = current().type;
 
-            // Stop if we hit keywords, identifiers, numbers, or strings
-            // But allow ERROR tokens (for $), and operator tokens (/, ;, |, etc.)
+            // Stop if we hit SQL keywords or identifiers (likely start of next statement)
             if (t == TokenType::IDENTIFIER ||
-                t == TokenType::NUMBER ||
-                t == TokenType::STRING ||
-                (t >= TokenType::SELECT && t < TokenType::EOF_TOKEN)) {  // Keywords range
+                (t >= TokenType::SELECT && t < TokenType::SEMICOLON)) {  // Keywords before punctuation
                 break;
             }
 
-            // Accumulate the token text
-            const char* text = get_token_text(current());
-            if (text) {
-                delim += std::string(text);
-            } else {
-                // Fallback to source view for tokens without text (e.g., ERROR tokens like $)
-                delim += std::string(current().view(source_));
+            // Accumulate the token text using source view (most reliable)
+            std::string_view token_view = current().view(source_);
+            if (!token_view.empty()) {
+                delim += std::string(token_view);
             }
             advance();