metadata constants

varun-edachali-dbx · varun-edachali-dbx · commit 1defef36794b · 2025-07-30T12:04:38.000Z
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/src/databricks/sql/backend/sea/metadata_constants.py b/src/databricks/sql/backend/sea/metadata_constants.py
@@ -0,0 +1,76 @@
+"""
+Metadata column mappings for normalizing SEA results to match Thrift backend.
+
+This module defines the column mappings needed to transform SEA metadata
+results to use the same column names as the Thrift backend, ensuring
+consistency between the two backends.
+"""
+
+from typing import Dict, List, NamedTuple, Optional
+
+
+class ColumnMapping(NamedTuple):
+    """Maps SEA column name to Thrift column name."""
+
+    sea_name: Optional[str]
+    thrift_name: str
+
+
+# Column mappings for each metadata operation
+CATALOG_COLUMN_MAPPINGS: List[ColumnMapping] = [
+    ColumnMapping("catalog", "TABLE_CAT"),
+]
+
+SCHEMA_COLUMN_MAPPINGS: List[ColumnMapping] = [
+    ColumnMapping("databaseName", "TABLE_SCHEM"),
+    ColumnMapping(None, "TABLE_CATALOG"),  # SEA doesn't return this, but Thrift does
+]
+
+TABLE_COLUMN_MAPPINGS: List[ColumnMapping] = [
+    ColumnMapping("catalogName", "TABLE_CAT"),
+    ColumnMapping("namespace", "TABLE_SCHEM"),
+    ColumnMapping("tableName", "TABLE_NAME"),
+    ColumnMapping("tableType", "TABLE_TYPE"),
+    ColumnMapping("remarks", "REMARKS"),
+    # Add NULL columns for Thrift compatibility
+    ColumnMapping(None, "TYPE_CAT"),  # Always NULL
+    ColumnMapping(None, "TYPE_SCHEM"),  # Always NULL
+    ColumnMapping(None, "TYPE_NAME"),  # Always NULL
+    ColumnMapping(None, "SELF_REFERENCING_COL_NAME"),  # Always NULL
+    ColumnMapping(None, "REF_GENERATION"),  # Always NULL
+]
+
+COLUMN_COLUMN_MAPPINGS: List[ColumnMapping] = [
+    ColumnMapping("catalogName", "TABLE_CAT"),
+    ColumnMapping("namespace", "TABLE_SCHEM"),
+    ColumnMapping("tableName", "TABLE_NAME"),
+    ColumnMapping("col_name", "COLUMN_NAME"),
+    ColumnMapping(None, "DATA_TYPE"),  # Requires conversion from columnType
+    ColumnMapping("columnType", "TYPE_NAME"),
+    ColumnMapping("columnSize", "COLUMN_SIZE"),
+    ColumnMapping(None, "BUFFER_LENGTH"),  # Always NULL
+    ColumnMapping("decimalDigits", "DECIMAL_DIGITS"),
+    ColumnMapping("radix", "NUM_PREC_RADIX"),
+    ColumnMapping(None, "NULLABLE"),  # Derived from isNullable
+    ColumnMapping("remarks", "REMARKS"),
+    ColumnMapping(None, "COLUMN_DEF"),  # Always NULL
+    ColumnMapping(None, "SQL_DATA_TYPE"),  # Always NULL
+    ColumnMapping(None, "SQL_DATETIME_SUB"),  # Always NULL
+    ColumnMapping(None, "CHAR_OCTET_LENGTH"),  # Always NULL
+    ColumnMapping("ordinalPosition", "ORDINAL_POSITION"),
+    ColumnMapping("isNullable", "IS_NULLABLE"),
+    ColumnMapping(None, "SCOPE_CATALOG"),  # Always NULL
+    ColumnMapping(None, "SCOPE_SCHEMA"),  # Always NULL
+    ColumnMapping(None, "SCOPE_TABLE"),  # Always NULL
+    ColumnMapping(None, "SOURCE_DATA_TYPE"),  # Always NULL
+    ColumnMapping("isAutoIncrement", "IS_AUTOINCREMENT"),
+    ColumnMapping("isGenerated", "IS_GENERATEDCOLUMN"),
+]
+
+# Operation to mapping lookup
+OPERATION_MAPPINGS: Dict[str, List[ColumnMapping]] = {
+    "catalogs": CATALOG_COLUMN_MAPPINGS,
+    "schemas": SCHEMA_COLUMN_MAPPINGS,
+    "tables": TABLE_COLUMN_MAPPINGS,
+    "columns": COLUMN_COLUMN_MAPPINGS,
+}
diff --git a/src/databricks/sql/backend/sea/metadata_normalizer.py b/src/databricks/sql/backend/sea/metadata_normalizer.py
@@ -0,0 +1,263 @@
+"""
+Metadata normalization for SEA backend results.
+
+This module provides functionality to normalize SEA metadata results
+to match the column names and data format expected from the Thrift backend.
+"""
+
+import logging
+from typing import List, Dict, Any, Optional, Tuple
+from databricks.sql.backend.sea.metadata_constants import (
+    OPERATION_MAPPINGS,
+    ColumnMapping,
+)
+from databricks.sql.backend.sea.utils.conversion import SqlType
+
+logger = logging.getLogger(__name__)
+
+
+# SQL type codes for metadata compatibility
+class TypeCodes:
+    """SQL type code constants for DATA_TYPE column values.
+
+    These integer codes are used in the DATA_TYPE column of metadata results
+    to maintain compatibility with the Thrift backend.
+    """
+
+    TINYINT = -6
+    SMALLINT = 5
+    INTEGER = 4
+    BIGINT = -5
+    FLOAT = 6
+    DOUBLE = 8
+    DECIMAL = 3
+    BINARY = -2
+    BOOLEAN = 16
+    CHAR = 1
+    VARCHAR = 12
+    TIMESTAMP = 93
+    DATE = 91
+    STRUCT = 2002
+    ARRAY = 2003
+    OTHER = 1111
+
+
+class MetadataNormalizer:
+    """Normalizes SEA metadata results to match Thrift backend column names."""
+
+    @staticmethod
+    def normalize_description(description: List[Tuple], operation: str) -> List[Tuple]:
+        """
+        Normalize column description to use Thrift standard names.
+
+        Args:
+            description: Original description from SEA
+            operation: The metadata operation (catalogs, schemas, tables, columns)
+
+        Returns:
+            Normalized description with Thrift column names
+        """
+        logger.debug(f"normalize_description called with operation: {operation}")
+        logger.debug(f"Original description: {description}")
+
+        mappings = OPERATION_MAPPINGS.get(operation, [])
+        if not mappings:
+            logger.debug(f"No mappings found for operation: {operation}")
+            return description
+
+        # Create lookup from SEA names to Thrift names
+        sea_to_thrift = {
+            mapping.sea_name: mapping.thrift_name
+            for mapping in mappings
+            if mapping.sea_name
+        }
+        logger.debug(f"SEA to Thrift mapping: {sea_to_thrift}")
+
+        # Create new description with normalized names
+        normalized_description = []
+        for col_desc in description:
+            (
+                name,
+                type_code,
+                display_size,
+                internal_size,
+                precision,
+                scale,
+                null_ok,
+            ) = col_desc
+
+            # Skip columns that don't exist in Thrift for tables operation
+            if operation == "tables" and name in ["isTemporary", "information"]:
+                continue
+
+            # Map SEA name to Thrift name
+            thrift_name = sea_to_thrift.get(name, name)
+
+            normalized_description.append(
+                (
+                    thrift_name,
+                    type_code,
+                    display_size,
+                    internal_size,
+                    precision,
+                    scale,
+                    null_ok,
+                )
+            )
+
+        # Add any missing NULL columns required by Thrift spec
+        existing_names = {desc[0] for desc in normalized_description}
+        for mapping in mappings:
+            if mapping.sea_name is None and mapping.thrift_name not in existing_names:
+                # Add NULL column
+                normalized_description.append(
+                    (mapping.thrift_name, "string", None, None, None, None, None)
+                )
+
+        # For tables operation, ensure the columns are in Thrift order
+        if operation == "tables":
+            # Define the expected Thrift column order
+            thrift_order = [
+                "TABLE_CAT",
+                "TABLE_SCHEM",
+                "TABLE_NAME",
+                "TABLE_TYPE",
+                "REMARKS",
+                "TYPE_CAT",
+                "TYPE_SCHEM",
+                "TYPE_NAME",
+                "SELF_REFERENCING_COL_NAME",
+                "REF_GENERATION",
+            ]
+
+            # Create a mapping of column names to their descriptions
+            desc_map = {desc[0]: desc for desc in normalized_description}
+
+            # Rebuild the description in the correct order
+            ordered_description = []
+            for col_name in thrift_order:
+                if col_name in desc_map:
+                    ordered_description.append(desc_map[col_name])
+
+            normalized_description = ordered_description
+
+        logger.debug(f"Normalized description: {normalized_description}")
+        return normalized_description
+
+    @staticmethod
+    def normalize_row_data(
+        rows: List[Dict[str, Any]],
+        operation: str,
+        context: Optional[Dict[str, Any]] = None,
+    ) -> List[Dict[str, Any]]:
+        """
+        Normalize row data to use Thrift standard column names.
+
+        Args:
+            rows: Original row data from SEA
+            operation: The metadata operation (catalogs, schemas, tables, columns)
+
+        Returns:
+            Normalized row data with Thrift column names
+        """
+        logger.debug(f"normalize_row_data called with operation: {operation}")
+        logger.debug(f"Number of rows to normalize: {len(rows)}")
+        if rows:
+            logger.debug(f"First row before normalization: {rows[0]}")
+
+        mappings = OPERATION_MAPPINGS.get(operation, [])
+        if not mappings:
+            logger.debug(f"No mappings found for operation: {operation}")
+            return rows
+
+        # Create lookup from SEA names to Thrift names
+        sea_to_thrift = {
+            mapping.sea_name: mapping.thrift_name
+            for mapping in mappings
+            if mapping.sea_name
+        }
+        logger.debug(f"SEA to Thrift mapping: {sea_to_thrift}")
+
+        normalized_rows = []
+        for row in rows:
+            normalized_row = {}
+
+            # Map existing columns, but skip columns that don't have a mapping in tables operation
+            for sea_name, value in row.items():
+                if operation == "tables" and sea_name in ["isTemporary", "information"]:
+                    # Skip these columns that exist in SEA but not in Thrift
+                    continue
+                thrift_name = sea_to_thrift.get(sea_name, sea_name)
+                normalized_row[thrift_name] = value
+
+            # Add NULL values for missing columns
+            for mapping in mappings:
+                if mapping.sea_name is None:
+                    # Handle special cases that need context
+                    if (
+                        mapping.thrift_name == "TABLE_CATALOG"
+                        and operation == "schemas"
+                        and context
+                    ):
+                        # For schemas, populate TABLE_CATALOG with the catalog name from context
+                        normalized_row[mapping.thrift_name] = context.get(
+                            "catalog_name"
+                        )
+                    else:
+                        normalized_row[mapping.thrift_name] = None
+                elif mapping.thrift_name not in normalized_row:
+                    # Handle special conversions if needed
+                    if mapping.thrift_name == "DATA_TYPE":
+                        # Convert TYPE_NAME to DATA_TYPE code
+                        type_name = row.get("columnType", "")
+                        normalized_row["DATA_TYPE"] = _convert_type_name_to_data_type(
+                            type_name
+                        )
+                    elif mapping.thrift_name == "NULLABLE":
+                        # Convert IS_NULLABLE to NULLABLE code
+                        is_nullable = row.get("isNullable", "")
+                        normalized_row["NULLABLE"] = 1 if is_nullable == "YES" else 0
+
+            normalized_rows.append(normalized_row)
+
+        if normalized_rows:
+            logger.debug(f"First row after normalization: {normalized_rows[0]}")
+
+        return normalized_rows
+
+
+def _convert_type_name_to_data_type(type_name: str) -> int:
+    """
+    Convert normalized type name to SQL DATA_TYPE code.
+    The type_name comes from the schema description's type_code field,
+    which is already normalized by the SEA backend (see backend.py:324-327).
+    This leverages the existing normalization rather than duplicating the logic.
+    """
+    # Simple mapping from normalized type names to SQL type codes
+    # Using SqlType constants for consistency with existing codebase
+    type_mapping = {
+        SqlType.BYTE: TypeCodes.TINYINT,
+        "tinyint": TypeCodes.TINYINT,
+        SqlType.SHORT: TypeCodes.SMALLINT,
+        "smallint": TypeCodes.SMALLINT,
+        SqlType.INT: TypeCodes.INTEGER,
+        "integer": TypeCodes.INTEGER,
+        SqlType.LONG: TypeCodes.BIGINT,
+        "bigint": TypeCodes.BIGINT,
+        SqlType.FLOAT: TypeCodes.FLOAT,
+        SqlType.DOUBLE: TypeCodes.DOUBLE,
+        SqlType.DECIMAL: TypeCodes.DECIMAL,
+        SqlType.BINARY: TypeCodes.BINARY,
+        SqlType.BOOLEAN: TypeCodes.BOOLEAN,
+        SqlType.CHAR: TypeCodes.CHAR,
+        SqlType.STRING: TypeCodes.VARCHAR,
+        "varchar": TypeCodes.VARCHAR,
+        SqlType.TIMESTAMP: TypeCodes.TIMESTAMP,
+        SqlType.DATE: TypeCodes.DATE,
+        SqlType.STRUCT: TypeCodes.STRUCT,
+        SqlType.ARRAY: TypeCodes.ARRAY,
+        SqlType.MAP: TypeCodes.VARCHAR,  # Maps are represented as VARCHAR in Thrift
+        SqlType.NULL: TypeCodes.VARCHAR,
+    }
+
+    return type_mapping.get(type_name.lower(), TypeCodes.OTHER)