Merge pull request #185 from datafold/wildcard_columns

erezsh · web-flow · commit e10e59dacf67 · 2022-08-02T12:30:39.000+02:00
Added support for auto-detecting mutual columns, and using patterns in -c
diff --git a/README.md b/README.md
@@ -144,7 +144,7 @@ If a database is not on the list, we'd still love to support it. Open an issue
 to discuss it.
 
 Note: Because URLs allow many special characters, and may collide with the syntax of your command-line,
-it's recommended to surround them with quotes. Alternatively, you may provide them in a TOML file via the `--config` option. 
+it's recommended to surround them with quotes. Alternatively, you may provide them in a TOML file via the `--config` option.
 
 
 # How to install
@@ -195,7 +195,7 @@ Options:
   - `--help` - Show help message and exit.
   - `-k` or `--key-column` - Name of the primary key column
   - `-t` or `--update-column` - Name of updated_at/last_updated column
-  - `-c` or `--columns` - List of names of extra columns to compare
+  - `-c` or `--columns` - Name or pattern of extra columns to compare. Pattern syntax is like SQL, e.g. `%foob.r%`.
   - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime)
   - `-s` or `--stats` - Print stats instead of a detailed diff
   - `-d` or `--debug` - Print debug info
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -5,13 +5,14 @@
 import logging
 from itertools import islice
 
-from .utils import remove_password_from_url
+from .utils import remove_password_from_url, safezip, match_like
 
 from .diff_tables import (
     TableSegment,
     TableDiffer,
     DEFAULT_BISECTION_THRESHOLD,
     DEFAULT_BISECTION_FACTOR,
+    create_schema,
 )
 from .databases.connect import connect
 from .parse_time import parse_time_before_now, UNITS_STR, ParseError
@@ -39,6 +40,11 @@ def _remove_passwords_in_dict(d: dict):
             d[k] = remove_password_from_url(v)
 
 
+def _get_schema(pair):
+    db, table_path = pair
+    return db.query_table_schema(table_path)
+
+
 @click.command()
 @click.argument("database1", required=False)
 @click.argument("table1", required=False)
@@ -67,7 +73,11 @@ def _remove_passwords_in_dict(d: dict):
 @click.option("--json", "json_output", is_flag=True, help="Print JSONL output for machine readability")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
-@click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
+@click.option(
+    "--case-sensitive",
+    is_flag=True,
+    help="Column names are treated as case-sensitive. Otherwise, data-diff corrects their case according to schema.",
+)
 @click.option(
     "-j",
     "--threads",
@@ -111,7 +121,7 @@ def _main(
     verbose,
     interactive,
     threads,
-    keep_column_case,
+    case_sensitive,
     json_output,
     where,
     threads1=None,
@@ -158,35 +168,66 @@ def _main(
 
     db1 = connect(database1, threads1 or threads)
     db2 = connect(database2, threads2 or threads)
+    dbs = db1, db2
 
     if interactive:
-        db1.enable_interactive()
-        db2.enable_interactive()
+        for db in dbs:
+            db.enable_interactive()
 
     start = time.time()
 
     try:
         options = dict(
             min_update=max_age and parse_time_before_now(max_age),
             max_update=min_age and parse_time_before_now(min_age),
-            case_sensitive=keep_column_case,
+            case_sensitive=case_sensitive,
             where=where,
         )
     except ParseError as e:
         logging.error("Error while parsing age expression: %s" % e)
         return
 
-    table1_seg = TableSegment(db1, db1.parse_table_name(table1), key_column, update_column, columns, **options)
-    table2_seg = TableSegment(db2, db2.parse_table_name(table2), key_column, update_column, columns, **options)
-
     differ = TableDiffer(
         bisection_factor=bisection_factor,
         bisection_threshold=bisection_threshold,
         threaded=threaded,
         max_threadpool_size=threads and threads * 2,
         debug=debug,
     )
-    diff_iter = differ.diff_tables(table1_seg, table2_seg)
+
+    table_names = table1, table2
+    table_paths = [db.parse_table_name(t) for db, t in safezip(dbs, table_names)]
+
+    schemas = list(differ._thread_map(_get_schema, safezip(dbs, table_paths)))
+    schema1, schema2 = schemas = [
+        create_schema(db, table_path, schema, case_sensitive)
+        for db, table_path, schema in safezip(dbs, table_paths, schemas)
+    ]
+
+    mutual = schema1.keys() & schema2.keys()  # Case-aware, according to case_sensitive
+    logging.debug(f"Available mutual columns: {mutual}")
+
+    expanded_columns = set()
+    for c in columns:
+        match = set(match_like(c, mutual))
+        if not match:
+            m1 = None if any(match_like(c, schema1.keys())) else f"{db1}/{table1}"
+            m2 = None if any(match_like(c, schema2.keys())) else f"{db2}/{table2}"
+            not_matched = ", ".join(m for m in [m1, m2] if m)
+            raise ValueError(f"Column {c} not found in: {not_matched}")
+
+        expanded_columns |= match
+
+    columns = tuple(expanded_columns - {key_column, update_column})
+
+    logging.info(f"Diffing columns: key={key_column} update={update_column} extra={columns}")
+
+    segments = [
+        TableSegment(db, table_path, key_column, update_column, columns, **options)._with_raw_schema(raw_schema)
+        for db, table_path, raw_schema in safezip(dbs, table_paths, schemas)
+    ]
+
+    diff_iter = differ.diff_tables(*segments)
 
     if limit:
         diff_iter = islice(diff_iter, int(limit))
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -7,7 +7,7 @@
 import threading
 from abc import abstractmethod
 
-from data_diff.utils import is_uuid, safezip
+from data_diff.utils import CaseAwareMapping, is_uuid, safezip
 from .database_types import (
     AbstractDatabase,
     ColType,
@@ -180,16 +180,19 @@ def select_table_schema(self, path: DbPath) -> str:
             f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
         )
 
-    def query_table_schema(self, path: DbPath, filter_columns: Optional[Sequence[str]] = None) -> Dict[str, ColType]:
+    def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
         rows = self.query(self.select_table_schema(path), list)
         if not rows:
             raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
 
-        if filter_columns is not None:
-            accept = {i.lower() for i in filter_columns}
-            rows = [r for r in rows if r[0].lower() in accept]
+        d = {r[0]: r for r in rows}
+        assert len(d) == len(rows)
+        return d
 
-        col_dict: Dict[str, ColType] = {row[0]: self._parse_type(path, *row) for row in rows}
+    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+        accept = {i.lower() for i in filter_columns}
+
+        col_dict = {name: self._parse_type(path, *row) for name, row in raw_schema.items() if name.lower() in accept}
 
         self._refine_coltypes(path, col_dict)
 
diff --git a/data_diff/databases/database_types.py b/data_diff/databases/database_types.py
@@ -5,7 +5,7 @@
 
 from runtype import dataclass
 
-from data_diff.utils import ArithAlphanumeric, ArithUUID, ArithString
+from data_diff.utils import ArithAlphanumeric, ArithUUID, CaseAwareMapping
 
 
 DbPath = Tuple[str, ...]
@@ -171,10 +171,23 @@ def select_table_schema(self, path: DbPath) -> str:
         ...
 
     @abstractmethod
-    def query_table_schema(self, path: DbPath, filter_columns: Optional[Sequence[str]] = None) -> Dict[str, ColType]:
-        "Query the table for its schema for table in 'path', and return {column: type}"
+    def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
+        """Query the table for its schema for table in 'path', and return {column: tuple}
+        where the tuple is (table_name, col_name, type_repr, datetime_precision?, numeric_precision?, numeric_scale?)
+        """
         ...
 
+    @abstractmethod
+    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+        """Process the result of query_table_schema().
+
+        Done in a separate step, to minimize the amount of processed columns.
+        Needed because processing each column may:
+        * throw errors and warnings
+        * query the database to sample values
+
+        """
+
     @abstractmethod
     def parse_table_name(self, name: str) -> DbPath:
         "Parse the given table name into a DbPath"
@@ -254,44 +267,4 @@ def _normalize_table_path(self, path: DbPath) -> DbPath:
         ...
 
 
-class Schema(ABC):
-    @abstractmethod
-    def get_key(self, key: str) -> str:
-        ...
-
-    @abstractmethod
-    def __getitem__(self, key: str) -> ColType:
-        ...
-
-    @abstractmethod
-    def __setitem__(self, key: str, value):
-        ...
-
-    @abstractmethod
-    def __contains__(self, key: str) -> bool:
-        ...
-
-
-class Schema_CaseSensitive(dict, Schema):
-    def get_key(self, key):
-        return key
-
-
-class Schema_CaseInsensitive(Schema):
-    def __init__(self, initial):
-        self._dict = {k.lower(): (k, v) for k, v in dict(initial).items()}
-
-    def get_key(self, key: str) -> str:
-        return self._dict[key.lower()][0]
-
-    def __getitem__(self, key: str) -> ColType:
-        return self._dict[key.lower()][1]
-
-    def __setitem__(self, key: str, value):
-        k = key.lower()
-        if k in self._dict:
-            key = self._dict[k][0]
-        self._dict[k] = key, value
-
-    def __contains__(self, key):
-        return key.lower() in self._dict
+Schema = CaseAwareMapping
diff --git a/data_diff/databases/databricks.py b/data_diff/databases/databricks.py
@@ -68,7 +68,7 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
         # Subtracting 1 due to wierd precision issues
         return max(super()._convert_db_precision_to_digits(p) - 1, 0)
 
-    def query_table_schema(self, path: DbPath, filter_columns: Optional[Sequence[str]] = None) -> Dict[str, ColType]:
+    def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
         # Databricks has INFORMATION_SCHEMA only for Databricks Runtime, not for Databricks SQL.
         # https://docs.databricks.com/spark/latest/spark-sql/language-manual/information-schema/columns.html
         # So, to obtain information about schema, we should use another approach.
@@ -80,35 +80,40 @@ def query_table_schema(self, path: DbPath, filter_columns: Optional[Sequence[str
             if not rows:
                 raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
 
-            if filter_columns is not None:
-                accept = {i.lower() for i in filter_columns}
-                rows = [r for r in rows if r.COLUMN_NAME.lower() in accept]
+            d = {r.COLUMN_NAME: r for r in rows}
+            assert len(d) == len(rows)
+            return d
 
-            resulted_rows = []
-            for row in rows:
-                row_type = "DECIMAL" if row.DATA_TYPE == 3 else row.TYPE_NAME
-                type_cls = self.TYPE_CLASSES.get(row_type, UnknownColType)
+    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+        accept = {i.lower() for i in filter_columns}
+        rows = [row for name, row in raw_schema.items() if name.lower() in accept]
 
-                if issubclass(type_cls, Integer):
-                    row = (row.COLUMN_NAME, row_type, None, None, 0)
+        resulted_rows = []
+        for row in rows:
+            row_type = "DECIMAL" if row.DATA_TYPE == 3 else row.TYPE_NAME
+            type_cls = self.TYPE_CLASSES.get(row_type, UnknownColType)
 
-                elif issubclass(type_cls, Float):
-                    numeric_precision = self._convert_db_precision_to_digits(row.DECIMAL_DIGITS)
-                    row = (row.COLUMN_NAME, row_type, None, numeric_precision, None)
+            if issubclass(type_cls, Integer):
+                row = (row.COLUMN_NAME, row_type, None, None, 0)
 
-                elif issubclass(type_cls, Decimal):
-                    # TYPE_NAME has a format DECIMAL(x,y)
-                    items = row.TYPE_NAME[8:].rstrip(")").split(",")
-                    numeric_precision, numeric_scale = int(items[0]), int(items[1])
-                    row = (row.COLUMN_NAME, row_type, None, numeric_precision, numeric_scale)
+            elif issubclass(type_cls, Float):
+                numeric_precision = self._convert_db_precision_to_digits(row.DECIMAL_DIGITS)
+                row = (row.COLUMN_NAME, row_type, None, numeric_precision, None)
 
-                elif issubclass(type_cls, Timestamp):
-                    row = (row.COLUMN_NAME, row_type, row.DECIMAL_DIGITS, None, None)
+            elif issubclass(type_cls, Decimal):
+                # TYPE_NAME has a format DECIMAL(x,y)
+                items = row.TYPE_NAME[8:].rstrip(")").split(",")
+                numeric_precision, numeric_scale = int(items[0]), int(items[1])
+                row = (row.COLUMN_NAME, row_type, None, numeric_precision, numeric_scale)
 
-                else:
-                    row = (row.COLUMN_NAME, row_type, None, None, None)
+            elif issubclass(type_cls, Timestamp):
+                row = (row.COLUMN_NAME, row_type, row.DECIMAL_DIGITS, None, None)
+
+            else:
+                row = (row.COLUMN_NAME, row_type, None, None, None)
+
+            resulted_rows.append(row)
 
-                resulted_rows.append(row)
         col_dict: Dict[str, ColType] = {row[0]: self._parse_type(path, *row) for row in resulted_rows}
 
         self._refine_coltypes(path, col_dict)
diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py
@@ -47,7 +47,7 @@ def create_connection(self):
             elif e.errno == mysql.errorcode.ER_BAD_DB_ERROR:
                 raise ConnectError("Database does not exist") from e
             else:
-                raise ConnectError(*e._args) from e
+                raise ConnectError(*e) from e
 
     def quote(self, s: str):
         return f"`{s}`"
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
diff --git a/data_diff/sql.py b/data_diff/sql.py
diff --git a/data_diff/utils.py b/data_diff/utils.py