Merge branch 'master' into version-tracking

kylemcnair · web-flow · commit 7444053f924c · 2022-12-01T17:11:31.000-08:00
diff --git a/data_diff/__init__.py b/data_diff/__init__.py
@@ -5,8 +5,9 @@
 from .sqeleton.abcs import DbKey, DbTime, DbPath
 from .diff_tables import Algorithm
 from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
-from .joindiff_tables import JoinDiffer
+from .joindiff_tables import JoinDiffer, TABLE_WRITE_LIMIT
 from .table_segment import TableSegment
+from .utils import eval_name_template
 
 def connect_to_table(
     db_info: Union[str, dict],
@@ -53,17 +54,27 @@ def diff_tables(
     # Start/end update_column values, used to restrict the segment
     min_update: DbTime = None,
     max_update: DbTime = None,
-    # Algorithm
-    algorithm: Algorithm = Algorithm.HASHDIFF,
-    # Into how many segments to bisect per iteration (hashdiff only)
-    bisection_factor: int = DEFAULT_BISECTION_FACTOR,
-    # When should we stop bisecting and compare locally (in row count; hashdiff only)
-    bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
     # Enable/disable threaded diffing. Needed to take advantage of database threads.
     threaded: bool = True,
     # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
     # There may be many pools, so number of actual threads can be a lot higher.
     max_threadpool_size: Optional[int] = 1,
+    # Algorithm
+    algorithm: Algorithm = Algorithm.AUTO,
+    # Into how many segments to bisect per iteration (hashdiff only)
+    bisection_factor: int = DEFAULT_BISECTION_FACTOR,
+    # When should we stop bisecting and compare locally (in row count; hashdiff only)
+    bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
+    # Enable/disable validating that the key columns are unique. (joindiff only)
+    validate_unique_key: bool = True,
+    # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
+    sample_exclusive_rows: bool = False,
+    # Path of new table to write diff results to. Disabled if not provided. (joindiff only)
+    materialize_to_table: Union[str, DbPath] = None,
+    # Materialize every row, not just those that are different. (joindiff only)
+    materialize_all_rows: bool = False,
+    # Maximum number of rows to write when materializing, per thread. (joindiff only)
+    table_write_limit: int = TABLE_WRITE_LIMIT,
 ) -> Iterator:
     """Finds the diff between table1 and table2.
 
@@ -76,14 +87,21 @@ def diff_tables(
         max_key (:data:`DbKey`, optional): Highest key value, used to restrict the segment
         min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
         max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
-        algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
-        bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
-        bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
-                                      and compare locally. (Used when algorithm is `HASHDIFF`).
         threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
         max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
                                    Only relevant when `threaded` is ``True``.
                                    There may be many pools, so number of actual threads can be a lot higher.
+        algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
+        bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
+        bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
+                                      and compare locally. (Used when algorithm is `HASHDIFF`).
+        validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
+                                    Single query, and can't be threaded, so it's very slow on non-cloud dbs.
+                                    Future versions will detect UNIQUE constraints in the schema.
+        sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
+        materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
+        materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
+        table_write_limit (int): Maximum number of rows to write when materializing, per thread.
 
     Note:
         The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
@@ -123,6 +141,9 @@ def diff_tables(
     segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
 
     algorithm = Algorithm(algorithm)
+    if algorithm == Algorithm.AUTO:
+        algorithm = Algorithm.JOINDIFF if table1.database is table2.database else Algorithm.HASHDIFF
+
     if algorithm == Algorithm.HASHDIFF:
         differ = HashDiffer(
             bisection_factor=bisection_factor,
@@ -131,9 +152,16 @@ def diff_tables(
             max_threadpool_size=max_threadpool_size,
         )
     elif algorithm == Algorithm.JOINDIFF:
+        if isinstance(materialize_to_table, str):
+            materialize_to_table = table1.database.parse_table_name(eval_name_template(materialize_to_table))
         differ = JoinDiffer(
             threaded=threaded,
             max_threadpool_size=max_threadpool_size,
+            validate_unique_key=validate_unique_key,
+            sample_exclusive_rows=sample_exclusive_rows,
+            materialize_to_table=materialize_to_table,
+            materialize_all_rows=materialize_all_rows,
+            table_write_limit=table_write_limit,
         )
     else:
         raise ValueError(f"Unknown algorithm: {algorithm}")
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -123,7 +123,7 @@ def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -
 )
 @click.option(
     "-m",
-    "--materialize",
+    "--materialize-to-table",
     default=None,
     metavar="TABLE_NAME",
     help="(joindiff only) Materialize the diff results into a new table in the database. If a table exists by that name, it will be replaced.",
@@ -248,7 +248,7 @@ def _main(
     sample_exclusive_rows,
     materialize_all_rows,
     table_write_limit,
-    materialize,
+    materialize_to_table,
     threads1=None,
     threads2=None,
     __conf__=None,
@@ -340,7 +340,7 @@ def _main(
             sample_exclusive_rows=sample_exclusive_rows,
             materialize_all_rows=materialize_all_rows,
             table_write_limit=table_write_limit,
-            materialize_to_table=materialize and db1.parse_table_name(eval_name_template(materialize)),
+            materialize_to_table=materialize_to_table and db1.parse_table_name(eval_name_template(materialize_to_table)),
         )
     else:
         assert algorithm == Algorithm.HASHDIFF
diff --git a/data_diff/joindiff_tables.py b/data_diff/joindiff_tables.py
@@ -123,16 +123,18 @@ class JoinDiffer(TableDiffer):
         max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
                                    Only relevant when `threaded` is ``True``.
                                    There may be many pools, so number of actual threads can be a lot higher.
-        validate_unique_key (bool): Enable/disable validating that the key columns are unique.
-                                    Single query, and can't be threaded, so it's very slow on non-cloud dbs.
-                                    Future versions will detect UNIQUE constraints in the schema.
-        sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table.
+        validate_unique_key (bool): Enable/disable validating that the key columns are unique. (default: True)
+                                    If there are no UNIQUE constraints in the schema, it is done in a single query,
+                                    and can't be threaded, so it's very slow on non-cloud dbs.
+        sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. (default: False)
+                                      Creates a temporary table.
         materialize_to_table (DbPath, optional): Path of new table to write diff results to. Disabled if not provided.
+        materialize_all_rows (bool): Materialize every row, not just those that are different. (default: False)
         table_write_limit (int): Maximum number of rows to write when materializing, per thread.
     """
 
     validate_unique_key: bool = True
-    sample_exclusive_rows: bool = True
+    sample_exclusive_rows: bool = False
     materialize_to_table: DbPath = None
     materialize_all_rows: bool = False
     table_write_limit: int = TABLE_WRITE_LIMIT
diff --git a/data_diff/sqeleton/databases/connect.py b/data_diff/sqeleton/databases/connect.py
@@ -21,7 +21,6 @@
 from .duckdb import DuckDB
 
 
-
 @dataclass
 class MatchUriPath:
     database_cls: Type[Database]
diff --git a/data_diff/sqeleton/queries/ast_classes.py b/data_diff/sqeleton/queries/ast_classes.py
@@ -786,6 +786,7 @@ class CurrentTimestamp(ExprNode):
     def compile(self, c: Compiler) -> str:
         return c.dialect.current_timestamp()
 
+
 # DDL
 
 
diff --git a/data_diff/sqeleton/queries/compiler.py b/data_diff/sqeleton/queries/compiler.py
@@ -11,6 +11,7 @@
 
 cv_params = contextvars.ContextVar("params")
 
+
 class Root:
     "Nodes inheriting from Root can be used as root statements in SQL (e.g. SELECT yes, RANDOM() no)"
 
@@ -38,6 +39,7 @@ def compile(self, elem, params=None) -> str:
 
         if self.root and isinstance(elem, Compilable) and not isinstance(elem, Root):
             from .ast_classes import Select
+
             elem = Select(columns=[elem])
 
         res = self._compile(elem)
diff --git a/tests/common.py b/tests/common.py
@@ -87,7 +87,7 @@ def get_git_revision_short_hash() -> str:
 _database_instances = {}
 
 
-def get_conn(cls: type, shared: bool =True) -> Database:
+def get_conn(cls: type, shared: bool = True) -> Database:
     if shared:
         if cls not in _database_instances:
             _database_instances[cls] = get_conn(cls, shared=False)
@@ -181,6 +181,7 @@ def _test_per_database(cls):
 
     return _test_per_database
 
+
 def table_segment(database, table_path, key_columns, *args, **kw):
     if isinstance(key_columns, str):
         key_columns = (key_columns,)
diff --git a/tests/sqeleton/test_sql.py b/tests/sqeleton/test_sql.py
@@ -18,7 +18,9 @@ def test_compile_int(self):
         self.assertEqual("1", self.compiler.compile(1))
 
     def test_compile_table_name(self):
-        self.assertEqual("`marine_mammals`.`walrus`", self.compiler.replace(root=False).compile(table("marine_mammals", "walrus")))
+        self.assertEqual(
+            "`marine_mammals`.`walrus`", self.compiler.replace(root=False).compile(table("marine_mammals", "walrus"))
+        )
 
     def test_compile_select(self):
         expected_sql = "SELECT name FROM `marine_mammals`.`walrus`"
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -67,13 +67,14 @@ def test_api_get_stats_dict(self):
             "updated": 0,
             "unchanged": 4,
             "total": 1,
-            "stats": {"rows_downloaded": 5},
+            # "stats": {"rows_downloaded": 5},
         }
         t1 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_src_name)
         t2 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_dst_name)
         diff = diff_tables(t1, t2)
-        output = diff.get_stats_dict()
 
+        output = diff.get_stats_dict()
+        output.pop('stats')
         self.assertEqual(expected_dict, output)
         self.assertIsNotNone(diff)
         assert len(list(diff)) == 1
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py
@@ -31,7 +31,6 @@
 test_each_database: Callable = test_each_database_in_list(TEST_DATABASES)
 
 
-
 class TestUtils(unittest.TestCase):
     def test_split_space(self):
         for i in range(0, 10):
@@ -90,24 +89,16 @@ def test_basic(self):
     def test_offset(self):
         differ = HashDiffer(bisection_factor=2, bisection_threshold=10)
         sec1 = self.now.shift(seconds=-3).datetime
-        a = table_segment(
-            self.connection, self.table_src_path, "id", "datetime", max_update=sec1, case_sensitive=False
-        )
-        b = table_segment(
-            self.connection, self.table_dst_path, "id", "datetime", max_update=sec1, case_sensitive=False
-        )
+        a = table_segment(self.connection, self.table_src_path, "id", "datetime", max_update=sec1, case_sensitive=False)
+        b = table_segment(self.connection, self.table_dst_path, "id", "datetime", max_update=sec1, case_sensitive=False)
         assert a.count() == 4, a.count()
         assert b.count() == 3
 
         assert not list(differ.diff_tables(a, a))
         self.assertEqual(len(list(differ.diff_tables(a, b))), 1)
 
-        a = table_segment(
-            self.connection, self.table_src_path, "id", "datetime", min_update=sec1, case_sensitive=False
-        )
-        b = table_segment(
-            self.connection, self.table_dst_path, "id", "datetime", min_update=sec1, case_sensitive=False
-        )
+        a = table_segment(self.connection, self.table_src_path, "id", "datetime", min_update=sec1, case_sensitive=False)
+        b = table_segment(self.connection, self.table_dst_path, "id", "datetime", min_update=sec1, case_sensitive=False)
         assert a.count() == 2
         assert b.count() == 2
         assert not list(differ.diff_tables(a, b))