55from .sqeleton .abcs import DbKey , DbTime , DbPath
66from .diff_tables import Algorithm
77from .hashdiff_tables import HashDiffer , DEFAULT_BISECTION_THRESHOLD , DEFAULT_BISECTION_FACTOR
8- from .joindiff_tables import JoinDiffer
8+ from .joindiff_tables import JoinDiffer , TABLE_WRITE_LIMIT
99from .table_segment import TableSegment
10+ from .utils import eval_name_template
1011
1112def connect_to_table (
1213 db_info : Union [str , dict ],
@@ -53,17 +54,27 @@ def diff_tables(
5354 # Start/end update_column values, used to restrict the segment
5455 min_update : DbTime = None ,
5556 max_update : DbTime = None ,
56- # Algorithm
57- algorithm : Algorithm = Algorithm .HASHDIFF ,
58- # Into how many segments to bisect per iteration (hashdiff only)
59- bisection_factor : int = DEFAULT_BISECTION_FACTOR ,
60- # When should we stop bisecting and compare locally (in row count; hashdiff only)
61- bisection_threshold : int = DEFAULT_BISECTION_THRESHOLD ,
6257 # Enable/disable threaded diffing. Needed to take advantage of database threads.
6358 threaded : bool = True ,
6459 # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
6560 # There may be many pools, so number of actual threads can be a lot higher.
6661 max_threadpool_size : Optional [int ] = 1 ,
62+ # Algorithm
63+ algorithm : Algorithm = Algorithm .AUTO ,
64+ # Into how many segments to bisect per iteration (hashdiff only)
65+ bisection_factor : int = DEFAULT_BISECTION_FACTOR ,
66+ # When should we stop bisecting and compare locally (in row count; hashdiff only)
67+ bisection_threshold : int = DEFAULT_BISECTION_THRESHOLD ,
68+ # Enable/disable validating that the key columns are unique. (joindiff only)
69+ validate_unique_key : bool = True ,
70+ # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
71+ sample_exclusive_rows : bool = False ,
72+ # Path of new table to write diff results to. Disabled if not provided. (joindiff only)
73+ materialize_to_table : Union [str , DbPath ] = None ,
74+ # Materialize every row, not just those that are different. (joindiff only)
75+ materialize_all_rows : bool = False ,
76+ # Maximum number of rows to write when materializing, per thread. (joindiff only)
77+ table_write_limit : int = TABLE_WRITE_LIMIT ,
6778) -> Iterator :
6879 """Finds the diff between table1 and table2.
6980
@@ -76,14 +87,21 @@ def diff_tables(
7687 max_key (:data:`DbKey`, optional): Highest key value, used to restrict the segment
7788 min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
7889 max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
79- algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
80- bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
81- bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
82- and compare locally. (Used when algorithm is `HASHDIFF`).
8390 threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
8491 max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
8592 Only relevant when `threaded` is ``True``.
8693 There may be many pools, so number of actual threads can be a lot higher.
94+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
95+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
96+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
97+ and compare locally. (Used when algorithm is `HASHDIFF`).
98+ validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
99+ Single query, and can't be threaded, so it's very slow on non-cloud dbs.
100+ Future versions will detect UNIQUE constraints in the schema.
101+ sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
102+ materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
103+ materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
104+ table_write_limit (int): Maximum number of rows to write when materializing, per thread.
87105
88106 Note:
89107 The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
@@ -123,6 +141,9 @@ def diff_tables(
123141 segments = [t .new (** override_attrs ) for t in tables ] if override_attrs else tables
124142
125143 algorithm = Algorithm (algorithm )
144+ if algorithm == Algorithm .AUTO :
145+ algorithm = Algorithm .JOINDIFF if table1 .database is table2 .database else Algorithm .HASHDIFF
146+
126147 if algorithm == Algorithm .HASHDIFF :
127148 differ = HashDiffer (
128149 bisection_factor = bisection_factor ,
@@ -131,9 +152,16 @@ def diff_tables(
131152 max_threadpool_size = max_threadpool_size ,
132153 )
133154 elif algorithm == Algorithm .JOINDIFF :
155+ if isinstance (materialize_to_table , str ):
156+ materialize_to_table = table1 .database .parse_table_name (eval_name_template (materialize_to_table ))
134157 differ = JoinDiffer (
135158 threaded = threaded ,
136159 max_threadpool_size = max_threadpool_size ,
160+ validate_unique_key = validate_unique_key ,
161+ sample_exclusive_rows = sample_exclusive_rows ,
162+ materialize_to_table = materialize_to_table ,
163+ materialize_all_rows = materialize_all_rows ,
164+ table_write_limit = table_write_limit ,
137165 )
138166 else :
139167 raise ValueError (f"Unknown algorithm: { algorithm } " )
0 commit comments