55from .sqeleton .abcs import DbKey , DbTime , DbPath
66from .diff_tables import Algorithm
77from .hashdiff_tables import HashDiffer , DEFAULT_BISECTION_THRESHOLD , DEFAULT_BISECTION_FACTOR
8- from .joindiff_tables import JoinDiffer
8+ from .joindiff_tables import JoinDiffer , TABLE_WRITE_LIMIT
99from .table_segment import TableSegment
1010
1111__version__ = "0.3.0rc4"
@@ -55,17 +55,28 @@ def diff_tables(
5555 # Start/end update_column values, used to restrict the segment
5656 min_update : DbTime = None ,
5757 max_update : DbTime = None ,
58+ # Enable/disable threaded diffing. Needed to take advantage of database threads.
59+ threaded : bool = True ,
60+ # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
61+ # There may be many pools, so number of actual threads can be a lot higher.
62+ max_threadpool_size : Optional [int ] = 1 ,
5863 # Algorithm
5964 algorithm : Algorithm = Algorithm .HASHDIFF ,
6065 # Into how many segments to bisect per iteration (hashdiff only)
6166 bisection_factor : int = DEFAULT_BISECTION_FACTOR ,
6267 # When should we stop bisecting and compare locally (in row count; hashdiff only)
6368 bisection_threshold : int = DEFAULT_BISECTION_THRESHOLD ,
64- # Enable/disable threaded diffing. Needed to take advantage of database threads.
65- threaded : bool = True ,
66- # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
67- # There may be many pools, so number of actual threads can be a lot higher.
68- max_threadpool_size : Optional [int ] = 1 ,
69+ # Enable/disable validating that the key columns are unique. (joindiff only)
70+ validate_unique_key : bool = True ,
71+ # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
72+ sample_exclusive_rows : bool = False ,
73+ # Path of new table to write diff results to. Disabled if not provided. (joindiff only)
74+ materialize_to_table : Union [str , DbPath ] = None ,
75+ # Materialize every row, not just those that are different. (joindiff only)
76+ materialize_all_rows : bool = False ,
77+ # Maximum number of rows to write when materializing, per thread. (joindiff only)
78+ table_write_limit : int = TABLE_WRITE_LIMIT ,
79+
6980) -> Iterator :
7081 """Finds the diff between table1 and table2.
7182
@@ -78,14 +89,21 @@ def diff_tables(
7889 max_key (:data:`DbKey`, optional): Highest key value, used to restrict the segment
7990 min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
8091 max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
81- algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
82- bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
83- bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
84- and compare locally. (Used when algorithm is `HASHDIFF`).
8592 threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
8693 max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
8794 Only relevant when `threaded` is ``True``.
8895 There may be many pools, so number of actual threads can be a lot higher.
96+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
97+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
98+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
99+ and compare locally. (Used when algorithm is `HASHDIFF`).
100+ validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
101+ Single query, and can't be threaded, so it's very slow on non-cloud dbs.
102+ Future versions will detect UNIQUE constraints in the schema.
103+ sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
104+ materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
105+ materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
106+ table_write_limit (int): Maximum number of rows to write when materializing, per thread.
89107
90108 Note:
91109 The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
@@ -133,9 +151,16 @@ def diff_tables(
133151 max_threadpool_size = max_threadpool_size ,
134152 )
135153 elif algorithm == Algorithm .JOINDIFF :
154+ if isinstance (materialize_to_table , str ):
155+ materialize_to_table = table1 .database .parse_table_name (materialize_to_table )
136156 differ = JoinDiffer (
137157 threaded = threaded ,
138158 max_threadpool_size = max_threadpool_size ,
159+ validate_unique_key = validate_unique_key ,
160+ sample_exclusive_rows = sample_exclusive_rows ,
161+ materialize_to_table = materialize_to_table ,
162+ materialize_all_rows = materialize_all_rows ,
163+ table_write_limit = table_write_limit
139164 )
140165 else :
141166 raise ValueError (f"Unknown algorithm: { algorithm } " )
0 commit comments