Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 02e3da0

Browse files
committed
diff_tables() now accepts all JoinDiffer params
1 parent e367da4 commit 02e3da0

File tree

2 files changed

+42
-15
lines changed

2 files changed

+42
-15
lines changed

data_diff/__init__.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .sqeleton.abcs import DbKey, DbTime, DbPath
66
from .diff_tables import Algorithm
77
from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
8-
from .joindiff_tables import JoinDiffer
8+
from .joindiff_tables import JoinDiffer, TABLE_WRITE_LIMIT
99
from .table_segment import TableSegment
1010

1111
__version__ = "0.3.0rc4"
@@ -55,17 +55,28 @@ def diff_tables(
5555
# Start/end update_column values, used to restrict the segment
5656
min_update: DbTime = None,
5757
max_update: DbTime = None,
58+
# Enable/disable threaded diffing. Needed to take advantage of database threads.
59+
threaded: bool = True,
60+
# Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
61+
# There may be many pools, so number of actual threads can be a lot higher.
62+
max_threadpool_size: Optional[int] = 1,
5863
# Algorithm
5964
algorithm: Algorithm = Algorithm.HASHDIFF,
6065
# Into how many segments to bisect per iteration (hashdiff only)
6166
bisection_factor: int = DEFAULT_BISECTION_FACTOR,
6267
# When should we stop bisecting and compare locally (in row count; hashdiff only)
6368
bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
64-
# Enable/disable threaded diffing. Needed to take advantage of database threads.
65-
threaded: bool = True,
66-
# Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
67-
# There may be many pools, so number of actual threads can be a lot higher.
68-
max_threadpool_size: Optional[int] = 1,
69+
# Enable/disable validating that the key columns are unique. (joindiff only)
70+
validate_unique_key: bool = True,
71+
# Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
72+
sample_exclusive_rows: bool = False,
73+
# Path of new table to write diff results to. Disabled if not provided. (joindiff only)
74+
materialize_to_table: Union[str, DbPath] = None,
75+
# Materialize every row, not just those that are different. (joindiff only)
76+
materialize_all_rows: bool = False,
77+
# Maximum number of rows to write when materializing, per thread. (joindiff only)
78+
table_write_limit: int = TABLE_WRITE_LIMIT,
79+
6980
) -> Iterator:
7081
"""Finds the diff between table1 and table2.
7182
@@ -78,14 +89,21 @@ def diff_tables(
7889
max_key (:data:`DbKey`, optional): Highest key value, used to restrict the segment
7990
min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
8091
max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
81-
algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
82-
bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
83-
bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
84-
and compare locally. (Used when algorithm is `HASHDIFF`).
8592
threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
8693
max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
8794
Only relevant when `threaded` is ``True``.
8895
There may be many pools, so number of actual threads can be a lot higher.
96+
algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
97+
bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
98+
bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
99+
and compare locally. (Used when algorithm is `HASHDIFF`).
100+
validate_unique_key (bool): Enable/disable validating that the key columns are unique. (used for `JOINDIFF`. default: True)
101+
Single query, and can't be threaded, so it's very slow on non-cloud dbs.
102+
Future versions will detect UNIQUE constraints in the schema.
103+
sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
104+
materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
105+
materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
106+
table_write_limit (int): Maximum number of rows to write when materializing, per thread.
89107
90108
Note:
91109
The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
@@ -133,9 +151,16 @@ def diff_tables(
133151
max_threadpool_size=max_threadpool_size,
134152
)
135153
elif algorithm == Algorithm.JOINDIFF:
154+
if isinstance(materialize_to_table, str):
155+
materialize_to_table = table1.database.parse_table_name(materialize_to_table)
136156
differ = JoinDiffer(
137157
threaded=threaded,
138158
max_threadpool_size=max_threadpool_size,
159+
validate_unique_key=validate_unique_key,
160+
sample_exclusive_rows=sample_exclusive_rows,
161+
materialize_to_table=materialize_to_table,
162+
materialize_all_rows=materialize_all_rows,
163+
table_write_limit=table_write_limit
139164
)
140165
else:
141166
raise ValueError(f"Unknown algorithm: {algorithm}")

data_diff/joindiff_tables.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,18 @@ class JoinDiffer(TableDiffer):
123123
max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
124124
Only relevant when `threaded` is ``True``.
125125
There may be many pools, so number of actual threads can be a lot higher.
126-
validate_unique_key (bool): Enable/disable validating that the key columns are unique.
127-
Single query, and can't be threaded, so it's very slow on non-cloud dbs.
128-
Future versions will detect UNIQUE constraints in the schema.
129-
sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table.
126+
validate_unique_key (bool): Enable/disable validating that the key columns are unique. (default: True)
127+
If there are no UNIQUE constraints in the schema, it is done in a single query,
128+
and can't be threaded, so it's very slow on non-cloud dbs.
129+
sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. (default: False)
130+
Creates a temporary table.
130131
materialize_to_table (DbPath, optional): Path of new table to write diff results to. Disabled if not provided.
132+
materialize_all_rows (bool): Materialize every row, not just those that are different. (default: False)
131133
table_write_limit (int): Maximum number of rows to write when materializing, per thread.
132134
"""
133135

134136
validate_unique_key: bool = True
135-
sample_exclusive_rows: bool = True
137+
sample_exclusive_rows: bool = False
136138
materialize_to_table: DbPath = None
137139
materialize_all_rows: bool = False
138140
table_write_limit: int = TABLE_WRITE_LIMIT

0 commit comments

Comments
 (0)