Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 81325b8

Browse files
committed
Added Database.normalize_uuid(); more tests (WIP)
1 parent 07ebc04 commit 81325b8

File tree

5 files changed

+63
-2
lines changed

5 files changed

+63
-2
lines changed

data_diff/databases/base.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,11 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType]):
197197
"Refine the types in the column dict, by querying the database for a sample of their values"
198198

199199
text_columns = [k for k, v in col_dict.items() if isinstance(v, Text)]
200+
if not text_columns:
201+
return
200202

201-
samples_by_row = self.query(Select(text_columns, TableName(table_path), limit=16), list)
203+
fields = [self.normalize_uuid(c, ColType_UUID()) for c in text_columns]
204+
samples_by_row = self.query(Select(fields, TableName(table_path), limit=16), list)
202205
samples_by_col = list(zip(*samples_by_row))
203206
for col_name, samples in safezip(text_columns, samples_by_col):
204207
uuid_samples = list(filter(is_uuid, samples))
@@ -270,6 +273,9 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
270273

271274
return f"LIMIT {limit}"
272275

276+
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
277+
return f"TRIM({value})"
278+
273279

274280
CHECKSUM_HEXDIGITS = 15 # Must be 15 or lower
275281
MD5_HEXDIGITS = 32

data_diff/databases/database_types.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,15 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
169169
"""
170170
...
171171

172+
@abstractmethod
173+
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
174+
"""Creates an SQL expression, that converts 'value' to a normalized uuid.
175+
176+
i.e. just makes sure there is no trailing whitespace.
177+
"""
178+
...
179+
180+
172181
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
173182
"""Creates an SQL expression, that converts 'value' to a normalized representation.
174183
@@ -189,6 +198,8 @@ def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
189198
return self.normalize_timestamp(value, coltype)
190199
elif isinstance(coltype, FractionalType):
191200
return self.normalize_number(value, coltype)
201+
elif isinstance(coltype, ColType_UUID):
202+
return self.normalize_uuid(value, coltype)
192203
return self.to_string(value)
193204

194205
def _normalize_table_path(self, path: DbPath) -> DbPath:

data_diff/databases/postgresql.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class PostgreSQL(ThreadedDatabase):
2626
"numeric": Decimal,
2727
"bigint": Integer,
2828
# Text
29+
"character": Text,
30+
"character varying": Text,
2931
"varchar": Text,
3032
"text": Text,
3133
}

tests/test_database_types.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import time
44
import re
55
import math
6+
import uuid
67
from datetime import datetime, timedelta
78
from decimal import Decimal
89
from parameterized import parameterized
@@ -157,11 +158,23 @@ def __next__(self) -> float:
157158
else:
158159
raise StopIteration
159160

161+
class UUID_Faker:
162+
def __init__(self, max):
163+
self.max = max
164+
165+
def __len__(self):
166+
return self.max
167+
168+
def __iter__(self):
169+
return (uuid.uuid1(i) for i in range(self.max))
170+
171+
160172

161173
TYPE_SAMPLES = {
162174
"int": IntFaker(N_SAMPLES),
163175
"datetime_no_timezone": DateTimeFaker(N_SAMPLES),
164176
"float": FloatFaker(N_SAMPLES),
177+
"uuid": UUID_Faker(N_SAMPLES),
165178
}
166179

167180
DATABASE_TYPES = {
@@ -185,6 +198,11 @@ def __next__(self) -> float:
185198
"double precision",
186199
"numeric(6,3)",
187200
],
201+
"uuid": [
202+
"text",
203+
"varchar(100)",
204+
"char(100)",
205+
],
188206
},
189207
db.MySQL: {
190208
# https://dev.mysql.com/doc/refman/8.0/en/integer-types.html
@@ -210,6 +228,10 @@ def __next__(self) -> float:
210228
"numeric",
211229
"numeric(65, 10)",
212230
],
231+
"uuid": [
232+
"varchar(100)",
233+
"char(100)",
234+
],
213235
},
214236
db.BigQuery: {
215237
"int": ["int"],
@@ -222,6 +244,9 @@ def __next__(self) -> float:
222244
"float64",
223245
"bignumeric",
224246
],
247+
"uuid": [
248+
"$uuid"
249+
],
225250
},
226251
db.Snowflake: {
227252
# https://docs.snowflake.com/en/sql-reference/data-types-numeric.html#int-integer-bigint-smallint-tinyint-byteint
@@ -246,6 +271,10 @@ def __next__(self) -> float:
246271
"float",
247272
"numeric",
248273
],
274+
"uuid": [
275+
"$uuid"
276+
],
277+
249278
},
250279
db.Redshift: {
251280
"int": [
@@ -260,6 +289,9 @@ def __next__(self) -> float:
260289
"float8",
261290
"numeric",
262291
],
292+
"uuid": [
293+
"$uuid"
294+
],
263295
},
264296
db.Oracle: {
265297
"int": [
@@ -276,6 +308,9 @@ def __next__(self) -> float:
276308
"real",
277309
"double precision",
278310
],
311+
"uuid": [
312+
"$uuid"
313+
],
279314
},
280315
db.Presto: {
281316
"int": [
@@ -295,6 +330,10 @@ def __next__(self) -> float:
295330
"decimal(10,2)",
296331
"decimal(30,6)",
297332
],
333+
"uuid": [
334+
"$uuid"
335+
],
336+
298337
},
299338
}
300339

tests/test_diff_tables.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,6 @@ def setUp(self):
263263
queries.append(f"INSERT INTO a VALUES ('{self.new_uuid}', 'This one is different')")
264264

265265
# TODO test unexpected values?
266-
# queries.append(f"INSERT INTO a VALUES ('unexpected', '<-- this bad value should not break us')")
267266

268267
for query in queries:
269268
self.connection.query(query, None)
@@ -276,6 +275,10 @@ def test_string_keys(self):
276275
diff = list(differ.diff_tables(self.a, self.b))
277276
self.assertEqual(diff, [("-", (str(self.new_uuid), "This one is different"))])
278277

278+
self.connection.query(f"INSERT INTO a VALUES ('unexpected', '<-- this bad value should not break us')", None)
279+
280+
self.assertRaises(ValueError, differ.diff_tables, self.a, self.b)
281+
279282

280283
class TestTableSegment(TestWithConnection):
281284
def setUp(self) -> None:

0 commit comments

Comments
 (0)