Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit f37d903

Browse files
committed
Merge remote-tracking branch 'upstream/master' into feat/stats-for-dbt
* upstream/master: v0.7.14 add debugging detail for VSCode format fix tests add both logical & raw type to schema add dataset schemas to --json output
2 parents 72b1ea1 + eb22d3e commit f37d903

File tree

7 files changed

+234
-21
lines changed

7 files changed

+234
-21
lines changed

CONTRIBUTING.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,43 @@ You can adjust how many rows we benchmark with by passing `N_SAMPLES` to `dev/be
162162
```shell-session
163163
$ N_SAMPLES=100000000 dev/benchmark.sh # 100m which is our canonical target
164164
```
165+
166+
## VSCode Setup
167+
To debug using the unit tests in VSCode, add the following files to a `.vscode` directory in the root of the repo
168+
169+
`launch.json`
170+
```
171+
{
172+
"version": "0.1.0",
173+
"configurations": [
174+
{
175+
"name": "Debug Unit Test",
176+
"type": "python",
177+
"request": "test",
178+
"justMyCode": true,
179+
}
180+
]
181+
}
182+
```
183+
184+
`settings.json`
185+
```
186+
{
187+
"python.testing.unittestArgs": [
188+
"-v",
189+
"-s",
190+
"",
191+
"-p",
192+
"test_*.py"
193+
],
194+
"python.testing.pytestEnabled": false,
195+
"python.testing.unittestEnabled": true,
196+
}
197+
```
198+
You should see that the tests are now appearing in the test explorer view:
199+
200+
![asdf](/docs/debug_example.png)
201+
202+
This will allow you to run tests in the IDE, debug them, and hit breakpoints.
203+
204+
Note that some tests require that you have the docker containers mentioned above running in order to pass.

data_diff/dbt.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,13 +312,23 @@ def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None:
312312
)
313313
return
314314

315+
dataset1_columns = [
316+
(name, type_, table1.database.dialect.parse_type(table1.table_path, name, type_, *other))
317+
for (name, type_, *other) in table1_columns.values()
318+
]
319+
dataset2_columns = [
320+
(name, type_, table2.database.dialect.parse_type(table2.table_path, name, type_, *other))
321+
for (name, type_, *other) in table2_columns.values()
322+
]
315323
print(
316324
json.dumps(
317325
jsonify(
318326
diff,
319327
dbt_model=diff_vars.dbt_model,
328+
dataset1_columns=dataset1_columns,
329+
dataset2_columns=dataset2_columns,
320330
with_summary=True,
321-
with_columns={
331+
columns_diff={
322332
"added": columns_added,
323333
"removed": columns_removed,
324334
"changed": columns_type_changed,

data_diff/format.py

Lines changed: 77 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,23 @@
11
import collections
2-
from typing import Any, Optional, List, Dict, Tuple
2+
from enum import Enum
3+
from typing import Any, Optional, List, Dict, Tuple, Type
34

45
from runtype import dataclass
56
from data_diff.diff_tables import DiffResultWrapper
7+
from data_diff.sqeleton.abcs.database_types import (
8+
JSON,
9+
Boolean,
10+
ColType,
11+
Array,
12+
ColType_UUID,
13+
Date,
14+
FractionalType,
15+
NumericType,
16+
Struct,
17+
TemporalType,
18+
ColType_Alphanum,
19+
String_Alphanum,
20+
)
621

722

823
def jsonify_error(table1: List[str], table2: List[str], dbt_model: str, error: str) -> "FailedDiff":
@@ -15,11 +30,16 @@ def jsonify_error(table1: List[str], table2: List[str], dbt_model: str, error: s
1530
).json()
1631

1732

33+
Columns = List[Tuple[str, str, ColType]]
34+
35+
1836
def jsonify(
1937
diff: DiffResultWrapper,
2038
dbt_model: str,
39+
dataset1_columns: Columns,
40+
dataset2_columns: Columns,
41+
columns_diff: Dict[str, List[str]],
2142
with_summary: bool = False,
22-
with_columns: Optional[Dict[str, List[str]]] = None,
2343
stats_only: bool = False,
2444
) -> "JsonDiff":
2545
"""
@@ -64,16 +84,13 @@ def jsonify(
6484
if with_summary:
6585
summary = _jsonify_diff_summary(diff.get_stats_dict(is_dbt=True))
6686

67-
columns = None
68-
if with_columns:
69-
columns = _jsonify_columns_diff(with_columns, list(key_columns))
87+
columns = _jsonify_columns_diff(dataset1_columns, dataset2_columns, columns_diff, list(key_columns))
7088

7189
is_different = bool(
7290
t1_exclusive_rows
7391
or t2_exclusive_rows
7492
or diff_rows
75-
or with_columns
76-
and (with_columns["added"] or with_columns["removed"] or with_columns["changed"])
93+
or (columns_diff["added"] or columns_diff["removed"] or columns_diff["changed"])
7794
)
7895
return JsonDiff(
7996
status="success",
@@ -146,8 +163,44 @@ class ExclusiveColumns:
146163
dataset2: List[str]
147164

148165

166+
class ColumnKind(Enum):
167+
INTEGER = "integer"
168+
FLOAT = "float"
169+
STRING = "string"
170+
DATE = "date"
171+
TIME = "time"
172+
DATETIME = "datetime"
173+
BOOL = "boolean"
174+
UNSUPPORTED = "unsupported"
175+
176+
177+
KIND_MAPPING: List[Tuple[Type[ColType], ColumnKind]] = [
178+
(Boolean, ColumnKind.BOOL),
179+
(Date, ColumnKind.DATE),
180+
(TemporalType, ColumnKind.DATETIME),
181+
(FractionalType, ColumnKind.FLOAT),
182+
(NumericType, ColumnKind.INTEGER),
183+
(ColType_UUID, ColumnKind.STRING),
184+
(ColType_Alphanum, ColumnKind.STRING),
185+
(String_Alphanum, ColumnKind.STRING),
186+
(JSON, ColumnKind.STRING),
187+
(Array, ColumnKind.STRING),
188+
(Struct, ColumnKind.STRING),
189+
(ColType, ColumnKind.UNSUPPORTED),
190+
]
191+
192+
193+
@dataclass
194+
class Column:
195+
name: str
196+
type: str
197+
kind: str
198+
199+
149200
@dataclass
150201
class JsonColumnsSummary:
202+
dataset1: List[Column]
203+
dataset2: List[Column]
151204
primaryKey: List[str]
152205
exclusive: ExclusiveColumns
153206
typeChanged: List[str]
@@ -187,7 +240,7 @@ class JsonDiff:
187240
summary: Optional[JsonDiffSummary]
188241
columns: Optional[JsonColumnsSummary]
189242

190-
version: str = "1.0.0"
243+
version: str = "1.1.0"
191244

192245

193246
def _group_rows(
@@ -270,12 +323,27 @@ def _jsonify_diff_summary(stats_dict: dict) -> JsonDiffSummary:
270323
)
271324

272325

273-
def _jsonify_columns_diff(columns_diff: Dict[str, List[str]], key_columns: List[str]) -> JsonColumnsSummary:
326+
def _jsonify_columns_diff(
327+
dataset1_columns: Columns, dataset2_columns: Columns, columns_diff: Dict[str, List[str]], key_columns: List[str]
328+
) -> JsonColumnsSummary:
274329
return JsonColumnsSummary(
330+
dataset1=[
331+
Column(name=name, type=type_, kind=_map_kind(kind).value) for (name, type_, kind) in dataset1_columns
332+
],
333+
dataset2=[
334+
Column(name=name, type=type_, kind=_map_kind(kind).value) for (name, type_, kind) in dataset2_columns
335+
],
275336
primaryKey=key_columns,
276337
exclusive=ExclusiveColumns(
277338
dataset2=list(columns_diff.get("added", [])),
278339
dataset1=list(columns_diff.get("removed", [])),
279340
),
280341
typeChanged=list(columns_diff.get("changed", [])),
281342
)
343+
344+
345+
def _map_kind(kind: ColType) -> ColumnKind:
346+
for raw_kind, json_kind in KIND_MAPPING:
347+
if isinstance(kind, raw_kind):
348+
return json_kind
349+
return ColumnKind.UNSUPPORTED

data_diff/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.13"
1+
__version__ = "0.7.14"

docs/debug_example.png

225 KB
Loading

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "data-diff"
3-
version = "0.7.13"
3+
version = "0.7.14"
44
description = "Command-line tool and Python library to efficiently diff rows across two different databases."
55
authors = ["Datafold <data-diff@datafold.com>"]
66
license = "MIT"

tests/test_format.py

Lines changed: 104 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import unittest
22
from data_diff.diff_tables import DiffResultWrapper, InfoTree, SegmentInfo, TableSegment
33
from data_diff.format import jsonify
4+
from data_diff.sqeleton.abcs.database_types import Integer
45
from data_diff.sqeleton.databases import Database
56

67

@@ -35,11 +36,28 @@ def test_jsonify_diff(self):
3536
diff=[],
3637
stats={},
3738
)
38-
json_diff = jsonify(diff, dbt_model="my_model")
39+
json_diff = jsonify(
40+
diff,
41+
dbt_model="my_model",
42+
dataset1_columns=[
43+
("id", "NUMBER", Integer()),
44+
("value", "NUMBER", Integer()),
45+
],
46+
dataset2_columns=[
47+
("id", "NUMBER", Integer()),
48+
("value", "NUMBER", Integer()),
49+
],
50+
columns_diff={
51+
"added": [],
52+
"removed": [],
53+
"typeChanged": [],
54+
},
55+
)
56+
3957
self.assertEqual(
4058
json_diff,
4159
{
42-
"version": "1.0.0",
60+
"version": "1.1.0",
4361
"status": "success",
4462
"result": "different",
4563
"model": "my_model",
@@ -57,8 +75,23 @@ def test_jsonify_diff(self):
5775
},
5876
],
5977
},
78+
"columns": {
79+
"dataset1": [
80+
{"name": "id", "type": "NUMBER", "kind": "integer"},
81+
{"name": "value", "type": "NUMBER", "kind": "integer"},
82+
],
83+
"dataset2": [
84+
{"name": "id", "type": "NUMBER", "kind": "integer"},
85+
{"name": "value", "type": "NUMBER", "kind": "integer"},
86+
],
87+
"primaryKey": ["id"],
88+
"exclusive": {
89+
"dataset1": [],
90+
"dataset2": [],
91+
},
92+
"typeChanged": [],
93+
},
6094
"summary": None,
61-
"columns": None,
6295
},
6396
)
6497

@@ -86,11 +119,27 @@ def test_jsonify_diff_no_difeference(self):
86119
diff=[],
87120
stats={},
88121
)
89-
json_diff = jsonify(diff, dbt_model="model")
122+
json_diff = jsonify(
123+
diff,
124+
dbt_model="model",
125+
dataset1_columns=[
126+
("id", "NUMBER", Integer()),
127+
("value", "NUMBER", Integer()),
128+
],
129+
dataset2_columns=[
130+
("id", "NUMBER", Integer()),
131+
("value", "NUMBER", Integer()),
132+
],
133+
columns_diff={
134+
"added": [],
135+
"removed": [],
136+
"changed": [],
137+
},
138+
)
90139
self.assertEqual(
91140
json_diff,
92141
{
93-
"version": "1.0.0",
142+
"version": "1.1.0",
94143
"status": "success",
95144
"result": "identical",
96145
"model": "model",
@@ -100,8 +149,23 @@ def test_jsonify_diff_no_difeference(self):
100149
"exclusive": {"dataset1": [], "dataset2": []},
101150
"diff": [],
102151
},
152+
"columns": {
153+
"primaryKey": ["id"],
154+
"dataset1": [
155+
{"name": "id", "type": "NUMBER", "kind": "integer"},
156+
{"name": "value", "type": "NUMBER", "kind": "integer"},
157+
],
158+
"dataset2": [
159+
{"name": "id", "type": "NUMBER", "kind": "integer"},
160+
{"name": "value", "type": "NUMBER", "kind": "integer"},
161+
],
162+
"exclusive": {
163+
"dataset1": [],
164+
"dataset2": [],
165+
},
166+
"typeChanged": [],
167+
},
103168
"summary": None,
104-
"columns": None,
105169
},
106170
)
107171

@@ -133,11 +197,27 @@ def test_jsonify_column_suffix_fix(self):
133197
diff=[],
134198
stats={},
135199
)
136-
json_diff = jsonify(diff, dbt_model="my_model")
200+
json_diff = jsonify(
201+
diff,
202+
dbt_model="my_model",
203+
dataset1_columns=[
204+
("id_a", "NUMBER", Integer()),
205+
("value_b", "NUMBER", Integer()),
206+
],
207+
dataset2_columns=[
208+
("id_a", "NUMBER", Integer()),
209+
("value_b", "NUMBER", Integer()),
210+
],
211+
columns_diff={
212+
"added": [],
213+
"removed": [],
214+
"typeChanged": [],
215+
},
216+
)
137217
self.assertEqual(
138218
json_diff,
139219
{
140-
"version": "1.0.0",
220+
"version": "1.1.0",
141221
"status": "success",
142222
"result": "different",
143223
"model": "my_model",
@@ -158,6 +238,21 @@ def test_jsonify_column_suffix_fix(self):
158238
],
159239
},
160240
"summary": None,
161-
"columns": None,
241+
"columns": {
242+
"dataset1": [
243+
{"name": "id_a", "type": "NUMBER", "kind": "integer"},
244+
{"name": "value_b", "type": "NUMBER", "kind": "integer"},
245+
],
246+
"dataset2": [
247+
{"name": "id_a", "type": "NUMBER", "kind": "integer"},
248+
{"name": "value_b", "type": "NUMBER", "kind": "integer"},
249+
],
250+
"primaryKey": ["id_a"],
251+
"exclusive": {
252+
"dataset1": [],
253+
"dataset2": [],
254+
},
255+
"typeChanged": [],
256+
},
162257
},
163258
)

0 commit comments

Comments
 (0)