Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 33438b4

Browse files
committed
Added tracking
1 parent 97c5e6a commit 33438b4

File tree

5 files changed

+193
-32
lines changed

5 files changed

+193
-32
lines changed

data_diff/__main__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import logging
66
from itertools import islice
77

8+
from data_diff.tracking import disable_tracking
9+
810
from .utils import remove_password_from_url, safezip, match_like
911

1012
from .diff_tables import (
@@ -81,6 +83,7 @@ def _get_schema(pair):
8183
@click.option("--json", "json_output", is_flag=True, help="Print JSONL output for machine readability")
8284
@click.option("-v", "--verbose", is_flag=True, help="Print extra info")
8385
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
86+
@click.option("--no-tracking", is_flag=True, help="Disable anonymous tracking.")
8487
@click.option(
8588
"--case-sensitive",
8689
is_flag=True,
@@ -128,6 +131,7 @@ def _main(
128131
debug,
129132
verbose,
130133
interactive,
134+
no_tracking,
131135
threads,
132136
case_sensitive,
133137
json_output,
@@ -137,6 +141,9 @@ def _main(
137141
__conf__=None,
138142
):
139143

144+
if no_tracking:
145+
disable_tracking()
146+
140147
if interactive:
141148
debug = True
142149

data_diff/diff_tables.py

Lines changed: 68 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212

1313
from runtype import dataclass
1414

15+
from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
16+
17+
1518
from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, TableName, Time, Value
1619
from .utils import CaseAwareMapping, CaseInsensitiveDict, safezip, split_space, CaseSensitiveDict, ArithString
1720
from .databases.base import Database
@@ -257,6 +260,11 @@ def query_key_range(self) -> Tuple[int, int]:
257260
def is_bounded(self):
258261
return self.min_key is not None and self.max_key is not None
259262

263+
def approximate_size(self):
264+
if not self.is_bounded:
265+
raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
266+
return self.max_key - self.min_key
267+
260268

261269
def diff_sets(a: set, b: set) -> Iterator:
262270
s1 = set(a)
@@ -322,45 +330,72 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
322330
if self.bisection_factor < 2:
323331
raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
324332

325-
# Query and validate schema
326-
table1, table2 = self._threaded_call("with_schema", [table1, table2])
327-
self._validate_and_adjust_columns(table1, table2)
333+
if is_tracking_enabled():
334+
options = dict(self)
335+
event_json = create_start_event_json(options)
336+
send_event_json(event_json)
328337

329-
key_type = table1._schema[table1.key_column]
330-
key_type2 = table2._schema[table2.key_column]
331-
if not isinstance(key_type, IKey):
332-
raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
333-
if not isinstance(key_type2, IKey):
334-
raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
335-
assert key_type.python_type is key_type2.python_type
336-
337-
# Query min/max values
338-
key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
338+
self.stats["diff_count"] = 0
339+
start = time.time()
340+
try:
339341

340-
# Start with the first completed value, so we don't waste time waiting
341-
min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
342+
# Query and validate schema
343+
table1, table2 = self._threaded_call("with_schema", [table1, table2])
344+
self._validate_and_adjust_columns(table1, table2)
342345

343-
table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
346+
key_type = table1._schema[table1.key_column]
347+
key_type2 = table2._schema[table2.key_column]
348+
if not isinstance(key_type, IKey):
349+
raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
350+
if not isinstance(key_type2, IKey):
351+
raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
352+
assert key_type.python_type is key_type2.python_type
344353

345-
logger.info(
346-
f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
347-
f"key-range: {table1.min_key}..{table2.max_key}, "
348-
f"size: {table2.max_key-table1.min_key}"
349-
)
354+
# Query min/max values
355+
key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
350356

351-
# Bisect (split) the table into segments, and diff them recursively.
352-
yield from self._bisect_and_diff_tables(table1, table2)
357+
# Start with the first completed value, so we don't waste time waiting
358+
min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
353359

354-
# Now we check for the second min-max, to diff the portions we "missed".
355-
min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
360+
table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
356361

357-
if min_key2 < min_key1:
358-
pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
359-
yield from self._bisect_and_diff_tables(*pre_tables)
362+
logger.info(
363+
f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
364+
f"key-range: {table1.min_key}..{table2.max_key}, "
365+
f"size: {table1.approximate_size()}"
366+
)
360367

361-
if max_key2 > max_key1:
362-
post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
363-
yield from self._bisect_and_diff_tables(*post_tables)
368+
# Bisect (split) the table into segments, and diff them recursively.
369+
yield from self._bisect_and_diff_tables(table1, table2)
370+
371+
# Now we check for the second min-max, to diff the portions we "missed".
372+
min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
373+
374+
if min_key2 < min_key1:
375+
pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
376+
yield from self._bisect_and_diff_tables(*pre_tables)
377+
378+
if max_key2 > max_key1:
379+
post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
380+
yield from self._bisect_and_diff_tables(*post_tables)
381+
382+
error = None
383+
except BaseException as e: # Catch KeyboardInterrupt too
384+
error = e
385+
finally:
386+
if is_tracking_enabled():
387+
runtime = time.time() - start
388+
table1_count = self.stats.get("table1_count")
389+
table2_count = self.stats.get("table2_count")
390+
diff_count = self.stats.get("diff_count")
391+
err_message = str(error)[:20] # Truncate possibly sensitive information.
392+
event_json = create_end_event_json(
393+
error is None, runtime, table1.database.name, table2.database.name, table1_count, table2_count, diff_count, err_message
394+
)
395+
send_event_json(event_json)
396+
397+
if error:
398+
raise error
364399

365400
def _parse_key_range_result(self, key_type, key_range):
366401
mn, mx = key_range
@@ -438,6 +473,8 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
438473
self.stats["table1_count"] = len(rows1)
439474
self.stats["table2_count"] = len(rows2)
440475

476+
self.stats["diff_count"] += len(diff)
477+
441478
logger.info(". " * level + f"Diff found {len(diff)} different rows.")
442479
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
443480
yield from diff

data_diff/tracking.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#
2+
# This module contains all the functionality related to the anonymous tracking of data-diff use.
3+
#
4+
5+
import toml
6+
import logging
7+
import os
8+
import json
9+
import platform
10+
from time import time
11+
from typing import Any, Dict, Optional
12+
import urllib.request
13+
from uuid import uuid4
14+
15+
TRACK_URL = "https://api.perfalytics.com/track"
16+
START_EVENT = "os_diff_run_start"
17+
END_EVENT = "os_diff_run_end"
18+
TOKEN = "ccb8c3a6-3b6f-445c-ad67-994efa7bd020"
19+
20+
DEFAULT_PROFILE = os.path.expanduser("~/.datadiff.toml")
21+
22+
23+
def _load_profile():
24+
try:
25+
with open(DEFAULT_PROFILE) as f:
26+
conf = toml.load(f)
27+
except FileNotFoundError as e:
28+
conf = {}
29+
30+
if "anonymous_id" not in conf:
31+
conf["anonymous_id"] = str(uuid4())
32+
with open(DEFAULT_PROFILE, "w") as f:
33+
toml.dump(conf, f)
34+
return conf
35+
36+
37+
g_tracking_enabled = True
38+
g_anonymous_id = None
39+
40+
41+
def disable_tracking():
42+
global g_tracking_enabled
43+
g_tracking_enabled = False
44+
45+
46+
def is_tracking_enabled():
47+
return g_tracking_enabled
48+
49+
50+
def get_anonymous_id():
51+
global g_anonymous_id
52+
if g_anonymous_id is None:
53+
profile = _load_profile()
54+
g_anonymous_id = profile["anonymous_id"]
55+
return g_anonymous_id
56+
57+
58+
def create_start_event_json(diff_options: Dict[str, Any]):
59+
return {
60+
"event": "os_diff_run_start",
61+
"properties": {
62+
"distinct_id": get_anonymous_id(),
63+
"token": TOKEN,
64+
"time": time(),
65+
"os_type": os.name,
66+
"os_version": platform.platform(),
67+
"python_version": f"{platform.python_version()}/{platform.python_implementation()}",
68+
"diff_options": diff_options,
69+
},
70+
}
71+
72+
73+
def create_end_event_json(
74+
is_success: bool,
75+
runtime_seconds: float,
76+
db1: str,
77+
db2: str,
78+
table1_count: int,
79+
table2_count: int,
80+
diff_count: int,
81+
error: Optional[str],
82+
):
83+
return {
84+
"event": "os_diff_run_end",
85+
"properties": {
86+
"distinct_id": get_anonymous_id(),
87+
"token": TOKEN,
88+
"time": time(),
89+
"is_success": is_success,
90+
"runtime_seconds": runtime_seconds,
91+
"data_source_1_type": db1,
92+
"data_source_2_type": db2,
93+
"table_1_rows_cnt": table1_count,
94+
"table_2_rows_cnt": table2_count,
95+
"diff_rows_cnt": diff_count,
96+
"error_message": error,
97+
},
98+
}
99+
100+
101+
def send_event_json(event_json):
102+
if not g_tracking_enabled:
103+
raise RuntimeError("Won't send; tracking is disabled!")
104+
105+
headers = {"Content-Type": "application/json"}
106+
data = json.dumps(event_json).encode()
107+
try:
108+
req = urllib.request.Request(TRACK_URL, data=data, headers=headers)
109+
with urllib.request.urlopen(req) as f:
110+
res = f.read()
111+
if f.code != 200:
112+
raise RuntimeError(res)
113+
except Exception as e:
114+
logging.debug(f"Failed to post to freshpaint: {e}")

tests/common.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55
import random
66

77
from data_diff import databases as db
8+
from data_diff import tracking
89
import logging
910
import subprocess
1011

12+
tracking.disable_tracking()
13+
1114
# We write 'or None' because Github sometimes creates empty env vars for secrets
1215
TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
1316
TEST_POSTGRESQL_CONN_STRING: str = "postgresql://postgres:Password1@localhost/postgres"

tests/test_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
def run_datadiff_cli(*args):
1414
try:
15-
stdout = subprocess.check_output([sys.executable, "-m", "data_diff"] + list(args), stderr=subprocess.PIPE)
15+
stdout = subprocess.check_output([sys.executable, "-m", "data_diff", '--no-tracking'] + list(args), stderr=subprocess.PIPE)
1616
except subprocess.CalledProcessError as e:
1717
logging.error(e.stderr)
1818
raise

0 commit comments

Comments
 (0)