Added tracking

erezsh · erezsh · commit 33438b438b15 · 2022-08-22T18:26:35.000+02:00
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -5,6 +5,8 @@
 import logging
 from itertools import islice
 
+from data_diff.tracking import disable_tracking
+
 from .utils import remove_password_from_url, safezip, match_like
 
 from .diff_tables import (
@@ -81,6 +83,7 @@ def _get_schema(pair):
 @click.option("--json", "json_output", is_flag=True, help="Print JSONL output for machine readability")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
+@click.option("--no-tracking", is_flag=True, help="Disable anonymous tracking.")
 @click.option(
     "--case-sensitive",
     is_flag=True,
@@ -128,6 +131,7 @@ def _main(
     debug,
     verbose,
     interactive,
+    no_tracking,
     threads,
     case_sensitive,
     json_output,
@@ -137,6 +141,9 @@ def _main(
     __conf__=None,
 ):
 
+    if no_tracking:
+        disable_tracking()
+
     if interactive:
         debug = True
 
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -12,6 +12,9 @@
 
 from runtype import dataclass
 
+from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
+
+
 from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, TableName, Time, Value
 from .utils import CaseAwareMapping, CaseInsensitiveDict, safezip, split_space, CaseSensitiveDict, ArithString
 from .databases.base import Database
@@ -257,6 +260,11 @@ def query_key_range(self) -> Tuple[int, int]:
     def is_bounded(self):
         return self.min_key is not None and self.max_key is not None
 
+    def approximate_size(self):
+        if not self.is_bounded:
+            raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
+        return self.max_key - self.min_key
+
 
 def diff_sets(a: set, b: set) -> Iterator:
     s1 = set(a)
@@ -322,45 +330,72 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
         if self.bisection_factor < 2:
             raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
 
-        # Query and validate schema
-        table1, table2 = self._threaded_call("with_schema", [table1, table2])
-        self._validate_and_adjust_columns(table1, table2)
+        if is_tracking_enabled():
+            options = dict(self)
+            event_json = create_start_event_json(options)
+            send_event_json(event_json)
 
-        key_type = table1._schema[table1.key_column]
-        key_type2 = table2._schema[table2.key_column]
-        if not isinstance(key_type, IKey):
-            raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
-        if not isinstance(key_type2, IKey):
-            raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
-        assert key_type.python_type is key_type2.python_type
-
-        # Query min/max values
-        key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
+        self.stats["diff_count"] = 0
+        start = time.time()
+        try:
 
-        # Start with the first completed value, so we don't waste time waiting
-        min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
+            # Query and validate schema
+            table1, table2 = self._threaded_call("with_schema", [table1, table2])
+            self._validate_and_adjust_columns(table1, table2)
 
-        table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
+            key_type = table1._schema[table1.key_column]
+            key_type2 = table2._schema[table2.key_column]
+            if not isinstance(key_type, IKey):
+                raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
+            if not isinstance(key_type2, IKey):
+                raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
+            assert key_type.python_type is key_type2.python_type
 
-        logger.info(
-            f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
-            f"key-range: {table1.min_key}..{table2.max_key}, "
-            f"size: {table2.max_key-table1.min_key}"
-        )
+            # Query min/max values
+            key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
 
-        # Bisect (split) the table into segments, and diff them recursively.
-        yield from self._bisect_and_diff_tables(table1, table2)
+            # Start with the first completed value, so we don't waste time waiting
+            min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
 
-        # Now we check for the second min-max, to diff the portions we "missed".
-        min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
+            table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
 
-        if min_key2 < min_key1:
-            pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
-            yield from self._bisect_and_diff_tables(*pre_tables)
+            logger.info(
+                f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
+                f"key-range: {table1.min_key}..{table2.max_key}, "
+                f"size: {table1.approximate_size()}"
+            )
 
-        if max_key2 > max_key1:
-            post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
-            yield from self._bisect_and_diff_tables(*post_tables)
+            # Bisect (split) the table into segments, and diff them recursively.
+            yield from self._bisect_and_diff_tables(table1, table2)
+
+            # Now we check for the second min-max, to diff the portions we "missed".
+            min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
+
+            if min_key2 < min_key1:
+                pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
+                yield from self._bisect_and_diff_tables(*pre_tables)
+
+            if max_key2 > max_key1:
+                post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
+                yield from self._bisect_and_diff_tables(*post_tables)
+
+            error = None
+        except BaseException as e:  # Catch KeyboardInterrupt too
+            error = e
+        finally:
+            if is_tracking_enabled():
+                runtime = time.time() - start
+                table1_count = self.stats.get("table1_count")
+                table2_count = self.stats.get("table2_count")
+                diff_count = self.stats.get("diff_count")
+                err_message = str(error)[:20]   # Truncate possibly sensitive information.
+                event_json = create_end_event_json(
+                    error is None, runtime, table1.database.name, table2.database.name, table1_count, table2_count, diff_count, err_message
+                )
+                send_event_json(event_json)
+
+            if error:
+                raise error
 
     def _parse_key_range_result(self, key_type, key_range):
         mn, mx = key_range
@@ -438,6 +473,8 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
                 self.stats["table1_count"] = len(rows1)
                 self.stats["table2_count"] = len(rows2)
 
+            self.stats["diff_count"] += len(diff)
+
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
             yield from diff
diff --git a/data_diff/tracking.py b/data_diff/tracking.py
@@ -0,0 +1,114 @@
+#
+# This module contains all the functionality related to the anonymous tracking of data-diff use.
+#
+
+import toml
+import logging
+import os
+import json
+import platform
+from time import time
+from typing import Any, Dict, Optional
+import urllib.request
+from uuid import uuid4
+
+TRACK_URL = "https://api.perfalytics.com/track"
+START_EVENT = "os_diff_run_start"
+END_EVENT = "os_diff_run_end"
+TOKEN = "ccb8c3a6-3b6f-445c-ad67-994efa7bd020"
+
+DEFAULT_PROFILE = os.path.expanduser("~/.datadiff.toml")
+
+
+def _load_profile():
+    try:
+        with open(DEFAULT_PROFILE) as f:
+            conf = toml.load(f)
+    except FileNotFoundError as e:
+        conf = {}
+
+    if "anonymous_id" not in conf:
+        conf["anonymous_id"] = str(uuid4())
+        with open(DEFAULT_PROFILE, "w") as f:
+            toml.dump(conf, f)
+    return conf
+
+
+g_tracking_enabled = True
+g_anonymous_id = None
+
+
+def disable_tracking():
+    global g_tracking_enabled
+    g_tracking_enabled = False
+
+
+def is_tracking_enabled():
+    return g_tracking_enabled
+
+
+def get_anonymous_id():
+    global g_anonymous_id
+    if g_anonymous_id is None:
+        profile = _load_profile()
+        g_anonymous_id = profile["anonymous_id"]
+    return g_anonymous_id
+
+
+def create_start_event_json(diff_options: Dict[str, Any]):
+    return {
+        "event": "os_diff_run_start",
+        "properties": {
+            "distinct_id": get_anonymous_id(),
+            "token": TOKEN,
+            "time": time(),
+            "os_type": os.name,
+            "os_version": platform.platform(),
+            "python_version": f"{platform.python_version()}/{platform.python_implementation()}",
+            "diff_options": diff_options,
+        },
+    }
+
+
+def create_end_event_json(
+    is_success: bool,
+    runtime_seconds: float,
+    db1: str,
+    db2: str,
+    table1_count: int,
+    table2_count: int,
+    diff_count: int,
+    error: Optional[str],
+):
+    return {
+        "event": "os_diff_run_end",
+        "properties": {
+            "distinct_id": get_anonymous_id(),
+            "token": TOKEN,
+            "time": time(),
+            "is_success": is_success,
+            "runtime_seconds": runtime_seconds,
+            "data_source_1_type": db1,
+            "data_source_2_type": db2,
+            "table_1_rows_cnt": table1_count,
+            "table_2_rows_cnt": table2_count,
+            "diff_rows_cnt": diff_count,
+            "error_message": error,
+        },
+    }
+
+
+def send_event_json(event_json):
+    if not g_tracking_enabled:
+        raise RuntimeError("Won't send; tracking is disabled!")
+
+    headers = {"Content-Type": "application/json"}
+    data = json.dumps(event_json).encode()
+    try:
+        req = urllib.request.Request(TRACK_URL, data=data, headers=headers)
+        with urllib.request.urlopen(req) as f:
+            res = f.read()
+            if f.code != 200:
+                raise RuntimeError(res)
+    except Exception as e:
+        logging.debug(f"Failed to post to freshpaint: {e}")
diff --git a/tests/common.py b/tests/common.py
@@ -5,9 +5,12 @@
 import random
 
 from data_diff import databases as db
+from data_diff import tracking
 import logging
 import subprocess
 
+tracking.disable_tracking()
+
 # We write 'or None' because Github sometimes creates empty env vars for secrets
 TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
 TEST_POSTGRESQL_CONN_STRING: str = "postgresql://postgres:Password1@localhost/postgres"
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -12,7 +12,7 @@
 
 def run_datadiff_cli(*args):
     try:
-        stdout = subprocess.check_output([sys.executable, "-m", "data_diff"] + list(args), stderr=subprocess.PIPE)
+        stdout = subprocess.check_output([sys.executable, "-m", "data_diff", '--no-tracking'] + list(args), stderr=subprocess.PIPE)
     except subprocess.CalledProcessError as e:
         logging.error(e.stderr)
         raise