Refactor _parse_type()

erezsh · erezsh · commit e3fda003560d · 2022-06-28T09:20:28.000+02:00
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -1,13 +1,22 @@
 import math
 import sys
 import logging
-from typing import Dict, Tuple, Optional, Sequence
+from typing import Dict, Tuple, Optional, Sequence, Type
 from functools import lru_cache, wraps
 from concurrent.futures import ThreadPoolExecutor
 import threading
 from abc import abstractmethod
 
-from .database_types import AbstractDatabase, ColType, Integer, Decimal, Float, UnknownColType
+from .database_types import (
+    AbstractDatabase,
+    ColType,
+    Integer,
+    Decimal,
+    Float,
+    PrecisionType,
+    TemporalType,
+    UnknownColType,
+)
 from data_diff.sql import DbPath, SqlOrStr, Compiler, Explain, Select
 
 logger = logging.getLogger("database")
@@ -62,7 +71,7 @@ class Database(AbstractDatabase):
     Instanciated using :meth:`~data_diff.connect_to_uri`
     """
 
-    DATETIME_TYPES: Dict[str, type] = {}
+    TYPE_CLASSES: Dict[str, type] = {}
     default_schema: str = None
 
     @property
@@ -109,6 +118,9 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
         # See: https://en.wikipedia.org/wiki/Single-precision_floating-point_format
         return math.floor(math.log(2**p, 10))
 
+    def _parse_type_repr(self, type_repr: str) -> Optional[Type[ColType]]:
+        return self.TYPE_CLASSES.get(type_repr)
+
     def _parse_type(
         self,
         col_name: str,
@@ -119,36 +131,35 @@ def _parse_type(
     ) -> ColType:
         """ """
 
-        cls = self.DATETIME_TYPES.get(type_repr)
-        if cls:
+        cls = self._parse_type_repr(type_repr)
+        if not cls:
+            return UnknownColType(type_repr)
+
+        if issubclass(cls, TemporalType):
             return cls(
                 precision=datetime_precision if datetime_precision is not None else DEFAULT_DATETIME_PRECISION,
                 rounds=self.ROUNDS_ON_PREC_LOSS,
             )
 
-        cls = self.NUMERIC_TYPES.get(type_repr)
-        if cls:
-            if issubclass(cls, Integer):
-                # Some DBs have a constant numeric_scale, so they don't report it.
-                # We fill in the constant, so we need to ignore it for integers.
-                return cls(precision=0)
-
-            elif issubclass(cls, Decimal):
-                if numeric_scale is None:
-                    raise ValueError(
-                        f"{self.name}: Unexpected numeric_scale is NULL, for column {col_name} of type {type_repr}."
-                    )
-                return cls(precision=numeric_scale)
-
-            assert issubclass(cls, Float)
+        elif issubclass(cls, Integer):
+            return cls()
+
+        elif issubclass(cls, Decimal):
+            if numeric_scale is None:
+                raise ValueError(
+                    f"{self.name}: Unexpected numeric_scale is NULL, for column {col_name} of type {type_repr}."
+                )
+            return cls(precision=numeric_scale)
+
+        elif issubclass(cls, Float):
             # assert numeric_scale is None
             return cls(
                 precision=self._convert_db_precision_to_digits(
                     numeric_precision if numeric_precision is not None else DEFAULT_NUMERIC_PRECISION
                 )
             )
 
-        return UnknownColType(type_repr)
+        raise TypeError(f"Parsing {type_repr} returned an unknown type '{cls}'.")
 
     def select_table_schema(self, path: DbPath) -> str:
         schema, table = self._normalize_table_path(path)
diff --git a/data_diff/databases/bigquery.py b/data_diff/databases/bigquery.py
@@ -11,11 +11,11 @@ def import_bigquery():
 
 
 class BigQuery(Database):
-    DATETIME_TYPES = {
+    TYPE_CLASSES = {
+        # Dates
         "TIMESTAMP": Timestamp,
         "DATETIME": Datetime,
-    }
-    NUMERIC_TYPES = {
+        # Numbers
         "INT64": Integer,
         "INT32": Integer,
         "NUMERIC": Decimal,
diff --git a/data_diff/databases/database_types.py b/data_diff/databases/database_types.py
@@ -53,6 +53,14 @@ class Decimal(FractionalType):
     pass
 
 
+class StringType(ColType):
+    pass
+
+
+class UUID(StringType):
+    pass
+
+
 @dataclass
 class Integer(NumericType):
     def __post_init__(self):
diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py
@@ -11,11 +11,11 @@ def import_mysql():
 
 
 class MySQL(ThreadedDatabase):
-    DATETIME_TYPES = {
+    TYPE_CLASSES = {
+        # Dates
         "datetime": Datetime,
         "timestamp": Timestamp,
-    }
-    NUMERIC_TYPES = {
+        # Numbers
         "double": Float,
         "float": Float,
         "decimal": Decimal,
diff --git a/data_diff/databases/oracle.py b/data_diff/databases/oracle.py
@@ -13,6 +13,10 @@ def import_oracle():
 
 
 class Oracle(ThreadedDatabase):
+    TYPE_CLASSES: Dict[str, type] = {
+        "NUMBER": Decimal,
+        "FLOAT": Float,
+    }
     ROUNDS_ON_PREC_LOSS = True
 
     def __init__(self, host, port, user, password, *, database, thread_count, **kw):
@@ -72,7 +76,6 @@ def _parse_type(
         numeric_precision: int = None,
         numeric_scale: int = None,
     ) -> ColType:
-        """ """
         regexps = {
             r"TIMESTAMP\((\d)\) WITH LOCAL TIME ZONE": Timestamp,
             r"TIMESTAMP\((\d)\) WITH TIME ZONE": TimestampTZ,
@@ -86,20 +89,4 @@ def _parse_type(
                     rounds=self.ROUNDS_ON_PREC_LOSS,
                 )
 
-        n_cls = {
-            "NUMBER": Decimal,
-            "FLOAT": Float,
-        }.get(type_repr, None)
-        if n_cls:
-            if issubclass(n_cls, Decimal):
-                assert numeric_scale is not None, (type_repr, numeric_precision, numeric_scale)
-                return n_cls(precision=numeric_scale)
-
-            assert issubclass(n_cls, Float)
-            return n_cls(
-                precision=self._convert_db_precision_to_digits(
-                    numeric_precision if numeric_precision is not None else DEFAULT_NUMERIC_PRECISION
-                )
-            )
-
-        return UnknownColType(type_repr)
+        return super()._parse_type(type_repr, col_name, type_repr, datetime_precision, numeric_precision, numeric_scale)
diff --git a/data_diff/databases/postgresql.py b/data_diff/databases/postgresql.py
@@ -13,13 +13,12 @@ def import_postgresql():
 
 
 class PostgreSQL(ThreadedDatabase):
-    DATETIME_TYPES = {
+    TYPE_CLASSES = {
+        # Timestamps
         "timestamp with time zone": TimestampTZ,
         "timestamp without time zone": Timestamp,
         "timestamp": Timestamp,
-        # "datetime": Datetime,
-    }
-    NUMERIC_TYPES = {
+        # Numbers
         "double precision": Float,
         "real": Float,
         "decimal": Decimal,
diff --git a/data_diff/databases/presto.py b/data_diff/databases/presto.py
@@ -20,13 +20,12 @@ def import_presto():
 
 class Presto(Database):
     default_schema = "public"
-    DATETIME_TYPES = {
+    TYPE_CLASSES = {
+        # Timestamps
         "timestamp with time zone": TimestampTZ,
         "timestamp without time zone": Timestamp,
         "timestamp": Timestamp,
-        # "datetime": Datetime,
-    }
-    NUMERIC_TYPES = {
+        # Numbers
         "integer": Integer,
         "real": Float,
         "double": Float,
@@ -104,17 +103,4 @@ def _parse_type(
                 prec, scale = map(int, m.groups())
                 return n_cls(scale)
 
-        n_cls = self.NUMERIC_TYPES.get(type_repr)
-        if n_cls:
-            if issubclass(n_cls, Integer):
-                assert numeric_precision is not None
-                return n_cls(0)
-
-            assert issubclass(n_cls, Float)
-            return n_cls(
-                precision=self._convert_db_precision_to_digits(
-                    numeric_precision if numeric_precision is not None else DEFAULT_NUMERIC_PRECISION
-                )
-            )
-
-        return UnknownColType(type_repr)
+        return super()._parse_type(type_repr)
diff --git a/data_diff/databases/redshift.py b/data_diff/databases/redshift.py
@@ -3,8 +3,8 @@
 
 
 class Redshift(PostgreSQL):
-    NUMERIC_TYPES = {
-        **PostgreSQL.NUMERIC_TYPES,
+    TYPE_CLASSES = {
+        **PostgreSQL.TYPE_CLASSES,
         "double": Float,
         "real": Float,
     }
diff --git a/data_diff/databases/snowflake.py b/data_diff/databases/snowflake.py
@@ -12,12 +12,12 @@ def import_snowflake():
 
 
 class Snowflake(Database):
-    DATETIME_TYPES = {
+    TYPE_CLASSES = {
+        # Timestamps
         "TIMESTAMP_NTZ": Timestamp,
         "TIMESTAMP_LTZ": Timestamp,
         "TIMESTAMP_TZ": TimestampTZ,
-    }
-    NUMERIC_TYPES = {
+        # Numbers
         "NUMBER": Decimal,
         "FLOAT": Float,
     }

Original file line number	Diff line number	Diff line change
`@@ -3,8 +3,8 @@`
`3`	`3`
`4`	`4`
`5`	`5`	`class Redshift(PostgreSQL):`
`6`		`- NUMERIC_TYPES = {`
`7`		`- **PostgreSQL.NUMERIC_TYPES,`
	`6`	`+ TYPE_CLASSES = {`
	`7`	`+ **PostgreSQL.TYPE_CLASSES,`
`8`	`8`	`"double": Float,`
`9`	`9`	`"real": Float,`
`10`	`10`	`}`