@@ -131,6 +131,10 @@ def __post_init__(self):
131131class UnknownColType (ColType ):
132132 text : str
133133
134+ def __post_init__ (self ):
135+ logger .warn (f"Column of type '{ self .text } ' has no compatibility handling. "
136+ "If encoding/formatting differs between databases, it may result in false positives." )
137+
134138
135139class AbstractDatabase (ABC ):
136140 @abstractmethod
@@ -173,16 +177,24 @@ def close(self):
173177 "Close connection(s) to the database instance. Querying will stop functioning."
174178 ...
175179
180+
176181 @abstractmethod
177- def normalize_value_by_type ( value : str , coltype : ColType ) -> str :
178- """Creates an SQL expression, that converts 'value' to a normalized representation .
182+ def normalize_timestamp ( self , value : str , coltype : ColType ) -> str :
183+ """Creates an SQL expression, that converts 'value' to a normalized timestamp .
179184
180- The returned expression must accept any SQL value, and return a string.
185+ The returned expression must accept any SQL datetime/timestamp, and return a string.
186+
187+ Date format: "YYYY-MM-DD HH:mm:SS.FFFFFF"
188+
189+ Precision of dates should be rounded up/down according to coltype.rounds
190+ """
191+ ...
181192
182- - Dates are expected in the format:
183- "YYYY-MM-DD HH:mm:SS.FFFFFF"
193+ @abstractmethod
194+ def normalize_number (self , value : str , coltype : ColType ) -> str :
195+ """Creates an SQL expression, that converts 'value' to a normalized number.
184196
185- Rounded up/down according to coltype.rounds
197+ The returned expression must accept any SQL int/numeric/float, and return a string.
186198
187199 - Floats/Decimals are expected in the format
188200 "I.P"
@@ -191,14 +203,31 @@ def normalize_value_by_type(value: str, coltype: ColType) -> str:
191203 and must be at least one digit (0).
192204 P is the fractional digits, the amount of which is specified with
193205 coltype.precision. Trailing zeroes may be necessary.
206+ If P is 0, the dot is omitted.
194207
195208 Note: This precision is different than the one used by databases. For decimals,
196- it's the same as "numeric_scale", and for floats, who use binary precision,
197- it can be calculated as log10(2**p)
209+ it's the same as ``numeric_scale``, and for floats, who use binary precision,
210+ it can be calculated as ``log10(2**numeric_precision)``.
211+ """
212+ ...
213+
214+ def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
215+ """Creates an SQL expression, that converts 'value' to a normalized representation.
216+
217+ The returned expression must accept any SQL value, and return a string.
218+
219+ The default implementation dispatches to a method according to ``coltype``:
198220
221+ TemporalType -> normalize_timestamp()
222+ NumericType -> normalize_number()
223+ -else- -> to_string()
199224
200225 """
201- ...
226+ if isinstance (coltype , TemporalType ):
227+ return self .normalize_timestamp (value , coltype )
228+ elif isinstance (coltype , NumericType ):
229+ return self .normalize_number (value , coltype )
230+ return self .to_string (f"{ value } " )
202231
203232
204233class Database (AbstractDatabase ):
@@ -410,27 +439,16 @@ def md5_to_int(self, s: str) -> str:
410439 def to_string (self , s : str ):
411440 return f"{ s } ::varchar"
412441
413- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
414- if isinstance (coltype , TemporalType ):
415- # if coltype.precision == 0:
416- # return f"to_char({value}::timestamp(0), 'YYYY-mm-dd HH24:MI:SS')"
417- # if coltype.precision == 3:
418- # return f"to_char({value}, 'YYYY-mm-dd HH24:MI:SS.US')"
419- # elif coltype.precision == 6:
420- # return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
421- # else:
422- # # Postgres/Redshift doesn't support arbitrary precision
423- # raise TypeError(f"Bad precision for {type(self).__name__}: {coltype})")
424- if coltype .rounds :
425- return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
426- else :
427- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
428- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
429442
430- elif isinstance (coltype , NumericType ):
431- value = f"{ value } ::decimal(38, { coltype .precision } )"
443+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
444+ if coltype .rounds :
445+ return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
432446
433- return self .to_string (f"{ value } " )
447+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
448+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
449+
450+ def normalize_number (self , value : str , coltype : ColType ) -> str :
451+ return self .to_string (f"{ value } ::decimal(38, { coltype .precision } )" )
434452
435453
436454class Presto (Database ):
@@ -470,25 +488,19 @@ def _query(self, sql_code: str) -> list:
470488 def close (self ):
471489 self ._conn .close ()
472490
473- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
474- if isinstance (coltype , TemporalType ):
475- if coltype .rounds :
476- if coltype .precision > 3 :
477- pass
478- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
479- else :
480- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
481- # datetime = f"date_format(cast({value} as timestamp(6), '%Y-%m-%d %H:%i:%S.%f'))"
482- # datetime = self.to_string(f"cast({value} as datetime(6))")
483-
484- return (
485- f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
486- )
491+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
492+ # TODO
493+ if coltype .rounds :
494+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
495+ else :
496+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
487497
488- elif isinstance (coltype , NumericType ):
489- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
498+ return (
499+ f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
500+ )
490501
491- return self .to_string (value )
502+ def normalize_number (self , value : str , coltype : ColType ) -> str :
503+ return self .to_string (f"cast({ value } as decimal(38,{ coltype .precision } ))" )
492504
493505 def select_table_schema (self , path : DbPath ) -> str :
494506 schema , table = self ._normalize_table_path (path )
@@ -577,18 +589,16 @@ def md5_to_int(self, s: str) -> str:
577589 def to_string (self , s : str ):
578590 return f"cast({ s } as char)"
579591
580- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
581- if isinstance (coltype , TemporalType ):
582- if coltype .rounds :
583- return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
584- else :
585- s = self .to_string (f"cast({ value } as datetime(6))" )
586- return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
592+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
593+ if coltype .rounds :
594+ return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
587595
588- elif isinstance (coltype , NumericType ):
589- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
596+ s = self .to_string (f"cast({ value } as datetime(6))" )
597+ return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
598+
599+ def normalize_number (self , value : str , coltype : ColType ) -> str :
600+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
590601
591- return self .to_string (f"{ value } " )
592602
593603
594604class Oracle (ThreadedDatabase ):
@@ -633,16 +643,15 @@ def select_table_schema(self, path: DbPath) -> str:
633643 f" FROM USER_TAB_COLUMNS WHERE table_name = '{ table .upper ()} '"
634644 )
635645
636- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
637- if isinstance (coltype , TemporalType ):
638- return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
639- elif isinstance (coltype , NumericType ):
640- # FM999.9990
641- format_str = "FM" + "9" * (38 - coltype .precision )
642- if coltype .precision :
643- format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
644- return f"to_char({ value } , '{ format_str } ')"
645- return self .to_string (f"{ value } " )
646+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
647+ return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
648+
649+ def normalize_number (self , value : str , coltype : ColType ) -> str :
650+ # FM999.9990
651+ format_str = "FM" + "9" * (38 - coltype .precision )
652+ if coltype .precision :
653+ format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
654+ return f"to_char({ value } , '{ format_str } ')"
646655
647656 def _parse_type (
648657 self , type_repr : str , datetime_precision : int = None , numeric_precision : int = None , numeric_scale : int = None
@@ -693,27 +702,25 @@ class Redshift(Postgres):
693702 def md5_to_int (self , s : str ) -> str :
694703 return f"strtol(substring(md5({ s } ), { 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS } ), 16)::decimal(38)"
695704
696- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
697- if isinstance (coltype , TemporalType ):
698- if coltype .rounds :
699- timestamp = f"{ value } ::timestamp(6)"
700- # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
701- secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
702- # Get the milliseconds from timestamp.
703- ms = f"extract(ms from { timestamp } )"
704- # Get the microseconds from timestamp, without the milliseconds!
705- us = f"extract(us from { timestamp } )"
706- # epoch = Total time since epoch in microseconds.
707- epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
708- timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
709- else :
710- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
711- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
712-
713- elif isinstance (coltype , NumericType ):
714- value = f"{ value } ::decimal(38,{ coltype .precision } )"
705+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
706+ if coltype .rounds :
707+ timestamp = f"{ value } ::timestamp(6)"
708+ # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
709+ secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
710+ # Get the milliseconds from timestamp.
711+ ms = f"extract(ms from { timestamp } )"
712+ # Get the microseconds from timestamp, without the milliseconds!
713+ us = f"extract(us from { timestamp } )"
714+ # epoch = Total time since epoch in microseconds.
715+ epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
716+ timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
717+ else :
718+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
719+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
720+
721+ def normalize_number (self , value : str , coltype : ColType ) -> str :
722+ return self .to_string (f"{ value } ::decimal(38,{ coltype .precision } )" )
715723
716- return self .to_string (f"{ value } " )
717724
718725 def select_table_schema (self , path : DbPath ) -> str :
719726 schema , table = self ._normalize_table_path (path )
@@ -813,27 +820,23 @@ def select_table_schema(self, path: DbPath) -> str:
813820 f"WHERE table_name = '{ table } ' AND table_schema = '{ schema } '"
814821 )
815822
816- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
817- if isinstance (coltype , TemporalType ):
818- if coltype .rounds :
819- timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
820- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
821- else :
822- if coltype .precision == 0 :
823- return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, { value } )"
824- elif coltype .precision == 6 :
825- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
823+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
824+ if coltype .rounds :
825+ timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
826+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
826827
827- timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
828- return f"RPAD(LEFT( { timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype . precision } ) , { TIMESTAMP_PRECISION_POS + 6 } , '0' )"
829- elif isinstance ( coltype , Integer ) :
830- pass
828+ if coltype . precision == 0 :
829+ return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000 , { value } )"
830+ elif coltype . precision == 6 :
831+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
831832
832- elif isinstance (coltype , NumericType ):
833- # value = f"cast({value} as decimal)"
834- return f"format('%.{ coltype .precision } f', ({ value } ))"
833+ timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
834+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
835835
836- return self .to_string (f"{ value } " )
836+ def normalize_number (self , value : str , coltype : ColType ) -> str :
837+ if isinstance (coltype , Integer ):
838+ return self .to_string (value )
839+ return f"format('%.{ coltype .precision } f', { value } )"
837840
838841 def parse_table_name (self , name : str ) -> DbPath :
839842 path = parse_table_name (name )
@@ -907,19 +910,16 @@ def select_table_schema(self, path: DbPath) -> str:
907910 schema , table = self ._normalize_table_path (path )
908911 return super ().select_table_schema ((schema , table ))
909912
910- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
911- if isinstance (coltype , TemporalType ):
912- if coltype .rounds :
913- timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
914- else :
915- timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
916-
917- return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
913+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
914+ if coltype .rounds :
915+ timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
916+ else :
917+ timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
918918
919- elif isinstance (coltype , NumericType ):
920- value = f"cast({ value } as decimal(38, { coltype .precision } ))"
919+ return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
921920
922- return self .to_string (f"{ value } " )
921+ def normalize_number (self , value : str , coltype : ColType ) -> str :
922+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
923923
924924
925925@dataclass
0 commit comments