1+ {-# LANGUAGE NumericUnderscores #-}
12{-# LANGUAGE OverloadedStrings #-}
23{-# LANGUAGE RecordWildCards #-}
4+ {-# LANGUAGE ScopedTypeVariables #-}
35{-# LANGUAGE TypeApplications #-}
46
57module DataFrame.IO.Parquet where
@@ -14,6 +16,8 @@ import qualified Data.List as L
1416import qualified Data.Map as M
1517import qualified Data.Text as T
1618import Data.Text.Encoding
19+ import Data.Time
20+ import Data.Time.Clock.POSIX (posixSecondsToUTCTime )
1721import Data.Word
1822import qualified DataFrame.Internal.Column as DI
1923import DataFrame.Internal.DataFrame (DataFrame )
@@ -28,6 +32,7 @@ import DataFrame.IO.Parquet.Thrift
2832import DataFrame.IO.Parquet.Types
2933import System.Directory (doesDirectoryExist )
3034
35+ import qualified Data.Vector.Unboxed as VU
3136import System.FilePath ((</>) )
3237
3338{- | Read a parquet file from path and load it into a dataframe.
@@ -93,13 +98,15 @@ readParquet path = do
9398 let schemaTail = drop 1 (schema fileMetadata)
9499 let colPath = columnPathInSchema (columnMetaData colChunk)
95100 let (maxDef, maxRep) = levelsForPath schemaTail colPath
101+ let lType = logicalType (schemaTail !! colIdx)
96102 column <-
97103 processColumnPages
98104 (maxDef, maxRep)
99105 pages
100106 (columnType metadata)
101107 primaryEncoding
102108 maybeTypeLength
109+ lType
103110
104111 modifyIORef colMap (M. insertWith DI. concatColumnsEither colName column)
105112
@@ -172,8 +179,9 @@ processColumnPages ::
172179 ParquetType ->
173180 ParquetEncoding ->
174181 Maybe Int32 ->
182+ LogicalType ->
175183 IO DI. Column
176- processColumnPages (maxDef, maxRep) pages pType _ maybeTypeLength = do
184+ processColumnPages (maxDef, maxRep) pages pType _ maybeTypeLength lType = do
177185 let dictPages = filter isDictionaryPage pages
178186 let dataPages = filter isDataPage pages
179187
@@ -206,10 +214,10 @@ processColumnPages (maxDef, maxRep) pages pType _ maybeTypeLength = do
206214 in pure (toMaybeBool maxDef defLvls vals)
207215 PINT32 ->
208216 let (vals, _) = readNInt32 nPresent afterLvls
209- in pure (toMaybeInt32 maxDef defLvls vals)
217+ in pure (applyLogicalType lType $ toMaybeInt32 maxDef defLvls vals)
210218 PINT64 ->
211219 let (vals, _) = readNInt64 nPresent afterLvls
212- in pure (toMaybeInt64 maxDef defLvls vals)
220+ in pure (applyLogicalType lType $ toMaybeInt64 maxDef defLvls vals)
213221 PINT96 ->
214222 let (vals, _) = readNInt96Times nPresent afterLvls
215223 in pure (toMaybeUTCTime maxDef defLvls vals)
@@ -258,10 +266,10 @@ processColumnPages (maxDef, maxRep) pages pType _ maybeTypeLength = do
258266 in pure (toMaybeBool maxDef defLvls vals)
259267 PINT32 ->
260268 let (vals, _) = readNInt32 nPresent afterLvls
261- in pure (toMaybeInt32 maxDef defLvls vals)
269+ in pure (applyLogicalType lType $ toMaybeInt32 maxDef defLvls vals)
262270 PINT64 ->
263271 let (vals, _) = readNInt64 nPresent afterLvls
264- in pure (toMaybeInt64 maxDef defLvls vals)
272+ in pure (applyLogicalType lType $ toMaybeInt64 maxDef defLvls vals)
265273 PINT96 ->
266274 let (vals, _) = readNInt96Times nPresent afterLvls
267275 in pure (toMaybeUTCTime maxDef defLvls vals)
@@ -296,3 +304,37 @@ processColumnPages (maxDef, maxRep) pages pType _ maybeTypeLength = do
296304 (c : cs) ->
297305 pure $
298306 L. foldl' (\ l r -> fromRight (error " concat failed" ) (DI. concatColumns l r)) c cs
307+
308+ applyLogicalType :: LogicalType -> DI. Column -> DI. Column
309+ applyLogicalType (TimestampType isUTC unit) col =
310+ fromRight col $
311+ DI. mapColumn
312+ (microsecondsToUTCTime . (* (1_000_000 `div` unitDivisor unit)))
313+ col
314+ applyLogicalType (DecimalType precision scale) col
315+ | precision <= 9 = case DI. toVector @ Int32 @ VU. Vector col of
316+ Right xs ->
317+ DI. fromUnboxedVector $
318+ VU. map (\ raw -> fromIntegral @ Int32 @ Double raw / 10 ^ scale) xs
319+ Left _ -> col
320+ | precision <= 18 = case DI. toVector @ Int64 @ VU. Vector col of
321+ Right xs ->
322+ DI. fromUnboxedVector $
323+ VU. map (\ raw -> fromIntegral @ Int64 @ Double raw / 10 ^ scale) xs
324+ Left _ -> col
325+ | otherwise = col
326+ applyLogicalType _ col = col
327+
328+ microsecondsToUTCTime :: Int64 -> UTCTime
329+ microsecondsToUTCTime us =
330+ posixSecondsToUTCTime (fromIntegral us / 1_000_000 )
331+
332+ unitDivisor :: TimeUnit -> Int64
333+ unitDivisor MILLISECONDS = 1_000
334+ unitDivisor MICROSECONDS = 1_000_000
335+ unitDivisor NANOSECONDS = 1_000_000_000
336+ unitDivisor TIME_UNIT_UNKNOWN = 1
337+
338+ applyScale :: Int32 -> Int32 -> Double
339+ applyScale scale rawValue =
340+ fromIntegral rawValue / (10 ^ scale)
0 commit comments