|
| 1 | +-- This script processes the following source file: |
| 2 | +-- |
| 3 | +-- http://unicode.org/Public/UNIDATA/UnicodeData.txt |
| 4 | +-- |
| 5 | +-- Format description: https://www.unicode.org/reports/tr44/tr44-36.html#UnicodeData.txt |
| 6 | + |
| 7 | +module UnicodeData |
| 8 | + ( UnicodeData |
| 9 | + , Data(..) |
| 10 | + , toTitleUD |
| 11 | + , parseUD |
| 12 | + ) where |
| 13 | + |
| 14 | +import Debug.Trace |
| 15 | +import Arsec hiding (semi) |
| 16 | +import Data.Array |
| 17 | +import Data.Functor (void) |
| 18 | +import Data.List (sort) |
| 19 | +import Data.Maybe (fromMaybe) |
| 20 | + |
| 21 | +type UnicodeData = Array Int Data |
| 22 | + |
| 23 | +-- "Simple_Titlecase_Mapping: If this field is null, then the Simple_Titlecase_Mapping |
| 24 | +-- is the same as the Simple_Uppercase_Mapping for this character." |
| 25 | +-- -- https://www.unicode.org/reports/tr44/tr44-36.html#UnicodeData.txt |
| 26 | +toTitleUD :: Data -> Maybe Char |
| 27 | +toTitleUD d = toTitleUD_ d <|> toUpperUD d |
| 28 | + |
| 29 | +data Data = Data { |
| 30 | + charUD :: {-# UNPACK #-} !Char |
| 31 | + , toUpperUD :: {-# UNPACK #-} !(Maybe Char) |
| 32 | + , toLowerUD :: {-# UNPACK #-} !(Maybe Char) |
| 33 | + , toTitleUD_ :: {-# UNPACK #-} !(Maybe Char) |
| 34 | + } deriving (Eq, Ord, Show) |
| 35 | + |
| 36 | +-- I'm pretty sure UnicodeData.txt is sorted but still sort it to be 100% certain. |
| 37 | +entries :: Parser UnicodeData |
| 38 | +entries = (\xs -> listArray (0, length xs - 1) xs) <$> many entry <* eof |
| 39 | + where |
| 40 | + entry = Data <$> unichar <* semi |
| 41 | + <* replicateM_ 11 (ignoreField <* semi) |
| 42 | + <*> optional unichar <* semi |
| 43 | + <*> optional unichar <* semi |
| 44 | + <*> optional unichar <* char '\n' |
| 45 | + semi = char ';' |
| 46 | + |
| 47 | +ignoreField :: Parser () |
| 48 | +ignoreField = void (many (satisfy (\c -> c /= ';'))) |
| 49 | + |
| 50 | +parseUD :: FilePath -> IO (Either ParseError UnicodeData) |
| 51 | +parseUD name = parse entries name <$> readFile name |
0 commit comments