Skip to content

Commit 4f24d0d

Browse files
committed
Fix CaseMapping generation script to not depend on GHC's Unicode data
1 parent f6a35e5 commit 4f24d0d

7 files changed

Lines changed: 264 additions & 69 deletions

File tree

scripts/Arsec.hs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ module Arsec
88
, unichars
99
, module Control.Applicative
1010
, module Control.Monad
11-
, module Data.Char
1211
, module Text.ParserCombinators.Parsec.Char
1312
, module Text.ParserCombinators.Parsec.Combinator
1413
, module Text.ParserCombinators.Parsec.Error

scripts/CaseFolding.hs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ module CaseFolding
1212

1313
import Arsec
1414
import Data.Bits
15+
import Data.Char (ord)
1516

1617
data Fold = Fold {
1718
code :: Char

scripts/CaseMapping.hs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import Data.Char (isDigit)
2+
import Data.Foldable (toList)
13
import Data.List (stripPrefix)
24
import Data.Maybe (fromJust)
35
import System.Environment
@@ -6,6 +8,7 @@ import System.IO
68
import Arsec
79
import CaseFolding
810
import SpecialCasing
11+
import UnicodeData
912

1013
main = do
1114
args <- getArgs
@@ -14,12 +17,16 @@ main = do
1417
[o] -> o
1518
psc <- parseSC "SpecialCasing.txt"
1619
pcf <- parseCF "CaseFolding.txt"
20+
ud <- parseUD "UnicodeData.txt"
1721
scs <- case psc of
1822
Left err -> print err >> return undefined
1923
Right ms -> return ms
2024
cfs <- case pcf of
2125
Left err -> print err >> return undefined
2226
Right ms -> return ms
27+
ud <- case ud of
28+
Left err -> print err >> return undefined
29+
Right ms -> return ms
2330
h <- openFile oname WriteMode
2431
let comments = map ("--" ++) $
2532
take 2 (cfComments cfs) ++ take 2 (scComments scs)
@@ -40,9 +47,10 @@ main = do
4047
,"unI64 :: Int64 -> _ {- unboxed Int64 -}"
4148
,"unI64 (I64# n) = n"
4249
,""]
43-
mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs)
44-
mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs)
45-
mapM_ (hPutStrLn h) (mapSC "title" title toTitle scs)
50+
let get f = [(k, d) | c <- toList ud, Just d <- [f c], let k = charUD c, k /= d]
51+
mapM_ (hPutStrLn h) (mapSC "upper" upper (get toUpperUD) scs)
52+
mapM_ (hPutStrLn h) (mapSC "lower" lower (get toLowerUD) scs)
53+
mapM_ (hPutStrLn h) (mapSC "title" title (get toTitleUD) scs)
4654
mapM_ (hPutStrLn h) (mapCF cfs)
4755
hClose h
4856

scripts/SpecialCasing.hs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ module SpecialCasing
1212

1313
import Arsec
1414
import Data.Bits
15+
import Data.Char (ord)
1516

1617
data SpecialCasing = SC { scComments :: [Comment], scCasing :: [Case] }
1718
deriving (Show)
@@ -38,17 +39,17 @@ entries = SC <$> many comment <*> many (entry <* many comment)
3839
parseSC :: FilePath -> IO (Either ParseError SpecialCasing)
3940
parseSC name = parse entries name <$> readFile name
4041

41-
mapSC :: String -> (Case -> String) -> (Char -> Char) -> SpecialCasing
42+
mapSC :: String -> (Case -> String) -> [(Char, Char)] -> SpecialCasing
4243
-> [String]
4344
mapSC which access twiddle (SC _ ms) =
4445
typ ++ map printUnusual ms' ++ map printUsual usual ++ [last]
4546
where
4647
ms' = filter p ms
47-
p c = [k] /= a && a /= [twiddle k] && null (conditions c)
48+
p c = [k] /= a && null (conditions c)
4849
where a = access c
4950
k = code c
5051
unusual = map code ms'
51-
usual = filter (\c -> twiddle c /= c && c `notElem` unusual) [minBound..maxBound]
52+
usual = filter (\(c, _) -> c `notElem` unusual) twiddle
5253

5354
typ = [which ++ "Mapping :: Char# -> _ {- unboxed Int64 -}"
5455
,"{-# NOINLINE " ++ which ++ "Mapping #-}"
@@ -57,7 +58,4 @@ mapSC which access twiddle (SC _ ms) =
5758
printUnusual c = " -- " ++ name c ++ "\n" ++
5859
" " ++ showC (code c) ++ "# -> unI64 " ++ show (ord x + (ord y `shiftL` 21) + (ord z `shiftL` 42))
5960
where x:y:z:_ = access c ++ repeat '\0'
60-
printUsual c = " " ++ showC c ++ "# -> unI64 " ++ show (ord (twiddle c))
61-
62-
ucFirst (c:cs) = toUpper c : cs
63-
ucFirst [] = []
61+
printUsual (c, c') = " " ++ showC c ++ "# -> unI64 " ++ show (ord c')

scripts/UnicodeData.hs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
-- This script processes the following source file:
2+
--
3+
-- http://unicode.org/Public/UNIDATA/UnicodeData.txt
4+
--
5+
-- Format description: https://www.unicode.org/reports/tr44/tr44-36.html#UnicodeData.txt
6+
7+
module UnicodeData
8+
( UnicodeData
9+
, Data(..)
10+
, toTitleUD
11+
, parseUD
12+
) where
13+
14+
import Debug.Trace
15+
import Arsec hiding (semi)
16+
import Data.Array
17+
import Data.Functor (void)
18+
import Data.List (sort)
19+
import Data.Maybe (fromMaybe)
20+
21+
type UnicodeData = Array Int Data
22+
23+
-- "Simple_Titlecase_Mapping: If this field is null, then the Simple_Titlecase_Mapping
24+
-- is the same as the Simple_Uppercase_Mapping for this character."
25+
-- -- https://www.unicode.org/reports/tr44/tr44-36.html#UnicodeData.txt
26+
toTitleUD :: Data -> Maybe Char
27+
toTitleUD d = toTitleUD_ d <|> toUpperUD d
28+
29+
data Data = Data {
30+
charUD :: {-# UNPACK #-} !Char
31+
, toUpperUD :: {-# UNPACK #-} !(Maybe Char)
32+
, toLowerUD :: {-# UNPACK #-} !(Maybe Char)
33+
, toTitleUD_ :: {-# UNPACK #-} !(Maybe Char)
34+
} deriving (Eq, Ord, Show)
35+
36+
-- I'm pretty sure UnicodeData.txt is sorted but still sort it to be 100% certain.
37+
entries :: Parser UnicodeData
38+
entries = (\xs -> listArray (0, length xs - 1) xs) <$> many entry <* eof
39+
where
40+
entry = Data <$> unichar <* semi
41+
<* replicateM_ 11 (ignoreField <* semi)
42+
<*> optional unichar <* semi
43+
<*> optional unichar <* semi
44+
<*> optional unichar <* char '\n'
45+
semi = char ';'
46+
47+
ignoreField :: Parser ()
48+
ignoreField = void (many (satisfy (\c -> c /= ';')))
49+
50+
parseUD :: FilePath -> IO (Either ParseError UnicodeData)
51+
parseUD name = parse entries name <$> readFile name

scripts/regenerate.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
2-
set -eu
2+
set -exu
33
cd "${BASH_SOURCE%/*}/"
4-
wget http://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt http://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt
5-
runghc-9.12 CaseMapping
6-
rm SpecialCasing.txt CaseFolding.txt
4+
wget http://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt http://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
5+
runghc CaseMapping
6+
rm SpecialCasing.txt CaseFolding.txt UnicodeData.txt
77
cd -

0 commit comments

Comments
 (0)