@@ -3,9 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
33Released under Apache 2.0 license as described in the file LICENSE.
44-/
55
6+ module
67import UnicodeBasic.CharacterDatabase
78import UnicodeBasic.Hangul
8- import UnicodeBasic.Types
9+ public import UnicodeBasic.Types
10+
11+ public section
912
1013namespace Unicode
1114
@@ -143,29 +146,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where
143146protected def UnicodeData.txt := include_str "../data/UnicodeData.txt"
144147
145148/-- Parse `UnicodeData.txt` -/
146- private unsafe def UnicodeData.init : IO (Array UnicodeData) := do
147- let stream := UCDStream.ofString UnicodeData.txt
148- let mut arr := #[]
149- for record in stream do
150- arr := arr.push {
151- code := ofHexString! record[0 ]!
152- name := record[1 ]!
153- gc := GC.ofAbbrev! record[2 ]!
154- cc := record[3 ]!.toNat!
155- bidi := BidiClass.ofAbbrev! record[4 ]!
156- decomp := getDecompositionMapping? record[5 ]!
157- numeric := getNumericType? record[6 ]! record[7 ]! record[8 ]!
158- bidiMirrored := record[9 ]! == "Y"
159- uppercase := if record[12 ]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12 ]!
160- lowercase := if record[13 ]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13 ]!
161- titlecase := if record[14 ]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14 ]!
162- }
163- return arr
164-
165- where
166-
167- /-- Get decomposition mapping -/
168- getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
149+ unsafe initialize UnicodeData.data : Array UnicodeData ←
150+ let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
169151 /-
170152 The value of the `Decomposition_Mapping` property for a character is
171153 provided in field 5 of `UnicodeData.txt`. This is a string-valued
@@ -218,8 +200,10 @@ where
218200 some ⟨tag, cs⟩
219201 | [] => unreachable!
220202
221- /-- Get numeric type -/
222- getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
203+ let getDigitUnsafe (char : Char) : Fin 10 :=
204+ unsafeCast (char.val - '0' .val).toNat
205+
206+ let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
223207 /-
224208 If the character has the property value `Numeric_Type=Decimal`, then the
225209 `Numeric_Value` of that digit is represented with an integer value
@@ -263,14 +247,23 @@ where
263247 else
264248 return .decimal <| getDigitUnsafe <| s₁.front
265249
266- /-- Get decimal digit -/
267- @[inline]
268- getDigitUnsafe (char : Char) : Fin 10 :=
269- unsafeCast (char.val - '0' .val).toNat
270-
271- /-- Parsed data from `UnicodeData.txt` -/
272- @ [init UnicodeData.init]
273- protected def UnicodeData.data : Array UnicodeData := #[]
250+ let stream := UCDStream.ofString UnicodeData.txt
251+ let mut arr := #[]
252+ for record in stream do
253+ arr := arr.push {
254+ code := ofHexString! record[0 ]!
255+ name := record[1 ]!
256+ gc := GC.ofAbbrev! record[2 ]!
257+ cc := record[3 ]!.toNat!
258+ bidi := BidiClass.ofAbbrev! record[4 ]!
259+ decomp := getDecompositionMapping? record[5 ]!
260+ numeric := getNumericType? record[6 ]! record[7 ]! record[8 ]!
261+ bidiMirrored := record[9 ]! == "Y"
262+ uppercase := if record[12 ]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12 ]!
263+ lowercase := if record[13 ]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13 ]!
264+ titlecase := if record[14 ]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14 ]!
265+ }
266+ return arr
274267
275268/-- Get code point data from `UnicodeData.txt` -/
276269partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do
@@ -370,12 +363,12 @@ structure UnicodeDataStream where
370363 default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter
371364 deriving Inhabited
372365
373- private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
366+ def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
374367 let c := s.code
375368 let i := s.index
376369 if c > Unicode.max then
377370 none
378- else if h : i < UnicodeData.data.size.toUSize then
371+ else if h : i.toNat < UnicodeData.data.size then
379372 let d := UnicodeData.data[i]
380373 let n := d.name
381374 if c < d.code then
0 commit comments