Skip to content

Commit aa4315d

Browse files
committed
feat: modularize
1 parent d7f2437 commit aa4315d

9 files changed

Lines changed: 76 additions & 69 deletions

File tree

UnicodeBasic.lean

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
public import UnicodeBasic.Types
7+
public import UnicodeBasic.TableLookup
58

6-
import UnicodeBasic.Types
7-
import UnicodeBasic.TableLookup
9+
public section
810

911
/-!
1012
# General API #

UnicodeBasic/CharacterDatabase.lean

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
Copyright © 2023 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
public section
57

68
namespace Unicode
79

@@ -63,5 +65,4 @@ instance : Std.Stream UCDStream (Array String.Slice) where
6365
next? := UCDStream.next?
6466

6567
end UCDStream
66-
6768
end Unicode

UnicodeBasic/Hangul.lean

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,36 @@
22
Copyright © 2024 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5+
module
6+
7+
public section
58

69
namespace Unicode.Hangul
710

8-
private def shortNameL : Array String :=
11+
def shortNameL : Array String :=
912
#["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S",
1013
"SS", "", "J", "JJ", "C", "K", "T", "P", "H"]
1114

12-
private def shortNameV : Array String :=
15+
def shortNameV : Array String :=
1316
#["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
1417
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
1518
"I"]
1619

17-
private def shortNameT : Array String :=
20+
def shortNameT : Array String :=
1821
#["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG",
1922
"LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S",
2023
"SS", "NG", "J", "C", "K", "T", "P", "H"]
2124

22-
private abbrev sizeL := shortNameL.size -- 19
23-
private abbrev sizeV := shortNameV.size -- 21
24-
private abbrev sizeT := shortNameT.size -- 28
25-
private abbrev sizeLV := sizeL * sizeV -- 399
26-
private abbrev sizeVT := sizeV * sizeT -- 588
27-
private abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
25+
abbrev sizeL := shortNameL.size -- 19
26+
abbrev sizeV := shortNameV.size -- 21
27+
abbrev sizeT := shortNameT.size -- 28
28+
abbrev sizeLV := sizeL * sizeV -- 399
29+
abbrev sizeVT := sizeV * sizeT -- 588
30+
abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
2831

29-
private abbrev baseL : UInt32 := 0x1100
30-
private abbrev baseV : UInt32 := 0x1161
31-
private abbrev baseT : UInt32 := 0x11A7
32+
abbrev baseL : UInt32 := 0x1100
33+
abbrev baseV : UInt32 := 0x1161
34+
abbrev baseT : UInt32 := 0x11A7
3235

3336
/-- Number of Hangul syllables -/
3437
def Syllable.size := sizeLVT

UnicodeBasic/TableLookup.lean

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
Copyright © 2024-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
66
import UnicodeBasic.CharacterDatabase
77
import UnicodeBasic.Hangul
8-
import UnicodeBasic.Types
8+
public import UnicodeBasic.Types
9+
public section
910

1011
namespace Unicode
1112

UnicodeBasic/Types.lean

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
5-
5+
module
6+
public section
67
/-- Low-level conversion from `UInt32` to `Char` (*unsafe*)
78
89
This function translates to a no-op in the compiler. However, it does not
@@ -204,6 +205,7 @@ deriving DecidableEq
204205
/-- General category (GC)
205206
206207
Unicode property: `General_Category` -/
208+
@[expose]
207209
def GC := UInt32 deriving DecidableEq, Inhabited
208210

209211
namespace GC
@@ -412,11 +414,12 @@ def toAbbrev! (x : GC) : String :=
412414
| [a] => a
413415
| _ => panic! "invalid general category"
414416

415-
open Std.Format Repr in instance : Repr GC where
416-
reprPrec x := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
417+
open Std.Format Repr in
418+
def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
419+
instance : Repr GC where reprPrec
417420

418-
instance : ToString GC where
419-
toString x := " | ".intercalate (reprAux x)
421+
def toString (x : GC) := " | ".intercalate (reprAux x)
422+
instance : ToString GC where toString
420423

421424
def ofAbbrev? (s : String.Slice) : Option GC :=
422425
match s.chars.take 3 |>.toList with
@@ -464,7 +467,7 @@ def ofAbbrev? (s : String.Slice) : Option GC :=
464467

465468
def ofAbbrev! (s : String.Slice) : GC :=
466469
match ofAbbrev? s with
467-
| some c => c
470+
| .some c => c
468471
| none => panic! "invalid general category"
469472

470473
def ofString? (s : String.Slice) : Option GC := do
@@ -475,7 +478,7 @@ def ofString? (s : String.Slice) : Option GC := do
475478

476479
def ofString! (s : String.Slice) : GC :=
477480
match ofString? s with
478-
| some c => c
481+
| .some c => c
479482
| none => panic! "invalid general category"
480483

481484
end GC
@@ -1012,3 +1015,4 @@ instance : Repr BidiClass where
10121015
reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}"
10131016

10141017
end Unicode
1018+
end

UnicodeData.lean

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ Copyright © 2023-2026 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6-
import UnicodeData.Aliases
7-
import UnicodeData.Basic
8-
import UnicodeData.PropList
6+
module
7+
public import UnicodeData.Aliases
8+
public import UnicodeData.Basic
9+
public import UnicodeData.PropList

UnicodeData/Aliases.lean

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@ Copyright © 2026 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6-
import Std.Data.HashMap
6+
module
7+
public import Std.Data.HashMap
78
import UnicodeBasic.Types
89
import UnicodeBasic.CharacterDatabase
910

11+
public section
12+
1013
namespace Unicode
1114

1215
/-- Type for list of aliases -/

UnicodeData/Basic.lean

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6+
module
67
import UnicodeBasic.CharacterDatabase
78
import UnicodeBasic.Hangul
8-
import UnicodeBasic.Types
9+
public import UnicodeBasic.Types
10+
11+
public section
912

1013
namespace Unicode
1114

@@ -143,29 +146,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where
143146
protected def UnicodeData.txt := include_str "../data/UnicodeData.txt"
144147

145148
/-- Parse `UnicodeData.txt` -/
146-
private unsafe def UnicodeData.init : IO (Array UnicodeData) := do
147-
let stream := UCDStream.ofString UnicodeData.txt
148-
let mut arr := #[]
149-
for record in stream do
150-
arr := arr.push {
151-
code := ofHexString! record[0]!
152-
name := record[1]!
153-
gc := GC.ofAbbrev! record[2]!
154-
cc := record[3]!.toNat!
155-
bidi := BidiClass.ofAbbrev! record[4]!
156-
decomp := getDecompositionMapping? record[5]!
157-
numeric := getNumericType? record[6]! record[7]! record[8]!
158-
bidiMirrored := record[9]! == "Y"
159-
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
160-
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
161-
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
162-
}
163-
return arr
164-
165-
where
166-
167-
/-- Get decomposition mapping -/
168-
getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
149+
unsafe initialize UnicodeData.data : Array UnicodeData ←
150+
let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
169151
/-
170152
The value of the `Decomposition_Mapping` property for a character is
171153
provided in field 5 of `UnicodeData.txt`. This is a string-valued
@@ -218,8 +200,10 @@ where
218200
some ⟨tag, cs⟩
219201
| [] => unreachable!
220202

221-
/-- Get numeric type -/
222-
getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
203+
let getDigitUnsafe (char : Char) : Fin 10 :=
204+
unsafeCast (char.val - '0'.val).toNat
205+
206+
let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
223207
/-
224208
If the character has the property value `Numeric_Type=Decimal`, then the
225209
`Numeric_Value` of that digit is represented with an integer value
@@ -263,14 +247,23 @@ where
263247
else
264248
return .decimal <| getDigitUnsafe <| s₁.front
265249

266-
/-- Get decimal digit -/
267-
@[inline]
268-
getDigitUnsafe (char : Char) : Fin 10 :=
269-
unsafeCast (char.val - '0'.val).toNat
270-
271-
/-- Parsed data from `UnicodeData.txt` -/
272-
@[init UnicodeData.init]
273-
protected def UnicodeData.data : Array UnicodeData := #[]
250+
let stream := UCDStream.ofString UnicodeData.txt
251+
let mut arr := #[]
252+
for record in stream do
253+
arr := arr.push {
254+
code := ofHexString! record[0]!
255+
name := record[1]!
256+
gc := GC.ofAbbrev! record[2]!
257+
cc := record[3]!.toNat!
258+
bidi := BidiClass.ofAbbrev! record[4]!
259+
decomp := getDecompositionMapping? record[5]!
260+
numeric := getNumericType? record[6]! record[7]! record[8]!
261+
bidiMirrored := record[9]! == "Y"
262+
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
263+
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
264+
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
265+
}
266+
return arr
274267

275268
/-- Get code point data from `UnicodeData.txt` -/
276269
partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do
@@ -370,12 +363,12 @@ structure UnicodeDataStream where
370363
default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter
371364
deriving Inhabited
372365

373-
private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
366+
def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
374367
let c := s.code
375368
let i := s.index
376369
if c > Unicode.max then
377370
none
378-
else if h : i < UnicodeData.data.size.toUSize then
371+
else if h : i.toNat < UnicodeData.data.size then
379372
let d := UnicodeData.data[i]
380373
let n := d.name
381374
if c < d.code then

UnicodeData/PropList.lean

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
33
Released under Apache 2.0 license as described in the file LICENSE.
44
-/
55

6+
module
67
import UnicodeBasic.Types
78
import UnicodeBasic.CharacterDatabase
89

10+
public section
11+
912
namespace Unicode
1013

1114
/-- Structure containing supported character properties from `PropList.txt` -/
@@ -35,7 +38,7 @@ deriving Inhabited, Repr
3538
/-- Raw string form `PropList.txt` -/
3639
protected def PropList.txt := include_str "../data/PropList.txt"
3740

38-
private unsafe def PropList.init : IO PropList := do
41+
unsafe initialize PropList.data : PropList
3942
let stream := UCDStream.ofString PropList.txt
4043
let mut list : PropList := {}
4144
for record in stream do
@@ -66,10 +69,6 @@ private unsafe def PropList.init : IO PropList := do
6669
list := {list with deprecated := list.deprecated.push val}
6770
return list
6871

69-
/-- Parsed data from `PropList.txt` -/
70-
@[init PropList.init]
71-
protected def PropList.data : PropList := {}
72-
7372
-- TODO: stop reinventing the wheel!
7473
/-- Binary search -/
7574
private partial def find (code : UInt32) (data : Array (UInt32 × Option UInt32)) (lo hi : Nat) : Nat :=

0 commit comments

Comments
 (0)