diff --git a/UnicodeBasic.lean b/UnicodeBasic.lean index e6d0d91e3..7ca255d8e 100644 --- a/UnicodeBasic.lean +++ b/UnicodeBasic.lean @@ -2,9 +2,11 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module +public import UnicodeBasic.Types +public import UnicodeBasic.TableLookup -import UnicodeBasic.Types -import UnicodeBasic.TableLookup +public section /-! # General API # diff --git a/UnicodeBasic/CharacterDatabase.lean b/UnicodeBasic/CharacterDatabase.lean index b587127b6..feaa80117 100644 --- a/UnicodeBasic/CharacterDatabase.lean +++ b/UnicodeBasic/CharacterDatabase.lean @@ -2,6 +2,8 @@ Copyright © 2023 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module +public section namespace Unicode @@ -63,5 +65,4 @@ instance : Std.Stream UCDStream (Array String.Slice) where next? := UCDStream.next? end UCDStream - end Unicode diff --git a/UnicodeBasic/Hangul.lean b/UnicodeBasic/Hangul.lean index e1b7f9e18..dcc600fe7 100644 --- a/UnicodeBasic/Hangul.lean +++ b/UnicodeBasic/Hangul.lean @@ -2,33 +2,36 @@ Copyright © 2024 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module + +public section namespace Unicode.Hangul -private def shortNameL : Array String := +def shortNameL : Array String := #["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"] -private def shortNameV : Array String := +def shortNameV : Array String := #["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"] -private def shortNameT : Array String := +def shortNameT : Array String := #["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"] -private abbrev sizeL := shortNameL.size -- 19 -private abbrev sizeV := shortNameV.size -- 21 -private abbrev sizeT := shortNameT.size -- 28 -private abbrev sizeLV := sizeL * sizeV -- 399 -private abbrev sizeVT := sizeV * sizeT -- 588 -private abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172 +abbrev sizeL := shortNameL.size -- 19 +abbrev sizeV := shortNameV.size -- 21 +abbrev sizeT := shortNameT.size -- 28 +abbrev sizeLV := sizeL * sizeV -- 399 +abbrev sizeVT := sizeV * sizeT -- 588 +abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172 -private abbrev baseL : UInt32 := 0x1100 -private abbrev baseV : UInt32 := 0x1161 -private abbrev baseT : UInt32 := 0x11A7 +abbrev baseL : UInt32 := 0x1100 +abbrev baseV : UInt32 := 0x1161 +abbrev baseT : UInt32 := 0x11A7 /-- Number of Hangul syllables -/ def Syllable.size := sizeLVT diff --git a/UnicodeBasic/TableLookup.lean b/UnicodeBasic/TableLookup.lean index d4a1eb9dd..e9afe8fa5 100644 --- a/UnicodeBasic/TableLookup.lean +++ b/UnicodeBasic/TableLookup.lean @@ -2,10 +2,11 @@ Copyright © 2024-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - +module import UnicodeBasic.CharacterDatabase import UnicodeBasic.Hangul -import UnicodeBasic.Types +public import UnicodeBasic.Types +public section namespace Unicode diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index ca6fd1598..4d5488283 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -2,7 +2,8 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - +module +public section /-- Low-level conversion from `UInt32` to `Char` (*unsafe*) This function translates to a no-op in the compiler. However, it does not @@ -204,6 +205,7 @@ deriving DecidableEq /-- General category (GC) Unicode property: `General_Category` -/ +@[expose] def GC := UInt32 deriving DecidableEq, Inhabited namespace GC @@ -412,11 +414,12 @@ def toAbbrev! (x : GC) : String := | [a] => a | _ => panic! "invalid general category" -open Std.Format Repr in instance : Repr GC where - reprPrec x := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill) +open Std.Format Repr in + def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill) + instance : Repr GC where reprPrec -instance : ToString GC where - toString x := " | ".intercalate (reprAux x) +def toString (x : GC) := " | ".intercalate (reprAux x) +instance : ToString GC where toString def ofAbbrev? (s : String.Slice) : Option GC := match s.chars.take 3 |>.toList with @@ -464,7 +467,7 @@ def ofAbbrev? (s : String.Slice) : Option GC := def ofAbbrev! (s : String.Slice) : GC := match ofAbbrev? s with - | some c => c + | .some c => c | none => panic! "invalid general category" def ofString? (s : String.Slice) : Option GC := do @@ -475,7 +478,7 @@ def ofString? (s : String.Slice) : Option GC := do def ofString! (s : String.Slice) : GC := match ofString? s with - | some c => c + | .some c => c | none => panic! "invalid general category" end GC @@ -1012,3 +1015,4 @@ instance : Repr BidiClass where reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}" end Unicode +end diff --git a/UnicodeData.lean b/UnicodeData.lean index 760376acf..be1495226 100644 --- a/UnicodeData.lean +++ b/UnicodeData.lean @@ -3,6 +3,7 @@ Copyright © 2023-2026 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ -import UnicodeData.Aliases -import UnicodeData.Basic -import UnicodeData.PropList +module +public import UnicodeData.Aliases +public import UnicodeData.Basic +public import UnicodeData.PropList diff --git a/UnicodeData/Aliases.lean b/UnicodeData/Aliases.lean index 0067ccc13..e4c464a28 100644 --- a/UnicodeData/Aliases.lean +++ b/UnicodeData/Aliases.lean @@ -3,10 +3,13 @@ Copyright © 2026 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ -import Std.Data.HashMap +module +public import Std.Data.HashMap import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase +public section + namespace Unicode /-- Type for list of aliases -/ diff --git a/UnicodeData/Basic.lean b/UnicodeData/Basic.lean index 3ace8974c..64c6e6565 100644 --- a/UnicodeData/Basic.lean +++ b/UnicodeData/Basic.lean @@ -3,9 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module import UnicodeBasic.CharacterDatabase import UnicodeBasic.Hangul -import UnicodeBasic.Types +public import UnicodeBasic.Types + +public section namespace Unicode @@ -143,29 +146,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where protected def UnicodeData.txt := include_str "../data/UnicodeData.txt" /-- Parse `UnicodeData.txt` -/ -private unsafe def UnicodeData.init : IO (Array UnicodeData) := do - let stream := UCDStream.ofString UnicodeData.txt - let mut arr := #[] - for record in stream do - arr := arr.push { - code := ofHexString! record[0]! - name := record[1]! - gc := GC.ofAbbrev! record[2]! - cc := record[3]!.toNat! - bidi := BidiClass.ofAbbrev! record[4]! - decomp := getDecompositionMapping? record[5]! - numeric := getNumericType? record[6]! record[7]! record[8]! - bidiMirrored := record[9]! == "Y" - uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]! - lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]! - titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]! - } - return arr - -where - - /-- Get decomposition mapping -/ - getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do +unsafe initialize UnicodeData.data : Array UnicodeData ← + let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do /- The value of the `Decomposition_Mapping` property for a character is provided in field 5 of `UnicodeData.txt`. This is a string-valued @@ -218,8 +200,10 @@ where some ⟨tag, cs⟩ | [] => unreachable! - /-- Get numeric type -/ - getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do + let getDigitUnsafe (char : Char) : Fin 10 := + unsafeCast (char.val - '0'.val).toNat + + let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do /- If the character has the property value `Numeric_Type=Decimal`, then the `Numeric_Value` of that digit is represented with an integer value @@ -263,14 +247,23 @@ where else return .decimal <| getDigitUnsafe <| s₁.front - /-- Get decimal digit -/ - @[inline] - getDigitUnsafe (char : Char) : Fin 10 := - unsafeCast (char.val - '0'.val).toNat - -/-- Parsed data from `UnicodeData.txt` -/ -@[init UnicodeData.init] -protected def UnicodeData.data : Array UnicodeData := #[] + let stream := UCDStream.ofString UnicodeData.txt + let mut arr := #[] + for record in stream do + arr := arr.push { + code := ofHexString! record[0]! + name := record[1]! + gc := GC.ofAbbrev! record[2]! + cc := record[3]!.toNat! + bidi := BidiClass.ofAbbrev! record[4]! + decomp := getDecompositionMapping? record[5]! + numeric := getNumericType? record[6]! record[7]! record[8]! + bidiMirrored := record[9]! == "Y" + uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]! + lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]! + titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]! + } + return arr /-- Get code point data from `UnicodeData.txt` -/ partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do @@ -370,12 +363,12 @@ structure UnicodeDataStream where default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter deriving Inhabited -private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do +def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do let c := s.code let i := s.index if c > Unicode.max then none - else if h : i < UnicodeData.data.size.toUSize then + else if h : i.toNat < UnicodeData.data.size then let d := UnicodeData.data[i] let n := d.name if c < d.code then diff --git a/UnicodeData/PropList.lean b/UnicodeData/PropList.lean index e43cd62e0..a1865d16c 100644 --- a/UnicodeData/PropList.lean +++ b/UnicodeData/PropList.lean @@ -3,9 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase +public section + namespace Unicode /-- Structure containing supported character properties from `PropList.txt` -/ @@ -35,7 +38,7 @@ deriving Inhabited, Repr /-- Raw string form `PropList.txt` -/ protected def PropList.txt := include_str "../data/PropList.txt" -private unsafe def PropList.init : IO PropList := do +unsafe initialize PropList.data : PropList ← let stream := UCDStream.ofString PropList.txt let mut list : PropList := {} for record in stream do @@ -66,10 +69,6 @@ private unsafe def PropList.init : IO PropList := do list := {list with deprecated := list.deprecated.push val} return list -/-- Parsed data from `PropList.txt` -/ -@[init PropList.init] -protected def PropList.data : PropList := {} - -- TODO: stop reinventing the wheel! /-- Binary search -/ private partial def find (code : UInt32) (data : Array (UInt32 × Option UInt32)) (lo hi : Nat) : Nat :=