Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions UnicodeBasic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
Copyright © 2023-2025 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/
module
public import UnicodeBasic.Types
public import UnicodeBasic.TableLookup

import UnicodeBasic.Types
import UnicodeBasic.TableLookup
public section

/-!
# General API #
Expand Down
3 changes: 2 additions & 1 deletion UnicodeBasic/CharacterDatabase.lean
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Copyright © 2023 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/
module
public section

namespace Unicode

Expand Down Expand Up @@ -63,5 +65,4 @@ instance : Std.Stream UCDStream (Array String.Slice) where
next? := UCDStream.next?

end UCDStream

end Unicode
27 changes: 15 additions & 12 deletions UnicodeBasic/Hangul.lean
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,36 @@
Copyright © 2024 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/
module

public section

namespace Unicode.Hangul

private def shortNameL : Array String :=
def shortNameL : Array String :=
#["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S",
"SS", "", "J", "JJ", "C", "K", "T", "P", "H"]

private def shortNameV : Array String :=
def shortNameV : Array String :=
#["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
"I"]

private def shortNameT : Array String :=
def shortNameT : Array String :=
#["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG",
"LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S",
"SS", "NG", "J", "C", "K", "T", "P", "H"]

private abbrev sizeL := shortNameL.size -- 19
private abbrev sizeV := shortNameV.size -- 21
private abbrev sizeT := shortNameT.size -- 28
private abbrev sizeLV := sizeL * sizeV -- 399
private abbrev sizeVT := sizeV * sizeT -- 588
private abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
abbrev sizeL := shortNameL.size -- 19
abbrev sizeV := shortNameV.size -- 21
abbrev sizeT := shortNameT.size -- 28
abbrev sizeLV := sizeL * sizeV -- 399
abbrev sizeVT := sizeV * sizeT -- 588
abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172

private abbrev baseL : UInt32 := 0x1100
private abbrev baseV : UInt32 := 0x1161
private abbrev baseT : UInt32 := 0x11A7
abbrev baseL : UInt32 := 0x1100
abbrev baseV : UInt32 := 0x1161
abbrev baseT : UInt32 := 0x11A7

/-- Number of Hangul syllables -/
def Syllable.size := sizeLVT
Expand Down
5 changes: 3 additions & 2 deletions UnicodeBasic/TableLookup.lean
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
Copyright © 2024-2025 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

module
import UnicodeBasic.CharacterDatabase
import UnicodeBasic.Hangul
import UnicodeBasic.Types
public import UnicodeBasic.Types
public section

namespace Unicode

Expand Down
18 changes: 11 additions & 7 deletions UnicodeBasic/Types.lean
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
Copyright © 2023-2025 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

module
public section
/-- Low-level conversion from `UInt32` to `Char` (*unsafe*)

This function translates to a no-op in the compiler. However, it does not
Expand Down Expand Up @@ -204,6 +205,7 @@ deriving DecidableEq
/-- General category (GC)

Unicode property: `General_Category` -/
@[expose]
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Questionable.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Compilation failed, locally inferred compilation type differs from type that would be inferred in other modules. Some of the following definitions may need to be @[expose]d to fix this mismatch:
GC ↦ 1
This is a current compiler limitation for modules that may be lifted in the future.

def GC := UInt32 deriving DecidableEq, Inhabited

namespace GC
Expand Down Expand Up @@ -412,11 +414,12 @@ def toAbbrev! (x : GC) : String :=
| [a] => a
| _ => panic! "invalid general category"

open Std.Format Repr in instance : Repr GC where
reprPrec x := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
open Std.Format Repr in
def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill)
instance : Repr GC where reprPrec
Comment on lines -415 to +419
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why myself, but if an instance is public, its members are automatically exposed. As such, they can't refer to private names (reprAux in this case).

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make reprAux public and fix later.


instance : ToString GC where
toString x := " | ".intercalate (reprAux x)
def toString (x : GC) := " | ".intercalate (reprAux x)
instance : ToString GC where toString
Comment on lines +421 to +422
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above.


def ofAbbrev? (s : String.Slice) : Option GC :=
match s.chars.take 3 |>.toList with
Expand Down Expand Up @@ -464,7 +467,7 @@ def ofAbbrev? (s : String.Slice) : Option GC :=

def ofAbbrev! (s : String.Slice) : GC :=
match ofAbbrev? s with
| some c => c
| .some c => c
| none => panic! "invalid general category"

def ofString? (s : String.Slice) : Option GC := do
Expand All @@ -475,7 +478,7 @@ def ofString? (s : String.Slice) : Option GC := do

def ofString! (s : String.Slice) : GC :=
match ofString? s with
| some c => c
| .some c => c
| none => panic! "invalid general category"

end GC
Expand Down Expand Up @@ -1012,3 +1015,4 @@ instance : Repr BidiClass where
reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}"

end Unicode
end
7 changes: 4 additions & 3 deletions UnicodeData.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Copyright © 2023-2026 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

import UnicodeData.Aliases
import UnicodeData.Basic
import UnicodeData.PropList
module
public import UnicodeData.Aliases
public import UnicodeData.Basic
public import UnicodeData.PropList
5 changes: 4 additions & 1 deletion UnicodeData/Aliases.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ Copyright © 2026 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

import Std.Data.HashMap
module
public import Std.Data.HashMap
import UnicodeBasic.Types
import UnicodeBasic.CharacterDatabase

public section

namespace Unicode

/-- Type for list of aliases -/
Expand Down
65 changes: 29 additions & 36 deletions UnicodeData/Basic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

module
import UnicodeBasic.CharacterDatabase
import UnicodeBasic.Hangul
import UnicodeBasic.Types
public import UnicodeBasic.Types

public section

namespace Unicode

Expand Down Expand Up @@ -143,29 +146,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where
protected def UnicodeData.txt := include_str "../data/UnicodeData.txt"

/-- Parse `UnicodeData.txt` -/
private unsafe def UnicodeData.init : IO (Array UnicodeData) := do
let stream := UCDStream.ofString UnicodeData.txt
let mut arr := #[]
for record in stream do
arr := arr.push {
code := ofHexString! record[0]!
name := record[1]!
gc := GC.ofAbbrev! record[2]!
cc := record[3]!.toNat!
bidi := BidiClass.ofAbbrev! record[4]!
decomp := getDecompositionMapping? record[5]!
numeric := getNumericType? record[6]! record[7]! record[8]!
bidiMirrored := record[9]! == "Y"
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
}
return arr

where

/-- Get decomposition mapping -/
getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
unsafe initialize UnicodeData.data : Array UnicodeData ←
let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do
/-
The value of the `Decomposition_Mapping` property for a character is
provided in field 5 of `UnicodeData.txt`. This is a string-valued
Expand Down Expand Up @@ -218,8 +200,10 @@ where
some ⟨tag, cs⟩
| [] => unreachable!

/-- Get numeric type -/
getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
let getDigitUnsafe (char : Char) : Fin 10 :=
unsafeCast (char.val - '0'.val).toNat

let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do
/-
If the character has the property value `Numeric_Type=Decimal`, then the
`Numeric_Value` of that digit is represented with an integer value
Expand Down Expand Up @@ -263,14 +247,23 @@ where
else
return .decimal <| getDigitUnsafe <| s₁.front

/-- Get decimal digit -/
@[inline]
getDigitUnsafe (char : Char) : Fin 10 :=
unsafeCast (char.val - '0'.val).toNat

/-- Parsed data from `UnicodeData.txt` -/
@[init UnicodeData.init]
protected def UnicodeData.data : Array UnicodeData := #[]
let stream := UCDStream.ofString UnicodeData.txt
let mut arr := #[]
for record in stream do
arr := arr.push {
code := ofHexString! record[0]!
name := record[1]!
gc := GC.ofAbbrev! record[2]!
cc := record[3]!.toNat!
bidi := BidiClass.ofAbbrev! record[4]!
decomp := getDecompositionMapping? record[5]!
numeric := getNumericType? record[6]! record[7]! record[8]!
bidiMirrored := record[9]! == "Y"
uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]!
lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]!
titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]!
}
return arr

/-- Get code point data from `UnicodeData.txt` -/
partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do
Expand Down Expand Up @@ -370,12 +363,12 @@ structure UnicodeDataStream where
default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter
deriving Inhabited

private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do
let c := s.code
let i := s.index
if c > Unicode.max then
none
else if h : i < UnicodeData.data.size.toUSize then
else if h : i.toNat < UnicodeData.data.size then
let d := UnicodeData.data[i]
let n := d.name
if c < d.code then
Expand Down
9 changes: 4 additions & 5 deletions UnicodeData/PropList.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

module
import UnicodeBasic.Types
import UnicodeBasic.CharacterDatabase

public section

namespace Unicode

/-- Structure containing supported character properties from `PropList.txt` -/
Expand Down Expand Up @@ -35,7 +38,7 @@ deriving Inhabited, Repr
/-- Raw string form `PropList.txt` -/
protected def PropList.txt := include_str "../data/PropList.txt"

private unsafe def PropList.init : IO PropList := do
unsafe initialize PropList.data : PropList
let stream := UCDStream.ofString PropList.txt
let mut list : PropList := {}
for record in stream do
Expand Down Expand Up @@ -66,10 +69,6 @@ private unsafe def PropList.init : IO PropList := do
list := {list with deprecated := list.deprecated.push val}
return list

/-- Parsed data from `PropList.txt` -/
@[init PropList.init]
protected def PropList.data : PropList := {}

-- TODO: stop reinventing the wheel!
/-- Binary search -/
private partial def find (code : UInt32) (data : Array (UInt32 × Option UInt32)) (lo hi : Nat) : Nat :=
Expand Down