Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions UnicodeBasic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,27 @@ namespace Unicode
@[inline]
def getName (char : Char) : String := lookupName char.val

/-!
## Script ##
-/

/-- Get character script

Unicode property: `Script`
-/
@[inline]
def getScript (char : Char) : Script := lookupScript char.val

/-- Get script name

Returns `none` if the script code is unassigned.

Unicode property: `Script`
-/
@[inline]
def getScriptName? (s : Script) : Option String :=
lookupScriptName s |>.map toString

/-!
## Bidirectional Properties ##
-/
Expand Down
21 changes: 21 additions & 0 deletions UnicodeBasic/TableLookup.lean
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ protected abbrev oMath : UInt64 := 0x800000000
@[extern "unicode_prop_lookup"]
protected opaque lookupProp (c : UInt32) : UInt64

@[extern "unicode_script_lookup"]
protected opaque lookupScript (c : UInt32) : Script

end CLib

/-- Binary search -/
Expand Down Expand Up @@ -386,4 +389,22 @@ where
str : String := include_str "../data/table/White_Space.txt"
table : Thunk <| Array (UInt32 × UInt32) := parsePropTable str

/-- Get the script of a code point using lookup table

Unicode property: `Script` -/
@[inline]
def lookupScript (c : UInt32) : Script := CLib.lookupScript c

/-- Get the name of a script

Unicode property: `Script` -/
def lookupScriptName (s : Script) : Option String.Slice :=
let table := table.get
if s.code < table[0]!.1 then none else
match table[find s.code (fun i => table[i]!.1) 0 table.size.toUSize]! with
| (c, v) => if s.code = c then some v else none
where
str : String := include_str "../data/table/Script_Name.txt"
table : Thunk <| Array (UInt32 × String.Slice) := parseTable str fun _ n => n[0]!

end Unicode
62 changes: 62 additions & 0 deletions UnicodeBasic/Types.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

import Std.Data.HashMap

/-- Low-level conversion from `UInt32` to `Char` (*unsafe*)

This function translates to a no-op in the compiler. However, it does not
Expand Down Expand Up @@ -1011,4 +1013,64 @@ def BidiClass.ofAbbrev! (abbr : String.Slice) : BidiClass :=
instance : Repr BidiClass where
reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}"

/-!
## Scripts ##
-/

/-- Check if valid script identifier -/
@[inline]
def Script.isValid (c : UInt32) : Bool :=
let c0 := (c >>> 24).toUInt8
let c1 := (c >>> 16).toUInt8
let c2 := (c >>> 8).toUInt8
let c3 := c.toUInt8
(c0 ≤ 'Z'.toUInt8 && 'A'.toUInt8 ≤ c0)
&& (c1 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c1)
&& (c2 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c2)
&& (c3 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c3)

/-- Script identifier type -/
structure Script where
code : UInt32
is_valid : Script.isValid code
deriving DecidableEq, Hashable

namespace Script

/-- Default value is `Zzzz` (`Unknown`) -/
instance : Inhabited Script where
default := {
code := (((('Z'.val <<< 8 ||| 'z'.val) <<< 8) ||| 'z'.val) <<< 8) ||| 'z'.val
is_valid := by decide
}

/-- String abbreviation of script -/
@[extern "unicode_script_to_abbrev"]
def toAbbrev : Script → String
| ⟨c, _⟩ =>
let c0 := Char.ofUInt8 (c >>> 24).toUInt8
let c1 := Char.ofUInt8 (c >>> 16).toUInt8
let c2 := Char.ofUInt8 (c >>> 8).toUInt8
let c3 := Char.ofUInt8 c.toUInt8
String.ofList [c0, c1, c2, c3]

@[extern "unicode_script_of_abbrev"]
private opaque ofAbbrevAux (abbr : String) : UInt32

/-- Get script from abbreviation -/
def ofAbbrev? (abbr : String.Slice) : Option Script :=
if abbr.utf8ByteSize = 4 then
let code := ofAbbrevAux abbr.toString
if h : Script.isValid code then
some ⟨code, h⟩
else
none
else
none

@[inline, inherit_doc ofAbbrev?]
def ofAbbrev! (abbr : String.Slice) : Script := ofAbbrev? abbr |>.get!

end Script

end Unicode
30 changes: 30 additions & 0 deletions UnicodeCLib/basic.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <lean/lean.h>
#include <arpa/inet.h>
#include "basic.h"

static inline int unicode_script_is_valid(uint32_t c) {
int c0 = (int)(c >> 24 & 0xff);
int c1 = (int)(c >> 16 & 0xff);
int c2 = (int)(c >> 8 & 0xff);
int c3 = (int)(c & 0xff);
return
c0 <= 'Z' && 'A' <= c0 &&
c1 <= 'z' && 'a' <= c1 &&
c2 <= 'z' && 'a' <= c2 &&
c3 <= 'z' && 'a' <= c3;
}

uint32_t unicode_script_of_abbrev(b_lean_obj_arg abbr) {
lean_string_object * str = lean_to_string(abbr);
assert(str->m_size > 4);
uint32_t val = *((uint32_t*)(str->m_data));
return ntohl(val);
}

lean_obj_res unicode_script_to_abbrev(uint32_t scr) {
assert(unicode_script_is_valid(scr));
lean_object * abbr = lean_alloc_string(5, 5, 4);
lean_to_string(abbr)->m_data[4] = 0;
*((uint32_t*)lean_to_string(abbr)->m_data) = htonl(scr);
return abbr;
}
6 changes: 6 additions & 0 deletions UnicodeCLib/basic.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ LEAN_EXPORT uint64_t unicode_prop_lookup(uint32_t c);

LEAN_EXPORT uint64_t unicode_case_lookup(uint32_t c);

LEAN_EXPORT uint32_t unicode_script_lookup(uint32_t c);

LEAN_EXPORT uint32_t unicode_script_of_abbrev(b_lean_obj_arg abbr);

LEAN_EXPORT lean_obj_res unicode_script_to_abbrev(uint32_t scr);

#ifdef __cplusplus
}
#endif
Loading
Loading