diff --git a/.github/workflows/check_unicode.yml b/.github/workflows/check_unicode.yml new file mode 100644 index 00000000000..c22a4e3b29d --- /dev/null +++ b/.github/workflows/check_unicode.yml @@ -0,0 +1,23 @@ +name: Unicode listing up to date +on: + workflow_dispatch: + schedule: + - cron: '42 3 * * *' + + check-unicode: + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - name: Check unicode file state + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + curl --proto '=https' --tlsv1.2 -fsSL -o /tmp/UnicodeData.txt https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + contrib/gen_unicode_general_category.py /tmp/UnicodeData.txt -o /tmp/unicode.rs + if ! diff -u lightning-types/src/unicode.rs /tmp/unicode.rs; then + TITLE="Unicode listing out of date: ${{ github.workflow }}" + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + BODY="The unicode character listing is out of date, see $RUN_URL" + gh issue create --title "$TITLE" --body "$BODY" + fi diff --git a/contrib/gen_unicode_general_category.py b/contrib/gen_unicode_general_category.py new file mode 100755 index 00000000000..4871e967b55 --- /dev/null +++ b/contrib/gen_unicode_general_category.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +# This file is Copyright its original authors, visible in version control +# history. +# +# This file is licensed under the Apache License, Version 2.0 or the MIT license +# , at your option. +# You may not use this file except in accordance with one or both of these +# licenses. + +"""Generate Unicode general-category predicates from `UnicodeData.txt`. + +Emits two `pub(crate)` functions taking a `char`, split into two disjoint +buckets across the Unicode top-level `C` ("Other") category so callers can +compose them: + + is_unicode_general_category_other — Cc / Cf / Cs / Co (assigned) + is_unicode_general_category_unassigned — Cn (plus codepoints above + U+10FFFF, which aren't + valid codepoints at all) + +`UnicodeData.txt` is the canonical machine-readable listing of every assigned +codepoint in the Unicode Character Database. Each line is `;`-separated; field +0 is the codepoint (hex), field 1 is the name, and field 2 is the two-letter +general category (e.g. `Lu`, `Cf`, `Mn`). Codepoints absent from the file have +category `Cn` (Unassigned) by convention. + +Two encoding details to preserve: + * Large blocks of contiguous same-category codepoints are written as two + consecutive entries whose names end in `, First>` and `, Last>`. Every + codepoint between First and Last (inclusive) shares the listed category. + * The codepoint range is U+0000..=U+10FFFF. + +Each `matches!` arm in the assigned-Other table carries an end-of-line comment +derived from the `UnicodeData.txt` name field — typically the longest common +word prefix or suffix across the names in the range, falling back to the set +of categories when the names share nothing meaningful. The unassigned table +omits per-arm comments since every range there has the same meaning by +construction. + +Usage: + contrib/gen_unicode_general_category.py UnicodeData.txt > out.rs +""" + +import argparse +import sys +from pathlib import Path + +MAX_CODEPOINT = 0x10FFFF + +LICENSE_HEADER = """\ +// This file is Copyright its original authors, visible in version control +// history. +// +// This file is licensed under the Apache License, Version 2.0 or the MIT license +// , at your option. +// You may not use this file except in accordance with one or both of these +// licenses. +""" + +GENERATED_NOTICE = """\ +// Auto-generated from the Unicode Character Database (UnicodeData.txt) by +// contrib/gen_unicode_general_category.py. Do not edit by hand; rerun the +// generator with an updated UnicodeData.txt to refresh the table. +""" + + +def _normalize_name(name): + """Strip the `<...>` wrapping and `, First` / `, Last` range markers so + that, e.g., `` becomes + `Non Private Use High Surrogate` and `` becomes `control`. + """ + if name.startswith("<") and name.endswith(">"): + inner = name[1:-1] + for suffix in (", First", ", Last"): + if inner.endswith(suffix): + inner = inner[: -len(suffix)] + return inner + return name + + +def parse_categories(path): + """Return `(cats, names)` mapping every codepoint listed in `path` to its + general category and to its (normalised) name. Codepoints absent from the + returned dicts have category `Cn` (Unassigned) and no name. + """ + cats = {} + names = {} + pending_first = None # (first_cp, first_cat, normalised_name) once a range opens. + with path.open() as f: + for lineno, raw in enumerate(f, 1): + line = raw.rstrip("\n") + if not line: + continue + fields = line.split(";") + if len(fields) < 3: + raise ValueError(f"{path}:{lineno}: expected at least 3 fields, got {len(fields)}") + cp = int(fields[0], 16) + name = fields[1] + cat = fields[2] + if pending_first is not None: + first_cp, first_cat, first_name = pending_first + if not name.endswith(", Last>"): + raise ValueError( + f"{path}:{lineno}: expected `, Last>` to close range " + f"opened at U+{first_cp:04X}, got name {name!r}" + ) + if cat != first_cat: + raise ValueError( + f"{path}:{lineno}: range U+{first_cp:04X}..=U+{cp:04X} " + f"has mismatched categories {first_cat!r} / {cat!r}" + ) + for x in range(first_cp, cp + 1): + cats[x] = cat + names[x] = first_name + pending_first = None + elif name.endswith(", First>"): + pending_first = (cp, cat, _normalize_name(name)) + else: + cats[cp] = cat + names[cp] = _normalize_name(name) + if pending_first is not None: + raise ValueError(f"{path}: dangling `, First>` entry at U+{pending_first[0]:04X}") + return cats, names + + +ASSIGNED_OTHER_CATS = frozenset({"Cc", "Cf", "Cs", "Co"}) + + +def coalesce_ranges(cats, names, target_cats, *, label): + """Walk U+0000..=U+10FFFF and return a list of `(start, end, label)` for + every contiguous run of codepoints whose general category is in + `target_cats`. Codepoints absent from `cats` are treated as `Cn`. + + If `label` is `True`, attach a comment summarising the codepoint names in + each range; otherwise every range gets an empty label. + """ + ranges = [] + start = None + for cp in range(MAX_CODEPOINT + 1): + in_target = cats.get(cp, "Cn") in target_cats + if in_target and start is None: + start = cp + elif not in_target and start is not None: + ranges.append((start, cp - 1)) + start = None + if start is not None: + ranges.append((start, MAX_CODEPOINT)) + + if not label: + return [(s, e, "") for s, e in ranges] + + labelled = [] + for s, e in ranges: + range_names = [] + range_cats = set() + for cp in range(s, e + 1): + range_cats.add(cats.get(cp, "Cn")) + n = names.get(cp) + if n is not None: + range_names.append(n) + labelled.append((s, e, _make_label(range_names, range_cats))) + return labelled + + +def _common_word_run(names, *, from_end): + """Return the longest sequence of words shared by every name, taken from + either the start (`from_end=False`) or the end (`from_end=True`) of each + name's whitespace-split tokens. + """ + if not names: + return "" + tokenised = [n.split() for n in names] + if from_end: + tokenised = [list(reversed(t)) for t in tokenised] + limit = min(len(t) for t in tokenised) + common = [] + for i in range(limit): + token = tokenised[0][i] + if all(t[i] == token for t in tokenised): + common.append(token) + else: + break + if from_end: + common.reverse() + return " ".join(common) + + +def _make_label(names, cats_in_range): + """Build a short human-readable label for a coalesced range. Applied to + the assigned-Other buckets only; each range there is `Cc`, `Cf`, `Cs`, + `Co`, or some contiguous union thereof. + + Rules, in order: + 1. All names identical → that name (e.g. `control`). + 2. Common leading or trailing words → the longer of the two. + 3. Otherwise, list the categories present (e.g. `Co / Cs`). + """ + unique = list(dict.fromkeys(names)) + if len(unique) == 1: + return unique[0] + + prefix = _common_word_run(names, from_end=False) + suffix = _common_word_run(names, from_end=True) + # Pick whichever is more informative; when both are non-empty, prefer the + # longer one. A multi-word prefix beats a single-word suffix. + label = prefix if len(prefix) >= len(suffix) else suffix + if label: + return label + return " / ".join(sorted(cats_in_range)) + + +def fmt_codepoint(cp): + # `UnicodeData.txt` uses 4-digit hex for the BMP and wider for higher + # planes; mirror that so the output stays readable next to the source data. + return f"0x{cp:04X}" if cp <= 0xFFFF else f"0x{cp:X}" + + +def _pattern(start, end): + if start == end: + return fmt_codepoint(start) + return f"{fmt_codepoint(start)}..={fmt_codepoint(end)}" + + +def _emit_matches_body(lines, arms): + """Append a `matches!(c as u32, ...)` body to `lines`, with one + `(pattern, label)` tuple per arm. The first arm sits at the `matches!` + argument indent and continuation `| ...` arms indent one level deeper, + matching the rustfmt convention used elsewhere in the tree. + """ + lines.append("\tmatches!(") + lines.append("\t\tc as u32,") + for i, (pattern, label) in enumerate(arms): + prefix = "\t\t" if i == 0 else "\t\t\t| " + comment = f" // {label}" if label else "" + lines.append(f"{prefix}{pattern}{comment}") + lines.append("\t)") + + +def render_rust(other_ranges, unassigned_ranges): + """Render the final Rust source defining both `char`-taking predicates. + + `other_ranges` and `unassigned_ranges` are lists of `(start, end, label)`. + The unassigned function additionally gets a synthetic final arm catching + `u32` values above U+10FFFF — these aren't valid Unicode codepoints, so + by definition they have no general category and the unassigned bucket is + the closest match. + """ + lines = [LICENSE_HEADER, GENERATED_NOTICE] + + lines.append("/// Returns `true` if `c` is in Unicode general category `Cc` (Control), `Cf`") + lines.append("/// (Format), `Cs` (Surrogate), or `Co` (Private Use) — the assigned codepoints") + lines.append("/// in the top-level `C` (\"Other\") category. The `Cs` portion of the table is") + lines.append("/// unreachable for `char` input (a `char` cannot hold a surrogate) but is kept") + lines.append("/// so the table mirrors the source UCD data verbatim. The disjoint `Cn`") + lines.append("/// (Unassigned) bucket is `is_unicode_general_category_unassigned`.") + lines.append("#[allow(dead_code)]") + lines.append("pub(crate) fn is_unicode_general_category_other(c: char) -> bool {") + other_arms = [(_pattern(s, e), label) for s, e, label in other_ranges] + _emit_matches_body(lines, other_arms) + lines.append("}") + lines.append("") + + lines.append("/// Returns `true` if `c` is in Unicode general category `Cn` (Unassigned), or") + lines.append("/// strictly above U+10FFFF. The trailing `0x110000..=u32::MAX` arm is") + lines.append("/// unreachable for `char` input (a `char` is bounded to U+10FFFF) but is kept") + lines.append("/// for defensive coverage of the underlying `u32`. The disjoint Cc / Cf / Cs /") + lines.append("/// Co bucket is `is_unicode_general_category_other`.") + lines.append("#[allow(dead_code)]") + lines.append("pub(crate) fn is_unicode_general_category_unassigned(c: char) -> bool {") + unassigned_arms = [(_pattern(s, e), label) for s, e, label in unassigned_ranges] + unassigned_arms.append(("0x110000..=u32::MAX", "above U+10FFFF — unreachable for `char`")) + _emit_matches_body(lines, unassigned_arms) + lines.append("}") + lines.append("") + + return "\n".join(lines) + + +def main(argv): + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument("unicode_data", type=Path, help="Path to UnicodeData.txt") + ap.add_argument( + "-o", "--output", type=Path, default=None, + help="Output Rust file (default: stdout)", + ) + args = ap.parse_args(argv) + + cats, names = parse_categories(args.unicode_data) + other = coalesce_ranges(cats, names, ASSIGNED_OTHER_CATS, label=True) + unassigned = coalesce_ranges(cats, names, frozenset({"Cn"}), label=False) + rust = render_rust(other, unassigned) + + if args.output is None: + sys.stdout.write(rust) + else: + args.output.write_text(rust) + print( + f"Wrote {args.output} " + f"({len(other)} assigned-Other ranges, " + f"{len(unassigned)} unassigned ranges).", + file=sys.stderr, + ) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/lightning-types/src/lib.rs b/lightning-types/src/lib.rs index 7f72d6d2671..6a526adaed2 100644 --- a/lightning-types/src/lib.rs +++ b/lightning-types/src/lib.rs @@ -27,3 +27,4 @@ pub mod features; pub mod payment; pub mod routing; pub mod string; +mod unicode; diff --git a/lightning-types/src/string.rs b/lightning-types/src/string.rs index e45c17d8586..a21cad411be 100644 --- a/lightning-types/src/string.rs +++ b/lightning-types/src/string.rs @@ -12,6 +12,8 @@ use alloc::string::String; use core::fmt; +use crate::unicode::*; + /// Struct to `Display` fields in a safe way using `PrintableString` #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] pub struct UntrustedString(pub String); @@ -31,7 +33,9 @@ impl<'a> fmt::Display for PrintableString<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { use core::fmt::Write; for c in self.0.chars() { - let c = if c.is_control() || is_format_char(c) { + let is_other = is_unicode_general_category_other(c); + let is_unassigned = is_unicode_general_category_unassigned(c); + let c = if c.is_control() || is_other || is_unassigned { core::char::REPLACEMENT_CHARACTER } else { c @@ -43,39 +47,6 @@ impl<'a> fmt::Display for PrintableString<'a> { } } -// Codepoints in Unicode general category `Cf` (Format), per Unicode standard. These are not -// matched by `char::is_control` (which only covers `Cc`), but include the bidirectional override / -// isolate controls (e.g. U+202E RLO) and zero-width characters behind the "Trojan Source" attack -// family (CVE-2021-42574), where an attacker-supplied string renders to a human reader as -// something other than its byte content. Strip them alongside `Cc` characters when sanitising -// untrusted input. -fn is_format_char(c: char) -> bool { - matches!( - c as u32, - 0x00AD - | 0x0600..=0x0605 - | 0x061C - | 0x06DD - | 0x070F - | 0x0890..=0x0891 - | 0x08E2 - | 0x180E - | 0x200B..=0x200F - | 0x202A..=0x202E - | 0x2060..=0x2064 - | 0x2066..=0x206F - | 0xFEFF - | 0xFFF9..=0xFFFB - | 0x110BD - | 0x110CD - | 0x13430..=0x1343F - | 0x1BCA0..=0x1BCA3 - | 0x1D173..=0x1D17A - | 0xE0001 - | 0xE0020..=0xE007F - ) -} - #[cfg(test)] mod tests { use super::PrintableString; diff --git a/lightning-types/src/unicode.rs b/lightning-types/src/unicode.rs new file mode 100644 index 00000000000..22b21969365 --- /dev/null +++ b/lightning-types/src/unicode.rs @@ -0,0 +1,799 @@ +// This file is Copyright its original authors, visible in version control +// history. +// +// This file is licensed under the Apache License, Version 2.0 or the MIT license +// , at your option. +// You may not use this file except in accordance with one or both of these +// licenses. + +// Auto-generated from the Unicode Character Database (UnicodeData.txt) by +// contrib/gen_unicode_general_category.py. Do not edit by hand; rerun the +// generator with an updated UnicodeData.txt to refresh the table. + +/// Returns `true` if `c` is in Unicode general category `Cc` (Control), `Cf` +/// (Format), `Cs` (Surrogate), or `Co` (Private Use) — the assigned codepoints +/// in the top-level `C` ("Other") category. The `Cs` portion of the table is +/// unreachable for `char` input (a `char` cannot hold a surrogate) but is kept +/// so the table mirrors the source UCD data verbatim. The disjoint `Cn` +/// (Unassigned) bucket is `is_unicode_general_category_unassigned`. +#[allow(dead_code)] +pub(crate) fn is_unicode_general_category_other(c: char) -> bool { + matches!( + c as u32, + 0x0000..=0x001F // control + | 0x007F..=0x009F // control + | 0x00AD // SOFT HYPHEN + | 0x0600..=0x0605 // ARABIC + | 0x061C // ARABIC LETTER MARK + | 0x06DD // ARABIC END OF AYAH + | 0x070F // SYRIAC ABBREVIATION MARK + | 0x0890..=0x0891 // MARK ABOVE + | 0x08E2 // ARABIC DISPUTED END OF AYAH + | 0x180E // MONGOLIAN VOWEL SEPARATOR + | 0x200B..=0x200F // Cf + | 0x202A..=0x202E // Cf + | 0x2060..=0x2064 // Cf + | 0x2066..=0x206F // Cf + | 0xD800..=0xF8FF // Co / Cs + | 0xFEFF // ZERO WIDTH NO-BREAK SPACE + | 0xFFF9..=0xFFFB // INTERLINEAR ANNOTATION + | 0x110BD // KAITHI NUMBER SIGN + | 0x110CD // KAITHI NUMBER SIGN ABOVE + | 0x13430..=0x1343F // EGYPTIAN HIEROGLYPH + | 0x1BCA0..=0x1BCA3 // SHORTHAND FORMAT + | 0x1D173..=0x1D17A // MUSICAL SYMBOL + | 0xE0001 // LANGUAGE TAG + | 0xE0020..=0xE007F // Cf + | 0xF0000..=0xFFFFD // Plane 15 Private Use + | 0x100000..=0x10FFFD // Plane 16 Private Use + ) +} + +/// Returns `true` if `c` is in Unicode general category `Cn` (Unassigned), or +/// strictly above U+10FFFF. The trailing `0x110000..=u32::MAX` arm is +/// unreachable for `char` input (a `char` is bounded to U+10FFFF) but is kept +/// for defensive coverage of the underlying `u32`. The disjoint Cc / Cf / Cs / +/// Co bucket is `is_unicode_general_category_other`. +#[allow(dead_code)] +pub(crate) fn is_unicode_general_category_unassigned(c: char) -> bool { + matches!( + c as u32, + 0x0378..=0x0379 + | 0x0380..=0x0383 + | 0x038B + | 0x038D + | 0x03A2 + | 0x0530 + | 0x0557..=0x0558 + | 0x058B..=0x058C + | 0x0590 + | 0x05C8..=0x05CF + | 0x05EB..=0x05EE + | 0x05F5..=0x05FF + | 0x070E + | 0x074B..=0x074C + | 0x07B2..=0x07BF + | 0x07FB..=0x07FC + | 0x082E..=0x082F + | 0x083F + | 0x085C..=0x085D + | 0x085F + | 0x086B..=0x086F + | 0x0892..=0x0896 + | 0x0984 + | 0x098D..=0x098E + | 0x0991..=0x0992 + | 0x09A9 + | 0x09B1 + | 0x09B3..=0x09B5 + | 0x09BA..=0x09BB + | 0x09C5..=0x09C6 + | 0x09C9..=0x09CA + | 0x09CF..=0x09D6 + | 0x09D8..=0x09DB + | 0x09DE + | 0x09E4..=0x09E5 + | 0x09FF..=0x0A00 + | 0x0A04 + | 0x0A0B..=0x0A0E + | 0x0A11..=0x0A12 + | 0x0A29 + | 0x0A31 + | 0x0A34 + | 0x0A37 + | 0x0A3A..=0x0A3B + | 0x0A3D + | 0x0A43..=0x0A46 + | 0x0A49..=0x0A4A + | 0x0A4E..=0x0A50 + | 0x0A52..=0x0A58 + | 0x0A5D + | 0x0A5F..=0x0A65 + | 0x0A77..=0x0A80 + | 0x0A84 + | 0x0A8E + | 0x0A92 + | 0x0AA9 + | 0x0AB1 + | 0x0AB4 + | 0x0ABA..=0x0ABB + | 0x0AC6 + | 0x0ACA + | 0x0ACE..=0x0ACF + | 0x0AD1..=0x0ADF + | 0x0AE4..=0x0AE5 + | 0x0AF2..=0x0AF8 + | 0x0B00 + | 0x0B04 + | 0x0B0D..=0x0B0E + | 0x0B11..=0x0B12 + | 0x0B29 + | 0x0B31 + | 0x0B34 + | 0x0B3A..=0x0B3B + | 0x0B45..=0x0B46 + | 0x0B49..=0x0B4A + | 0x0B4E..=0x0B54 + | 0x0B58..=0x0B5B + | 0x0B5E + | 0x0B64..=0x0B65 + | 0x0B78..=0x0B81 + | 0x0B84 + | 0x0B8B..=0x0B8D + | 0x0B91 + | 0x0B96..=0x0B98 + | 0x0B9B + | 0x0B9D + | 0x0BA0..=0x0BA2 + | 0x0BA5..=0x0BA7 + | 0x0BAB..=0x0BAD + | 0x0BBA..=0x0BBD + | 0x0BC3..=0x0BC5 + | 0x0BC9 + | 0x0BCE..=0x0BCF + | 0x0BD1..=0x0BD6 + | 0x0BD8..=0x0BE5 + | 0x0BFB..=0x0BFF + | 0x0C0D + | 0x0C11 + | 0x0C29 + | 0x0C3A..=0x0C3B + | 0x0C45 + | 0x0C49 + | 0x0C4E..=0x0C54 + | 0x0C57 + | 0x0C5B + | 0x0C5E..=0x0C5F + | 0x0C64..=0x0C65 + | 0x0C70..=0x0C76 + | 0x0C8D + | 0x0C91 + | 0x0CA9 + | 0x0CB4 + | 0x0CBA..=0x0CBB + | 0x0CC5 + | 0x0CC9 + | 0x0CCE..=0x0CD4 + | 0x0CD7..=0x0CDB + | 0x0CDF + | 0x0CE4..=0x0CE5 + | 0x0CF0 + | 0x0CF4..=0x0CFF + | 0x0D0D + | 0x0D11 + | 0x0D45 + | 0x0D49 + | 0x0D50..=0x0D53 + | 0x0D64..=0x0D65 + | 0x0D80 + | 0x0D84 + | 0x0D97..=0x0D99 + | 0x0DB2 + | 0x0DBC + | 0x0DBE..=0x0DBF + | 0x0DC7..=0x0DC9 + | 0x0DCB..=0x0DCE + | 0x0DD5 + | 0x0DD7 + | 0x0DE0..=0x0DE5 + | 0x0DF0..=0x0DF1 + | 0x0DF5..=0x0E00 + | 0x0E3B..=0x0E3E + | 0x0E5C..=0x0E80 + | 0x0E83 + | 0x0E85 + | 0x0E8B + | 0x0EA4 + | 0x0EA6 + | 0x0EBE..=0x0EBF + | 0x0EC5 + | 0x0EC7 + | 0x0ECF + | 0x0EDA..=0x0EDB + | 0x0EE0..=0x0EFF + | 0x0F48 + | 0x0F6D..=0x0F70 + | 0x0F98 + | 0x0FBD + | 0x0FCD + | 0x0FDB..=0x0FFF + | 0x10C6 + | 0x10C8..=0x10CC + | 0x10CE..=0x10CF + | 0x1249 + | 0x124E..=0x124F + | 0x1257 + | 0x1259 + | 0x125E..=0x125F + | 0x1289 + | 0x128E..=0x128F + | 0x12B1 + | 0x12B6..=0x12B7 + | 0x12BF + | 0x12C1 + | 0x12C6..=0x12C7 + | 0x12D7 + | 0x1311 + | 0x1316..=0x1317 + | 0x135B..=0x135C + | 0x137D..=0x137F + | 0x139A..=0x139F + | 0x13F6..=0x13F7 + | 0x13FE..=0x13FF + | 0x169D..=0x169F + | 0x16F9..=0x16FF + | 0x1716..=0x171E + | 0x1737..=0x173F + | 0x1754..=0x175F + | 0x176D + | 0x1771 + | 0x1774..=0x177F + | 0x17DE..=0x17DF + | 0x17EA..=0x17EF + | 0x17FA..=0x17FF + | 0x181A..=0x181F + | 0x1879..=0x187F + | 0x18AB..=0x18AF + | 0x18F6..=0x18FF + | 0x191F + | 0x192C..=0x192F + | 0x193C..=0x193F + | 0x1941..=0x1943 + | 0x196E..=0x196F + | 0x1975..=0x197F + | 0x19AC..=0x19AF + | 0x19CA..=0x19CF + | 0x19DB..=0x19DD + | 0x1A1C..=0x1A1D + | 0x1A5F + | 0x1A7D..=0x1A7E + | 0x1A8A..=0x1A8F + | 0x1A9A..=0x1A9F + | 0x1AAE..=0x1AAF + | 0x1ADE..=0x1ADF + | 0x1AEC..=0x1AFF + | 0x1B4D + | 0x1BF4..=0x1BFB + | 0x1C38..=0x1C3A + | 0x1C4A..=0x1C4C + | 0x1C8B..=0x1C8F + | 0x1CBB..=0x1CBC + | 0x1CC8..=0x1CCF + | 0x1CFB..=0x1CFF + | 0x1F16..=0x1F17 + | 0x1F1E..=0x1F1F + | 0x1F46..=0x1F47 + | 0x1F4E..=0x1F4F + | 0x1F58 + | 0x1F5A + | 0x1F5C + | 0x1F5E + | 0x1F7E..=0x1F7F + | 0x1FB5 + | 0x1FC5 + | 0x1FD4..=0x1FD5 + | 0x1FDC + | 0x1FF0..=0x1FF1 + | 0x1FF5 + | 0x1FFF + | 0x2065 + | 0x2072..=0x2073 + | 0x208F + | 0x209D..=0x209F + | 0x20C2..=0x20CF + | 0x20F1..=0x20FF + | 0x218C..=0x218F + | 0x242A..=0x243F + | 0x244B..=0x245F + | 0x2B74..=0x2B75 + | 0x2CF4..=0x2CF8 + | 0x2D26 + | 0x2D28..=0x2D2C + | 0x2D2E..=0x2D2F + | 0x2D68..=0x2D6E + | 0x2D71..=0x2D7E + | 0x2D97..=0x2D9F + | 0x2DA7 + | 0x2DAF + | 0x2DB7 + | 0x2DBF + | 0x2DC7 + | 0x2DCF + | 0x2DD7 + | 0x2DDF + | 0x2E5E..=0x2E7F + | 0x2E9A + | 0x2EF4..=0x2EFF + | 0x2FD6..=0x2FEF + | 0x3040 + | 0x3097..=0x3098 + | 0x3100..=0x3104 + | 0x3130 + | 0x318F + | 0x31E6..=0x31EE + | 0x321F + | 0xA48D..=0xA48F + | 0xA4C7..=0xA4CF + | 0xA62C..=0xA63F + | 0xA6F8..=0xA6FF + | 0xA7DD..=0xA7F0 + | 0xA82D..=0xA82F + | 0xA83A..=0xA83F + | 0xA878..=0xA87F + | 0xA8C6..=0xA8CD + | 0xA8DA..=0xA8DF + | 0xA954..=0xA95E + | 0xA97D..=0xA97F + | 0xA9CE + | 0xA9DA..=0xA9DD + | 0xA9FF + | 0xAA37..=0xAA3F + | 0xAA4E..=0xAA4F + | 0xAA5A..=0xAA5B + | 0xAAC3..=0xAADA + | 0xAAF7..=0xAB00 + | 0xAB07..=0xAB08 + | 0xAB0F..=0xAB10 + | 0xAB17..=0xAB1F + | 0xAB27 + | 0xAB2F + | 0xAB6C..=0xAB6F + | 0xABEE..=0xABEF + | 0xABFA..=0xABFF + | 0xD7A4..=0xD7AF + | 0xD7C7..=0xD7CA + | 0xD7FC..=0xD7FF + | 0xFA6E..=0xFA6F + | 0xFADA..=0xFAFF + | 0xFB07..=0xFB12 + | 0xFB18..=0xFB1C + | 0xFB37 + | 0xFB3D + | 0xFB3F + | 0xFB42 + | 0xFB45 + | 0xFDD0..=0xFDEF + | 0xFE1A..=0xFE1F + | 0xFE53 + | 0xFE67 + | 0xFE6C..=0xFE6F + | 0xFE75 + | 0xFEFD..=0xFEFE + | 0xFF00 + | 0xFFBF..=0xFFC1 + | 0xFFC8..=0xFFC9 + | 0xFFD0..=0xFFD1 + | 0xFFD8..=0xFFD9 + | 0xFFDD..=0xFFDF + | 0xFFE7 + | 0xFFEF..=0xFFF8 + | 0xFFFE..=0xFFFF + | 0x1000C + | 0x10027 + | 0x1003B + | 0x1003E + | 0x1004E..=0x1004F + | 0x1005E..=0x1007F + | 0x100FB..=0x100FF + | 0x10103..=0x10106 + | 0x10134..=0x10136 + | 0x1018F + | 0x1019D..=0x1019F + | 0x101A1..=0x101CF + | 0x101FE..=0x1027F + | 0x1029D..=0x1029F + | 0x102D1..=0x102DF + | 0x102FC..=0x102FF + | 0x10324..=0x1032C + | 0x1034B..=0x1034F + | 0x1037B..=0x1037F + | 0x1039E + | 0x103C4..=0x103C7 + | 0x103D6..=0x103FF + | 0x1049E..=0x1049F + | 0x104AA..=0x104AF + | 0x104D4..=0x104D7 + | 0x104FC..=0x104FF + | 0x10528..=0x1052F + | 0x10564..=0x1056E + | 0x1057B + | 0x1058B + | 0x10593 + | 0x10596 + | 0x105A2 + | 0x105B2 + | 0x105BA + | 0x105BD..=0x105BF + | 0x105F4..=0x105FF + | 0x10737..=0x1073F + | 0x10756..=0x1075F + | 0x10768..=0x1077F + | 0x10786 + | 0x107B1 + | 0x107BB..=0x107FF + | 0x10806..=0x10807 + | 0x10809 + | 0x10836 + | 0x10839..=0x1083B + | 0x1083D..=0x1083E + | 0x10856 + | 0x1089F..=0x108A6 + | 0x108B0..=0x108DF + | 0x108F3 + | 0x108F6..=0x108FA + | 0x1091C..=0x1091E + | 0x1093A..=0x1093E + | 0x1095A..=0x1097F + | 0x109B8..=0x109BB + | 0x109D0..=0x109D1 + | 0x10A04 + | 0x10A07..=0x10A0B + | 0x10A14 + | 0x10A18 + | 0x10A36..=0x10A37 + | 0x10A3B..=0x10A3E + | 0x10A49..=0x10A4F + | 0x10A59..=0x10A5F + | 0x10AA0..=0x10ABF + | 0x10AE7..=0x10AEA + | 0x10AF7..=0x10AFF + | 0x10B36..=0x10B38 + | 0x10B56..=0x10B57 + | 0x10B73..=0x10B77 + | 0x10B92..=0x10B98 + | 0x10B9D..=0x10BA8 + | 0x10BB0..=0x10BFF + | 0x10C49..=0x10C7F + | 0x10CB3..=0x10CBF + | 0x10CF3..=0x10CF9 + | 0x10D28..=0x10D2F + | 0x10D3A..=0x10D3F + | 0x10D66..=0x10D68 + | 0x10D86..=0x10D8D + | 0x10D90..=0x10E5F + | 0x10E7F + | 0x10EAA + | 0x10EAE..=0x10EAF + | 0x10EB2..=0x10EC1 + | 0x10EC8..=0x10ECF + | 0x10ED9..=0x10EF9 + | 0x10F28..=0x10F2F + | 0x10F5A..=0x10F6F + | 0x10F8A..=0x10FAF + | 0x10FCC..=0x10FDF + | 0x10FF7..=0x10FFF + | 0x1104E..=0x11051 + | 0x11076..=0x1107E + | 0x110C3..=0x110CC + | 0x110CE..=0x110CF + | 0x110E9..=0x110EF + | 0x110FA..=0x110FF + | 0x11135 + | 0x11148..=0x1114F + | 0x11177..=0x1117F + | 0x111E0 + | 0x111F5..=0x111FF + | 0x11212 + | 0x11242..=0x1127F + | 0x11287 + | 0x11289 + | 0x1128E + | 0x1129E + | 0x112AA..=0x112AF + | 0x112EB..=0x112EF + | 0x112FA..=0x112FF + | 0x11304 + | 0x1130D..=0x1130E + | 0x11311..=0x11312 + | 0x11329 + | 0x11331 + | 0x11334 + | 0x1133A + | 0x11345..=0x11346 + | 0x11349..=0x1134A + | 0x1134E..=0x1134F + | 0x11351..=0x11356 + | 0x11358..=0x1135C + | 0x11364..=0x11365 + | 0x1136D..=0x1136F + | 0x11375..=0x1137F + | 0x1138A + | 0x1138C..=0x1138D + | 0x1138F + | 0x113B6 + | 0x113C1 + | 0x113C3..=0x113C4 + | 0x113C6 + | 0x113CB + | 0x113D6 + | 0x113D9..=0x113E0 + | 0x113E3..=0x113FF + | 0x1145C + | 0x11462..=0x1147F + | 0x114C8..=0x114CF + | 0x114DA..=0x1157F + | 0x115B6..=0x115B7 + | 0x115DE..=0x115FF + | 0x11645..=0x1164F + | 0x1165A..=0x1165F + | 0x1166D..=0x1167F + | 0x116BA..=0x116BF + | 0x116CA..=0x116CF + | 0x116E4..=0x116FF + | 0x1171B..=0x1171C + | 0x1172C..=0x1172F + | 0x11747..=0x117FF + | 0x1183C..=0x1189F + | 0x118F3..=0x118FE + | 0x11907..=0x11908 + | 0x1190A..=0x1190B + | 0x11914 + | 0x11917 + | 0x11936 + | 0x11939..=0x1193A + | 0x11947..=0x1194F + | 0x1195A..=0x1199F + | 0x119A8..=0x119A9 + | 0x119D8..=0x119D9 + | 0x119E5..=0x119FF + | 0x11A48..=0x11A4F + | 0x11AA3..=0x11AAF + | 0x11AF9..=0x11AFF + | 0x11B0A..=0x11B5F + | 0x11B68..=0x11BBF + | 0x11BE2..=0x11BEF + | 0x11BFA..=0x11BFF + | 0x11C09 + | 0x11C37 + | 0x11C46..=0x11C4F + | 0x11C6D..=0x11C6F + | 0x11C90..=0x11C91 + | 0x11CA8 + | 0x11CB7..=0x11CFF + | 0x11D07 + | 0x11D0A + | 0x11D37..=0x11D39 + | 0x11D3B + | 0x11D3E + | 0x11D48..=0x11D4F + | 0x11D5A..=0x11D5F + | 0x11D66 + | 0x11D69 + | 0x11D8F + | 0x11D92 + | 0x11D99..=0x11D9F + | 0x11DAA..=0x11DAF + | 0x11DDC..=0x11DDF + | 0x11DEA..=0x11EDF + | 0x11EF9..=0x11EFF + | 0x11F11 + | 0x11F3B..=0x11F3D + | 0x11F5B..=0x11FAF + | 0x11FB1..=0x11FBF + | 0x11FF2..=0x11FFE + | 0x1239A..=0x123FF + | 0x1246F + | 0x12475..=0x1247F + | 0x12544..=0x12F8F + | 0x12FF3..=0x12FFF + | 0x13456..=0x1345F + | 0x143FB..=0x143FF + | 0x14647..=0x160FF + | 0x1613A..=0x167FF + | 0x16A39..=0x16A3F + | 0x16A5F + | 0x16A6A..=0x16A6D + | 0x16ABF + | 0x16ACA..=0x16ACF + | 0x16AEE..=0x16AEF + | 0x16AF6..=0x16AFF + | 0x16B46..=0x16B4F + | 0x16B5A + | 0x16B62 + | 0x16B78..=0x16B7C + | 0x16B90..=0x16D3F + | 0x16D7A..=0x16E3F + | 0x16E9B..=0x16E9F + | 0x16EB9..=0x16EBA + | 0x16ED4..=0x16EFF + | 0x16F4B..=0x16F4E + | 0x16F88..=0x16F8E + | 0x16FA0..=0x16FDF + | 0x16FE5..=0x16FEF + | 0x16FF7..=0x16FFF + | 0x18CD6..=0x18CFE + | 0x18D1F..=0x18D7F + | 0x18DF3..=0x1AFEF + | 0x1AFF4 + | 0x1AFFC + | 0x1AFFF + | 0x1B123..=0x1B131 + | 0x1B133..=0x1B14F + | 0x1B153..=0x1B154 + | 0x1B156..=0x1B163 + | 0x1B168..=0x1B16F + | 0x1B2FC..=0x1BBFF + | 0x1BC6B..=0x1BC6F + | 0x1BC7D..=0x1BC7F + | 0x1BC89..=0x1BC8F + | 0x1BC9A..=0x1BC9B + | 0x1BCA4..=0x1CBFF + | 0x1CCFD..=0x1CCFF + | 0x1CEB4..=0x1CEB9 + | 0x1CED1..=0x1CEDF + | 0x1CEF1..=0x1CEFF + | 0x1CF2E..=0x1CF2F + | 0x1CF47..=0x1CF4F + | 0x1CFC4..=0x1CFFF + | 0x1D0F6..=0x1D0FF + | 0x1D127..=0x1D128 + | 0x1D1EB..=0x1D1FF + | 0x1D246..=0x1D2BF + | 0x1D2D4..=0x1D2DF + | 0x1D2F4..=0x1D2FF + | 0x1D357..=0x1D35F + | 0x1D379..=0x1D3FF + | 0x1D455 + | 0x1D49D + | 0x1D4A0..=0x1D4A1 + | 0x1D4A3..=0x1D4A4 + | 0x1D4A7..=0x1D4A8 + | 0x1D4AD + | 0x1D4BA + | 0x1D4BC + | 0x1D4C4 + | 0x1D506 + | 0x1D50B..=0x1D50C + | 0x1D515 + | 0x1D51D + | 0x1D53A + | 0x1D53F + | 0x1D545 + | 0x1D547..=0x1D549 + | 0x1D551 + | 0x1D6A6..=0x1D6A7 + | 0x1D7CC..=0x1D7CD + | 0x1DA8C..=0x1DA9A + | 0x1DAA0 + | 0x1DAB0..=0x1DEFF + | 0x1DF1F..=0x1DF24 + | 0x1DF2B..=0x1DFFF + | 0x1E007 + | 0x1E019..=0x1E01A + | 0x1E022 + | 0x1E025 + | 0x1E02B..=0x1E02F + | 0x1E06E..=0x1E08E + | 0x1E090..=0x1E0FF + | 0x1E12D..=0x1E12F + | 0x1E13E..=0x1E13F + | 0x1E14A..=0x1E14D + | 0x1E150..=0x1E28F + | 0x1E2AF..=0x1E2BF + | 0x1E2FA..=0x1E2FE + | 0x1E300..=0x1E4CF + | 0x1E4FA..=0x1E5CF + | 0x1E5FB..=0x1E5FE + | 0x1E600..=0x1E6BF + | 0x1E6DF + | 0x1E6F6..=0x1E6FD + | 0x1E700..=0x1E7DF + | 0x1E7E7 + | 0x1E7EC + | 0x1E7EF + | 0x1E7FF + | 0x1E8C5..=0x1E8C6 + | 0x1E8D7..=0x1E8FF + | 0x1E94C..=0x1E94F + | 0x1E95A..=0x1E95D + | 0x1E960..=0x1EC70 + | 0x1ECB5..=0x1ED00 + | 0x1ED3E..=0x1EDFF + | 0x1EE04 + | 0x1EE20 + | 0x1EE23 + | 0x1EE25..=0x1EE26 + | 0x1EE28 + | 0x1EE33 + | 0x1EE38 + | 0x1EE3A + | 0x1EE3C..=0x1EE41 + | 0x1EE43..=0x1EE46 + | 0x1EE48 + | 0x1EE4A + | 0x1EE4C + | 0x1EE50 + | 0x1EE53 + | 0x1EE55..=0x1EE56 + | 0x1EE58 + | 0x1EE5A + | 0x1EE5C + | 0x1EE5E + | 0x1EE60 + | 0x1EE63 + | 0x1EE65..=0x1EE66 + | 0x1EE6B + | 0x1EE73 + | 0x1EE78 + | 0x1EE7D + | 0x1EE7F + | 0x1EE8A + | 0x1EE9C..=0x1EEA0 + | 0x1EEA4 + | 0x1EEAA + | 0x1EEBC..=0x1EEEF + | 0x1EEF2..=0x1EFFF + | 0x1F02C..=0x1F02F + | 0x1F094..=0x1F09F + | 0x1F0AF..=0x1F0B0 + | 0x1F0C0 + | 0x1F0D0 + | 0x1F0F6..=0x1F0FF + | 0x1F1AE..=0x1F1E5 + | 0x1F203..=0x1F20F + | 0x1F23C..=0x1F23F + | 0x1F249..=0x1F24F + | 0x1F252..=0x1F25F + | 0x1F266..=0x1F2FF + | 0x1F6D9..=0x1F6DB + | 0x1F6ED..=0x1F6EF + | 0x1F6FD..=0x1F6FF + | 0x1F7DA..=0x1F7DF + | 0x1F7EC..=0x1F7EF + | 0x1F7F1..=0x1F7FF + | 0x1F80C..=0x1F80F + | 0x1F848..=0x1F84F + | 0x1F85A..=0x1F85F + | 0x1F888..=0x1F88F + | 0x1F8AE..=0x1F8AF + | 0x1F8BC..=0x1F8BF + | 0x1F8C2..=0x1F8CF + | 0x1F8D9..=0x1F8FF + | 0x1FA58..=0x1FA5F + | 0x1FA6E..=0x1FA6F + | 0x1FA7D..=0x1FA7F + | 0x1FA8B..=0x1FA8D + | 0x1FAC7 + | 0x1FAC9..=0x1FACC + | 0x1FADD..=0x1FADE + | 0x1FAEB..=0x1FAEE + | 0x1FAF9..=0x1FAFF + | 0x1FB93 + | 0x1FBFB..=0x1FFFF + | 0x2A6E0..=0x2A6FF + | 0x2B81E..=0x2B81F + | 0x2CEAE..=0x2CEAF + | 0x2EBE1..=0x2EBEF + | 0x2EE5E..=0x2F7FF + | 0x2FA1E..=0x2FFFF + | 0x3134B..=0x3134F + | 0x3347A..=0xE0000 + | 0xE0002..=0xE001F + | 0xE0080..=0xE00FF + | 0xE01F0..=0xEFFFF + | 0xFFFFE..=0xFFFFF + | 0x10FFFE..=0x10FFFF + | 0x110000..=u32::MAX // above U+10FFFF — unreachable for `char` + ) +}