diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index 995484e..418bd79 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -174,20 +174,10 @@ def replace_box_unicode_with_ascii(input_string): # This dictionary is used for the default encoding from Unicode/UTF-8 to ASCII +# Start from CHARACTER_TO_NAME, but skipping all the printable ASCII characters +# and some special characters: -UNICODE_CHARACTER_TO_ASCII = CHARACTER_TO_NAME.copy() -if "operator-to-ascii" in NAMED_CHARACTERS_COLLECTION: - UNICODE_CHARACTER_TO_ASCII.update( - { - ch: NAMED_CHARACTERS_COLLECTION["operator-to-ascii"][name] - for name, ch in NAMED_CHARACTERS_COLLECTION["operator-to-unicode"].items() - if name in NAMED_CHARACTERS_COLLECTION["operator-to-ascii"] - } - ) - # TODO: add WL characters to UNICODE_CHARACTER_TO_ASCII. For example, "\uf74c" in WMA is named as - # \[DifferentialD]. Here we are using "\U0001d451" for that name, because is a character - # we can print with standard fonts. For the effects of this table, "\uf74c" should be mapped to - # something that can be print as an ASCII string (probably, "d"). +UNICODE_CHARACTER_TO_ASCII = NAMED_CHARACTERS_COLLECTION.get("unicode-to-ascii", {}) # Deprecated diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index d50c9ba..be6056b 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -101,6 +101,7 @@ # amslatex: "\={a}" is incorrect; \= is an invalid escape sequence. AAcute: + ascii: "a'" esc-alias: a' has-unicode-inverse: false is-letter-like: true @@ -114,6 +115,7 @@ AAcute: ABar: amslatex: '\={a}' + ascii: "a-" esc-alias: a- has-unicode-inverse: false is-letter-like: true @@ -126,6 +128,7 @@ ABar: ACup: amslatex: '\u{a}' + ascii: "au" esc-alias: au has-unicode-inverse: false is-letter-like: true @@ -143,6 +146,7 @@ AddTo: operator-name: AddTo ADoubleDot: + ascii: 'a"' esc-alias: a" has-unicode-inverse: false is-letter-like: true @@ -154,6 +158,7 @@ ADoubleDot: wl-unicode-name: LATIN SMALL LETTER A WITH DIAERESIS AE: + ascii: "ae" esc-alias: ae has-unicode-inverse: false is-letter-like: true @@ -165,6 +170,7 @@ AE: wl-unicode-name: LATIN SMALL LETTER AE AGrave: + ascii: 'a`' esc-alias: a` has-unicode-inverse: false is-letter-like: true @@ -177,6 +183,7 @@ AGrave: wl-unicode-name: LATIN SMALL LETTER A WITH GRAVE AHat: + ascii: 'a^' esc-alias: a^ has-unicode-inverse: false is-letter-like: true @@ -217,6 +224,7 @@ ApplyTo: operator-name: ApplyTo ARing: + ascii: 'ao' esc-alias: ao has-unicode-inverse: false is-letter-like: true @@ -565,6 +573,7 @@ Bullet: wl-unicode-name: BULLET CAcute: + ascii: "c'" esc-alias: c' has-unicode-inverse: true is-letter-like: true @@ -577,6 +586,7 @@ CAcute: wl-unicode-name: LATIN SMALL LETTER C WITH ACUTE CCedilla: + ascii: "," esc-alias: c has-unicode-inverse: false is-letter-like: true @@ -589,6 +599,7 @@ CCedilla: wl-unicode-name: LATIN SMALL LETTER C WITH CEDILLA CHacek: + ascii: 'cv' esc-alias: cv has-unicode-inverse: false is-letter-like: true @@ -623,6 +634,7 @@ Cap: wl-unicode-name: FROWN CapitalAAcute: + ascii: "A'" esc-alias: A' has-unicode-inverse: false is-letter-like: true @@ -635,6 +647,7 @@ CapitalAAcute: CapitalABar: amslatex: '\={A}' + ascii: 'A-' esc-alias: A- has-unicode-inverse: false is-letter-like: true @@ -647,6 +660,7 @@ CapitalABar: CapitalACup: amslatex: '\u{A}' + ascii: 'Au' esc-alias: Au has-unicode-inverse: true is-letter-like: true @@ -658,6 +672,7 @@ CapitalACup: wl-unicode-name: LATIN CAPITAL LETTER A WITH BREVE CapitalADoubleDot: + ascii: 'A"' esc-alias: A" has-unicode-inverse: true is-letter-like: true @@ -668,6 +683,7 @@ CapitalADoubleDot: wl-unicode-name: LATIN CAPITAL LETTER A WITH DIAERESIS CapitalAE: + ascii: "AE" esc-alias: AE has-unicode-inverse: true is-letter-like: true @@ -678,6 +694,7 @@ CapitalAE: wl-unicode-name: LATIN CAPITAL LETTER AE CapitalAGrave: + ascii: "A`" esc-alias: A` has-unicode-inverse: true is-letter-like: true @@ -689,6 +706,7 @@ CapitalAGrave: wl-unicode-name: LATIN CAPITAL LETTER A WITH GRAVE CapitalAHat: + ascii: "A^" esc-alias: A^ has-unicode-inverse: false is-letter-like: true @@ -699,6 +717,7 @@ CapitalAHat: wl-unicode-name: LATIN CAPITAL LETTER A WITH CIRCUMFLEX CapitalARing: + ascii: "Ao" esc-alias: Ao has-unicode-inverse: true is-letter-like: true @@ -709,6 +728,7 @@ CapitalARing: wl-unicode-name: LATIN CAPITAL LETTER A WITH RING ABOVE CapitalATilde: + ascii: "A~" esc-alias: A~ has-unicode-inverse: true is-letter-like: true @@ -754,6 +774,7 @@ CapitalCAcute: CapitalCCedilla: amslatex: '\c{C}' + ascii: "C," esc-alias: C has-unicode-inverse: false latex: '\c{C}' @@ -765,6 +786,7 @@ CapitalCCedilla: wl-unicode-name: LATIN CAPITAL LETTER C WITH CEDILLA CapitalCHacek: + ascii: 'Cv' esc-alias: Cv has-unicode-inverse: true is-letter-like: true @@ -788,6 +810,7 @@ CapitalChi: wl-unicode-name: GREEK CAPITAL LETTER CHI CapitalDHacek: + ascii: 'Dv' esc-alias: Dv has-unicode-inverse: false is-letter-like: true @@ -835,6 +858,7 @@ CapitalDigamma: wl-unicode-name: GREEK LETTER DIGAMMA CapitalEAcute: + ascii: "E'" esc-alias: E' has-unicode-inverse: false is-letter-like: true @@ -846,6 +870,7 @@ CapitalEAcute: wl-unicode-name: LATIN CAPITAL LETTER E WITH ACUTE CapitalEBar: + ascii: 'E-' esc-alias: E- has-unicode-inverse: false is-letter-like: true @@ -859,6 +884,7 @@ CapitalEBar: CapitalECup: amslatex: '\u{E}' + ascii: 'Eu' esc-alias: Eu has-unicode-inverse: false is-letter-like: true @@ -870,6 +896,7 @@ CapitalECup: wl-unicode-name: LATIN CAPITAL LETTER E WITH BREVE CapitalEDoubleDot: + ascii: 'E"' esc-alias: E" has-unicode-inverse: false is-letter-like: true @@ -880,6 +907,7 @@ CapitalEDoubleDot: wl-unicode-name: LATIN CAPITAL LETTER E WITH DIAERESIS CapitalEGrave: + ascii: "E`" esc-alias: E` has-unicode-inverse: false is-letter-like: true @@ -891,6 +919,7 @@ CapitalEGrave: wl-unicode-name: LATIN CAPITAL LETTER E WITH GRAVE CapitalEHacek: + ascii: 'Ev' esc-alias: Ev has-unicode-inverse: false is-letter-like: true @@ -903,6 +932,7 @@ CapitalEHacek: wl-unicode-name: LATIN CAPITAL LETTER E WITH CARON CapitalEHat: + ascii: "E^" esc-alias: E^ has-unicode-inverse: false is-letter-like: true @@ -935,6 +965,7 @@ CapitalEta: wl-unicode-name: GREEK CAPITAL LETTER ETA CapitalEth: + ascii: "Eth" esc-alias: D- has-unicode-inverse: false is-letter-like: true @@ -957,6 +988,7 @@ CapitalGamma: wl-unicode-name: GREEK CAPITAL LETTER GAMMA CapitalIAcute: + ascii: "I'" esc-alias: I' has-unicode-inverse: false is-letter-like: true @@ -980,6 +1012,7 @@ CapitalICup: wl-unicode-name: LATIN CAPITAL LETTER I WITH BREVE CapitalIDoubleDot: + ascii: 'I"' esc-alias: I" has-unicode-inverse: false is-letter-like: true @@ -990,6 +1023,7 @@ CapitalIDoubleDot: wl-unicode-name: LATIN CAPITAL LETTER I WITH DIAERESIS CapitalIGrave: + ascii: 'I`' esc-alias: I` has-unicode-inverse: false is-letter-like: true @@ -1001,6 +1035,7 @@ CapitalIGrave: wl-unicode-name: LATIN CAPITAL LETTER I WITH GRAVE CapitalIHat: + ascii: 'I^' esc-alias: I^ has-unicode-inverse: false is-letter-like: true @@ -1079,6 +1114,7 @@ CapitalMu: wl-unicode-name: GREEK CAPITAL LETTER MU CapitalNHacek: + ascii: 'Nv' esc-alias: Nv has-unicode-inverse: false is-letter-like: true @@ -1091,6 +1127,7 @@ CapitalNHacek: wl-unicode-name: LATIN CAPITAL LETTER N WITH CARON CapitalNTilde: + ascii: 'N~' esc-alias: N~ has-unicode-inverse: false is-letter-like: true @@ -1112,6 +1149,7 @@ CapitalNu: wl-unicode-name: GREEK CAPITAL LETTER NU CapitalOAcute: + ascii: "O'" esc-alias: O' has-unicode-inverse: false is-letter-like: true @@ -1123,6 +1161,7 @@ CapitalOAcute: wl-unicode-name: LATIN CAPITAL LETTER O WITH ACUTE CapitalODoubleAcute: + ascii: "O''" esc-alias: O'' has-unicode-inverse: false is-letter-like: true @@ -1134,6 +1173,7 @@ CapitalODoubleAcute: wl-unicode-name: LATIN CAPITAL LETTER O WITH DOUBLE ACUTE CapitalODoubleDot: + ascii: 'O"' esc-alias: O" has-unicode-inverse: false is-letter-like: true @@ -1155,6 +1195,7 @@ CapitalOE: wl-unicode-name: LATIN CAPITAL LIGATURE OE CapitalOGrave: + ascii: 'O`' esc-alias: O` has-unicode-inverse: false is-letter-like: true @@ -1166,6 +1207,7 @@ CapitalOGrave: wl-unicode-name: LATIN CAPITAL LETTER O WITH GRAVE CapitalOHat: + ascii: 'O^' esc-alias: O^ has-unicode-inverse: false is-letter-like: true @@ -1178,6 +1220,7 @@ CapitalOHat: CapitalOSlash: amslatex: '\O{}' + ascii: 'O/' esc-alias: O/ has-unicode-inverse: true is-letter-like: true @@ -1188,6 +1231,7 @@ CapitalOSlash: wl-unicode-name: LATIN CAPITAL LETTER O WITH STROKE CapitalOTilde: + ascii: 'O~' esc-alias: O~ has-unicode-inverse: false is-letter-like: true @@ -1257,6 +1301,7 @@ CapitalPsi: wl-unicode-name: GREEK CAPITAL LETTER PSI CapitalRHacek: + ascii: 'Rv' esc-alias: Rv has-unicode-inverse: false latex: "\\v{R}" @@ -1280,6 +1325,7 @@ CapitalRho: wl-unicode-name: GREEK CAPITAL LETTER RHO CapitalSHacek: + ascii: 'Sv' esc-alias: Sv has-unicode-inverse: false is-letter-like: true @@ -1326,6 +1372,7 @@ CapitalStigma: wl-unicode-name: GREEK LETTER STIGMA CapitalTHacek: + ascii: 'Tv' esc-alias: Tv has-unicode-inverse: false is-letter-like: true @@ -1361,6 +1408,7 @@ CapitalTheta: wl-unicode-name: GREEK CAPITAL LETTER THETA CapitalThorn: + ascii: 'Th' esc-alias: Thn has-unicode-inverse: false is-letter-like: true @@ -1371,6 +1419,7 @@ CapitalThorn: wl-unicode-name: LATIN CAPITAL LETTER THORN CapitalUAcute: + ascii: "U'" esc-alias: U' has-unicode-inverse: false is-letter-like: true @@ -1382,6 +1431,7 @@ CapitalUAcute: wl-unicode-name: LATIN CAPITAL LETTER U WITH ACUTE CapitalUDoubleAcute: + ascii: "U''" esc-alias: U'' has-unicode-inverse: false is-letter-like: true @@ -1393,6 +1443,7 @@ CapitalUDoubleAcute: wl-unicode-name: LATIN CAPITAL LETTER U WITH DOUBLE ACUTE CapitalUDoubleDot: + ascii: 'U"' esc-alias: U" has-unicode-inverse: false is-letter-like: true @@ -1403,6 +1454,7 @@ CapitalUDoubleDot: wl-unicode-name: LATIN CAPITAL LETTER U WITH DIAERESIS CapitalUGrave: + ascii: 'U`' esc-alias: U` has-unicode-inverse: false is-letter-like: true @@ -1414,6 +1466,7 @@ CapitalUGrave: wl-unicode-name: LATIN CAPITAL LETTER U WITH GRAVE CapitalUHat: + ascii: 'U^' esc-alias: U^ has-unicode-inverse: false is-letter-like: true @@ -1424,6 +1477,7 @@ CapitalUHat: wl-unicode-name: LATIN CAPITAL LETTER U WITH CIRCUMFLEX CapitalURing: + ascii: 'Uo' esc-alias: Uo has-unicode-inverse: false is-letter-like: true @@ -1459,6 +1513,7 @@ CapitalXi: wl-unicode-name: GREEK CAPITAL LETTER XI CapitalYAcute: + ascii: "Y'" esc-alias: Y' has-unicode-inverse: false is-letter-like: true @@ -1470,6 +1525,7 @@ CapitalYAcute: wl-unicode-name: LATIN CAPITAL LETTER Y WITH ACUTE CapitalZHacek: + ascii: 'Zv' esc-alias: Zv has-unicode-inverse: false is-letter-like: true @@ -1815,6 +1871,7 @@ Coproduct: wl-unicode-name: N-ARY COPRODUCT Copyright: + ascii: "(c)" has-unicode-inverse: false is-letter-like: false unicode-equivalent: "\xA9" @@ -1956,6 +2013,7 @@ Currency: wl-unicode-name: CURRENCY SIGN DHacek: + ascii: 'dv' esc-alias: dv has-unicode-inverse: false is-letter-like: true @@ -2023,6 +2081,7 @@ Decrement: # right value is 176 which is the same as the Unicode value. These # are what are used here. Degree: + ascii: "Degree" esc-alias: deg has-unicode-inverse: false is-builtin-constant: true @@ -2334,6 +2393,7 @@ DoubleDagger: wl-unicode-name: DOUBLE DAGGER DoubleDot: + ascii: ".." has-unicode-inverse: false is-letter-like: false unicode-equivalent: "\xA8" @@ -2394,6 +2454,7 @@ DoubleLeftTee: DoubleLongLeftArrow: amslatex: '\Longleftarrow' + ascii: '<==' esc-alias: <== has-unicode-inverse: false is-letter-like: false @@ -2407,6 +2468,7 @@ DoubleLongLeftArrow: DoubleLongLeftRightArrow: amslatex: '\Longleftrightarrow' + ascii: '<==>' esc-alias: <==> has-unicode-inverse: false is-letter-like: false @@ -2420,6 +2482,7 @@ DoubleLongLeftRightArrow: DoubleLongRightArrow: amslatex: '\Longrightarrow' + ascii: '==>' esc-alias: ==> has-unicode-inverse: false is-letter-like: false @@ -2432,6 +2495,7 @@ DoubleLongRightArrow: wl-unicode-name: LONG RIGHTWARDS DOUBLE ARROW DoublePrime: + ascii: ',,' esc-alias: '''''' has-unicode-inverse: false is-letter-like: false @@ -2444,6 +2508,7 @@ DoublePrime: DoubleRightArrow: amslatex: '\Rightarrow' + ascii: '=>' esc-alias: ' =>' has-unicode-inverse: false is-letter-like: false @@ -2457,6 +2522,7 @@ DoubleRightArrow: DoubleRightTee: amslatex: '\vDash' + ascii: '=|' has-unicode-inverse: false is-letter-like: false unicode-equivalent: "\u22A8" @@ -3333,6 +3399,7 @@ DownTeeArrow: wl-unicode-name: DOWNWARDS ARROW FROM BAR EAcute: + ascii: "e'" esc-alias: e' has-unicode-inverse: false is-letter-like: true @@ -3345,6 +3412,7 @@ EAcute: wl-unicode-name: LATIN SMALL LETTER E WITH ACUTE EBar: + ascii: 'e-' esc-alias: e- has-unicode-inverse: false is-letter-like: true @@ -3367,6 +3435,7 @@ ECup: wl-unicode-name: LATIN SMALL LETTER E WITH BREVE EDoubleDot: + ascii: 'e"' esc-alias: e" has-unicode-inverse: false is-letter-like: true @@ -3378,6 +3447,7 @@ EDoubleDot: wl-unicode-name: LATIN SMALL LETTER E WITH DIAERESIS EGrave: + ascii: 'e`' esc-alias: e` has-unicode-inverse: false is-letter-like: true @@ -3390,6 +3460,7 @@ EGrave: wl-unicode-name: LATIN SMALL LETTER E WITH GRAVE EHacek: + ascii: 'ev' esc-alias: ev has-unicode-inverse: false is-letter-like: true @@ -3401,6 +3472,7 @@ EHacek: wl-unicode-name: LATIN SMALL LETTER E WITH CARON EHat: + ascii: 'e^' esc-alias: e^ has-unicode-inverse: false is-letter-like: true @@ -3446,6 +3518,7 @@ Element: Ellipsis: amslatex: '\dots' + ascii: '...' esc-alias: '...' has-unicode-inverse: false is-letter-like: true @@ -3607,6 +3680,7 @@ Equal: EqualTilde: amslatex: '\eqsim' + ascii: '=~' esc-alias: =~ has-unicode-inverse: false is-letter-like: false @@ -3633,6 +3707,7 @@ Equilibrium: Equivalent: # amslatex: '\unicode{29e6}' + ascii: '<=>' esc-alias: equiv has-unicode-inverse: true is-letter-like: false @@ -3671,6 +3746,7 @@ Eta: wl-unicode-name: GREEK SMALL LETTER ETA Eth: + ascii: 'Eth' esc-alias: d- has-unicode-inverse: false is-letter-like: false @@ -3716,6 +3792,7 @@ Exists: ExponentialE: amslatex: '\ExponentialE' + ascii: 'E' esc-alias: ee has-unicode-inverse: true is-builtin-constant: true @@ -5696,6 +5773,7 @@ GreaterEqual: GreaterEqualLess: amslatex: '\gtreqless' + ascii: '>=<' has-unicode-inverse: false is-letter-like: false operator-name: GreaterEqualLess @@ -5708,6 +5786,7 @@ GreaterEqualLess: GreaterFullEqual: amslatex: '\geqq' + ascii: '>==' has-unicode-inverse: false is-letter-like: false operator-name: GreaterFullEqual @@ -5720,6 +5799,7 @@ GreaterFullEqual: GreaterGreater: amslatex: '\gg' + ascii: '>>' has-unicode-inverse: false is-letter-like: false operator-name: GreaterGreater @@ -5732,6 +5812,7 @@ GreaterGreater: GreaterLess: amslatex: '\gtrless' + ascii: '><' has-unicode-inverse: false is-letter-like: false operator-name: GreaterLess @@ -5744,6 +5825,7 @@ GreaterLess: GreaterSlantEqual: amslatex: '\geq' + ascii: '>=' esc-alias: '>/' has-unicode-inverse: false operator-name: GreaterSlantEqual @@ -5757,6 +5839,7 @@ GreaterSlantEqual: GreaterTilde: amslatex: '\gtrsim' # In WMA, '\gtrsin' which seems a typo... + ascii: '>~' esc-alias: '>~' has-unicode-inverse: false operator-name: GreaterTilde @@ -5781,6 +5864,7 @@ HBar: wl-unicode-name: PLANCK CONSTANT OVER TWO PI Hacek: + ascii: 'v' esc-alias: hck has-unicode-inverse: false is-letter-like: true @@ -5791,6 +5875,7 @@ Hacek: wl-unicode-name: CARON HappySmiley: + ascii: ':-)' esc-alias: :) has-unicode-inverse: false is-letter-like: true @@ -5869,6 +5954,7 @@ Hyphen: wl-unicode-name: HYPHEN IAcute: + ascii: "i'" esc-alias: i' has-unicode-inverse: false is-letter-like: true @@ -5881,6 +5967,7 @@ IAcute: wl-unicode-name: LATIN SMALL LETTER I WITH ACUTE ICup: + ascii: 'iu' esc-alias: iu has-unicode-inverse: false latex: '\u{i}' @@ -5892,6 +5979,7 @@ ICup: wl-unicode-name: LATIN SMALL LETTER I WITH BREVE IDoubleDot: + ascii: 'i"' esc-alias: i" has-unicode-inverse: false is-letter-like: true @@ -5903,6 +5991,7 @@ IDoubleDot: wl-unicode-name: LATIN SMALL LETTER I WITH DIAERESIS IGrave: + ascii: 'i`' esc-alias: i` has-unicode-inverse: false is-letter-like: true @@ -5915,6 +6004,7 @@ IGrave: wl-unicode-name: LATIN SMALL LETTER I WITH GRAVE IHat: + ascii: 'i^' esc-alias: i^ has-unicode-inverse: false is-letter-like: true @@ -5927,6 +6017,7 @@ IHat: ImaginaryI: amslatex: '\ComplexI' + ascii: 'I' esc-alias: ii has-unicode-inverse: true is-builtin-constant: true @@ -5939,6 +6030,7 @@ ImaginaryI: ImaginaryJ: amslatex: '\ComplexJ' + ascii: 'I' esc-alias: jj has-unicode-inverse: true is-builtin-constant: true @@ -5961,6 +6053,7 @@ ImplicitPlus: Implies: amslatex: '\Rightarrow' + ascii: '=>' esc-alias: => has-unicode-inverse: true is-letter-like: false @@ -5987,6 +6080,7 @@ IndentingNewLine: Infinity: amslatex: '\infty' + ascii: 'Infinity' esc-alias: inf has-unicode-inverse: false is-builtin-constant: true @@ -6001,8 +6095,8 @@ Infinity: # This is the default infix symbol used in boxing/formatting Infix[] expressions. # See also RawTilde Infix: - ascii: "~" amslatex: '\textasciitilde' + ascii: "~" has-unicode-inverse: false is-letter-like: false @@ -6142,6 +6236,7 @@ Koppa: LSlash: amslatex: '\l{}' + ascii: 'l/' esc-alias: l/ has-unicode-inverse: false is-letter-like: true @@ -6170,6 +6265,7 @@ LastPage: # Opening part of AngleBracket[] - an operator without a builtin-meaning LeftAngleBracket: + ascii: '<' esc-alias: < has-unicode-inverse: false is-letter-like: false @@ -6183,6 +6279,7 @@ LeftAngleBracket: LeftArrow: amslatex: '\leftarrow' + ascii: '<-' esc-alias: <- has-unicode-inverse: false is-letter-like: false @@ -6196,6 +6293,7 @@ LeftArrow: LeftArrowBar: amslatex: '|\leftarrow' + ascii: '|<-' has-unicode-inverse: false is-letter-like: false operator-name: LeftArrowBar @@ -6221,6 +6319,7 @@ LeftArrowRightArrow: # Opening of "Association[]" operator LeftAssociation: amslatex: '\langle\vert' + ascii: '<|' esc-alias: <| has-unicode-inverse: false is-letter-like: false @@ -6230,6 +6329,7 @@ LeftAssociation: # Opening of "BracketingBar[]" operator without builtin-meaning LeftBracketingBar: + ascii: '|' esc-alias: l| has-unicode-inverse: false is-letter-like: false @@ -6265,6 +6365,7 @@ LeftDoubleBracket: # Opening part of DoubleBracketingBar[] - an operator without a builtin-meaning LeftDoubleBracketingBar: + ascii: '||' esc-alias: l|| has-unicode-inverse: false is-letter-like: false @@ -6324,6 +6425,7 @@ LeftFloor: wl-unicode-name: LEFT FLOOR LeftGuillemet: + ascii: "<<" esc-alias: g<< has-unicode-inverse: false is-letter-like: true @@ -6358,6 +6460,7 @@ LeftPointer: LeftRightArrow: amslatex: '\leftrightarrow' + ascii: '<->' esc-alias: <-> has-unicode-inverse: false is-letter-like: false @@ -6382,6 +6485,7 @@ LeftRightVector: wl-unicode-name: LEFT BARB UP RIGHT BARB UP HARPOON LeftSkeleton: + ascii: '<<' has-unicode-inverse: true is-letter-like: false unicode-equivalent: "\xAB" @@ -6391,6 +6495,7 @@ LeftSkeleton: LeftTee: amslatex: '\dashv' + ascii: '-|' esc-alias: lT has-unicode-inverse: false is-letter-like: false @@ -6578,6 +6683,7 @@ LessEqualGreater: LessFullEqual: amslatex: '\leqq' + ascii: '<==' has-unicode-inverse: false is-letter-like: false operator-name: LessFullEqual @@ -6590,6 +6696,7 @@ LessFullEqual: LessGreater: amslatex: '\lessgtr' + ascii: '<>' has-unicode-inverse: false is-letter-like: false operator-name: LessGreater @@ -6602,6 +6709,7 @@ LessGreater: LessLess: amslatex: '\ll' + ascii: '<<' has-unicode-inverse: false is-letter-like: false operator-name: LessLess @@ -6614,6 +6722,7 @@ LessLess: LessSlantEqual: amslatex: '\leq' + ascii: '<= ' esc-alias: ' esc-alias: <--> has-unicode-inverse: false is-letter-like: false @@ -6719,6 +6834,7 @@ LongLeftRightArrow: LongRightArrow: amslatex: '\longrightarrow' + ascii: '-->' esc-alias: --> has-unicode-inverse: false is-letter-like: false @@ -6808,6 +6924,7 @@ MeasuredAngle: wl-unicode-name: MEASURED ANGLE MediumSpace: + ascii: ' ' has-unicode-inverse: false is-letter-like: false wl-reference: https://reference.wolfram.com/language/ref/character/MediumSpace.html @@ -6872,6 +6989,7 @@ MinLimit: MinusPlus: amslatex: '\mp' + ascii: '-+' esc-alias: -+ has-unicode-inverse: false is-letter-like: false @@ -6895,6 +7013,7 @@ Mu: wl-unicode-name: GREEK SMALL LETTER MU NHacek: + ascii: 'nv' esc-alias: nv has-unicode-inverse: false is-letter-like: true @@ -6906,6 +7025,7 @@ NHacek: wl-unicode-name: LATIN SMALL LETTER N WITH CARON NTilde: + ascii: 'n~' esc-alias: n~ has-unicode-inverse: false is-letter-like: true @@ -7008,6 +7128,7 @@ NeutralSmiley: wl-unicode: "\uF722" NewLine: + ascii: "\n" has-unicode-inverse: true is-letter-like: false unicode-equivalent: "\u000A" @@ -7023,6 +7144,7 @@ NoBreak: wl-unicode-name: WORD JOINER NonBreakingSpace: + ascii: " " esc-alias: nbs has-unicode-inverse: false is-letter-like: false @@ -7152,6 +7274,7 @@ NotExists: NotGreater: amslatex: '\ngtr' + ascii: '!>' esc-alias: '!>' has-unicode-inverse: false is-letter-like: false @@ -7165,6 +7288,7 @@ NotGreater: NotGreaterEqual: amslatex: '\ngeq' + ascii: '!>=' esc-alias: '!>=' has-unicode-inverse: false is-letter-like: false @@ -7178,6 +7302,7 @@ NotGreaterEqual: NotGreaterFullEqual: amslatex: '\ngeqq' + ascii: '!>==' has-unicode-inverse: false is-letter-like: false operator-name: NotGreaterFullEqual @@ -7190,6 +7315,7 @@ NotGreaterFullEqual: NotGreaterGreater: amslatex: '\not{\gg}' + ascii: '!>>' has-unicode-inverse: false is-letter-like: false operator-name: NotGreaterGreater @@ -7199,6 +7325,7 @@ NotGreaterGreater: NotGreaterLess: amslatex: '\not{\gtrless}' + ascii: '!><' has-unicode-inverse: false is-letter-like: false operator-name: NotGreaterLess @@ -7210,6 +7337,7 @@ NotGreaterLess: wl-unicode-name: NEITHER GREATER-THAN NOR LESS-THAN NotGreaterSlantEqual: + ascii: '!>=' esc-alias: '!>/' has-unicode-inverse: false is-letter-like: false @@ -7218,6 +7346,7 @@ NotGreaterSlantEqual: NotGreaterTilde: amslatex: '\not{\gtrsim}' + ascii: '!>~' esc-alias: '!>~' has-unicode-inverse: false is-letter-like: false @@ -7274,6 +7403,7 @@ NotLeftTriangleEqual: NotLess: amslatex: '\nless' + ascii: '!<' esc-alias: '!<' has-unicode-inverse: false is-letter-like: false @@ -7287,6 +7417,7 @@ NotLess: NotLessEqual: amslatex: '\nleq' + ascii: '!<=' esc-alias: '!<=' has-unicode-inverse: false is-letter-like: false @@ -7300,6 +7431,7 @@ NotLessEqual: NotLessFullEqual: amslatex: '\nleqq' + ascii: '!<==' has-unicode-inverse: false is-letter-like: false operator-name: NotLessFullEqual @@ -7312,6 +7444,7 @@ NotLessFullEqual: NotLessGreater: amslatex: '\not{\lessgtr}' + ascii: '!<>' has-unicode-inverse: false is-letter-like: false operator-name: NotLessGreater @@ -7323,12 +7456,14 @@ NotLessGreater: wl-unicode-name: NEITHER LESS-THAN NOR GREATER-THAN NotLessLess: + ascii: '!<<' has-unicode-inverse: false is-letter-like: false wl-reference: https://reference.wolfram.com/language/ref/character/NotLessLess.html wl-unicode: "\uF422" NotLessSlantEqual: + ascii: '!<=' esc-alias: '!' has-unicode-inverse: false is-letter-like: false @@ -8563,6 +8714,7 @@ RightAngleBracket: # Note: not the same as \[Rule] or \[DirectedEdge] RightArrow: amslatex: '\rightarrow' + ascii: "->" esc-alias: ' ->' has-unicode-inverse: false is-letter-like: false @@ -8576,6 +8728,7 @@ RightArrow: RightArrowBar: amslatex: '\rightarrow |' + ascii: "->|" has-unicode-inverse: false is-letter-like: false operator-name: RightArrowBar @@ -8601,6 +8754,7 @@ RightArrowLeftArrow: # Closing of "Association[]" operator RightAssociation: amslatex: '\vert\rangle' + ascii: '|>' esc-alias: '|>' has-unicode-inverse: false is-letter-like: false @@ -8611,6 +8765,7 @@ RightAssociation: # Closing of "BracketingBar[]" operator without builtin-meaning # Note: not the same as \[VerticalBar] RightBracketingBar: + ascii: '|' esc-alias: r| has-unicode-inverse: false is-letter-like: false @@ -8710,6 +8865,7 @@ RightFloor: wl-unicode-name: RIGHT FLOOR RightGuillemet: + ascii: ">>" esc-alias: g>> has-unicode-inverse: false is-letter-like: true @@ -8743,6 +8899,7 @@ RightPointer: wl-unicode-name: BLACK RIGHT-POINTING SMALL TRIANGLE RightSkeleton: + ascii: '>>' has-unicode-inverse: true is-letter-like: false unicode-equivalent: "\xBB" @@ -8752,6 +8909,7 @@ RightSkeleton: RightTee: amslatex: '\vdash' + ascii: '|-' esc-alias: rT has-unicode-inverse: false operator-name: RightTee @@ -8944,6 +9102,7 @@ RuleDelayed: wl-unicode: "\uF51F" SHacek: + ascii: 'sv' esc-alias: sv has-unicode-inverse: false is-letter-like: true @@ -8955,6 +9114,7 @@ SHacek: wl-unicode-name: LATIN SMALL LETTER S WITH CARON SZ: + ascii: 'sz' esc-alias: sz has-unicode-inverse: false is-letter-like: false @@ -10145,6 +10305,7 @@ SystemsModelDelay: wl-unicode: "\uF3AF" THacek: + ascii: 'tv' esc-alias: tv has-unicode-inverse: false latex: "\\v{t}" @@ -10246,6 +10407,7 @@ ThinSpace: wl-unicode-name: THIN SPACE Thorn: + ascii: 'th' esc-alias: thn has-unicode-inverse: false is-letter-like: false @@ -10359,6 +10521,7 @@ TripleDot: wl-unicode: "\uF758" UAcute: + ascii: "u'" esc-alias: u' has-unicode-inverse: false is-letter-like: true @@ -10370,6 +10533,7 @@ UAcute: wl-unicode-name: LATIN SMALL LETTER U WITH ACUTE UDoubleAcute: + ascii: "u''" esc-alias: u'' has-unicode-inverse: false is-letter-like: true @@ -10380,6 +10544,7 @@ UDoubleAcute: wl-unicode-name: LATIN SMALL LETTER U WITH DOUBLE ACUTE UDoubleDot: + ascii: 'u"' esc-alias: u" has-unicode-inverse: false is-letter-like: true @@ -10390,6 +10555,7 @@ UDoubleDot: wl-unicode-name: LATIN SMALL LETTER U WITH DIAERESIS UGrave: + ascii: 'u`' esc-alias: u` has-unicode-inverse: false is-letter-like: true @@ -10401,6 +10567,7 @@ UGrave: wl-unicode-name: LATIN SMALL LETTER U WITH GRAVE UHat: + ascii: 'u^' esc-alias: u^ has-unicode-inverse: false is-letter-like: true @@ -10411,6 +10578,7 @@ UHat: wl-unicode-name: LATIN SMALL LETTER U WITH CIRCUMFLEX URing: + ascii: 'uo' esc-alias: uo has-unicode-inverse: false is-letter-like: true @@ -10902,6 +11070,7 @@ Xnor: Xor: amslatex: '\oplus' # The WL veebar-looking symbol isn't in AMSLaTeX + ascii: 'xor' esc-alias: xor has-unicode-inverse: false is-letter-like: false @@ -10914,6 +11083,7 @@ Xor: wl-unicode-name: XOR YAcute: + ascii: "y'" esc-alias: y' has-unicode-inverse: false is-letter-like: true @@ -10926,6 +11096,7 @@ YAcute: wl-unicode-name: LATIN SMALL LETTER Y WITH ACUTE YDoubleDot: + ascii: 'y"' esc-alias: y" has-unicode-inverse: false is-letter-like: true @@ -10947,6 +11118,7 @@ Yen: wl-unicode-name: YEN SIGN ZHacek: + ascii: 'zv' esc-alias: zv has-unicode-inverse: false is-letter-like: true diff --git a/mathics_scanner/generate/named_characters.py b/mathics_scanner/generate/named_characters.py index 868e90e..05d30f7 100755 --- a/mathics_scanner/generate/named_characters.py +++ b/mathics_scanner/generate/named_characters.py @@ -16,6 +16,41 @@ from mathics_scanner.version import __version__ # noqa +def build_unicode_to_ascii_table(data): + """ + Collect unicode-equivalent and wl-unicode characters + in data, and associate them to an ascii string. + """ + result = {} + for key, entry in data.items(): + # First, look at the ascii entry + ascii_equiv = entry.get("ascii", None) + # If there is no ascii entry, try with + # esc-alias + if ascii_equiv is None: + ascii_equiv = entry.get("esc-alias", None) + # otherwise, use the NameCharacter form: + if ascii_equiv is None: + ascii_equiv = rf"\[{key}]" + + unicode_equivalent = entry.get("unicode-equivalent", None) + if unicode_equivalent is not None: + # not already an ascii character + if unicode_equivalent != ascii_equiv and ( + len(unicode_equivalent) != 1 or ord(unicode_equivalent) > 127 + ): + result[unicode_equivalent] = ascii_equiv + wl_unicode = entry.get("wl-unicode", None) + if wl_unicode is not None and wl_unicode not in result: + # not ascii + if wl_unicode != ascii_equiv and ( + len(wl_unicode) != 1 or ord(wl_unicode) > 127 + ): + result[wl_unicode] = ascii_equiv + + return result + + def re_from_keys(d: dict) -> str: """ Takes dictionary whose keys are all strings and returns a regex that @@ -90,6 +125,9 @@ def compile_tables(data: dict) -> dict: if "esc-alias" in v } + # unicode-to-ascii + unicode_to_ascii = build_unicode_to_ascii_table(data) + # WL to AMS LaTeX (math mode) characters wl_to_amslatex = { v["wl-unicode"]: v.get("amslatex") @@ -253,6 +291,7 @@ def compile_tables(data: dict) -> dict: "operator-to-ascii": operator_to_ascii, "operator-to-unicode": operator_to_unicode, "unicode-to-amslatex": unicode_to_amslatex, + "unicode-to-ascii": unicode_to_ascii, "unicode-to-latex": unicode_to_latex, "unicode-operators": unicode_to_operator, "unicode-to-wl-dict": unicode_to_wl_dict, @@ -288,6 +327,7 @@ def compile_tables(data: dict) -> dict: "operator-to-unicode", # "unicode-operators", # not used yet "unicode-to-amslatex", + "unicode-to-ascii", "unicode-to-latex", "unicode-to-wl-dict", "unicode-to-wl-re", diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index 9292bc0..02a2773 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -82,16 +82,6 @@ def test_operators(): dup_operator_symbols = set(["?", "!"]) for k, v in yaml_data.items(): - if "ascii" in v: - if len(v["ascii"]) > 1: - assert ( - "operator-name" in v - ), f"In {k}: ASCII with more than one characters must be an operator" - pass - else: - assert ( - "wl-unicode" in v - ), f"In {k}: there must be either an ascii name or have a wl-unicode" if "operator-name" not in v: continue