Skip to content

Commit 2a06a89

Browse files
committed
59: add support for \uXXXX escapes within string literals
This is in response to edn-format/edn#65 . This is an extension as string literals as currently documented do not specify support for \uXXXX escapes. https://github.com/edn-format/edn/tree/a51127aecd318096667ae0dafa25353ecb07c9c3 Notes: - Unicode escape must begin with "\u". This is case sensitive "\U" will be rejected. - "\u" must be followed by exactly four hex digits taken from this set: 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F - The digits are not case sensitive. - Each such Unicode escape encodes a single 16-bit Java char. Since Java uses UTF-16 internally (for historical reasons) code points beyond the basic multilingual plane as a pair of unicode escapes. (see also "surrogate pairs")
1 parent 89a7e56 commit 2a06a89

2 files changed

Lines changed: 47 additions & 0 deletions

File tree

src/main/java/us/bpsm/edn/parser/ScannerImpl.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,23 @@ private String readStringLiteral(Parseable pbr) throws IOException {
377377
case '\\':
378378
b.append('\\');
379379
break;
380+
case 'u':
381+
// Support for reading unicode escapes within string
382+
// literals is an extension to EDN. It is not currently
383+
// part of the spec described here:
384+
// https://github.com/edn-format/edn
385+
int v = 0;
386+
for (int i = 0; i < 4; i++) {
387+
curr = pbr.read();
388+
int d = Character.digit(curr, 16);
389+
if (d == -1) {
390+
throw new EdnSyntaxException(
391+
"Invalid \\u Unicode escape in string.");
392+
}
393+
v = v * 16 + d;
394+
}
395+
b.append((char)v);
396+
break;
380397
default:
381398
throw new EdnSyntaxException("Unsupported '"+ ((char)curr)
382399
+"' escape in string");

src/test/java/us/bpsm/edn/parser/ScannerTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.junit.Test;
1414

1515
import us.bpsm.edn.EdnException;
16+
import us.bpsm.edn.EdnSyntaxException;
1617
import us.bpsm.edn.Keyword;
1718
import us.bpsm.edn.Symbol;
1819
import us.bpsm.edn.parser.Parseable;
@@ -366,6 +367,35 @@ public void unicodeEscapeCharacterLiterals() {
366367
assertEquals(c, s.nextToken(pbr));
367368
}
368369

370+
@Test
371+
public void unicodeEscapesInStringLiterals() {
372+
String txt = "\"" +
373+
"\\" + "u0000" +
374+
"\\" + "u1234" +
375+
"\\" + "u0Ff0" +
376+
"\"";
377+
String expected = "\u0000\u1234\u0Ff0";
378+
assertEquals(3, expected.length());
379+
Parseable pbr = Parsers.newParseable(txt);
380+
Scanner s = scanner();
381+
assertEquals(expected, s.nextToken(pbr));
382+
}
383+
384+
@Test(expected = EdnSyntaxException.class)
385+
public void truncatedUnicodeEscapeInStringLiteral() {
386+
scanner().nextToken(Parsers.newParseable("\"\\" + "u123\""));
387+
}
388+
389+
@Test(expected = EdnSyntaxException.class)
390+
public void truncatedInputInUnicodeEscapeInStringLiteral() {
391+
scanner().nextToken(Parsers.newParseable("\"\\" + "u123"));
392+
}
393+
394+
@Test(expected = EdnSyntaxException.class)
395+
public void nonDigitInUnicodeEscapeInStringLiteral() {
396+
scanner().nextToken(Parsers.newParseable("\"\\" + "u123?\""));
397+
}
398+
369399
@Test
370400
public void simpleStringWithLinebreak() {
371401
assertEquals("\n", scan("\"\n\""));

0 commit comments

Comments
 (0)