diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 8f9729ef..5bb67b07 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -479,21 +479,16 @@ static const signed char digit_values[256] = { static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p) { - signed char b; - uint32_t result = 0; - b = digit_values[p[0]]; - if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[1]]; - if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[2]]; - if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[3]]; - if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); - result = (result << 4) | (unsigned char)b; - return result; + signed char b0 = digit_values[p[0]]; + signed char b1 = digit_values[p[1]]; + signed char b2 = digit_values[p[2]]; + signed char b3 = digit_values[p[3]]; + + if ((b0 | b1 | b2 | b3) < 0) { + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); + } + + return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3; } #define GET_PARSER_CONFIG \ @@ -643,9 +638,58 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserCon typedef struct _json_unescape_positions { long size; const char **positions; - bool has_more; + unsigned long additional_backslashes; } JSON_UnescapePositions; +ALWAYS_INLINE(static) void *find_backslash(const void *src, size_t n) { +// HAVE_SIMD_NEON and JSON_CPU_LITTLE_ENDIAN_64BITS are implied by __APPLE__ && __aarch64__ +// but they are here for clarity and consistency with code in this file. +#if defined(__APPLE__) && defined(__aarch64__) && HAVE_SIMD_NEON && JSON_CPU_LITTLE_ENDIAN_64BITS + const unsigned char *s = (const unsigned char *)src; + + static const uint8_t offsets[16] = { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }; + while (n >= sizeof(uint8x16_t)) { + uint8x16_t chunk = vld1q_u8(s); + uint8x16_t backslashes = vdupq_n_u8('\\'); + uint8x16_t has_backslashes = vceqq_u8(chunk, backslashes); + uint8x16_t backslash_offsets = vandq_u8(has_backslashes, vld1q_u8(offsets)); + int first_backslash_offset = vmaxvq_u8(backslash_offsets); + if (first_backslash_offset) { + // The indexes are stored in reverse order so we need to subtract from 16 + // to get the first backslash offset. We do this to avoid having to use + // a negation + OR operation along with a vminvq_u8 if the indexes were stored + // in normal order. + return (void *)(s + (16 - first_backslash_offset)); + } + s += sizeof(uint8x16_t); + n -= sizeof(uint8x16_t); + } + + if (n >= sizeof(uint64_t)) { + uint64_t word; + memcpy(&word, s, sizeof(uint64_t)); + uint64_t xor = word ^ 0x5c5c5c5c5c5c5c5c; + uint64_t has_backslash = (xor - 0x0101010101010101) & ((~xor) & 0x8080808080808080); + if (has_backslash) { + int byte_offset = trailing_zeros64(has_backslash) / CHAR_BIT; + return (void *)(s + byte_offset); + } + s += sizeof(uint64_t); + n -= sizeof(uint64_t); + } + + for (size_t i = 0; i < n; i++) { + if (s[i] == '\\') { + return (void *)(s + i); + } + } + + return NULL; +#else + return memchr(src, '\\', n); +#endif +} + static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions) { while (positions->size) { @@ -657,13 +701,43 @@ static inline const char *json_next_backslash(const char *pe, const char *string } } - if (positions->has_more) { - return memchr(pe, '\\', stringEnd - pe); + if (positions->additional_backslashes) { + positions->additional_backslashes--; + return find_backslash(pe, stringEnd - pe); } return NULL; } +static inline void json_memcpy(char *dest, const char *src, size_t size) { +#if defined(__APPLE__) && defined(__aarch64__) && HAVE_SIMD_NEON && JSON_CPU_LITTLE_ENDIAN_64BITS + while (size >= sizeof(uint8x16_t)) { + uint8x16_t chunk; + chunk = vld1q_u8((const uint8_t *)src); + vst1q_u8((uint8_t *)dest, chunk); + dest += sizeof(uint8x16_t); + src += sizeof(uint8x16_t); + size -= sizeof(uint8x16_t); + } + + if (size >= sizeof(uint64_t)) { + uint64_t chunk; + memcpy(&chunk, src, sizeof(uint64_t)); + memcpy(dest, &chunk, sizeof(uint64_t)); + dest += sizeof(uint64_t); + src += sizeof(uint64_t); + size -= sizeof(uint64_t); + } + + while(size) { + *dest++ = *src++; + size--; + } +#else + memcpy(dest, src, size); +#endif +} + NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions) { bool intern = is_name || config->freeze; @@ -681,7 +755,7 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) { if (pe > p) { - MEMCPY(buffer, p, char, pe - p); + json_memcpy(buffer, p, pe - p); buffer += pe - p; } switch (*++pe) { @@ -746,7 +820,7 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser char buf[4]; int unescape_len = convert_UTF32_to_UTF8(buf, ch); - MEMCPY(buffer, buf, char, unescape_len); + json_memcpy(buffer, buf, unescape_len); buffer += unescape_len; p = ++pe; } @@ -768,7 +842,7 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser #undef APPEND_CHAR if (stringEnd > p) { - MEMCPY(buffer, p, char, stringEnd - p); + json_memcpy(buffer, p, stringEnd - p); buffer += stringEnd - p; } rb_str_set_len(result, buffer - bufferStart); @@ -992,7 +1066,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi JSON_UnescapePositions positions = { .size = 0, .positions = backslashes, - .has_more = false, + .additional_backslashes = 0, }; do { @@ -1007,7 +1081,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi backslashes[positions.size] = state->cursor; positions.size++; } else { - positions.has_more = true; + positions.additional_backslashes++; } state->cursor++; break; diff --git a/test/json/json_parser_test.rb b/test/json/json_parser_test.rb index ec939190..ca1cf3dc 100644 --- a/test/json/json_parser_test.rb +++ b/test/json/json_parser_test.rb @@ -543,6 +543,10 @@ def test_backslash json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' assert_equal data, parse(json) + data = ['""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""'] + json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' + assert_equal data, parse(json) + data = '["This is a "test" of the emergency broadcast system."]' json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" assert_equal data, parse(json) @@ -611,6 +615,10 @@ def test_backslash json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" assert_equal data, parse(json) + data = "ab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002c" + json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" + assert_equal data, parse(json) + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f" json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\"" assert_equal data, parse(json) @@ -619,9 +627,21 @@ def test_backslash json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\"" assert_equal data, parse(json) + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b" + json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\"" + assert_equal data, parse(json) + data = "a\n\t\f\b\n\t\f\b\n\t\f\b\n\t" json = "\"a\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\"" assert_equal data, parse(json) + + data = "a\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\t" + json = "\"a\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\"" + assert_equal data, parse(json) + + data = "\n" * 63 + json = "\""+("\\n" * 63)+"\"" + assert_equal data, parse(json) end class SubArray < Array