diff --git a/cppcache/src/DataInput.cpp b/cppcache/src/DataInput.cpp index 3590dd917e..cd672cbbfa 100644 --- a/cppcache/src/DataInput.cpp +++ b/cppcache/src/DataInput.cpp @@ -24,6 +24,8 @@ #include "util/JavaModifiedUtf8.hpp" #include "util/string.hpp" +using apache::geode::client::internal::JavaModifiedUtf8; + namespace apache { namespace geode { namespace client { @@ -62,8 +64,8 @@ void DataInput::readJavaModifiedUtf8( std::basic_string& value) { uint16_t length = readInt16(); _GEODE_CHECK_BUFFER_SIZE(length); - value = internal::JavaModifiedUtf8::decode( - reinterpret_cast(m_buf), length); + value = + JavaModifiedUtf8::decode(reinterpret_cast(m_buf), length); advanceCursor(length); } template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void @@ -72,10 +74,10 @@ DataInput::readJavaModifiedUtf8(std::u16string&); template void DataInput::readJavaModifiedUtf8( std::basic_string& value) { - // TODO string OPTIMIZE convert from UTF-16 to UCS-4 directly - std::u16string utf16; - readJavaModifiedUtf8(utf16); - value = to_ucs4(utf16); + uint16_t length = readInt16(); + _GEODE_CHECK_BUFFER_SIZE(length); + value = + JavaModifiedUtf8::decodeU32(reinterpret_cast(m_buf), length); } template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void DataInput::readJavaModifiedUtf8(std::u32string&); diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index d6f53b81cd..58cd536b26 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -19,7 +19,9 @@ #include #include +#include +#include "geode/ExceptionTypes.hpp" #include "string.hpp" namespace apache { @@ -57,22 +59,192 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) { return encodedLen; } -std::string JavaModifiedUtf8::fromString(const std::string& utf8) { - return fromString(to_utf16(utf8)); +std::set utf16_surrogate_codes = {{0xD800}, {0xDB7F}, {0xDB80}, {0xDBFF}, + {0xDC00}, {0xDF80}, {0xDFFF}}; + +bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) { + return (code_point > 0x7FF) && (utf16_surrogate_codes.find(code_point) == + utf16_surrogate_codes.end()); +} + +enum class UtfScanState : int32_t { + Initial = 0, + Continuing = 1, +}; + +ju8string JavaModifiedUtf8::decode2byte(const std::string& utf8char) { + ju8string jmutf8char; + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) { + int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); + if (code_point > 0x7F) { + jmutf8char += byte1; + jmutf8char += byte2; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "ASCII character)"); + } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8char; +} + +ju8string JavaModifiedUtf8::decode3byte(const std::string& utf8char) { + ju8string jmutf8char; + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + auto byte3 = utf8char[2]; + if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) && + ((byte3 & 0x80) == 0x80)) { + uint16_t code_point = + ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F); + if (IsValidCodePoint(code_point)) { + jmutf8char += byte1; + jmutf8char += byte2; + jmutf8char += byte3; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "3-byte encoding)"); + } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8char; +} + +ju8string JavaModifiedUtf8::decode4byte(const std::string& utf8char) { + ju8string jmutf8char; + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + auto byte3 = utf8char[2]; + auto byte4 = utf8char[3]; + if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) && + ((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) { + uint32_t code_point = (byte1 & 0x07) << 18; + code_point += (byte2 & 0x3F) << 12; + code_point += (byte3 & 0x3F) << 6; + code_point += byte4 & 0x3F; + + if (code_point > 0xFFFF) { + jmutf8char += static_cast(0xED); + jmutf8char += + static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); + jmutf8char += static_cast((0x80 + ((code_point >> 10) & 0x3F))); + + jmutf8char += static_cast(0xED); + jmutf8char += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); + jmutf8char += byte4; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "4-byte encoding)"); + } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8char; } -std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) { - std::string jmutf8; - jmutf8.reserve(utf16.length()); +ju8string JavaModifiedUtf8::decode(const std::string& utf8char) { + ju8string jmutf8char; - for (auto&& c : utf16) { - encode(c, jmutf8); + switch (utf8char.size()) { + case 2: + jmutf8char = decode2byte(utf8char); + break; + case 3: + jmutf8char = decode3byte(utf8char); + break; + case 4: + jmutf8char = decode4byte(utf8char); + break; + default: + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8char; +} + +ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { + ju8string jmutf8; + size_t cursor = 0; + auto state = UtfScanState::Initial; + std::string current; + + while (cursor < utf8.size()) { + auto byte = utf8[cursor++]; + + switch (state) { + case UtfScanState::Initial: + if ((byte & 0x80) == 0) { + if (byte) { + jmutf8 += byte; + } else { + jmutf8 += static_cast(0xC0); + jmutf8 += static_cast(0x80); + } + } else if ((byte & 0xc0) == 0x80) { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } else { + current += byte; + state = UtfScanState::Continuing; + } + break; + case UtfScanState::Continuing: { + if ((byte & 0xC0) == 0x80) { + current += byte; + } else { + cursor--; + state = UtfScanState::Initial; + jmutf8 += JavaModifiedUtf8::decode(current); + current.clear(); + } + } break; + } + } + + if (current.size() && state == UtfScanState::Continuing) { + state = UtfScanState::Initial; + jmutf8 += JavaModifiedUtf8::decode(current); + current.clear(); + } + + if (state != UtfScanState::Initial) { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); } return jmutf8; } -void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) { +std::string JavaModifiedUtf8::toString(const ju8string& jmutf8) { + // std::string utf8; + // size_t cursor = 0; + + // while (cursor < jmutf8.size()) { + // auto byte1 = jmutf8[cursor++]; + // if ((byte1 & 0x80) == 0) { + // utf8.push_back(byte1); + // } else if ((byte1 & 0xE0) == 0xC0) { + // auto byte2 = jmutf8[cursor++]; + // if () } + //} + return ""; +} + +void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) { if (c == 0) { // NUL jmutf8 += static_cast(0xc0); @@ -90,6 +262,54 @@ void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) { } } +std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) { + std::u32string result; + + uint16_t i = 0; + while (i < len) { + auto byte1 = buf[i++]; + if (!(byte1 & 0x80)) { + result += static_cast(byte1) & 0x000000FF; + } else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) { + auto byte2 = buf[i++]; + if (((byte1 & 0xFF) == 0xC0) && ((byte2 & 0xFF) == 0x80)) { + result.push_back(static_cast(0)); + } else { + int32_t code_point = static_cast(byte1 & 0x1F) << 6; + code_point += static_cast(byte2 & 0x3F); + result.push_back(code_point); + } + } else if ((i < len - 4) && ((byte1 & 0xED) == 0xED)) { + auto byte2 = buf[i++]; + auto byte3 = buf[i++]; + auto byte4 = buf[i++]; + auto byte5 = buf[i++]; + auto byte6 = buf[i++]; + if ((byte4 & 0xED) == 0xED) { + int32_t code_point = + 0x10000 + (static_cast(byte2 & 0xF) << 16); + code_point += static_cast(byte3 & 0x3F) << 10; + code_point += static_cast(byte5 & 0xF) << 6; + code_point += static_cast(byte6 & 0x3F); + result.push_back(code_point); + } else { + throw IllegalArgumentException("Bad encoding in jmutf-8 string"); + } + } else if ((i < len - 1) && ((byte1 & 0xE0) == 0xE0)) { + auto byte2 = buf[i++]; + auto byte3 = buf[i++]; + int32_t code_point = static_cast(byte1 & 0xF) << 12; + code_point += static_cast(byte2 & 0x3F) << 6; + code_point += static_cast(byte3 & 0x3F); + result.push_back(code_point); + } else { + throw IllegalArgumentException("Bad encoding in jmutf-8 string"); + } + } + + return result; +} + std::u16string JavaModifiedUtf8::decode(const char* buf, uint16_t len) { std::u16string value; const auto end = buf + len; diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp index f6d1722b2c..7029d578f9 100644 --- a/cppcache/src/util/JavaModifiedUtf8.hpp +++ b/cppcache/src/util/JavaModifiedUtf8.hpp @@ -27,6 +27,9 @@ namespace geode { namespace client { namespace internal { +struct ju8type_traits : std::char_traits {}; +typedef std::basic_string ju8string; + struct JavaModifiedUtf8 { /** * Calculate the length of the given UTF-8 string when encoded in Java @@ -45,21 +48,36 @@ struct JavaModifiedUtf8 { /** * Converts given UTF-8 string to Java Modified UTF-8 string. */ - static std::string fromString(const std::string& utf8); + static ju8string fromString(const std::string& utf8); /** * Converts given UTF-16 string to Java Modified UTF-8 string. */ - static std::string fromString(const std::u16string& utf16); + static ju8string fromString(const std::u16string& utf16); + + /** + * Converts Java-Modified UTF-8 string to UTF-8 string. + */ + std::string toString(const ju8string& jmutf8); /** * Converts a single UTF-16 code unit into Java Modified UTF-8 code units. */ - static void encode(const char16_t c, std::string& jmutf8); + static void encode(const char16_t c, ju8string& jmutf8); static std::u16string decode(const char* buf, uint16_t len); + static ju8string decode(const std::string& utf8char); + + static std::u32string decodeU32(const char* buf, uint16_t len); + static char16_t decodeJavaModifiedUtf8Char(const char** pbuf); + + private: + static bool IsValidCodePoint(uint16_t code_point); + static ju8string decode2byte(const std::string& utf8char); + static ju8string decode3byte(const std::string& utf8char); + static ju8string decode4byte(const std::string& utf8char); }; } // namespace internal diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp index 0c330e64a6..5ccec2203a 100644 --- a/cppcache/test/CacheableStringTests.cpp +++ b/cppcache/test/CacheableStringTests.cpp @@ -21,11 +21,13 @@ #include #include +#include #include "ByteArrayFixture.hpp" #include "DataInputInternal.hpp" #include "DataOutputInternal.hpp" #include "SerializationRegistry.hpp" +#include "util/JavaModifiedUtf8.hpp" namespace { @@ -34,7 +36,9 @@ using apache::geode::client::CacheableString; using apache::geode::client::DataInputInternal; using apache::geode::client::DataOutput; using apache::geode::client::DataOutputInternal; +using apache::geode::client::IllegalArgumentException; using apache::geode::client::SerializationRegistry; +using apache::geode::client::internal::JavaModifiedUtf8; class TestDataOutput : public DataOutputInternal { public: @@ -220,4 +224,208 @@ TEST_F(CacheableStringTests, TestFromDataNonAsciiHuge) { EXPECT_EQ(utf8, str->value()); } +std::vector impossible_bytes[] = { + {0xFE}, {0xFF}, {0xFE, 0xFE, 0xFF, 0xFF}}; + +std::vector overlong_ascii_sequences[] = { + {0xC0, 0xAF}, {0xE0, 0x80, 0xAF}, {0xF0, 0x80, 0x80, 0xAF}}; + +std::vector maximum_overlong_sequences[] = { + {0xC1, 0xBF}, {0xE0, 0x9F, 0xBF}, {0xF0, 0x8F, 0xBF, 0xBF}}; + +std::vector overlong_nulls[] = { + {0xC0, 0x80}, {0xE0, 0x80, 0x80}, {0xF0, 0x80, 0x80, 0x80}}; + +std::vector single_utf_16_surrogates[] = { + {0xED, 0xA0, 0x80}, {0xED, 0xAD, 0xBF}, {0xED, 0xAE, 0x80}, + {0xED, 0xAF, 0xBF}, {0xED, 0xB0, 0x80}, {0xED, 0xBE, 0x80}, + {0xED, 0xBF, 0xBF}}; + +TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { + std::string bad_start_code; + bad_start_code += static_cast(0xF8); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code), + IllegalArgumentException); + + std::string too_short_2byte; + too_short_2byte += static_cast(0xC0); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), + IllegalArgumentException); + + std::string bad_2byte_at_end = "foo"; + bad_2byte_at_end += static_cast(0xC0); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_2byte_at_end), + IllegalArgumentException); + + std::string too_long_3_byte_encode; + too_long_3_byte_encode.push_back(static_cast(0xE0)); + too_long_3_byte_encode.push_back(static_cast(0x80)); + too_long_3_byte_encode.push_back(static_cast(0x80)); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode), + IllegalArgumentException); + + std::string too_short_3byte; + too_short_3byte += static_cast(0xE8); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + IllegalArgumentException); + + too_short_3byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + IllegalArgumentException); + + std::string bad_3byte_at_end = "foo"; + bad_3byte_at_end += static_cast(0xE8); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + IllegalArgumentException); + + bad_3byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + IllegalArgumentException); + + std::string too_short_4byte; + too_short_4byte += static_cast(0xF7); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + too_short_4byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + too_short_4byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + std::string bad_4byte_at_end = "foo"; + bad_4byte_at_end += static_cast(0xF7); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + bad_4byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + bad_4byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + for (auto sequence : impossible_bytes) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + IllegalArgumentException); + } + + for (auto sequence : overlong_ascii_sequences) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + IllegalArgumentException); + } + + for (auto sequence : maximum_overlong_sequences) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + IllegalArgumentException); + } + + for (auto sequence : overlong_nulls) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + IllegalArgumentException); + } + + for (auto sequence : single_utf_16_surrogates) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + IllegalArgumentException); + } +} + +std::pair, std::vector> lowest_boundary_sequences[] = { + {{0x00}, {0xC0, 0x80}}, + {{0xD0, 0x80}, {0xD0, 0x80}}, + {{0xE0, 0xA0, 0x80}, {0xE0, 0xA0, 0x80}}, + {{0xF0, 0x90, 0x80, 0x80}, {0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80}}}; + +std::pair, std::vector> highest_boundary_sequences[] = { + {{0x7F}, {0x7F}}, + {{0xDF, 0xBF}, {0xDF, 0xBF}}, + {{0xEF, 0xBF, 0xBF}, {0xEF, 0xBF, 0xBF}}, + {{0xF7, 0xBF, 0xBF, 0xBF}, {0xED, 0xAE, 0xBF, 0xED, 0xBF, 0xBF}}, +}; + +std::pair, std::vector> other_boundary_sequences[] = { + {{0xED, 0x9F, 0xBF}, {0xED, 0x9F, 0xBF}}, + {{0xEE, 0x80, 0x80}, {0xEE, 0x80, 0x80}}, + {{0xEF, 0xBF, 0xBD}, {0xEF, 0xBF, 0xBD}}, +}; + +#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0])) + +TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { + std::string utf8; + std::string expected; + utf8.push_back(0); + + auto jmutf8 = JavaModifiedUtf8::fromString(utf8); + + for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; + i < ARRAY_SIZE(lowest_boundary_sequences); i++) { + utf8.clear(); + expected.clear(); + for (auto byte_value : std::get<0>(lowest_boundary_sequences[i])) { + utf8 += static_cast(byte_value); + } + for (auto byte_value : std::get<1>(lowest_boundary_sequences[i])) { + expected += static_cast(byte_value); + } + jmutf8 = JavaModifiedUtf8::fromString(utf8); + EXPECT_EQ(expected.size(), jmutf8.size()); + EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); + } + + for (decltype(ARRAY_SIZE(highest_boundary_sequences)) i = 0; + i < ARRAY_SIZE(highest_boundary_sequences); i++) { + utf8.clear(); + expected.clear(); + for (auto byte_value : std::get<0>(highest_boundary_sequences[i])) { + utf8 += static_cast(byte_value); + } + for (auto byte_value : std::get<1>(highest_boundary_sequences[i])) { + expected += static_cast(byte_value); + } + jmutf8 = JavaModifiedUtf8::fromString(utf8); + EXPECT_EQ(expected.size(), jmutf8.size()); + EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); + } + + for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; + i < ARRAY_SIZE(other_boundary_sequences); i++) { + utf8.clear(); + expected.clear(); + for (auto byte_value : std::get<0>(other_boundary_sequences[i])) { + utf8 += static_cast(byte_value); + } + for (auto byte_value : std::get<1>(other_boundary_sequences[i])) { + expected += static_cast(byte_value); + } + jmutf8 = JavaModifiedUtf8::fromString(utf8); + EXPECT_EQ(expected.size(), jmutf8.size()); + EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); + } +} + } // namespace