From cf799414073bc1fb4b73c645b803e98300812022 Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Thu, 3 Mar 2022 11:08:05 -0800 Subject: [PATCH 01/10] GEODE-4198: Convert directly from utf-8 --> java modified utf-8, rathern than utf-8 --> utf-16 --> jmutf-8. --- cppcache/src/util/JavaModifiedUtf8.cpp | 60 +++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index d6f53b81cd..4eee3efb3c 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -20,6 +20,7 @@ #include #include +#include "geode/ExceptionTypes.hpp" #include "string.hpp" namespace apache { @@ -58,7 +59,64 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) { } std::string JavaModifiedUtf8::fromString(const std::string& utf8) { - return fromString(to_utf16(utf8)); + std::string jmutf8; + auto cursor = 0; + + while (cursor < utf8.size()) { + auto byte1 = utf8[cursor++]; + if ((byte1 & 0x80) == 0) { + if (byte1) { + jmutf8 += byte1; + } else { + jmutf8 += static_cast(0xC0); + jmutf8 += static_cast(0x80); + } + } else if ((byte1 & 0xE0) == 0xC0) { + if (cursor <= utf8.size() - 1) { + jmutf8 += byte1; + jmutf8 += utf8[cursor++]; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + } else if ((byte1 & 0xF0) == 0xE0) { + if (cursor <= utf8.size() - 2) { + jmutf8 += byte1; + jmutf8 += utf8[cursor++]; + jmutf8 += utf8[cursor++]; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + } else if ((byte1 & 0xF8) == 0xF0) { + if (cursor <= utf8.size() - 3) { + auto byte2 = utf8[cursor++]; + auto byte3 = utf8[cursor++]; + auto byte4 = utf8[cursor++]; + + uint32_t code_point = (byte1 & 0x07) << 18; + code_point += (byte2 & 0x3F) << 12; + code_point += (byte3 & 0x3F) << 6; + code_point += byte4 & 0x3F; + + jmutf8 += static_cast(0xED); + jmutf8 += + static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); + jmutf8 += static_cast((0x80 + ((code_point >> 10) & 0x3F))); + + jmutf8 += static_cast(0xED); + jmutf8 += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); + jmutf8 += byte4; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + } else { + throw IllegalArgumentException("Invalid utf-8 start code"); + } + } + return jmutf8; } std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) { From ba848e02b8a9ab66bb594b668d0432ca22646838 Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Thu, 3 Mar 2022 11:48:20 -0800 Subject: [PATCH 02/10] GEODE-4189: Fix Linux builds --- cppcache/src/util/JavaModifiedUtf8.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index 4eee3efb3c..6cb0dd64f0 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -60,7 +60,7 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) { std::string JavaModifiedUtf8::fromString(const std::string& utf8) { std::string jmutf8; - auto cursor = 0; + size_t cursor = 0; while (cursor < utf8.size()) { auto byte1 = utf8[cursor++]; From ad3e0a3bd534361b1ab68f0a15d36a1ae941d9e3 Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Mon, 7 Mar 2022 14:50:18 -0800 Subject: [PATCH 03/10] WIP: add some unit tests, and working on direct jmutf-8 --> UCS-4 --- cppcache/src/DataInput.cpp | 14 +-- cppcache/src/util/JavaModifiedUtf8.cpp | 123 +++++++++++++++++++++++-- cppcache/src/util/JavaModifiedUtf8.hpp | 12 ++- cppcache/test/CacheableStringTests.cpp | 84 +++++++++++++++++ 4 files changed, 215 insertions(+), 18 deletions(-) diff --git a/cppcache/src/DataInput.cpp b/cppcache/src/DataInput.cpp index 3590dd917e..cd672cbbfa 100644 --- a/cppcache/src/DataInput.cpp +++ b/cppcache/src/DataInput.cpp @@ -24,6 +24,8 @@ #include "util/JavaModifiedUtf8.hpp" #include "util/string.hpp" +using apache::geode::client::internal::JavaModifiedUtf8; + namespace apache { namespace geode { namespace client { @@ -62,8 +64,8 @@ void DataInput::readJavaModifiedUtf8( std::basic_string& value) { uint16_t length = readInt16(); _GEODE_CHECK_BUFFER_SIZE(length); - value = internal::JavaModifiedUtf8::decode( - reinterpret_cast(m_buf), length); + value = + JavaModifiedUtf8::decode(reinterpret_cast(m_buf), length); advanceCursor(length); } template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void @@ -72,10 +74,10 @@ DataInput::readJavaModifiedUtf8(std::u16string&); template void DataInput::readJavaModifiedUtf8( std::basic_string& value) { - // TODO string OPTIMIZE convert from UTF-16 to UCS-4 directly - std::u16string utf16; - readJavaModifiedUtf8(utf16); - value = to_ucs4(utf16); + uint16_t length = readInt16(); + _GEODE_CHECK_BUFFER_SIZE(length); + value = + JavaModifiedUtf8::decodeU32(reinterpret_cast(m_buf), length); } template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void DataInput::readJavaModifiedUtf8(std::u32string&); diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index 6cb0dd64f0..9d9c93cd64 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -58,8 +58,12 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) { return encodedLen; } -std::string JavaModifiedUtf8::fromString(const std::string& utf8) { - std::string jmutf8; +// Note on error handling in this method: +// Error handling here is done just to serve the purpose of not +// crashing, instead throwing exceptions. Beyond this, we do NOT fully +// validate the incoming utf-8 string, it is assumed to be otherwise correct. +ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { + ju8string jmutf8; size_t cursor = 0; while (cursor < utf8.size()) { @@ -72,7 +76,7 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) { jmutf8 += static_cast(0x80); } } else if ((byte1 & 0xE0) == 0xC0) { - if (cursor <= utf8.size() - 1) { + if (utf8.size() > 0 && cursor <= utf8.size() - 1) { jmutf8 += byte1; jmutf8 += utf8[cursor++]; } else { @@ -80,7 +84,7 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) { "Invalid utf-8 string passed to conversion method"); } } else if ((byte1 & 0xF0) == 0xE0) { - if (cursor <= utf8.size() - 2) { + if (utf8.size() > 2 && cursor <= utf8.size() - 2) { jmutf8 += byte1; jmutf8 += utf8[cursor++]; jmutf8 += utf8[cursor++]; @@ -89,7 +93,7 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) { "Invalid utf-8 string passed to conversion method"); } } else if ((byte1 & 0xF8) == 0xF0) { - if (cursor <= utf8.size() - 3) { + if (utf8.size() > 3 && cursor <= utf8.size() - 3) { auto byte2 = utf8[cursor++]; auto byte3 = utf8[cursor++]; auto byte4 = utf8[cursor++]; @@ -119,8 +123,8 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) { return jmutf8; } -std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) { - std::string jmutf8; +ju8string JavaModifiedUtf8::fromString(const std::u16string& utf16) { + ju8string jmutf8; jmutf8.reserve(utf16.length()); for (auto&& c : utf16) { @@ -130,7 +134,7 @@ std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) { return jmutf8; } -void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) { +void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) { if (c == 0) { // NUL jmutf8 += static_cast(0xc0); @@ -147,6 +151,109 @@ void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) { jmutf8 += static_cast(0x80 | (c & 0x3F)); } } +// +// def utf8m_to_utf8s(string) : +// """ +// : param string : modified utf8 encoded string +// : return : utf8 encoded string +// """ +// new_string = [] +// length = len(string) +// i = 0 +// while i < length : +// byte1 = string[i] +// if (byte1 & 0x80) == 0 : # 1byte encoding +// new_string.append(byte1) +// elif(byte1 & 0xE0) == 0xC0: # 2byte encoding +// i += 1 +// byte2 = string[i] +// if byte1 != 0xC0 or byte2 != 0x80: +// new_string.append(byte1) +// new_string.append(byte2) +// else: +// new_string.append(0) +// elif(byte1 & 0xF0) == 0xE0 : # 3byte encoding +// i += 1 +// byte2 = string[i] +// i += 1 +// byte3 = string[i] +// if i + 3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0: +//# See if this is a pair of 3byte encodings +// byte4 = string[i + 1] +// byte5 = string[i + 2] +// byte6 = string[i + 3] +// if byte4 == 0xED and (byte5 & 0xF0) == 0xB0: +//# Bits in : 11101101 1010xxxx 10xxxxxx +//# Bits in : 11101101 1011xxxx 10xxxxxx +// i += 3 +// +//# Reconstruct 21 bit code +// u21 = ((byte2 & 0x0F) + 1) << 16 +// u21 += (byte3 & 0x3F) << 10 +// u21 += (byte5 & 0x0F) << 6 +// u21 += byte6 & 0x3F +// +//# Bits out : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx +// +//# Convert to 4byte encoding +// new_string.append(0xF0 + ((u21 >> 18) & 0x07)) +// new_string.append(0x80 + ((u21 >> 12) & 0x3F)) +// new_string.append(0x80 + ((u21 >> 6) & 0x3F)) +// new_string.append(0x80 + (u21 & 0x3F)) +// continue +// new_string.append(byte1) +// new_string.append(byte2) +// new_string.append(byte3) +// i += 1 +// return bytes(new_string).decode("utf-8") +// +std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) { + std::u32string result; + + uint16_t i = 0; + while (i < len) { + auto byte1 = buf[i++]; + if (!(byte1 & 0x80)) { + result += 0x00000000 & byte1; + } else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) { + auto byte2 = buf[i++]; + if (!(byte1 == 0xC0) || !(byte2 == 0x80)) { + int32_t code_point = static_cast(byte1 & 0x1F) << 6; + code_point += static_cast(byte2 & 0x3F); + result += code_point; + } else { + result.append(static_cast(0)); + } + } else if ((i < len - 5) && (byte1 == 0xED)) { + auto byte2 = buf[i++]; + auto byte3 = buf[i++]; + auto byte4 = buf[i++]; + auto byte5 = buf[i++]; + auto byte6 = buf[i++]; + if (byte4 == 0xED) { + int32_t code_point = + 0x10000 + (static_cast(byte2 & 0xF) << 16); + code_point += static_cast(byte3 & 0x3F) << 10; + code_point += static_cast(byte5 & 0xF) << 6; + code_point += static_cast(byte6 & 0x3F); + result += code_point; + } else { + throw IllegalArgumentException("Bad encoding in jmutf-8 string"); + } + } else if ((i < len - 2) && ((byte1 & 0xE0) == 0xE0)) { + auto byte2 = buf[i++]; + auto byte3 = buf[i++]; + int32_t code_point = static_cast(byte1 & 0xF) << 12; + code_point += static_cast(byte2 & 0x3F) << 6; + code_point += static_cast(byte3 & 0x3F); + result += code_point; + } else { + throw IllegalArgumentException("Bad encoding in jmutf-8 string"); + } + } + + return result; +} std::u16string JavaModifiedUtf8::decode(const char* buf, uint16_t len) { std::u16string value; diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp index f6d1722b2c..6e08ab9080 100644 --- a/cppcache/src/util/JavaModifiedUtf8.hpp +++ b/cppcache/src/util/JavaModifiedUtf8.hpp @@ -27,6 +27,9 @@ namespace geode { namespace client { namespace internal { +struct ju8type_traits : std::char_traits {}; +typedef std::basic_string ju8string; + struct JavaModifiedUtf8 { /** * Calculate the length of the given UTF-8 string when encoded in Java @@ -45,20 +48,21 @@ struct JavaModifiedUtf8 { /** * Converts given UTF-8 string to Java Modified UTF-8 string. */ - static std::string fromString(const std::string& utf8); - + static ju8string fromString(const std::string& utf8); /** * Converts given UTF-16 string to Java Modified UTF-8 string. */ - static std::string fromString(const std::u16string& utf16); + static ju8string fromString(const std::u16string& utf16); /** * Converts a single UTF-16 code unit into Java Modified UTF-8 code units. */ - static void encode(const char16_t c, std::string& jmutf8); + static void encode(const char16_t c, ju8string& jmutf8); static std::u16string decode(const char* buf, uint16_t len); + static std::u32string decodeU32(const char* buf, uint16_t len); + static char16_t decodeJavaModifiedUtf8Char(const char** pbuf); }; diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp index 0c330e64a6..cb175ebcfb 100644 --- a/cppcache/test/CacheableStringTests.cpp +++ b/cppcache/test/CacheableStringTests.cpp @@ -21,11 +21,13 @@ #include #include +#include #include "ByteArrayFixture.hpp" #include "DataInputInternal.hpp" #include "DataOutputInternal.hpp" #include "SerializationRegistry.hpp" +#include "util/JavaModifiedUtf8.hpp" namespace { @@ -34,7 +36,9 @@ using apache::geode::client::CacheableString; using apache::geode::client::DataInputInternal; using apache::geode::client::DataOutput; using apache::geode::client::DataOutputInternal; +using apache::geode::client::IllegalArgumentException; using apache::geode::client::SerializationRegistry; +using apache::geode::client::internal::JavaModifiedUtf8; class TestDataOutput : public DataOutputInternal { public: @@ -220,4 +224,84 @@ TEST_F(CacheableStringTests, TestFromDataNonAsciiHuge) { EXPECT_EQ(utf8, str->value()); } +TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { + { + std::string bad_start_code; + bad_start_code += static_cast(0xF8); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code), + IllegalArgumentException); + } + + { + std::string too_short_2byte; + too_short_2byte += static_cast(0xC0); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), + IllegalArgumentException); + + std::string bad_2byte_at_end = "foo"; + too_short_2byte += static_cast(0xC0); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), + IllegalArgumentException); + } + + { + std::string too_short_3byte; + too_short_3byte += static_cast(0xE8); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + IllegalArgumentException); + + too_short_3byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + IllegalArgumentException); + + std::string bad_3byte_at_end = "foo"; + bad_3byte_at_end += static_cast(0xE8); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + IllegalArgumentException); + + bad_3byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + IllegalArgumentException); + } + + { + std::string too_short_4byte; + too_short_4byte += static_cast(0xF7); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + too_short_4byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + too_short_4byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + std::string bad_4byte_at_end = "foo"; + bad_4byte_at_end += static_cast(0xF7); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + bad_4byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + bad_4byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + } +} + +TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { + std::string utf8; + utf8 += 'a'; + + auto jmutf8 = JavaModifiedUtf8::fromString(utf8); + EXPECT_EQ(utf8.size(), jmutf8.size()); + for (size_t i = 0; i < utf8.size(); i++) { + EXPECT_EQ(utf8[i], jmutf8[i]); + } +} + } // namespace From 2f90d4cbbe28881fec0cf815516c69d0c449e96b Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Tue, 8 Mar 2022 08:54:40 -0800 Subject: [PATCH 04/10] GEODE-4189: Fix build break (and logic error) - Also delete commented block --- cppcache/src/util/JavaModifiedUtf8.cpp | 59 +------------------------- 1 file changed, 2 insertions(+), 57 deletions(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index 9d9c93cd64..f465735fce 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -151,62 +151,7 @@ void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) { jmutf8 += static_cast(0x80 | (c & 0x3F)); } } -// -// def utf8m_to_utf8s(string) : -// """ -// : param string : modified utf8 encoded string -// : return : utf8 encoded string -// """ -// new_string = [] -// length = len(string) -// i = 0 -// while i < length : -// byte1 = string[i] -// if (byte1 & 0x80) == 0 : # 1byte encoding -// new_string.append(byte1) -// elif(byte1 & 0xE0) == 0xC0: # 2byte encoding -// i += 1 -// byte2 = string[i] -// if byte1 != 0xC0 or byte2 != 0x80: -// new_string.append(byte1) -// new_string.append(byte2) -// else: -// new_string.append(0) -// elif(byte1 & 0xF0) == 0xE0 : # 3byte encoding -// i += 1 -// byte2 = string[i] -// i += 1 -// byte3 = string[i] -// if i + 3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0: -//# See if this is a pair of 3byte encodings -// byte4 = string[i + 1] -// byte5 = string[i + 2] -// byte6 = string[i + 3] -// if byte4 == 0xED and (byte5 & 0xF0) == 0xB0: -//# Bits in : 11101101 1010xxxx 10xxxxxx -//# Bits in : 11101101 1011xxxx 10xxxxxx -// i += 3 -// -//# Reconstruct 21 bit code -// u21 = ((byte2 & 0x0F) + 1) << 16 -// u21 += (byte3 & 0x3F) << 10 -// u21 += (byte5 & 0x0F) << 6 -// u21 += byte6 & 0x3F -// -//# Bits out : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -// -//# Convert to 4byte encoding -// new_string.append(0xF0 + ((u21 >> 18) & 0x07)) -// new_string.append(0x80 + ((u21 >> 12) & 0x3F)) -// new_string.append(0x80 + ((u21 >> 6) & 0x3F)) -// new_string.append(0x80 + (u21 & 0x3F)) -// continue -// new_string.append(byte1) -// new_string.append(byte2) -// new_string.append(byte3) -// i += 1 -// return bytes(new_string).decode("utf-8") -// + std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) { std::u32string result; @@ -214,7 +159,7 @@ std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) { while (i < len) { auto byte1 = buf[i++]; if (!(byte1 & 0x80)) { - result += 0x00000000 & byte1; + result += static_cast(byte1) & 0x000000FF; } else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) { auto byte2 = buf[i++]; if (!(byte1 == 0xC0) || !(byte2 == 0x80)) { From f744b7cadd22a3f3661c06dc0c0dec003f4464eb Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Tue, 8 Mar 2022 09:24:42 -0800 Subject: [PATCH 05/10] GEODE-4189: Fix Linux build break --- cppcache/src/util/JavaModifiedUtf8.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index f465735fce..6e3d301cbb 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -167,7 +167,7 @@ std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) { code_point += static_cast(byte2 & 0x3F); result += code_point; } else { - result.append(static_cast(0)); + result += static_cast(0); } } else if ((i < len - 5) && (byte1 == 0xED)) { auto byte2 = buf[i++]; From 4d47a9f5bc1f55de9f1211922ad53ef1b5682e2f Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Thu, 10 Mar 2022 07:26:38 -0800 Subject: [PATCH 06/10] GEODE-4189: Unit tests for utf-8 --> jmutf-8 complete --- cppcache/src/util/JavaModifiedUtf8.cpp | 76 +++++--- cppcache/src/util/JavaModifiedUtf8.hpp | 3 + cppcache/test/CacheableStringTests.cpp | 229 +++++++++++++++++++------ 3 files changed, 233 insertions(+), 75 deletions(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index 6e3d301cbb..6c62019ad8 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -19,6 +19,7 @@ #include #include +#include #include "geode/ExceptionTypes.hpp" #include "string.hpp" @@ -58,6 +59,14 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) { return encodedLen; } +std::set utf16_surrogate_codes = {{0xD800}, {0xDB7F}, {0xDB80}, {0xDBFF}, + {0xDC00}, {0xDF80}, {0xDFFF}}; + +bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) { + return (code_point > 0x7FF) && (utf16_surrogate_codes.find(code_point) == + utf16_surrogate_codes.end()); +} + // Note on error handling in this method: // Error handling here is done just to serve the purpose of not // crashing, instead throwing exceptions. Beyond this, we do NOT fully @@ -77,17 +86,36 @@ ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { } } else if ((byte1 & 0xE0) == 0xC0) { if (utf8.size() > 0 && cursor <= utf8.size() - 1) { - jmutf8 += byte1; - jmutf8 += utf8[cursor++]; + auto byte2 = utf8[cursor++]; + int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); + if (code_point > 0x7F) { + jmutf8 += byte1; + jmutf8 += byte2; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "ASCII character)"); + } } else { throw IllegalArgumentException( "Invalid utf-8 string passed to conversion method"); } } else if ((byte1 & 0xF0) == 0xE0) { if (utf8.size() > 2 && cursor <= utf8.size() - 2) { - jmutf8 += byte1; - jmutf8 += utf8[cursor++]; - jmutf8 += utf8[cursor++]; + auto byte2 = utf8[cursor++]; + auto byte3 = utf8[cursor++]; + + uint16_t code_point = + ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F); + if (IsValidCodePoint(code_point)) { + jmutf8 += byte1; + jmutf8 += byte2; + jmutf8 += byte3; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "3-byte encoding)"); + } } else { throw IllegalArgumentException( "Invalid utf-8 string passed to conversion method"); @@ -103,14 +131,20 @@ ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { code_point += (byte3 & 0x3F) << 6; code_point += byte4 & 0x3F; - jmutf8 += static_cast(0xED); - jmutf8 += - static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); - jmutf8 += static_cast((0x80 + ((code_point >> 10) & 0x3F))); + if (code_point > 0xFFFF) { + jmutf8 += static_cast(0xED); + jmutf8 += + static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); + jmutf8 += static_cast((0x80 + ((code_point >> 10) & 0x3F))); - jmutf8 += static_cast(0xED); - jmutf8 += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); - jmutf8 += byte4; + jmutf8 += static_cast(0xED); + jmutf8 += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); + jmutf8 += byte4; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "4-byte encoding)"); + } } else { throw IllegalArgumentException( "Invalid utf-8 string passed to conversion method"); @@ -162,36 +196,36 @@ std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) { result += static_cast(byte1) & 0x000000FF; } else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) { auto byte2 = buf[i++]; - if (!(byte1 == 0xC0) || !(byte2 == 0x80)) { + if (((byte1 & 0xFF) == 0xC0) && ((byte2 & 0xFF) == 0x80)) { + result.push_back(static_cast(0)); + } else { int32_t code_point = static_cast(byte1 & 0x1F) << 6; code_point += static_cast(byte2 & 0x3F); - result += code_point; - } else { - result += static_cast(0); + result.push_back(code_point); } - } else if ((i < len - 5) && (byte1 == 0xED)) { + } else if ((i < len - 4) && ((byte1 & 0xED) == 0xED)) { auto byte2 = buf[i++]; auto byte3 = buf[i++]; auto byte4 = buf[i++]; auto byte5 = buf[i++]; auto byte6 = buf[i++]; - if (byte4 == 0xED) { + if ((byte4 & 0xED) == 0xED) { int32_t code_point = 0x10000 + (static_cast(byte2 & 0xF) << 16); code_point += static_cast(byte3 & 0x3F) << 10; code_point += static_cast(byte5 & 0xF) << 6; code_point += static_cast(byte6 & 0x3F); - result += code_point; + result.push_back(code_point); } else { throw IllegalArgumentException("Bad encoding in jmutf-8 string"); } - } else if ((i < len - 2) && ((byte1 & 0xE0) == 0xE0)) { + } else if ((i < len - 1) && ((byte1 & 0xE0) == 0xE0)) { auto byte2 = buf[i++]; auto byte3 = buf[i++]; int32_t code_point = static_cast(byte1 & 0xF) << 12; code_point += static_cast(byte2 & 0x3F) << 6; code_point += static_cast(byte3 & 0x3F); - result += code_point; + result.push_back(code_point); } else { throw IllegalArgumentException("Bad encoding in jmutf-8 string"); } diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp index 6e08ab9080..8a483db116 100644 --- a/cppcache/src/util/JavaModifiedUtf8.hpp +++ b/cppcache/src/util/JavaModifiedUtf8.hpp @@ -64,6 +64,9 @@ struct JavaModifiedUtf8 { static std::u32string decodeU32(const char* buf, uint16_t len); static char16_t decodeJavaModifiedUtf8Char(const char** pbuf); + + private: + static bool IsValidCodePoint(uint16_t code_point); }; } // namespace internal diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp index cb175ebcfb..6445882a3f 100644 --- a/cppcache/test/CacheableStringTests.cpp +++ b/cppcache/test/CacheableStringTests.cpp @@ -224,83 +224,204 @@ TEST_F(CacheableStringTests, TestFromDataNonAsciiHuge) { EXPECT_EQ(utf8, str->value()); } -TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { - { - std::string bad_start_code; - bad_start_code += static_cast(0xF8); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code), - IllegalArgumentException); - } +std::vector impossible_bytes[] = { + {0xFE}, {0xFF}, {0xFE, 0xFE, 0xFF, 0xFF}}; - { - std::string too_short_2byte; - too_short_2byte += static_cast(0xC0); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), - IllegalArgumentException); +std::vector overlong_ascii_sequences[] = { + {0xC0, 0xAF}, {0xE0, 0x80, 0xAF}, {0xF0, 0x80, 0x80, 0xAF}}; - std::string bad_2byte_at_end = "foo"; - too_short_2byte += static_cast(0xC0); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), - IllegalArgumentException); - } +std::vector maximum_overlong_sequences[] = { + {0xC1, 0xBF}, {0xE0, 0x9F, 0xBF}, {0xF0, 0x8F, 0xBF, 0xBF}}; - { - std::string too_short_3byte; - too_short_3byte += static_cast(0xE8); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), - IllegalArgumentException); +std::vector overlong_nulls[] = { + {0xC0, 0x80}, {0xE0, 0x80, 0x80}, {0xF0, 0x80, 0x80, 0x80}}; - too_short_3byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), - IllegalArgumentException); +std::vector single_utf_16_surrogates[] = { + {0xED, 0xA0, 0x80}, {0xED, 0xAD, 0xBF}, {0xED, 0xAE, 0x80}, + {0xED, 0xAF, 0xBF}, {0xED, 0xB0, 0x80}, {0xED, 0xBE, 0x80}, + {0xED, 0xBF, 0xBF}}; - std::string bad_3byte_at_end = "foo"; - bad_3byte_at_end += static_cast(0xE8); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), +TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { + std::string bad_start_code; + bad_start_code += static_cast(0xF8); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code), + IllegalArgumentException); + + std::string too_short_2byte; + too_short_2byte += static_cast(0xC0); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), + IllegalArgumentException); + + std::string bad_2byte_at_end = "foo"; + bad_2byte_at_end += static_cast(0xC0); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_2byte_at_end), + IllegalArgumentException); + + std::string too_long_3_byte_encode; + too_long_3_byte_encode.push_back(0xE0); + too_long_3_byte_encode.push_back(0x80); + too_long_3_byte_encode.push_back(0x80); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode), + IllegalArgumentException); + + std::string too_short_3byte; + too_short_3byte += static_cast(0xE8); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + IllegalArgumentException); + + too_short_3byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + IllegalArgumentException); + + std::string bad_3byte_at_end = "foo"; + bad_3byte_at_end += static_cast(0xE8); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + IllegalArgumentException); + + bad_3byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + IllegalArgumentException); + + std::string too_short_4byte; + too_short_4byte += static_cast(0xF7); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + too_short_4byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + too_short_4byte += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + IllegalArgumentException); + + std::string bad_4byte_at_end = "foo"; + bad_4byte_at_end += static_cast(0xF7); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + bad_4byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + bad_4byte_at_end += static_cast(0x1); + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + IllegalArgumentException); + + for (auto sequence : impossible_bytes) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); + } - bad_3byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + for (auto sequence : overlong_ascii_sequences) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); } - { - std::string too_short_4byte; - too_short_4byte += static_cast(0xF7); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + for (auto sequence : maximum_overlong_sequences) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); + } - too_short_4byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + for (auto sequence : overlong_nulls) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); + } - too_short_4byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + for (auto sequence : single_utf_16_surrogates) { + std::string bad_sequence; + for (auto byte_value : sequence) { + bad_sequence += static_cast(byte_value); + } + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); + } +} - std::string bad_4byte_at_end = "foo"; - bad_4byte_at_end += static_cast(0xF7); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), - IllegalArgumentException); +std::pair, std::vector> lowest_boundary_sequences[] = { + {{0x00}, {0xC0, 0x80}}, + {{0xD0, 0x80}, {0xD0, 0x80}}, + {{0xE0, 0xA0, 0x80}, {0xE0, 0xA0, 0x80}}, + {{0xF0, 0x90, 0x80, 0x80}, {0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80}}}; + +std::pair, std::vector> highest_boundary_sequences[] = { + {{0x7F}, {0x7F}}, + {{0xDF, 0xCF}, {0xDF, 0xCF}}, + {{0xEF, 0xBF, 0xBF}, {0xEF, 0xBF, 0xBF}}, + {{0xF7, 0xBF, 0xBF, 0xBF}, {0xED, 0xAE, 0xBF, 0xED, 0xBF, 0xBF}}, +}; - bad_4byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), - IllegalArgumentException); +std::pair, std::vector> other_boundary_sequences[] = { + {{0xED, 0x9F, 0xBF}, {0xED, 0x9F, 0xBF}}, + {{0xEE, 0x80, 0x80}, {0xEE, 0x80, 0x80}}, + {{0xEF, 0xBF, 0xBD}, {0xEF, 0xBF, 0xBD}}, +}; - bad_4byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), - IllegalArgumentException); - } -} +#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0])) TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { std::string utf8; - utf8 += 'a'; + std::string expected; + utf8.push_back(0); auto jmutf8 = JavaModifiedUtf8::fromString(utf8); - EXPECT_EQ(utf8.size(), jmutf8.size()); - for (size_t i = 0; i < utf8.size(); i++) { - EXPECT_EQ(utf8[i], jmutf8[i]); + + for (auto i = 0; i < ARRAY_SIZE(lowest_boundary_sequences); i++) { + utf8.clear(); + expected.clear(); + for (auto byte_value : std::get<0>(lowest_boundary_sequences[i])) { + utf8 += static_cast(byte_value); + } + for (auto byte_value : std::get<1>(lowest_boundary_sequences[i])) { + expected += static_cast(byte_value); + } + jmutf8 = JavaModifiedUtf8::fromString(utf8); + EXPECT_EQ(expected.size(), jmutf8.size()); + EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); + } + + for (auto i = 0; i < ARRAY_SIZE(highest_boundary_sequences); i++) { + utf8.clear(); + expected.clear(); + for (auto byte_value : std::get<0>(highest_boundary_sequences[i])) { + utf8 += static_cast(byte_value); + } + for (auto byte_value : std::get<1>(highest_boundary_sequences[i])) { + expected += static_cast(byte_value); + } + jmutf8 = JavaModifiedUtf8::fromString(utf8); + EXPECT_EQ(expected.size(), jmutf8.size()); + EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); + } + + for (auto i = 0; i < ARRAY_SIZE(other_boundary_sequences); i++) { + utf8.clear(); + expected.clear(); + for (auto byte_value : std::get<0>(other_boundary_sequences[i])) { + utf8 += static_cast(byte_value); + } + for (auto byte_value : std::get<1>(other_boundary_sequences[i])) { + expected += static_cast(byte_value); + } + jmutf8 = JavaModifiedUtf8::fromString(utf8); + EXPECT_EQ(expected.size(), jmutf8.size()); + EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } } From 63b417a8ed77395f50a45e6de5e5485d9ac56ec2 Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Thu, 10 Mar 2022 07:59:40 -0800 Subject: [PATCH 07/10] GEODE-4189: Fix Linux build break - Also delete obsolete comment block --- cppcache/src/util/JavaModifiedUtf8.cpp | 4 ---- cppcache/test/CacheableStringTests.cpp | 9 ++++++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index 6c62019ad8..875e1efcd7 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -67,10 +67,6 @@ bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) { utf16_surrogate_codes.end()); } -// Note on error handling in this method: -// Error handling here is done just to serve the purpose of not -// crashing, instead throwing exceptions. Beyond this, we do NOT fully -// validate the incoming utf-8 string, it is assumed to be otherwise correct. ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { ju8string jmutf8; size_t cursor = 0; diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp index 6445882a3f..1927103628 100644 --- a/cppcache/test/CacheableStringTests.cpp +++ b/cppcache/test/CacheableStringTests.cpp @@ -382,7 +382,8 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { auto jmutf8 = JavaModifiedUtf8::fromString(utf8); - for (auto i = 0; i < ARRAY_SIZE(lowest_boundary_sequences); i++) { + for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; + i < ARRAY_SIZE(lowest_boundary_sequences); i++) { utf8.clear(); expected.clear(); for (auto byte_value : std::get<0>(lowest_boundary_sequences[i])) { @@ -396,7 +397,8 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } - for (auto i = 0; i < ARRAY_SIZE(highest_boundary_sequences); i++) { + for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; + i < ARRAY_SIZE(highest_boundary_sequences); i++) { utf8.clear(); expected.clear(); for (auto byte_value : std::get<0>(highest_boundary_sequences[i])) { @@ -410,7 +412,8 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } - for (auto i = 0; i < ARRAY_SIZE(other_boundary_sequences); i++) { + for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; + i < ARRAY_SIZE(other_boundary_sequences); i++) { utf8.clear(); expected.clear(); for (auto byte_value : std::get<0>(other_boundary_sequences[i])) { From a426c960763005ddd1c20c80d64d91ba3925e4c0 Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Thu, 10 Mar 2022 09:30:22 -0800 Subject: [PATCH 08/10] GEODE-4189: Fix another Linux build break --- cppcache/test/CacheableStringTests.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp index 1927103628..026826a49f 100644 --- a/cppcache/test/CacheableStringTests.cpp +++ b/cppcache/test/CacheableStringTests.cpp @@ -258,9 +258,9 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { IllegalArgumentException); std::string too_long_3_byte_encode; - too_long_3_byte_encode.push_back(0xE0); - too_long_3_byte_encode.push_back(0x80); - too_long_3_byte_encode.push_back(0x80); + too_long_3_byte_encode.push_back(static_cast(0xE0)); + too_long_3_byte_encode.push_back(static_cast(0x80)); + too_long_3_byte_encode.push_back(static_cast(0x80)); EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode), IllegalArgumentException); From 15fb7e90b696b42e1f1011210790ca9742c8208b Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Fri, 11 Mar 2022 07:26:41 -0800 Subject: [PATCH 09/10] GEODE-4189: experimenting with simpler scanning code --- cppcache/src/util/JavaModifiedUtf8.cpp | 163 +++++++++++++++++++++++-- cppcache/src/util/JavaModifiedUtf8.hpp | 9 ++ cppcache/test/CacheableStringTests.cpp | 46 +++---- 3 files changed, 187 insertions(+), 31 deletions(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index 875e1efcd7..dcf5978b93 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -67,6 +67,149 @@ bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) { utf16_surrogate_codes.end()); } +enum class UtfScanState : int32_t { + Initial = 0, + Need1 = 1, + Need2 = 2, + Need3 = 3, + Need4 = 4, + Need5 = 5 +}; + +ju8string JavaModifiedUtf8::decode(const std::string& utf8char) { + ju8string jmutf8char; + + if (utf8char.size() == 2) { + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) { + int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); + if (code_point > 0x7F) { + jmutf8char += byte1; + jmutf8char += byte2; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "ASCII character)"); + } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + } else if (utf8char.size() == 3) { + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + auto byte3 = utf8char[2]; + if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) && + ((byte3 & 0x80) == 0x80)) { + uint16_t code_point = + ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F); + if (IsValidCodePoint(code_point)) { + jmutf8char += byte1; + jmutf8char += byte2; + jmutf8char += byte3; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "3-byte encoding)"); + } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + } else if (utf8char.size() == 4) { + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + auto byte3 = utf8char[2]; + auto byte4 = utf8char[3]; + if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) && + ((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) { + uint32_t code_point = (byte1 & 0x07) << 18; + code_point += (byte2 & 0x3F) << 12; + code_point += (byte3 & 0x3F) << 6; + code_point += byte4 & 0x3F; + + if (code_point > 0xFFFF) { + jmutf8char += static_cast(0xED); + jmutf8char += + static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); + jmutf8char += + static_cast((0x80 + ((code_point >> 10) & 0x3F))); + + jmutf8char += static_cast(0xED); + jmutf8char += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); + jmutf8char += byte4; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method (overly long " + "4-byte encoding)"); + } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + } + + return jmutf8char; +} + +ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) { + ju8string jmutf8; + size_t cursor = 0; + auto state = UtfScanState::Initial; + std::string current; + + while (cursor < utf8.size()) { + auto byte = utf8[cursor++]; + + switch (state) { + case UtfScanState::Initial: + if ((byte & 0x80) == 0) { + if (byte) { + jmutf8 += byte; + } else { + jmutf8 += static_cast(0xC0); + jmutf8 += static_cast(0x80); + } + } else if ((byte & 0xE0) == 0xC0) { + current += byte; + state = UtfScanState::Need1; + } else if ((byte & 0xF0) == 0xE0) { + current += byte; + state = UtfScanState::Need2; + } else if ((byte & 0xF8) == 0xF0) { + current += byte; + state = UtfScanState::Need3; + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + break; + case UtfScanState::Need1: { + current += byte; + state = UtfScanState::Initial; + jmutf8 += JavaModifiedUtf8::decode(current); + current.clear(); + } break; + case UtfScanState::Need2: + current += byte; + state = UtfScanState::Need1; + break; + case UtfScanState::Need3: + current += byte; + state = UtfScanState::Need2; + break; + } + } + + if (state != UtfScanState::Initial) { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8; +} + ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { ju8string jmutf8; size_t cursor = 0; @@ -153,15 +296,19 @@ ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { return jmutf8; } -ju8string JavaModifiedUtf8::fromString(const std::u16string& utf16) { - ju8string jmutf8; - jmutf8.reserve(utf16.length()); +std::string JavaModifiedUtf8::toString(const ju8string& jmutf8) { + // std::string utf8; + // size_t cursor = 0; - for (auto&& c : utf16) { - encode(c, jmutf8); - } - - return jmutf8; + // while (cursor < jmutf8.size()) { + // auto byte1 = jmutf8[cursor++]; + // if ((byte1 & 0x80) == 0) { + // utf8.push_back(byte1); + // } else if ((byte1 & 0xE0) == 0xC0) { + // auto byte2 = jmutf8[cursor++]; + // if () } + //} + return ""; } void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) { diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp index 8a483db116..cdd19367d1 100644 --- a/cppcache/src/util/JavaModifiedUtf8.hpp +++ b/cppcache/src/util/JavaModifiedUtf8.hpp @@ -49,11 +49,18 @@ struct JavaModifiedUtf8 { * Converts given UTF-8 string to Java Modified UTF-8 string. */ static ju8string fromString(const std::string& utf8); + static ju8string fromStringImproved(const std::string& utf8); + /** * Converts given UTF-16 string to Java Modified UTF-8 string. */ static ju8string fromString(const std::u16string& utf16); + /** + * Converts Java-Modified UTF-8 string to UTF-8 string. + */ + std::string toString(const ju8string& jmutf8); + /** * Converts a single UTF-16 code unit into Java Modified UTF-8 code units. */ @@ -61,6 +68,8 @@ struct JavaModifiedUtf8 { static std::u16string decode(const char* buf, uint16_t len); + static ju8string decode(const std::string& utf8char); + static std::u32string decodeU32(const char* buf, uint16_t len); static char16_t decodeJavaModifiedUtf8Char(const char** pbuf); diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp index 026826a49f..38c82ed77a 100644 --- a/cppcache/test/CacheableStringTests.cpp +++ b/cppcache/test/CacheableStringTests.cpp @@ -244,68 +244,68 @@ std::vector single_utf_16_surrogates[] = { TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { std::string bad_start_code; bad_start_code += static_cast(0xF8); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_start_code), IllegalArgumentException); std::string too_short_2byte; too_short_2byte += static_cast(0xC0); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_2byte), IllegalArgumentException); std::string bad_2byte_at_end = "foo"; bad_2byte_at_end += static_cast(0xC0); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_2byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_2byte_at_end), IllegalArgumentException); std::string too_long_3_byte_encode; too_long_3_byte_encode.push_back(static_cast(0xE0)); too_long_3_byte_encode.push_back(static_cast(0x80)); too_long_3_byte_encode.push_back(static_cast(0x80)); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_long_3_byte_encode), IllegalArgumentException); std::string too_short_3byte; too_short_3byte += static_cast(0xE8); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte), IllegalArgumentException); too_short_3byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte), IllegalArgumentException); std::string bad_3byte_at_end = "foo"; bad_3byte_at_end += static_cast(0xE8); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end), IllegalArgumentException); bad_3byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end), IllegalArgumentException); std::string too_short_4byte; too_short_4byte += static_cast(0xF7); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte), IllegalArgumentException); too_short_4byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte), IllegalArgumentException); too_short_4byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte), IllegalArgumentException); std::string bad_4byte_at_end = "foo"; bad_4byte_at_end += static_cast(0xF7); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end), IllegalArgumentException); bad_4byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end), IllegalArgumentException); bad_4byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end), IllegalArgumentException); for (auto sequence : impossible_bytes) { @@ -313,7 +313,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), IllegalArgumentException); } @@ -322,7 +322,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), IllegalArgumentException); } @@ -331,7 +331,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), IllegalArgumentException); } @@ -340,7 +340,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), IllegalArgumentException); } @@ -349,7 +349,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), IllegalArgumentException); } } @@ -380,7 +380,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { std::string expected; utf8.push_back(0); - auto jmutf8 = JavaModifiedUtf8::fromString(utf8); + auto jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; i < ARRAY_SIZE(lowest_boundary_sequences); i++) { @@ -392,7 +392,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { for (auto byte_value : std::get<1>(lowest_boundary_sequences[i])) { expected += static_cast(byte_value); } - jmutf8 = JavaModifiedUtf8::fromString(utf8); + jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); EXPECT_EQ(expected.size(), jmutf8.size()); EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } @@ -407,7 +407,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { for (auto byte_value : std::get<1>(highest_boundary_sequences[i])) { expected += static_cast(byte_value); } - jmutf8 = JavaModifiedUtf8::fromString(utf8); + jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); EXPECT_EQ(expected.size(), jmutf8.size()); EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } @@ -422,7 +422,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { for (auto byte_value : std::get<1>(other_boundary_sequences[i])) { expected += static_cast(byte_value); } - jmutf8 = JavaModifiedUtf8::fromString(utf8); + jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); EXPECT_EQ(expected.size(), jmutf8.size()); EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } From 959dcff7c949d7583adb46bf32ea24db30a4a632 Mon Sep 17 00:00:00 2001 From: Blake Bender Date: Fri, 11 Mar 2022 08:29:51 -0800 Subject: [PATCH 10/10] GEODE-4189: Unit tests passing with new scanner --- cppcache/src/util/JavaModifiedUtf8.cpp | 287 ++++++++++--------------- cppcache/src/util/JavaModifiedUtf8.hpp | 4 +- cppcache/test/CacheableStringTests.cpp | 50 ++--- 3 files changed, 138 insertions(+), 203 deletions(-) diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp index dcf5978b93..58cd536b26 100644 --- a/cppcache/src/util/JavaModifiedUtf8.cpp +++ b/cppcache/src/util/JavaModifiedUtf8.cpp @@ -69,91 +69,114 @@ bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) { enum class UtfScanState : int32_t { Initial = 0, - Need1 = 1, - Need2 = 2, - Need3 = 3, - Need4 = 4, - Need5 = 5 + Continuing = 1, }; -ju8string JavaModifiedUtf8::decode(const std::string& utf8char) { +ju8string JavaModifiedUtf8::decode2byte(const std::string& utf8char) { ju8string jmutf8char; - - if (utf8char.size() == 2) { - auto byte1 = utf8char[0]; - auto byte2 = utf8char[1]; - if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) { - int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); - if (code_point > 0x7F) { - jmutf8char += byte1; - jmutf8char += byte2; - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method (overly long " - "ASCII character)"); - } + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) { + int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); + if (code_point > 0x7F) { + jmutf8char += byte1; + jmutf8char += byte2; } else { throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method"); + "Invalid utf-8 string passed to conversion method (overly long " + "ASCII character)"); } - } else if (utf8char.size() == 3) { - auto byte1 = utf8char[0]; - auto byte2 = utf8char[1]; - auto byte3 = utf8char[2]; - if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) && - ((byte3 & 0x80) == 0x80)) { - uint16_t code_point = - ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F); - if (IsValidCodePoint(code_point)) { - jmutf8char += byte1; - jmutf8char += byte2; - jmutf8char += byte3; - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method (overly long " - "3-byte encoding)"); - } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8char; +} + +ju8string JavaModifiedUtf8::decode3byte(const std::string& utf8char) { + ju8string jmutf8char; + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + auto byte3 = utf8char[2]; + if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) && + ((byte3 & 0x80) == 0x80)) { + uint16_t code_point = + ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F); + if (IsValidCodePoint(code_point)) { + jmutf8char += byte1; + jmutf8char += byte2; + jmutf8char += byte3; } else { throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method"); + "Invalid utf-8 string passed to conversion method (overly long " + "3-byte encoding)"); } - } else if (utf8char.size() == 4) { - auto byte1 = utf8char[0]; - auto byte2 = utf8char[1]; - auto byte3 = utf8char[2]; - auto byte4 = utf8char[3]; - if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) && - ((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) { - uint32_t code_point = (byte1 & 0x07) << 18; - code_point += (byte2 & 0x3F) << 12; - code_point += (byte3 & 0x3F) << 6; - code_point += byte4 & 0x3F; - - if (code_point > 0xFFFF) { - jmutf8char += static_cast(0xED); - jmutf8char += - static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); - jmutf8char += - static_cast((0x80 + ((code_point >> 10) & 0x3F))); - - jmutf8char += static_cast(0xED); - jmutf8char += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); - jmutf8char += byte4; - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method (overly long " - "4-byte encoding)"); - } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8char; +} + +ju8string JavaModifiedUtf8::decode4byte(const std::string& utf8char) { + ju8string jmutf8char; + auto byte1 = utf8char[0]; + auto byte2 = utf8char[1]; + auto byte3 = utf8char[2]; + auto byte4 = utf8char[3]; + if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) && + ((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) { + uint32_t code_point = (byte1 & 0x07) << 18; + code_point += (byte2 & 0x3F) << 12; + code_point += (byte3 & 0x3F) << 6; + code_point += byte4 & 0x3F; + + if (code_point > 0xFFFF) { + jmutf8char += static_cast(0xED); + jmutf8char += + static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); + jmutf8char += static_cast((0x80 + ((code_point >> 10) & 0x3F))); + + jmutf8char += static_cast(0xED); + jmutf8char += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); + jmutf8char += byte4; } else { throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method"); + "Invalid utf-8 string passed to conversion method (overly long " + "4-byte encoding)"); } + } else { + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); } return jmutf8char; } -ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) { +ju8string JavaModifiedUtf8::decode(const std::string& utf8char) { + ju8string jmutf8char; + + switch (utf8char.size()) { + case 2: + jmutf8char = decode2byte(utf8char); + break; + case 3: + jmutf8char = decode3byte(utf8char); + break; + case 4: + jmutf8char = decode4byte(utf8char); + break; + default: + throw IllegalArgumentException( + "Invalid utf-8 string passed to conversion method"); + } + + return jmutf8char; +} + +ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { ju8string jmutf8; size_t cursor = 0; auto state = UtfScanState::Initial; @@ -171,37 +194,33 @@ ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) { jmutf8 += static_cast(0xC0); jmutf8 += static_cast(0x80); } - } else if ((byte & 0xE0) == 0xC0) { - current += byte; - state = UtfScanState::Need1; - } else if ((byte & 0xF0) == 0xE0) { - current += byte; - state = UtfScanState::Need2; - } else if ((byte & 0xF8) == 0xF0) { - current += byte; - state = UtfScanState::Need3; - } else { + } else if ((byte & 0xc0) == 0x80) { throw IllegalArgumentException( "Invalid utf-8 string passed to conversion method"); + } else { + current += byte; + state = UtfScanState::Continuing; } break; - case UtfScanState::Need1: { - current += byte; - state = UtfScanState::Initial; - jmutf8 += JavaModifiedUtf8::decode(current); - current.clear(); + case UtfScanState::Continuing: { + if ((byte & 0xC0) == 0x80) { + current += byte; + } else { + cursor--; + state = UtfScanState::Initial; + jmutf8 += JavaModifiedUtf8::decode(current); + current.clear(); + } } break; - case UtfScanState::Need2: - current += byte; - state = UtfScanState::Need1; - break; - case UtfScanState::Need3: - current += byte; - state = UtfScanState::Need2; - break; } } + if (current.size() && state == UtfScanState::Continuing) { + state = UtfScanState::Initial; + jmutf8 += JavaModifiedUtf8::decode(current); + current.clear(); + } + if (state != UtfScanState::Initial) { throw IllegalArgumentException( "Invalid utf-8 string passed to conversion method"); @@ -210,92 +229,6 @@ ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) { return jmutf8; } -ju8string JavaModifiedUtf8::fromString(const std::string& utf8) { - ju8string jmutf8; - size_t cursor = 0; - - while (cursor < utf8.size()) { - auto byte1 = utf8[cursor++]; - if ((byte1 & 0x80) == 0) { - if (byte1) { - jmutf8 += byte1; - } else { - jmutf8 += static_cast(0xC0); - jmutf8 += static_cast(0x80); - } - } else if ((byte1 & 0xE0) == 0xC0) { - if (utf8.size() > 0 && cursor <= utf8.size() - 1) { - auto byte2 = utf8[cursor++]; - int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); - if (code_point > 0x7F) { - jmutf8 += byte1; - jmutf8 += byte2; - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method (overly long " - "ASCII character)"); - } - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method"); - } - } else if ((byte1 & 0xF0) == 0xE0) { - if (utf8.size() > 2 && cursor <= utf8.size() - 2) { - auto byte2 = utf8[cursor++]; - auto byte3 = utf8[cursor++]; - - uint16_t code_point = - ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F); - if (IsValidCodePoint(code_point)) { - jmutf8 += byte1; - jmutf8 += byte2; - jmutf8 += byte3; - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method (overly long " - "3-byte encoding)"); - } - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method"); - } - } else if ((byte1 & 0xF8) == 0xF0) { - if (utf8.size() > 3 && cursor <= utf8.size() - 3) { - auto byte2 = utf8[cursor++]; - auto byte3 = utf8[cursor++]; - auto byte4 = utf8[cursor++]; - - uint32_t code_point = (byte1 & 0x07) << 18; - code_point += (byte2 & 0x3F) << 12; - code_point += (byte3 & 0x3F) << 6; - code_point += byte4 & 0x3F; - - if (code_point > 0xFFFF) { - jmutf8 += static_cast(0xED); - jmutf8 += - static_cast((0xA0 + (((code_point >> 16) - 1) & 0x0F))); - jmutf8 += static_cast((0x80 + ((code_point >> 10) & 0x3F))); - - jmutf8 += static_cast(0xED); - jmutf8 += static_cast((0xB0 + ((code_point >> 6) & 0x0F))); - jmutf8 += byte4; - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method (overly long " - "4-byte encoding)"); - } - } else { - throw IllegalArgumentException( - "Invalid utf-8 string passed to conversion method"); - } - - } else { - throw IllegalArgumentException("Invalid utf-8 start code"); - } - } - return jmutf8; -} - std::string JavaModifiedUtf8::toString(const ju8string& jmutf8) { // std::string utf8; // size_t cursor = 0; diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp index cdd19367d1..7029d578f9 100644 --- a/cppcache/src/util/JavaModifiedUtf8.hpp +++ b/cppcache/src/util/JavaModifiedUtf8.hpp @@ -49,7 +49,6 @@ struct JavaModifiedUtf8 { * Converts given UTF-8 string to Java Modified UTF-8 string. */ static ju8string fromString(const std::string& utf8); - static ju8string fromStringImproved(const std::string& utf8); /** * Converts given UTF-16 string to Java Modified UTF-8 string. @@ -76,6 +75,9 @@ struct JavaModifiedUtf8 { private: static bool IsValidCodePoint(uint16_t code_point); + static ju8string decode2byte(const std::string& utf8char); + static ju8string decode3byte(const std::string& utf8char); + static ju8string decode4byte(const std::string& utf8char); }; } // namespace internal diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp index 38c82ed77a..5ccec2203a 100644 --- a/cppcache/test/CacheableStringTests.cpp +++ b/cppcache/test/CacheableStringTests.cpp @@ -244,68 +244,68 @@ std::vector single_utf_16_surrogates[] = { TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { std::string bad_start_code; bad_start_code += static_cast(0xF8); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_start_code), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code), IllegalArgumentException); std::string too_short_2byte; too_short_2byte += static_cast(0xC0); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_2byte), + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte), IllegalArgumentException); std::string bad_2byte_at_end = "foo"; bad_2byte_at_end += static_cast(0xC0); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_2byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_2byte_at_end), IllegalArgumentException); std::string too_long_3_byte_encode; too_long_3_byte_encode.push_back(static_cast(0xE0)); too_long_3_byte_encode.push_back(static_cast(0x80)); too_long_3_byte_encode.push_back(static_cast(0x80)); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_long_3_byte_encode), + EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode), IllegalArgumentException); std::string too_short_3byte; too_short_3byte += static_cast(0xE8); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte), + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), IllegalArgumentException); too_short_3byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte), + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte), IllegalArgumentException); std::string bad_3byte_at_end = "foo"; bad_3byte_at_end += static_cast(0xE8); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), IllegalArgumentException); bad_3byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end), IllegalArgumentException); std::string too_short_4byte; too_short_4byte += static_cast(0xF7); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte), + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), IllegalArgumentException); too_short_4byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte), + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), IllegalArgumentException); too_short_4byte += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte), + EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte), IllegalArgumentException); std::string bad_4byte_at_end = "foo"; bad_4byte_at_end += static_cast(0xF7); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), IllegalArgumentException); bad_4byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), IllegalArgumentException); bad_4byte_at_end += static_cast(0x1); - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end), IllegalArgumentException); for (auto sequence : impossible_bytes) { @@ -313,7 +313,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); } @@ -322,7 +322,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); } @@ -331,7 +331,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); } @@ -340,7 +340,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); } @@ -349,7 +349,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) { for (auto byte_value : sequence) { bad_sequence += static_cast(byte_value); } - EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence), + EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence), IllegalArgumentException); } } @@ -362,7 +362,7 @@ std::pair, std::vector> lowest_boundary_sequences[] = { std::pair, std::vector> highest_boundary_sequences[] = { {{0x7F}, {0x7F}}, - {{0xDF, 0xCF}, {0xDF, 0xCF}}, + {{0xDF, 0xBF}, {0xDF, 0xBF}}, {{0xEF, 0xBF, 0xBF}, {0xEF, 0xBF, 0xBF}}, {{0xF7, 0xBF, 0xBF, 0xBF}, {0xED, 0xAE, 0xBF, 0xED, 0xBF, 0xBF}}, }; @@ -380,7 +380,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { std::string expected; utf8.push_back(0); - auto jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); + auto jmutf8 = JavaModifiedUtf8::fromString(utf8); for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; i < ARRAY_SIZE(lowest_boundary_sequences); i++) { @@ -392,12 +392,12 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { for (auto byte_value : std::get<1>(lowest_boundary_sequences[i])) { expected += static_cast(byte_value); } - jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); + jmutf8 = JavaModifiedUtf8::fromString(utf8); EXPECT_EQ(expected.size(), jmutf8.size()); EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } - for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0; + for (decltype(ARRAY_SIZE(highest_boundary_sequences)) i = 0; i < ARRAY_SIZE(highest_boundary_sequences); i++) { utf8.clear(); expected.clear(); @@ -407,7 +407,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { for (auto byte_value : std::get<1>(highest_boundary_sequences[i])) { expected += static_cast(byte_value); } - jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); + jmutf8 = JavaModifiedUtf8::fromString(utf8); EXPECT_EQ(expected.size(), jmutf8.size()); EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); } @@ -422,7 +422,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) { for (auto byte_value : std::get<1>(other_boundary_sequences[i])) { expected += static_cast(byte_value); } - jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8); + jmutf8 = JavaModifiedUtf8::fromString(utf8); EXPECT_EQ(expected.size(), jmutf8.size()); EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size())); }