From cf799414073bc1fb4b73c645b803e98300812022 Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Thu, 3 Mar 2022 11:08:05 -0800
Subject: [PATCH 01/10] GEODE-4198: Convert directly from utf-8 --> java
 modified utf-8, rathern than utf-8 --> utf-16 --> jmutf-8.

---
 cppcache/src/util/JavaModifiedUtf8.cpp | 60 +++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)
diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index d6f53b81cd..4eee3efb3c 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -20,6 +20,7 @@
 #include <codecvt>
 #include <locale>
 
+#include "geode/ExceptionTypes.hpp"
 #include "string.hpp"
 
 namespace apache {
@@ -58,7 +59,64 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) {
 }
 
 std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
-  return fromString(to_utf16(utf8));
+  std::string jmutf8;
+  auto cursor = 0;
+
+  while (cursor < utf8.size()) {
+    auto byte1 = utf8[cursor++];
+    if ((byte1 & 0x80) == 0) {
+      if (byte1) {
+        jmutf8 += byte1;
+      } else {
+        jmutf8 += static_cast<uint8_t>(0xC0);
+        jmutf8 += static_cast<uint8_t>(0x80);
+      }
+    } else if ((byte1 & 0xE0) == 0xC0) {
+      if (cursor <= utf8.size() - 1) {
+        jmutf8 += byte1;
+        jmutf8 += utf8[cursor++];
+      } else {
+        throw IllegalArgumentException(
+            "Invalid utf-8 string passed to conversion method");
+      }
+    } else if ((byte1 & 0xF0) == 0xE0) {
+      if (cursor <= utf8.size() - 2) {
+        jmutf8 += byte1;
+        jmutf8 += utf8[cursor++];
+        jmutf8 += utf8[cursor++];
+      } else {
+        throw IllegalArgumentException(
+            "Invalid utf-8 string passed to conversion method");
+      }
+    } else if ((byte1 & 0xF8) == 0xF0) {
+      if (cursor <= utf8.size() - 3) {
+        auto byte2 = utf8[cursor++];
+        auto byte3 = utf8[cursor++];
+        auto byte4 = utf8[cursor++];
+
+        uint32_t code_point = (byte1 & 0x07) << 18;
+        code_point += (byte2 & 0x3F) << 12;
+        code_point += (byte3 & 0x3F) << 6;
+        code_point += byte4 & 0x3F;
+
+        jmutf8 += static_cast<uint8_t>(0xED);
+        jmutf8 +=
+            static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
+        jmutf8 += static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));
+
+        jmutf8 += static_cast<uint8_t>(0xED);
+        jmutf8 += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
+        jmutf8 += byte4;
+      } else {
+        throw IllegalArgumentException(
+            "Invalid utf-8 string passed to conversion method");
+      }
+
+    } else {
+      throw IllegalArgumentException("Invalid utf-8 start code");
+    }
+  }
+  return jmutf8;
 }
 
 std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) {

From ba848e02b8a9ab66bb594b668d0432ca22646838 Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Thu, 3 Mar 2022 11:48:20 -0800
Subject: [PATCH 02/10] GEODE-4189: Fix Linux builds

---
 cppcache/src/util/JavaModifiedUtf8.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index 4eee3efb3c..6cb0dd64f0 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -60,7 +60,7 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) {
 
 std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
   std::string jmutf8;
-  auto cursor = 0;
+  size_t cursor = 0;
 
   while (cursor < utf8.size()) {
     auto byte1 = utf8[cursor++];

From ad3e0a3bd534361b1ab68f0a15d36a1ae941d9e3 Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Mon, 7 Mar 2022 14:50:18 -0800
Subject: [PATCH 03/10] WIP: add some unit tests, and working on direct jmutf-8
 --> UCS-4

---
 cppcache/src/DataInput.cpp             |  14 +--
 cppcache/src/util/JavaModifiedUtf8.cpp | 123 +++++++++++++++++++++++--
 cppcache/src/util/JavaModifiedUtf8.hpp |  12 ++-
 cppcache/test/CacheableStringTests.cpp |  84 +++++++++++++++++
 4 files changed, 215 insertions(+), 18 deletions(-)

diff --git a/cppcache/src/DataInput.cpp b/cppcache/src/DataInput.cpp
index 3590dd917e..cd672cbbfa 100644
--- a/cppcache/src/DataInput.cpp
+++ b/cppcache/src/DataInput.cpp
@@ -24,6 +24,8 @@
 #include "util/JavaModifiedUtf8.hpp"
 #include "util/string.hpp"
 
+using apache::geode::client::internal::JavaModifiedUtf8;
+
 namespace apache {
 namespace geode {
 namespace client {
@@ -62,8 +64,8 @@ void DataInput::readJavaModifiedUtf8(
     std::basic_string<char16_t, _Traits, _Allocator>& value) {
   uint16_t length = readInt16();
   _GEODE_CHECK_BUFFER_SIZE(length);
-  value = internal::JavaModifiedUtf8::decode(
-      reinterpret_cast<const char*>(m_buf), length);
+  value =
+      JavaModifiedUtf8::decode(reinterpret_cast<const char*>(m_buf), length);
   advanceCursor(length);
 }
 template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void
@@ -72,10 +74,10 @@ DataInput::readJavaModifiedUtf8(std::u16string&);
 template <class _Traits, class _Allocator>
 void DataInput::readJavaModifiedUtf8(
     std::basic_string<char32_t, _Traits, _Allocator>& value) {
-  // TODO string OPTIMIZE convert from UTF-16 to UCS-4 directly
-  std::u16string utf16;
-  readJavaModifiedUtf8(utf16);
-  value = to_ucs4(utf16);
+  uint16_t length = readInt16();
+  _GEODE_CHECK_BUFFER_SIZE(length);
+  value =
+      JavaModifiedUtf8::decodeU32(reinterpret_cast<const char*>(m_buf), length);
 }
 template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void
 DataInput::readJavaModifiedUtf8(std::u32string&);
diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index 6cb0dd64f0..9d9c93cd64 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -58,8 +58,12 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) {
   return encodedLen;
 }
 
-std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
-  std::string jmutf8;
+// Note on error handling in this method:
+// Error handling here is done just to serve the purpose of not
+// crashing, instead throwing exceptions.  Beyond this, we do NOT fully
+// validate the incoming utf-8 string, it is assumed to be otherwise correct.
+ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
+  ju8string jmutf8;
   size_t cursor = 0;
 
   while (cursor < utf8.size()) {
@@ -72,7 +76,7 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
         jmutf8 += static_cast<uint8_t>(0x80);
       }
     } else if ((byte1 & 0xE0) == 0xC0) {
-      if (cursor <= utf8.size() - 1) {
+      if (utf8.size() > 0 && cursor <= utf8.size() - 1) {
         jmutf8 += byte1;
         jmutf8 += utf8[cursor++];
       } else {
@@ -80,7 +84,7 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
             "Invalid utf-8 string passed to conversion method");
       }
     } else if ((byte1 & 0xF0) == 0xE0) {
-      if (cursor <= utf8.size() - 2) {
+      if (utf8.size() > 2 && cursor <= utf8.size() - 2) {
         jmutf8 += byte1;
         jmutf8 += utf8[cursor++];
         jmutf8 += utf8[cursor++];
@@ -89,7 +93,7 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
             "Invalid utf-8 string passed to conversion method");
       }
     } else if ((byte1 & 0xF8) == 0xF0) {
-      if (cursor <= utf8.size() - 3) {
+      if (utf8.size() > 3 && cursor <= utf8.size() - 3) {
         auto byte2 = utf8[cursor++];
         auto byte3 = utf8[cursor++];
         auto byte4 = utf8[cursor++];
@@ -119,8 +123,8 @@ std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
   return jmutf8;
 }
 
-std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) {
-  std::string jmutf8;
+ju8string JavaModifiedUtf8::fromString(const std::u16string& utf16) {
+  ju8string jmutf8;
   jmutf8.reserve(utf16.length());
 
   for (auto&& c : utf16) {
@@ -130,7 +134,7 @@ std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) {
   return jmutf8;
 }
 
-void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) {
+void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) {
   if (c == 0) {
     // NUL
     jmutf8 += static_cast<uint8_t>(0xc0);
@@ -147,6 +151,109 @@ void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) {
     jmutf8 += static_cast<uint8_t>(0x80 | (c & 0x3F));
   }
 }
+//
+// def utf8m_to_utf8s(string) :
+//  """
+//  : param string : modified utf8 encoded string
+//  : return : utf8 encoded string
+//  """
+//  new_string = []
+//  length = len(string)
+//  i = 0
+//  while i < length :
+//    byte1 = string[i]
+//    if (byte1 & 0x80) == 0 : # 1byte encoding
+//      new_string.append(byte1)
+//      elif(byte1 & 0xE0) == 0xC0:  # 2byte encoding
+//      i += 1
+//      byte2 = string[i]
+//      if byte1 != 0xC0 or byte2 != 0x80:
+// new_string.append(byte1)
+// new_string.append(byte2)
+//      else:
+// new_string.append(0)
+// elif(byte1 & 0xF0) == 0xE0 : # 3byte encoding
+// i += 1
+// byte2 = string[i]
+// i += 1
+// byte3 = string[i]
+// if i + 3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0:
+//# See if this is a pair of 3byte encodings
+// byte4 = string[i + 1]
+// byte5 = string[i + 2]
+// byte6 = string[i + 3]
+// if byte4 == 0xED and (byte5 & 0xF0) == 0xB0:
+//# Bits in : 11101101 1010xxxx 10xxxxxx
+//# Bits in : 11101101 1011xxxx 10xxxxxx
+// i += 3
+//
+//# Reconstruct 21 bit code
+// u21 = ((byte2 & 0x0F) + 1) << 16
+// u21 += (byte3 & 0x3F) << 10
+// u21 += (byte5 & 0x0F) << 6
+// u21 += byte6 & 0x3F
+//
+//# Bits out : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+//
+//# Convert to 4byte encoding
+// new_string.append(0xF0 + ((u21 >> 18) & 0x07))
+// new_string.append(0x80 + ((u21 >> 12) & 0x3F))
+// new_string.append(0x80 + ((u21 >> 6) & 0x3F))
+// new_string.append(0x80 + (u21 & 0x3F))
+// continue
+// new_string.append(byte1)
+// new_string.append(byte2)
+// new_string.append(byte3)
+// i += 1
+// return bytes(new_string).decode("utf-8")
+//
+std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) {
+  std::u32string result;
+
+  uint16_t i = 0;
+  while (i < len) {
+    auto byte1 = buf[i++];
+    if (!(byte1 & 0x80)) {
+      result += 0x00000000 & byte1;
+    } else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) {
+      auto byte2 = buf[i++];
+      if (!(byte1 == 0xC0) || !(byte2 == 0x80)) {
+        int32_t code_point = static_cast<int32_t>(byte1 & 0x1F) << 6;
+        code_point += static_cast<int32_t>(byte2 & 0x3F);
+        result += code_point;
+      } else {
+        result.append(static_cast<int32_t>(0));
+      }
+    } else if ((i < len - 5) && (byte1 == 0xED)) {
+      auto byte2 = buf[i++];
+      auto byte3 = buf[i++];
+      auto byte4 = buf[i++];
+      auto byte5 = buf[i++];
+      auto byte6 = buf[i++];
+      if (byte4 == 0xED) {
+        int32_t code_point =
+            0x10000 + (static_cast<int32_t>(byte2 & 0xF) << 16);
+        code_point += static_cast<int32_t>(byte3 & 0x3F) << 10;
+        code_point += static_cast<int32_t>(byte5 & 0xF) << 6;
+        code_point += static_cast<int32_t>(byte6 & 0x3F);
+        result += code_point;
+      } else {
+        throw IllegalArgumentException("Bad encoding in jmutf-8 string");
+      }
+    } else if ((i < len - 2) && ((byte1 & 0xE0) == 0xE0)) {
+      auto byte2 = buf[i++];
+      auto byte3 = buf[i++];
+      int32_t code_point = static_cast<int32_t>(byte1 & 0xF) << 12;
+      code_point += static_cast<int32_t>(byte2 & 0x3F) << 6;
+      code_point += static_cast<int32_t>(byte3 & 0x3F);
+      result += code_point;
+    } else {
+      throw IllegalArgumentException("Bad encoding in jmutf-8 string");
+    }
+  }
+
+  return result;
+}
 
 std::u16string JavaModifiedUtf8::decode(const char* buf, uint16_t len) {
   std::u16string value;
diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp
index f6d1722b2c..6e08ab9080 100644
--- a/cppcache/src/util/JavaModifiedUtf8.hpp
+++ b/cppcache/src/util/JavaModifiedUtf8.hpp
@@ -27,6 +27,9 @@ namespace geode {
 namespace client {
 namespace internal {
 
+struct ju8type_traits : std::char_traits<char> {};
+typedef std::basic_string<char, ju8type_traits> ju8string;
+
 struct JavaModifiedUtf8 {
   /**
    * Calculate the length of the given UTF-8 string when encoded in Java
@@ -45,20 +48,21 @@ struct JavaModifiedUtf8 {
   /**
    * Converts given UTF-8 string to Java Modified UTF-8 string.
    */
-  static std::string fromString(const std::string& utf8);
-
+  static ju8string fromString(const std::string& utf8);
   /**
    * Converts given UTF-16 string to Java Modified UTF-8 string.
    */
-  static std::string fromString(const std::u16string& utf16);
+  static ju8string fromString(const std::u16string& utf16);
 
   /**
    * Converts a single UTF-16 code unit into Java Modified UTF-8 code units.
    */
-  static void encode(const char16_t c, std::string& jmutf8);
+  static void encode(const char16_t c, ju8string& jmutf8);
 
   static std::u16string decode(const char* buf, uint16_t len);
 
+  static std::u32string decodeU32(const char* buf, uint16_t len);
+
   static char16_t decodeJavaModifiedUtf8Char(const char** pbuf);
 };
 
diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp
index 0c330e64a6..cb175ebcfb 100644
--- a/cppcache/test/CacheableStringTests.cpp
+++ b/cppcache/test/CacheableStringTests.cpp
@@ -21,11 +21,13 @@
 #include <gtest/gtest.h>
 
 #include <geode/DataOutput.hpp>
+#include <geode/ExceptionTypes.hpp>
 
 #include "ByteArrayFixture.hpp"
 #include "DataInputInternal.hpp"
 #include "DataOutputInternal.hpp"
 #include "SerializationRegistry.hpp"
+#include "util/JavaModifiedUtf8.hpp"
 
 namespace {
 
@@ -34,7 +36,9 @@ using apache::geode::client::CacheableString;
 using apache::geode::client::DataInputInternal;
 using apache::geode::client::DataOutput;
 using apache::geode::client::DataOutputInternal;
+using apache::geode::client::IllegalArgumentException;
 using apache::geode::client::SerializationRegistry;
+using apache::geode::client::internal::JavaModifiedUtf8;
 
 class TestDataOutput : public DataOutputInternal {
  public:
@@ -220,4 +224,84 @@ TEST_F(CacheableStringTests, TestFromDataNonAsciiHuge) {
   EXPECT_EQ(utf8, str->value());
 }
 
+TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
+  {
+    std::string bad_start_code;
+    bad_start_code += static_cast<int8_t>(0xF8);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code),
+                 IllegalArgumentException);
+  }
+
+  {
+    std::string too_short_2byte;
+    too_short_2byte += static_cast<int8_t>(0xC0);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte),
+                 IllegalArgumentException);
+
+    std::string bad_2byte_at_end = "foo";
+    too_short_2byte += static_cast<int8_t>(0xC0);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte),
+                 IllegalArgumentException);
+  }
+
+  {
+    std::string too_short_3byte;
+    too_short_3byte += static_cast<int8_t>(0xE8);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
+                 IllegalArgumentException);
+
+    too_short_3byte += static_cast<int8_t>(0x1);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
+                 IllegalArgumentException);
+
+    std::string bad_3byte_at_end = "foo";
+    bad_3byte_at_end += static_cast<int8_t>(0xE8);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+                 IllegalArgumentException);
+
+    bad_3byte_at_end += static_cast<int8_t>(0x1);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+                 IllegalArgumentException);
+  }
+
+  {
+    std::string too_short_4byte;
+    too_short_4byte += static_cast<int8_t>(0xF7);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+                 IllegalArgumentException);
+
+    too_short_4byte += static_cast<int8_t>(0x1);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+                 IllegalArgumentException);
+
+    too_short_4byte += static_cast<int8_t>(0x1);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+                 IllegalArgumentException);
+
+    std::string bad_4byte_at_end = "foo";
+    bad_4byte_at_end += static_cast<int8_t>(0xF7);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+                 IllegalArgumentException);
+
+    bad_4byte_at_end += static_cast<int8_t>(0x1);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+                 IllegalArgumentException);
+
+    bad_4byte_at_end += static_cast<int8_t>(0x1);
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+                 IllegalArgumentException);
+  }
+}
+
+TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
+  std::string utf8;
+  utf8 += 'a';
+
+  auto jmutf8 = JavaModifiedUtf8::fromString(utf8);
+  EXPECT_EQ(utf8.size(), jmutf8.size());
+  for (size_t i = 0; i < utf8.size(); i++) {
+    EXPECT_EQ(utf8[i], jmutf8[i]);
+  }
+}
+
 }  // namespace

From 2f90d4cbbe28881fec0cf815516c69d0c449e96b Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Tue, 8 Mar 2022 08:54:40 -0800
Subject: [PATCH 04/10] GEODE-4189: Fix build break (and logic error) - Also
 delete commented block

---
 cppcache/src/util/JavaModifiedUtf8.cpp | 59 +-------------------------
 1 file changed, 2 insertions(+), 57 deletions(-)

diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index 9d9c93cd64..f465735fce 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -151,62 +151,7 @@ void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) {
     jmutf8 += static_cast<uint8_t>(0x80 | (c & 0x3F));
   }
 }
-//
-// def utf8m_to_utf8s(string) :
-//  """
-//  : param string : modified utf8 encoded string
-//  : return : utf8 encoded string
-//  """
-//  new_string = []
-//  length = len(string)
-//  i = 0
-//  while i < length :
-//    byte1 = string[i]
-//    if (byte1 & 0x80) == 0 : # 1byte encoding
-//      new_string.append(byte1)
-//      elif(byte1 & 0xE0) == 0xC0:  # 2byte encoding
-//      i += 1
-//      byte2 = string[i]
-//      if byte1 != 0xC0 or byte2 != 0x80:
-// new_string.append(byte1)
-// new_string.append(byte2)
-//      else:
-// new_string.append(0)
-// elif(byte1 & 0xF0) == 0xE0 : # 3byte encoding
-// i += 1
-// byte2 = string[i]
-// i += 1
-// byte3 = string[i]
-// if i + 3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0:
-//# See if this is a pair of 3byte encodings
-// byte4 = string[i + 1]
-// byte5 = string[i + 2]
-// byte6 = string[i + 3]
-// if byte4 == 0xED and (byte5 & 0xF0) == 0xB0:
-//# Bits in : 11101101 1010xxxx 10xxxxxx
-//# Bits in : 11101101 1011xxxx 10xxxxxx
-// i += 3
-//
-//# Reconstruct 21 bit code
-// u21 = ((byte2 & 0x0F) + 1) << 16
-// u21 += (byte3 & 0x3F) << 10
-// u21 += (byte5 & 0x0F) << 6
-// u21 += byte6 & 0x3F
-//
-//# Bits out : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-//
-//# Convert to 4byte encoding
-// new_string.append(0xF0 + ((u21 >> 18) & 0x07))
-// new_string.append(0x80 + ((u21 >> 12) & 0x3F))
-// new_string.append(0x80 + ((u21 >> 6) & 0x3F))
-// new_string.append(0x80 + (u21 & 0x3F))
-// continue
-// new_string.append(byte1)
-// new_string.append(byte2)
-// new_string.append(byte3)
-// i += 1
-// return bytes(new_string).decode("utf-8")
-//
+
 std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) {
   std::u32string result;
 
@@ -214,7 +159,7 @@ std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) {
   while (i < len) {
     auto byte1 = buf[i++];
     if (!(byte1 & 0x80)) {
-      result += 0x00000000 & byte1;
+      result += static_cast<int32_t>(byte1) & 0x000000FF;
     } else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) {
       auto byte2 = buf[i++];
       if (!(byte1 == 0xC0) || !(byte2 == 0x80)) {

From f744b7cadd22a3f3661c06dc0c0dec003f4464eb Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Tue, 8 Mar 2022 09:24:42 -0800
Subject: [PATCH 05/10] GEODE-4189: Fix Linux build break

---
 cppcache/src/util/JavaModifiedUtf8.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index f465735fce..6e3d301cbb 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -167,7 +167,7 @@ std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) {
         code_point += static_cast<int32_t>(byte2 & 0x3F);
         result += code_point;
       } else {
-        result.append(static_cast<int32_t>(0));
+        result += static_cast<int32_t>(0);
       }
     } else if ((i < len - 5) && (byte1 == 0xED)) {
       auto byte2 = buf[i++];

From 4d47a9f5bc1f55de9f1211922ad53ef1b5682e2f Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Thu, 10 Mar 2022 07:26:38 -0800
Subject: [PATCH 06/10] GEODE-4189: Unit tests for utf-8 --> jmutf-8 complete

---
 cppcache/src/util/JavaModifiedUtf8.cpp |  76 +++++---
 cppcache/src/util/JavaModifiedUtf8.hpp |   3 +
 cppcache/test/CacheableStringTests.cpp | 229 +++++++++++++++++++------
 3 files changed, 233 insertions(+), 75 deletions(-)

diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index 6e3d301cbb..6c62019ad8 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -19,6 +19,7 @@
 
 #include <codecvt>
 #include <locale>
+#include <set>
 
 #include "geode/ExceptionTypes.hpp"
 #include "string.hpp"
@@ -58,6 +59,14 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) {
   return encodedLen;
 }
 
+std::set<int> utf16_surrogate_codes = {{0xD800}, {0xDB7F}, {0xDB80}, {0xDBFF},
+                                       {0xDC00}, {0xDF80}, {0xDFFF}};
+
+bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) {
+  return (code_point > 0x7FF) && (utf16_surrogate_codes.find(code_point) ==
+                                  utf16_surrogate_codes.end());
+}
+
 // Note on error handling in this method:
 // Error handling here is done just to serve the purpose of not
 // crashing, instead throwing exceptions.  Beyond this, we do NOT fully
@@ -77,17 +86,36 @@ ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
       }
     } else if ((byte1 & 0xE0) == 0xC0) {
       if (utf8.size() > 0 && cursor <= utf8.size() - 1) {
-        jmutf8 += byte1;
-        jmutf8 += utf8[cursor++];
+        auto byte2 = utf8[cursor++];
+        int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
+        if (code_point > 0x7F) {
+          jmutf8 += byte1;
+          jmutf8 += byte2;
+        } else {
+          throw IllegalArgumentException(
+              "Invalid utf-8 string passed to conversion method (overly long "
+              "ASCII character)");
+        }
       } else {
         throw IllegalArgumentException(
             "Invalid utf-8 string passed to conversion method");
       }
     } else if ((byte1 & 0xF0) == 0xE0) {
       if (utf8.size() > 2 && cursor <= utf8.size() - 2) {
-        jmutf8 += byte1;
-        jmutf8 += utf8[cursor++];
-        jmutf8 += utf8[cursor++];
+        auto byte2 = utf8[cursor++];
+        auto byte3 = utf8[cursor++];
+
+        uint16_t code_point =
+            ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F);
+        if (IsValidCodePoint(code_point)) {
+          jmutf8 += byte1;
+          jmutf8 += byte2;
+          jmutf8 += byte3;
+        } else {
+          throw IllegalArgumentException(
+              "Invalid utf-8 string passed to conversion method (overly long "
+              "3-byte encoding)");
+        }
       } else {
         throw IllegalArgumentException(
             "Invalid utf-8 string passed to conversion method");
@@ -103,14 +131,20 @@ ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
         code_point += (byte3 & 0x3F) << 6;
         code_point += byte4 & 0x3F;
 
-        jmutf8 += static_cast<uint8_t>(0xED);
-        jmutf8 +=
-            static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
-        jmutf8 += static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));
+        if (code_point > 0xFFFF) {
+          jmutf8 += static_cast<uint8_t>(0xED);
+          jmutf8 +=
+              static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
+          jmutf8 += static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));
 
-        jmutf8 += static_cast<uint8_t>(0xED);
-        jmutf8 += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
-        jmutf8 += byte4;
+          jmutf8 += static_cast<uint8_t>(0xED);
+          jmutf8 += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
+          jmutf8 += byte4;
+        } else {
+          throw IllegalArgumentException(
+              "Invalid utf-8 string passed to conversion method (overly long "
+              "4-byte encoding)");
+        }
       } else {
         throw IllegalArgumentException(
             "Invalid utf-8 string passed to conversion method");
@@ -162,36 +196,36 @@ std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) {
       result += static_cast<int32_t>(byte1) & 0x000000FF;
     } else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) {
       auto byte2 = buf[i++];
-      if (!(byte1 == 0xC0) || !(byte2 == 0x80)) {
+      if (((byte1 & 0xFF) == 0xC0) && ((byte2 & 0xFF) == 0x80)) {
+        result.push_back(static_cast<int32_t>(0));
+      } else {
         int32_t code_point = static_cast<int32_t>(byte1 & 0x1F) << 6;
         code_point += static_cast<int32_t>(byte2 & 0x3F);
-        result += code_point;
-      } else {
-        result += static_cast<int32_t>(0);
+        result.push_back(code_point);
       }
-    } else if ((i < len - 5) && (byte1 == 0xED)) {
+    } else if ((i < len - 4) && ((byte1 & 0xED) == 0xED)) {
       auto byte2 = buf[i++];
       auto byte3 = buf[i++];
       auto byte4 = buf[i++];
       auto byte5 = buf[i++];
       auto byte6 = buf[i++];
-      if (byte4 == 0xED) {
+      if ((byte4 & 0xED) == 0xED) {
         int32_t code_point =
             0x10000 + (static_cast<int32_t>(byte2 & 0xF) << 16);
         code_point += static_cast<int32_t>(byte3 & 0x3F) << 10;
         code_point += static_cast<int32_t>(byte5 & 0xF) << 6;
         code_point += static_cast<int32_t>(byte6 & 0x3F);
-        result += code_point;
+        result.push_back(code_point);
       } else {
         throw IllegalArgumentException("Bad encoding in jmutf-8 string");
       }
-    } else if ((i < len - 2) && ((byte1 & 0xE0) == 0xE0)) {
+    } else if ((i < len - 1) && ((byte1 & 0xE0) == 0xE0)) {
       auto byte2 = buf[i++];
       auto byte3 = buf[i++];
       int32_t code_point = static_cast<int32_t>(byte1 & 0xF) << 12;
       code_point += static_cast<int32_t>(byte2 & 0x3F) << 6;
       code_point += static_cast<int32_t>(byte3 & 0x3F);
-      result += code_point;
+      result.push_back(code_point);
     } else {
       throw IllegalArgumentException("Bad encoding in jmutf-8 string");
     }
diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp
index 6e08ab9080..8a483db116 100644
--- a/cppcache/src/util/JavaModifiedUtf8.hpp
+++ b/cppcache/src/util/JavaModifiedUtf8.hpp
@@ -64,6 +64,9 @@ struct JavaModifiedUtf8 {
   static std::u32string decodeU32(const char* buf, uint16_t len);
 
   static char16_t decodeJavaModifiedUtf8Char(const char** pbuf);
+
+ private:
+  static bool IsValidCodePoint(uint16_t code_point);
 };
 
 }  // namespace internal
diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp
index cb175ebcfb..6445882a3f 100644
--- a/cppcache/test/CacheableStringTests.cpp
+++ b/cppcache/test/CacheableStringTests.cpp
@@ -224,83 +224,204 @@ TEST_F(CacheableStringTests, TestFromDataNonAsciiHuge) {
   EXPECT_EQ(utf8, str->value());
 }
 
-TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
-  {
-    std::string bad_start_code;
-    bad_start_code += static_cast<int8_t>(0xF8);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code),
-                 IllegalArgumentException);
-  }
+std::vector<int> impossible_bytes[] = {
+    {0xFE}, {0xFF}, {0xFE, 0xFE, 0xFF, 0xFF}};
 
-  {
-    std::string too_short_2byte;
-    too_short_2byte += static_cast<int8_t>(0xC0);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte),
-                 IllegalArgumentException);
+std::vector<int> overlong_ascii_sequences[] = {
+    {0xC0, 0xAF}, {0xE0, 0x80, 0xAF}, {0xF0, 0x80, 0x80, 0xAF}};
 
-    std::string bad_2byte_at_end = "foo";
-    too_short_2byte += static_cast<int8_t>(0xC0);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte),
-                 IllegalArgumentException);
-  }
+std::vector<int> maximum_overlong_sequences[] = {
+    {0xC1, 0xBF}, {0xE0, 0x9F, 0xBF}, {0xF0, 0x8F, 0xBF, 0xBF}};
 
-  {
-    std::string too_short_3byte;
-    too_short_3byte += static_cast<int8_t>(0xE8);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
-                 IllegalArgumentException);
+std::vector<int> overlong_nulls[] = {
+    {0xC0, 0x80}, {0xE0, 0x80, 0x80}, {0xF0, 0x80, 0x80, 0x80}};
 
-    too_short_3byte += static_cast<int8_t>(0x1);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
-                 IllegalArgumentException);
+std::vector<int> single_utf_16_surrogates[] = {
+    {0xED, 0xA0, 0x80}, {0xED, 0xAD, 0xBF}, {0xED, 0xAE, 0x80},
+    {0xED, 0xAF, 0xBF}, {0xED, 0xB0, 0x80}, {0xED, 0xBE, 0x80},
+    {0xED, 0xBF, 0xBF}};
 
-    std::string bad_3byte_at_end = "foo";
-    bad_3byte_at_end += static_cast<int8_t>(0xE8);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
+  std::string bad_start_code;
+  bad_start_code += static_cast<int8_t>(0xF8);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code),
+               IllegalArgumentException);
+
+  std::string too_short_2byte;
+  too_short_2byte += static_cast<int8_t>(0xC0);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte),
+               IllegalArgumentException);
+
+  std::string bad_2byte_at_end = "foo";
+  bad_2byte_at_end += static_cast<int8_t>(0xC0);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_2byte_at_end),
+               IllegalArgumentException);
+
+  std::string too_long_3_byte_encode;
+  too_long_3_byte_encode.push_back(0xE0);
+  too_long_3_byte_encode.push_back(0x80);
+  too_long_3_byte_encode.push_back(0x80);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode),
+               IllegalArgumentException);
+
+  std::string too_short_3byte;
+  too_short_3byte += static_cast<int8_t>(0xE8);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
+               IllegalArgumentException);
+
+  too_short_3byte += static_cast<int8_t>(0x1);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
+               IllegalArgumentException);
+
+  std::string bad_3byte_at_end = "foo";
+  bad_3byte_at_end += static_cast<int8_t>(0xE8);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+               IllegalArgumentException);
+
+  bad_3byte_at_end += static_cast<int8_t>(0x1);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+               IllegalArgumentException);
+
+  std::string too_short_4byte;
+  too_short_4byte += static_cast<int8_t>(0xF7);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+               IllegalArgumentException);
+
+  too_short_4byte += static_cast<int8_t>(0x1);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+               IllegalArgumentException);
+
+  too_short_4byte += static_cast<int8_t>(0x1);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+               IllegalArgumentException);
+
+  std::string bad_4byte_at_end = "foo";
+  bad_4byte_at_end += static_cast<int8_t>(0xF7);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+               IllegalArgumentException);
+
+  bad_4byte_at_end += static_cast<int8_t>(0x1);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+               IllegalArgumentException);
+
+  bad_4byte_at_end += static_cast<int8_t>(0x1);
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+               IllegalArgumentException);
+
+  for (auto sequence : impossible_bytes) {
+    std::string bad_sequence;
+    for (auto byte_value : sequence) {
+      bad_sequence += static_cast<int8_t>(byte_value);
+    }
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
+  }
 
-    bad_3byte_at_end += static_cast<int8_t>(0x1);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+  for (auto sequence : overlong_ascii_sequences) {
+    std::string bad_sequence;
+    for (auto byte_value : sequence) {
+      bad_sequence += static_cast<int8_t>(byte_value);
+    }
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
   }
 
-  {
-    std::string too_short_4byte;
-    too_short_4byte += static_cast<int8_t>(0xF7);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+  for (auto sequence : maximum_overlong_sequences) {
+    std::string bad_sequence;
+    for (auto byte_value : sequence) {
+      bad_sequence += static_cast<int8_t>(byte_value);
+    }
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
+  }
 
-    too_short_4byte += static_cast<int8_t>(0x1);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+  for (auto sequence : overlong_nulls) {
+    std::string bad_sequence;
+    for (auto byte_value : sequence) {
+      bad_sequence += static_cast<int8_t>(byte_value);
+    }
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
+  }
 
-    too_short_4byte += static_cast<int8_t>(0x1);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+  for (auto sequence : single_utf_16_surrogates) {
+    std::string bad_sequence;
+    for (auto byte_value : sequence) {
+      bad_sequence += static_cast<int8_t>(byte_value);
+    }
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
+  }
+}
 
-    std::string bad_4byte_at_end = "foo";
-    bad_4byte_at_end += static_cast<int8_t>(0xF7);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
-                 IllegalArgumentException);
+std::pair<std::vector<int>, std::vector<int>> lowest_boundary_sequences[] = {
+    {{0x00}, {0xC0, 0x80}},
+    {{0xD0, 0x80}, {0xD0, 0x80}},
+    {{0xE0, 0xA0, 0x80}, {0xE0, 0xA0, 0x80}},
+    {{0xF0, 0x90, 0x80, 0x80}, {0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80}}};
+
+std::pair<std::vector<int>, std::vector<int>> highest_boundary_sequences[] = {
+    {{0x7F}, {0x7F}},
+    {{0xDF, 0xCF}, {0xDF, 0xCF}},
+    {{0xEF, 0xBF, 0xBF}, {0xEF, 0xBF, 0xBF}},
+    {{0xF7, 0xBF, 0xBF, 0xBF}, {0xED, 0xAE, 0xBF, 0xED, 0xBF, 0xBF}},
+};
 
-    bad_4byte_at_end += static_cast<int8_t>(0x1);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
-                 IllegalArgumentException);
+std::pair<std::vector<int>, std::vector<int>> other_boundary_sequences[] = {
+    {{0xED, 0x9F, 0xBF}, {0xED, 0x9F, 0xBF}},
+    {{0xEE, 0x80, 0x80}, {0xEE, 0x80, 0x80}},
+    {{0xEF, 0xBF, 0xBD}, {0xEF, 0xBF, 0xBD}},
+};
 
-    bad_4byte_at_end += static_cast<int8_t>(0x1);
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
-                 IllegalArgumentException);
-  }
-}
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
 
 TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
   std::string utf8;
-  utf8 += 'a';
+  std::string expected;
+  utf8.push_back(0);
 
   auto jmutf8 = JavaModifiedUtf8::fromString(utf8);
-  EXPECT_EQ(utf8.size(), jmutf8.size());
-  for (size_t i = 0; i < utf8.size(); i++) {
-    EXPECT_EQ(utf8[i], jmutf8[i]);
+
+  for (auto i = 0; i < ARRAY_SIZE(lowest_boundary_sequences); i++) {
+    utf8.clear();
+    expected.clear();
+    for (auto byte_value : std::get<0>(lowest_boundary_sequences[i])) {
+      utf8 += static_cast<int8_t>(byte_value);
+    }
+    for (auto byte_value : std::get<1>(lowest_boundary_sequences[i])) {
+      expected += static_cast<int8_t>(byte_value);
+    }
+    jmutf8 = JavaModifiedUtf8::fromString(utf8);
+    EXPECT_EQ(expected.size(), jmutf8.size());
+    EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
+  }
+
+  for (auto i = 0; i < ARRAY_SIZE(highest_boundary_sequences); i++) {
+    utf8.clear();
+    expected.clear();
+    for (auto byte_value : std::get<0>(highest_boundary_sequences[i])) {
+      utf8 += static_cast<int8_t>(byte_value);
+    }
+    for (auto byte_value : std::get<1>(highest_boundary_sequences[i])) {
+      expected += static_cast<int8_t>(byte_value);
+    }
+    jmutf8 = JavaModifiedUtf8::fromString(utf8);
+    EXPECT_EQ(expected.size(), jmutf8.size());
+    EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
+  }
+
+  for (auto i = 0; i < ARRAY_SIZE(other_boundary_sequences); i++) {
+    utf8.clear();
+    expected.clear();
+    for (auto byte_value : std::get<0>(other_boundary_sequences[i])) {
+      utf8 += static_cast<int8_t>(byte_value);
+    }
+    for (auto byte_value : std::get<1>(other_boundary_sequences[i])) {
+      expected += static_cast<int8_t>(byte_value);
+    }
+    jmutf8 = JavaModifiedUtf8::fromString(utf8);
+    EXPECT_EQ(expected.size(), jmutf8.size());
+    EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }
 }
 

From 63b417a8ed77395f50a45e6de5e5485d9ac56ec2 Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Thu, 10 Mar 2022 07:59:40 -0800
Subject: [PATCH 07/10] GEODE-4189: Fix Linux build break - Also delete
 obsolete comment block

---
 cppcache/src/util/JavaModifiedUtf8.cpp | 4 ----
 cppcache/test/CacheableStringTests.cpp | 9 ++++++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index 6c62019ad8..875e1efcd7 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -67,10 +67,6 @@ bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) {
                                   utf16_surrogate_codes.end());
 }
 
-// Note on error handling in this method:
-// Error handling here is done just to serve the purpose of not
-// crashing, instead throwing exceptions.  Beyond this, we do NOT fully
-// validate the incoming utf-8 string, it is assumed to be otherwise correct.
 ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
   ju8string jmutf8;
   size_t cursor = 0;
diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp
index 6445882a3f..1927103628 100644
--- a/cppcache/test/CacheableStringTests.cpp
+++ b/cppcache/test/CacheableStringTests.cpp
@@ -382,7 +382,8 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
 
   auto jmutf8 = JavaModifiedUtf8::fromString(utf8);
 
-  for (auto i = 0; i < ARRAY_SIZE(lowest_boundary_sequences); i++) {
+  for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0;
+       i < ARRAY_SIZE(lowest_boundary_sequences); i++) {
     utf8.clear();
     expected.clear();
     for (auto byte_value : std::get<0>(lowest_boundary_sequences[i])) {
@@ -396,7 +397,8 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }
 
-  for (auto i = 0; i < ARRAY_SIZE(highest_boundary_sequences); i++) {
+  for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0;
+       i < ARRAY_SIZE(highest_boundary_sequences); i++) {
     utf8.clear();
     expected.clear();
     for (auto byte_value : std::get<0>(highest_boundary_sequences[i])) {
@@ -410,7 +412,8 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }
 
-  for (auto i = 0; i < ARRAY_SIZE(other_boundary_sequences); i++) {
+  for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0;
+       i < ARRAY_SIZE(other_boundary_sequences); i++) {
     utf8.clear();
     expected.clear();
     for (auto byte_value : std::get<0>(other_boundary_sequences[i])) {

From a426c960763005ddd1c20c80d64d91ba3925e4c0 Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Thu, 10 Mar 2022 09:30:22 -0800
Subject: [PATCH 08/10] GEODE-4189: Fix another Linux build break

---
 cppcache/test/CacheableStringTests.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp
index 1927103628..026826a49f 100644
--- a/cppcache/test/CacheableStringTests.cpp
+++ b/cppcache/test/CacheableStringTests.cpp
@@ -258,9 +258,9 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
                IllegalArgumentException);
 
   std::string too_long_3_byte_encode;
-  too_long_3_byte_encode.push_back(0xE0);
-  too_long_3_byte_encode.push_back(0x80);
-  too_long_3_byte_encode.push_back(0x80);
+  too_long_3_byte_encode.push_back(static_cast<int8_t>(0xE0));
+  too_long_3_byte_encode.push_back(static_cast<int8_t>(0x80));
+  too_long_3_byte_encode.push_back(static_cast<int8_t>(0x80));
   EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode),
                IllegalArgumentException);
 

From 15fb7e90b696b42e1f1011210790ca9742c8208b Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Fri, 11 Mar 2022 07:26:41 -0800
Subject: [PATCH 09/10] GEODE-4189: experimenting with simpler scanning code

---
 cppcache/src/util/JavaModifiedUtf8.cpp | 163 +++++++++++++++++++++++--
 cppcache/src/util/JavaModifiedUtf8.hpp |   9 ++
 cppcache/test/CacheableStringTests.cpp |  46 +++----
 3 files changed, 187 insertions(+), 31 deletions(-)

diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index 875e1efcd7..dcf5978b93 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -67,6 +67,149 @@ bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) {
                                   utf16_surrogate_codes.end());
 }
 
+enum class UtfScanState : int32_t {
+  Initial = 0,
+  Need1 = 1,
+  Need2 = 2,
+  Need3 = 3,
+  Need4 = 4,
+  Need5 = 5
+};
+
+ju8string JavaModifiedUtf8::decode(const std::string& utf8char) {
+  ju8string jmutf8char;
+
+  if (utf8char.size() == 2) {
+    auto byte1 = utf8char[0];
+    auto byte2 = utf8char[1];
+    if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) {
+      int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
+      if (code_point > 0x7F) {
+        jmutf8char += byte1;
+        jmutf8char += byte2;
+      } else {
+        throw IllegalArgumentException(
+            "Invalid utf-8 string passed to conversion method (overly long "
+            "ASCII character)");
+      }
+    } else {
+      throw IllegalArgumentException(
+          "Invalid utf-8 string passed to conversion method");
+    }
+  } else if (utf8char.size() == 3) {
+    auto byte1 = utf8char[0];
+    auto byte2 = utf8char[1];
+    auto byte3 = utf8char[2];
+    if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) &&
+        ((byte3 & 0x80) == 0x80)) {
+      uint16_t code_point =
+          ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F);
+      if (IsValidCodePoint(code_point)) {
+        jmutf8char += byte1;
+        jmutf8char += byte2;
+        jmutf8char += byte3;
+      } else {
+        throw IllegalArgumentException(
+            "Invalid utf-8 string passed to conversion method (overly long "
+            "3-byte encoding)");
+      }
+    } else {
+      throw IllegalArgumentException(
+          "Invalid utf-8 string passed to conversion method");
+    }
+  } else if (utf8char.size() == 4) {
+    auto byte1 = utf8char[0];
+    auto byte2 = utf8char[1];
+    auto byte3 = utf8char[2];
+    auto byte4 = utf8char[3];
+    if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) &&
+        ((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) {
+      uint32_t code_point = (byte1 & 0x07) << 18;
+      code_point += (byte2 & 0x3F) << 12;
+      code_point += (byte3 & 0x3F) << 6;
+      code_point += byte4 & 0x3F;
+
+      if (code_point > 0xFFFF) {
+        jmutf8char += static_cast<uint8_t>(0xED);
+        jmutf8char +=
+            static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
+        jmutf8char +=
+            static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));
+
+        jmutf8char += static_cast<uint8_t>(0xED);
+        jmutf8char += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
+        jmutf8char += byte4;
+      } else {
+        throw IllegalArgumentException(
+            "Invalid utf-8 string passed to conversion method (overly long "
+            "4-byte encoding)");
+      }
+    } else {
+      throw IllegalArgumentException(
+          "Invalid utf-8 string passed to conversion method");
+    }
+  }
+
+  return jmutf8char;
+}
+
+ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) {
+  ju8string jmutf8;
+  size_t cursor = 0;
+  auto state = UtfScanState::Initial;
+  std::string current;
+
+  while (cursor < utf8.size()) {
+    auto byte = utf8[cursor++];
+
+    switch (state) {
+      case UtfScanState::Initial:
+        if ((byte & 0x80) == 0) {
+          if (byte) {
+            jmutf8 += byte;
+          } else {
+            jmutf8 += static_cast<uint8_t>(0xC0);
+            jmutf8 += static_cast<uint8_t>(0x80);
+          }
+        } else if ((byte & 0xE0) == 0xC0) {
+          current += byte;
+          state = UtfScanState::Need1;
+        } else if ((byte & 0xF0) == 0xE0) {
+          current += byte;
+          state = UtfScanState::Need2;
+        } else if ((byte & 0xF8) == 0xF0) {
+          current += byte;
+          state = UtfScanState::Need3;
+        } else {
+          throw IllegalArgumentException(
+              "Invalid utf-8 string passed to conversion method");
+        }
+        break;
+      case UtfScanState::Need1: {
+        current += byte;
+        state = UtfScanState::Initial;
+        jmutf8 += JavaModifiedUtf8::decode(current);
+        current.clear();
+      } break;
+      case UtfScanState::Need2:
+        current += byte;
+        state = UtfScanState::Need1;
+        break;
+      case UtfScanState::Need3:
+        current += byte;
+        state = UtfScanState::Need2;
+        break;
+    }
+  }
+
+  if (state != UtfScanState::Initial) {
+    throw IllegalArgumentException(
+        "Invalid utf-8 string passed to conversion method");
+  }
+
+  return jmutf8;
+}
+
 ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
   ju8string jmutf8;
   size_t cursor = 0;
@@ -153,15 +296,19 @@ ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
   return jmutf8;
 }
 
-ju8string JavaModifiedUtf8::fromString(const std::u16string& utf16) {
-  ju8string jmutf8;
-  jmutf8.reserve(utf16.length());
+std::string JavaModifiedUtf8::toString(const ju8string& jmutf8) {
+  // std::string utf8;
+  // size_t cursor = 0;
 
-  for (auto&& c : utf16) {
-    encode(c, jmutf8);
-  }
-
-  return jmutf8;
+  // while (cursor < jmutf8.size()) {
+  //  auto byte1 = jmutf8[cursor++];
+  //  if ((byte1 & 0x80) == 0) {
+  //    utf8.push_back(byte1);
+  //  } else if ((byte1 & 0xE0) == 0xC0) {
+  //    auto byte2 = jmutf8[cursor++];
+  //    if () }
+  //}
+  return "";
 }
 
 void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) {
diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp
index 8a483db116..cdd19367d1 100644
--- a/cppcache/src/util/JavaModifiedUtf8.hpp
+++ b/cppcache/src/util/JavaModifiedUtf8.hpp
@@ -49,11 +49,18 @@ struct JavaModifiedUtf8 {
    * Converts given UTF-8 string to Java Modified UTF-8 string.
    */
   static ju8string fromString(const std::string& utf8);
+  static ju8string fromStringImproved(const std::string& utf8);
+
   /**
    * Converts given UTF-16 string to Java Modified UTF-8 string.
    */
   static ju8string fromString(const std::u16string& utf16);
 
+  /**
+   * Converts Java-Modified UTF-8 string to UTF-8 string.
+   */
+  std::string toString(const ju8string& jmutf8);
+
   /**
    * Converts a single UTF-16 code unit into Java Modified UTF-8 code units.
    */
@@ -61,6 +68,8 @@ struct JavaModifiedUtf8 {
 
   static std::u16string decode(const char* buf, uint16_t len);
 
+  static ju8string decode(const std::string& utf8char);
+
   static std::u32string decodeU32(const char* buf, uint16_t len);
 
   static char16_t decodeJavaModifiedUtf8Char(const char** pbuf);
diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp
index 026826a49f..38c82ed77a 100644
--- a/cppcache/test/CacheableStringTests.cpp
+++ b/cppcache/test/CacheableStringTests.cpp
@@ -244,68 +244,68 @@ std::vector<int> single_utf_16_surrogates[] = {
 TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
   std::string bad_start_code;
   bad_start_code += static_cast<int8_t>(0xF8);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_start_code),
                IllegalArgumentException);
 
   std::string too_short_2byte;
   too_short_2byte += static_cast<int8_t>(0xC0);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_2byte),
                IllegalArgumentException);
 
   std::string bad_2byte_at_end = "foo";
   bad_2byte_at_end += static_cast<int8_t>(0xC0);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_2byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_2byte_at_end),
                IllegalArgumentException);
 
   std::string too_long_3_byte_encode;
   too_long_3_byte_encode.push_back(static_cast<int8_t>(0xE0));
   too_long_3_byte_encode.push_back(static_cast<int8_t>(0x80));
   too_long_3_byte_encode.push_back(static_cast<int8_t>(0x80));
-  EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_long_3_byte_encode),
                IllegalArgumentException);
 
   std::string too_short_3byte;
   too_short_3byte += static_cast<int8_t>(0xE8);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte),
                IllegalArgumentException);
 
   too_short_3byte += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte),
                IllegalArgumentException);
 
   std::string bad_3byte_at_end = "foo";
   bad_3byte_at_end += static_cast<int8_t>(0xE8);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end),
                IllegalArgumentException);
 
   bad_3byte_at_end += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end),
                IllegalArgumentException);
 
   std::string too_short_4byte;
   too_short_4byte += static_cast<int8_t>(0xF7);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte),
                IllegalArgumentException);
 
   too_short_4byte += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte),
                IllegalArgumentException);
 
   too_short_4byte += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte),
                IllegalArgumentException);
 
   std::string bad_4byte_at_end = "foo";
   bad_4byte_at_end += static_cast<int8_t>(0xF7);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end),
                IllegalArgumentException);
 
   bad_4byte_at_end += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end),
                IllegalArgumentException);
 
   bad_4byte_at_end += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end),
                IllegalArgumentException);
 
   for (auto sequence : impossible_bytes) {
@@ -313,7 +313,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -322,7 +322,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -331,7 +331,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -340,7 +340,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -349,7 +349,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
                  IllegalArgumentException);
   }
 }
@@ -380,7 +380,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
   std::string expected;
   utf8.push_back(0);
 
-  auto jmutf8 = JavaModifiedUtf8::fromString(utf8);
+  auto jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
 
   for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0;
        i < ARRAY_SIZE(lowest_boundary_sequences); i++) {
@@ -392,7 +392,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     for (auto byte_value : std::get<1>(lowest_boundary_sequences[i])) {
       expected += static_cast<int8_t>(byte_value);
     }
-    jmutf8 = JavaModifiedUtf8::fromString(utf8);
+    jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
     EXPECT_EQ(expected.size(), jmutf8.size());
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }
@@ -407,7 +407,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     for (auto byte_value : std::get<1>(highest_boundary_sequences[i])) {
       expected += static_cast<int8_t>(byte_value);
     }
-    jmutf8 = JavaModifiedUtf8::fromString(utf8);
+    jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
     EXPECT_EQ(expected.size(), jmutf8.size());
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }
@@ -422,7 +422,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     for (auto byte_value : std::get<1>(other_boundary_sequences[i])) {
       expected += static_cast<int8_t>(byte_value);
     }
-    jmutf8 = JavaModifiedUtf8::fromString(utf8);
+    jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
     EXPECT_EQ(expected.size(), jmutf8.size());
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }

From 959dcff7c949d7583adb46bf32ea24db30a4a632 Mon Sep 17 00:00:00 2001
From: Blake Bender <bblake@vmware.com>
Date: Fri, 11 Mar 2022 08:29:51 -0800
Subject: [PATCH 10/10] GEODE-4189: Unit tests passing with new scanner

---
 cppcache/src/util/JavaModifiedUtf8.cpp | 287 ++++++++++---------------
 cppcache/src/util/JavaModifiedUtf8.hpp |   4 +-
 cppcache/test/CacheableStringTests.cpp |  50 ++---
 3 files changed, 138 insertions(+), 203 deletions(-)

diff --git a/cppcache/src/util/JavaModifiedUtf8.cpp b/cppcache/src/util/JavaModifiedUtf8.cpp
index dcf5978b93..58cd536b26 100644
--- a/cppcache/src/util/JavaModifiedUtf8.cpp
+++ b/cppcache/src/util/JavaModifiedUtf8.cpp
@@ -69,91 +69,114 @@ bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) {
 
 enum class UtfScanState : int32_t {
   Initial = 0,
-  Need1 = 1,
-  Need2 = 2,
-  Need3 = 3,
-  Need4 = 4,
-  Need5 = 5
+  Continuing = 1,
 };
 
-ju8string JavaModifiedUtf8::decode(const std::string& utf8char) {
+ju8string JavaModifiedUtf8::decode2byte(const std::string& utf8char) {
   ju8string jmutf8char;
-
-  if (utf8char.size() == 2) {
-    auto byte1 = utf8char[0];
-    auto byte2 = utf8char[1];
-    if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) {
-      int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
-      if (code_point > 0x7F) {
-        jmutf8char += byte1;
-        jmutf8char += byte2;
-      } else {
-        throw IllegalArgumentException(
-            "Invalid utf-8 string passed to conversion method (overly long "
-            "ASCII character)");
-      }
+  auto byte1 = utf8char[0];
+  auto byte2 = utf8char[1];
+  if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) {
+    int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
+    if (code_point > 0x7F) {
+      jmutf8char += byte1;
+      jmutf8char += byte2;
     } else {
       throw IllegalArgumentException(
-          "Invalid utf-8 string passed to conversion method");
+          "Invalid utf-8 string passed to conversion method (overly long "
+          "ASCII character)");
     }
-  } else if (utf8char.size() == 3) {
-    auto byte1 = utf8char[0];
-    auto byte2 = utf8char[1];
-    auto byte3 = utf8char[2];
-    if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) &&
-        ((byte3 & 0x80) == 0x80)) {
-      uint16_t code_point =
-          ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F);
-      if (IsValidCodePoint(code_point)) {
-        jmutf8char += byte1;
-        jmutf8char += byte2;
-        jmutf8char += byte3;
-      } else {
-        throw IllegalArgumentException(
-            "Invalid utf-8 string passed to conversion method (overly long "
-            "3-byte encoding)");
-      }
+  } else {
+    throw IllegalArgumentException(
+        "Invalid utf-8 string passed to conversion method");
+  }
+
+  return jmutf8char;
+}
+
+ju8string JavaModifiedUtf8::decode3byte(const std::string& utf8char) {
+  ju8string jmutf8char;
+  auto byte1 = utf8char[0];
+  auto byte2 = utf8char[1];
+  auto byte3 = utf8char[2];
+  if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) &&
+      ((byte3 & 0x80) == 0x80)) {
+    uint16_t code_point =
+        ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F);
+    if (IsValidCodePoint(code_point)) {
+      jmutf8char += byte1;
+      jmutf8char += byte2;
+      jmutf8char += byte3;
     } else {
       throw IllegalArgumentException(
-          "Invalid utf-8 string passed to conversion method");
+          "Invalid utf-8 string passed to conversion method (overly long "
+          "3-byte encoding)");
     }
-  } else if (utf8char.size() == 4) {
-    auto byte1 = utf8char[0];
-    auto byte2 = utf8char[1];
-    auto byte3 = utf8char[2];
-    auto byte4 = utf8char[3];
-    if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) &&
-        ((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) {
-      uint32_t code_point = (byte1 & 0x07) << 18;
-      code_point += (byte2 & 0x3F) << 12;
-      code_point += (byte3 & 0x3F) << 6;
-      code_point += byte4 & 0x3F;
-
-      if (code_point > 0xFFFF) {
-        jmutf8char += static_cast<uint8_t>(0xED);
-        jmutf8char +=
-            static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
-        jmutf8char +=
-            static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));
-
-        jmutf8char += static_cast<uint8_t>(0xED);
-        jmutf8char += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
-        jmutf8char += byte4;
-      } else {
-        throw IllegalArgumentException(
-            "Invalid utf-8 string passed to conversion method (overly long "
-            "4-byte encoding)");
-      }
+  } else {
+    throw IllegalArgumentException(
+        "Invalid utf-8 string passed to conversion method");
+  }
+
+  return jmutf8char;
+}
+
+ju8string JavaModifiedUtf8::decode4byte(const std::string& utf8char) {
+  ju8string jmutf8char;
+  auto byte1 = utf8char[0];
+  auto byte2 = utf8char[1];
+  auto byte3 = utf8char[2];
+  auto byte4 = utf8char[3];
+  if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) &&
+      ((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) {
+    uint32_t code_point = (byte1 & 0x07) << 18;
+    code_point += (byte2 & 0x3F) << 12;
+    code_point += (byte3 & 0x3F) << 6;
+    code_point += byte4 & 0x3F;
+
+    if (code_point > 0xFFFF) {
+      jmutf8char += static_cast<uint8_t>(0xED);
+      jmutf8char +=
+          static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
+      jmutf8char += static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));
+
+      jmutf8char += static_cast<uint8_t>(0xED);
+      jmutf8char += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
+      jmutf8char += byte4;
     } else {
       throw IllegalArgumentException(
-          "Invalid utf-8 string passed to conversion method");
+          "Invalid utf-8 string passed to conversion method (overly long "
+          "4-byte encoding)");
     }
+  } else {
+    throw IllegalArgumentException(
+        "Invalid utf-8 string passed to conversion method");
   }
 
   return jmutf8char;
 }
 
-ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) {
+ju8string JavaModifiedUtf8::decode(const std::string& utf8char) {
+  ju8string jmutf8char;
+
+  switch (utf8char.size()) {
+    case 2:
+      jmutf8char = decode2byte(utf8char);
+      break;
+    case 3:
+      jmutf8char = decode3byte(utf8char);
+      break;
+    case 4:
+      jmutf8char = decode4byte(utf8char);
+      break;
+    default:
+      throw IllegalArgumentException(
+          "Invalid utf-8 string passed to conversion method");
+  }
+
+  return jmutf8char;
+}
+
+ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
   ju8string jmutf8;
   size_t cursor = 0;
   auto state = UtfScanState::Initial;
@@ -171,37 +194,33 @@ ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) {
             jmutf8 += static_cast<uint8_t>(0xC0);
             jmutf8 += static_cast<uint8_t>(0x80);
           }
-        } else if ((byte & 0xE0) == 0xC0) {
-          current += byte;
-          state = UtfScanState::Need1;
-        } else if ((byte & 0xF0) == 0xE0) {
-          current += byte;
-          state = UtfScanState::Need2;
-        } else if ((byte & 0xF8) == 0xF0) {
-          current += byte;
-          state = UtfScanState::Need3;
-        } else {
+        } else if ((byte & 0xc0) == 0x80) {
           throw IllegalArgumentException(
               "Invalid utf-8 string passed to conversion method");
+        } else {
+          current += byte;
+          state = UtfScanState::Continuing;
         }
         break;
-      case UtfScanState::Need1: {
-        current += byte;
-        state = UtfScanState::Initial;
-        jmutf8 += JavaModifiedUtf8::decode(current);
-        current.clear();
+      case UtfScanState::Continuing: {
+        if ((byte & 0xC0) == 0x80) {
+          current += byte;
+        } else {
+          cursor--;
+          state = UtfScanState::Initial;
+          jmutf8 += JavaModifiedUtf8::decode(current);
+          current.clear();
+        }
       } break;
-      case UtfScanState::Need2:
-        current += byte;
-        state = UtfScanState::Need1;
-        break;
-      case UtfScanState::Need3:
-        current += byte;
-        state = UtfScanState::Need2;
-        break;
     }
   }
 
+  if (current.size() && state == UtfScanState::Continuing) {
+    state = UtfScanState::Initial;
+    jmutf8 += JavaModifiedUtf8::decode(current);
+    current.clear();
+  }
+
   if (state != UtfScanState::Initial) {
     throw IllegalArgumentException(
         "Invalid utf-8 string passed to conversion method");
@@ -210,92 +229,6 @@ ju8string JavaModifiedUtf8::fromStringImproved(const std::string& utf8) {
   return jmutf8;
 }
 
-ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
-  ju8string jmutf8;
-  size_t cursor = 0;
-
-  while (cursor < utf8.size()) {
-    auto byte1 = utf8[cursor++];
-    if ((byte1 & 0x80) == 0) {
-      if (byte1) {
-        jmutf8 += byte1;
-      } else {
-        jmutf8 += static_cast<uint8_t>(0xC0);
-        jmutf8 += static_cast<uint8_t>(0x80);
-      }
-    } else if ((byte1 & 0xE0) == 0xC0) {
-      if (utf8.size() > 0 && cursor <= utf8.size() - 1) {
-        auto byte2 = utf8[cursor++];
-        int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
-        if (code_point > 0x7F) {
-          jmutf8 += byte1;
-          jmutf8 += byte2;
-        } else {
-          throw IllegalArgumentException(
-              "Invalid utf-8 string passed to conversion method (overly long "
-              "ASCII character)");
-        }
-      } else {
-        throw IllegalArgumentException(
-            "Invalid utf-8 string passed to conversion method");
-      }
-    } else if ((byte1 & 0xF0) == 0xE0) {
-      if (utf8.size() > 2 && cursor <= utf8.size() - 2) {
-        auto byte2 = utf8[cursor++];
-        auto byte3 = utf8[cursor++];
-
-        uint16_t code_point =
-            ((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F);
-        if (IsValidCodePoint(code_point)) {
-          jmutf8 += byte1;
-          jmutf8 += byte2;
-          jmutf8 += byte3;
-        } else {
-          throw IllegalArgumentException(
-              "Invalid utf-8 string passed to conversion method (overly long "
-              "3-byte encoding)");
-        }
-      } else {
-        throw IllegalArgumentException(
-            "Invalid utf-8 string passed to conversion method");
-      }
-    } else if ((byte1 & 0xF8) == 0xF0) {
-      if (utf8.size() > 3 && cursor <= utf8.size() - 3) {
-        auto byte2 = utf8[cursor++];
-        auto byte3 = utf8[cursor++];
-        auto byte4 = utf8[cursor++];
-
-        uint32_t code_point = (byte1 & 0x07) << 18;
-        code_point += (byte2 & 0x3F) << 12;
-        code_point += (byte3 & 0x3F) << 6;
-        code_point += byte4 & 0x3F;
-
-        if (code_point > 0xFFFF) {
-          jmutf8 += static_cast<uint8_t>(0xED);
-          jmutf8 +=
-              static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
-          jmutf8 += static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));
-
-          jmutf8 += static_cast<uint8_t>(0xED);
-          jmutf8 += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
-          jmutf8 += byte4;
-        } else {
-          throw IllegalArgumentException(
-              "Invalid utf-8 string passed to conversion method (overly long "
-              "4-byte encoding)");
-        }
-      } else {
-        throw IllegalArgumentException(
-            "Invalid utf-8 string passed to conversion method");
-      }
-
-    } else {
-      throw IllegalArgumentException("Invalid utf-8 start code");
-    }
-  }
-  return jmutf8;
-}
-
 std::string JavaModifiedUtf8::toString(const ju8string& jmutf8) {
   // std::string utf8;
   // size_t cursor = 0;
diff --git a/cppcache/src/util/JavaModifiedUtf8.hpp b/cppcache/src/util/JavaModifiedUtf8.hpp
index cdd19367d1..7029d578f9 100644
--- a/cppcache/src/util/JavaModifiedUtf8.hpp
+++ b/cppcache/src/util/JavaModifiedUtf8.hpp
@@ -49,7 +49,6 @@ struct JavaModifiedUtf8 {
    * Converts given UTF-8 string to Java Modified UTF-8 string.
    */
   static ju8string fromString(const std::string& utf8);
-  static ju8string fromStringImproved(const std::string& utf8);
 
   /**
    * Converts given UTF-16 string to Java Modified UTF-8 string.
@@ -76,6 +75,9 @@ struct JavaModifiedUtf8 {
 
  private:
   static bool IsValidCodePoint(uint16_t code_point);
+  static ju8string decode2byte(const std::string& utf8char);
+  static ju8string decode3byte(const std::string& utf8char);
+  static ju8string decode4byte(const std::string& utf8char);
 };
 
 }  // namespace internal
diff --git a/cppcache/test/CacheableStringTests.cpp b/cppcache/test/CacheableStringTests.cpp
index 38c82ed77a..5ccec2203a 100644
--- a/cppcache/test/CacheableStringTests.cpp
+++ b/cppcache/test/CacheableStringTests.cpp
@@ -244,68 +244,68 @@ std::vector<int> single_utf_16_surrogates[] = {
 TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
   std::string bad_start_code;
   bad_start_code += static_cast<int8_t>(0xF8);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_start_code),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_start_code),
                IllegalArgumentException);
 
   std::string too_short_2byte;
   too_short_2byte += static_cast<int8_t>(0xC0);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_2byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_2byte),
                IllegalArgumentException);
 
   std::string bad_2byte_at_end = "foo";
   bad_2byte_at_end += static_cast<int8_t>(0xC0);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_2byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_2byte_at_end),
                IllegalArgumentException);
 
   std::string too_long_3_byte_encode;
   too_long_3_byte_encode.push_back(static_cast<int8_t>(0xE0));
   too_long_3_byte_encode.push_back(static_cast<int8_t>(0x80));
   too_long_3_byte_encode.push_back(static_cast<int8_t>(0x80));
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_long_3_byte_encode),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_long_3_byte_encode),
                IllegalArgumentException);
 
   std::string too_short_3byte;
   too_short_3byte += static_cast<int8_t>(0xE8);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
                IllegalArgumentException);
 
   too_short_3byte += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_3byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_3byte),
                IllegalArgumentException);
 
   std::string bad_3byte_at_end = "foo";
   bad_3byte_at_end += static_cast<int8_t>(0xE8);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
                IllegalArgumentException);
 
   bad_3byte_at_end += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_3byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_3byte_at_end),
                IllegalArgumentException);
 
   std::string too_short_4byte;
   too_short_4byte += static_cast<int8_t>(0xF7);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
                IllegalArgumentException);
 
   too_short_4byte += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
                IllegalArgumentException);
 
   too_short_4byte += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(too_short_4byte),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(too_short_4byte),
                IllegalArgumentException);
 
   std::string bad_4byte_at_end = "foo";
   bad_4byte_at_end += static_cast<int8_t>(0xF7);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
                IllegalArgumentException);
 
   bad_4byte_at_end += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
                IllegalArgumentException);
 
   bad_4byte_at_end += static_cast<int8_t>(0x1);
-  EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_4byte_at_end),
+  EXPECT_THROW(JavaModifiedUtf8::fromString(bad_4byte_at_end),
                IllegalArgumentException);
 
   for (auto sequence : impossible_bytes) {
@@ -313,7 +313,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -322,7 +322,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -331,7 +331,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -340,7 +340,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
   }
 
@@ -349,7 +349,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8BadStrings) {
     for (auto byte_value : sequence) {
       bad_sequence += static_cast<int8_t>(byte_value);
     }
-    EXPECT_THROW(JavaModifiedUtf8::fromStringImproved(bad_sequence),
+    EXPECT_THROW(JavaModifiedUtf8::fromString(bad_sequence),
                  IllegalArgumentException);
   }
 }
@@ -362,7 +362,7 @@ std::pair<std::vector<int>, std::vector<int>> lowest_boundary_sequences[] = {
 
 std::pair<std::vector<int>, std::vector<int>> highest_boundary_sequences[] = {
     {{0x7F}, {0x7F}},
-    {{0xDF, 0xCF}, {0xDF, 0xCF}},
+    {{0xDF, 0xBF}, {0xDF, 0xBF}},
     {{0xEF, 0xBF, 0xBF}, {0xEF, 0xBF, 0xBF}},
     {{0xF7, 0xBF, 0xBF, 0xBF}, {0xED, 0xAE, 0xBF, 0xED, 0xBF, 0xBF}},
 };
@@ -380,7 +380,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
   std::string expected;
   utf8.push_back(0);
 
-  auto jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
+  auto jmutf8 = JavaModifiedUtf8::fromString(utf8);
 
   for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0;
        i < ARRAY_SIZE(lowest_boundary_sequences); i++) {
@@ -392,12 +392,12 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     for (auto byte_value : std::get<1>(lowest_boundary_sequences[i])) {
       expected += static_cast<int8_t>(byte_value);
     }
-    jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
+    jmutf8 = JavaModifiedUtf8::fromString(utf8);
     EXPECT_EQ(expected.size(), jmutf8.size());
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }
 
-  for (decltype(ARRAY_SIZE(lowest_boundary_sequences)) i = 0;
+  for (decltype(ARRAY_SIZE(highest_boundary_sequences)) i = 0;
        i < ARRAY_SIZE(highest_boundary_sequences); i++) {
     utf8.clear();
     expected.clear();
@@ -407,7 +407,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     for (auto byte_value : std::get<1>(highest_boundary_sequences[i])) {
       expected += static_cast<int8_t>(byte_value);
     }
-    jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
+    jmutf8 = JavaModifiedUtf8::fromString(utf8);
     EXPECT_EQ(expected.size(), jmutf8.size());
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }
@@ -422,7 +422,7 @@ TEST_F(CacheableStringTests, TestUtf8ToJmUtf8Boundaries) {
     for (auto byte_value : std::get<1>(other_boundary_sequences[i])) {
       expected += static_cast<int8_t>(byte_value);
     }
-    jmutf8 = JavaModifiedUtf8::fromStringImproved(utf8);
+    jmutf8 = JavaModifiedUtf8::fromString(utf8);
     EXPECT_EQ(expected.size(), jmutf8.size());
     EXPECT_TRUE(!memcmp(&expected[0], &jmutf8[0], expected.size()));
   }