Skip to content
Draft
14 changes: 8 additions & 6 deletions cppcache/src/DataInput.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include "util/JavaModifiedUtf8.hpp"
#include "util/string.hpp"

using apache::geode::client::internal::JavaModifiedUtf8;

namespace apache {
namespace geode {
namespace client {
Expand Down Expand Up @@ -62,8 +64,8 @@ void DataInput::readJavaModifiedUtf8(
std::basic_string<char16_t, _Traits, _Allocator>& value) {
uint16_t length = readInt16();
_GEODE_CHECK_BUFFER_SIZE(length);
value = internal::JavaModifiedUtf8::decode(
reinterpret_cast<const char*>(m_buf), length);
value =
JavaModifiedUtf8::decode(reinterpret_cast<const char*>(m_buf), length);
advanceCursor(length);
}
template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void
Expand All @@ -72,10 +74,10 @@ DataInput::readJavaModifiedUtf8(std::u16string&);
template <class _Traits, class _Allocator>
void DataInput::readJavaModifiedUtf8(
std::basic_string<char32_t, _Traits, _Allocator>& value) {
// TODO string OPTIMIZE convert from UTF-16 to UCS-4 directly
std::u16string utf16;
readJavaModifiedUtf8(utf16);
value = to_ucs4(utf16);
uint16_t length = readInt16();
_GEODE_CHECK_BUFFER_SIZE(length);
value =
JavaModifiedUtf8::decodeU32(reinterpret_cast<const char*>(m_buf), length);
}
template APACHE_GEODE_EXPLICIT_TEMPLATE_EXPORT void
DataInput::readJavaModifiedUtf8(std::u32string&);
Expand Down
236 changes: 228 additions & 8 deletions cppcache/src/util/JavaModifiedUtf8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

#include <codecvt>
#include <locale>
#include <set>

#include "geode/ExceptionTypes.hpp"
#include "string.hpp"

namespace apache {
Expand Down Expand Up @@ -57,22 +59,192 @@ size_t JavaModifiedUtf8::encodedLength(const char16_t* data, size_t length) {
return encodedLen;
}

std::string JavaModifiedUtf8::fromString(const std::string& utf8) {
return fromString(to_utf16(utf8));
std::set<int> utf16_surrogate_codes = {{0xD800}, {0xDB7F}, {0xDB80}, {0xDBFF},
{0xDC00}, {0xDF80}, {0xDFFF}};

bool JavaModifiedUtf8::IsValidCodePoint(uint16_t code_point) {
return (code_point > 0x7FF) && (utf16_surrogate_codes.find(code_point) ==
utf16_surrogate_codes.end());
}

enum class UtfScanState : int32_t {
Initial = 0,
Continuing = 1,
};

ju8string JavaModifiedUtf8::decode2byte(const std::string& utf8char) {
ju8string jmutf8char;
auto byte1 = utf8char[0];
auto byte2 = utf8char[1];
if (((byte1 & 0xE0) == 0xC0) && ((byte2 & 0x80) == 0x80)) {
int32_t code_point = ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
if (code_point > 0x7F) {
jmutf8char += byte1;
jmutf8char += byte2;
} else {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method (overly long "
"ASCII character)");
}
} else {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method");
}

return jmutf8char;
}

ju8string JavaModifiedUtf8::decode3byte(const std::string& utf8char) {
ju8string jmutf8char;
auto byte1 = utf8char[0];
auto byte2 = utf8char[1];
auto byte3 = utf8char[2];
if (((byte1 & 0xF0) == 0xE0) && ((byte2 & 0x80) == 0x80) &&
((byte3 & 0x80) == 0x80)) {
uint16_t code_point =
((byte1 & 0x0F) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F);
if (IsValidCodePoint(code_point)) {
jmutf8char += byte1;
jmutf8char += byte2;
jmutf8char += byte3;
} else {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method (overly long "
"3-byte encoding)");
}
} else {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method");
}

return jmutf8char;
}

ju8string JavaModifiedUtf8::decode4byte(const std::string& utf8char) {
ju8string jmutf8char;
auto byte1 = utf8char[0];
auto byte2 = utf8char[1];
auto byte3 = utf8char[2];
auto byte4 = utf8char[3];
if (((byte1 & 0xF8) == 0xF0) && ((byte2 & 0x80) == 0x80) &&
((byte3 & 0x80) == 0x80) && ((byte4 & 0x80) == 0x80)) {
uint32_t code_point = (byte1 & 0x07) << 18;
code_point += (byte2 & 0x3F) << 12;
code_point += (byte3 & 0x3F) << 6;
code_point += byte4 & 0x3F;

if (code_point > 0xFFFF) {
jmutf8char += static_cast<uint8_t>(0xED);
jmutf8char +=
static_cast<uint8_t>((0xA0 + (((code_point >> 16) - 1) & 0x0F)));
jmutf8char += static_cast<uint8_t>((0x80 + ((code_point >> 10) & 0x3F)));

jmutf8char += static_cast<uint8_t>(0xED);
jmutf8char += static_cast<uint8_t>((0xB0 + ((code_point >> 6) & 0x0F)));
jmutf8char += byte4;
} else {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method (overly long "
"4-byte encoding)");
}
} else {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method");
}

return jmutf8char;
}

std::string JavaModifiedUtf8::fromString(const std::u16string& utf16) {
std::string jmutf8;
jmutf8.reserve(utf16.length());
ju8string JavaModifiedUtf8::decode(const std::string& utf8char) {
ju8string jmutf8char;

for (auto&& c : utf16) {
encode(c, jmutf8);
switch (utf8char.size()) {
case 2:
jmutf8char = decode2byte(utf8char);
break;
case 3:
jmutf8char = decode3byte(utf8char);
break;
case 4:
jmutf8char = decode4byte(utf8char);
break;
default:
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method");
}

return jmutf8char;
}

ju8string JavaModifiedUtf8::fromString(const std::string& utf8) {
ju8string jmutf8;
size_t cursor = 0;
auto state = UtfScanState::Initial;
std::string current;

while (cursor < utf8.size()) {
auto byte = utf8[cursor++];

switch (state) {
case UtfScanState::Initial:
if ((byte & 0x80) == 0) {
if (byte) {
jmutf8 += byte;
} else {
jmutf8 += static_cast<uint8_t>(0xC0);
jmutf8 += static_cast<uint8_t>(0x80);
}
} else if ((byte & 0xc0) == 0x80) {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method");
} else {
current += byte;
state = UtfScanState::Continuing;
}
break;
case UtfScanState::Continuing: {
if ((byte & 0xC0) == 0x80) {
current += byte;
} else {
cursor--;
state = UtfScanState::Initial;
jmutf8 += JavaModifiedUtf8::decode(current);
current.clear();
}
} break;
}
}

if (current.size() && state == UtfScanState::Continuing) {
state = UtfScanState::Initial;
jmutf8 += JavaModifiedUtf8::decode(current);
current.clear();
}

if (state != UtfScanState::Initial) {
throw IllegalArgumentException(
"Invalid utf-8 string passed to conversion method");
}

return jmutf8;
}

void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) {
std::string JavaModifiedUtf8::toString(const ju8string& jmutf8) {
// std::string utf8;
// size_t cursor = 0;

// while (cursor < jmutf8.size()) {
// auto byte1 = jmutf8[cursor++];
// if ((byte1 & 0x80) == 0) {
// utf8.push_back(byte1);
// } else if ((byte1 & 0xE0) == 0xC0) {
// auto byte2 = jmutf8[cursor++];
// if () }
//}
return "";
}

void JavaModifiedUtf8::encode(const char16_t c, ju8string& jmutf8) {
if (c == 0) {
// NUL
jmutf8 += static_cast<uint8_t>(0xc0);
Expand All @@ -90,6 +262,54 @@ void JavaModifiedUtf8::encode(const char16_t c, std::string& jmutf8) {
}
}

std::u32string JavaModifiedUtf8::decodeU32(const char* buf, uint16_t len) {
std::u32string result;

uint16_t i = 0;
while (i < len) {
auto byte1 = buf[i++];
if (!(byte1 & 0x80)) {
result += static_cast<int32_t>(byte1) & 0x000000FF;
} else if ((i < len) && ((byte1 & 0xE0) == 0xC0)) {
auto byte2 = buf[i++];
if (((byte1 & 0xFF) == 0xC0) && ((byte2 & 0xFF) == 0x80)) {
result.push_back(static_cast<int32_t>(0));
} else {
int32_t code_point = static_cast<int32_t>(byte1 & 0x1F) << 6;
code_point += static_cast<int32_t>(byte2 & 0x3F);
result.push_back(code_point);
}
} else if ((i < len - 4) && ((byte1 & 0xED) == 0xED)) {
auto byte2 = buf[i++];
auto byte3 = buf[i++];
auto byte4 = buf[i++];
auto byte5 = buf[i++];
auto byte6 = buf[i++];
if ((byte4 & 0xED) == 0xED) {
int32_t code_point =
0x10000 + (static_cast<int32_t>(byte2 & 0xF) << 16);
code_point += static_cast<int32_t>(byte3 & 0x3F) << 10;
code_point += static_cast<int32_t>(byte5 & 0xF) << 6;
code_point += static_cast<int32_t>(byte6 & 0x3F);
result.push_back(code_point);
} else {
throw IllegalArgumentException("Bad encoding in jmutf-8 string");
}
} else if ((i < len - 1) && ((byte1 & 0xE0) == 0xE0)) {
auto byte2 = buf[i++];
auto byte3 = buf[i++];
int32_t code_point = static_cast<int32_t>(byte1 & 0xF) << 12;
code_point += static_cast<int32_t>(byte2 & 0x3F) << 6;
code_point += static_cast<int32_t>(byte3 & 0x3F);
result.push_back(code_point);
} else {
throw IllegalArgumentException("Bad encoding in jmutf-8 string");
}
}

return result;
}

std::u16string JavaModifiedUtf8::decode(const char* buf, uint16_t len) {
std::u16string value;
const auto end = buf + len;
Expand Down
24 changes: 21 additions & 3 deletions cppcache/src/util/JavaModifiedUtf8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ namespace geode {
namespace client {
namespace internal {

struct ju8type_traits : std::char_traits<char> {};
typedef std::basic_string<char, ju8type_traits> ju8string;

struct JavaModifiedUtf8 {
/**
* Calculate the length of the given UTF-8 string when encoded in Java
Expand All @@ -45,21 +48,36 @@ struct JavaModifiedUtf8 {
/**
* Converts given UTF-8 string to Java Modified UTF-8 string.
*/
static std::string fromString(const std::string& utf8);
static ju8string fromString(const std::string& utf8);

/**
* Converts given UTF-16 string to Java Modified UTF-8 string.
*/
static std::string fromString(const std::u16string& utf16);
static ju8string fromString(const std::u16string& utf16);

/**
* Converts Java-Modified UTF-8 string to UTF-8 string.
*/
std::string toString(const ju8string& jmutf8);

/**
* Converts a single UTF-16 code unit into Java Modified UTF-8 code units.
*/
static void encode(const char16_t c, std::string& jmutf8);
static void encode(const char16_t c, ju8string& jmutf8);

static std::u16string decode(const char* buf, uint16_t len);

static ju8string decode(const std::string& utf8char);

static std::u32string decodeU32(const char* buf, uint16_t len);

static char16_t decodeJavaModifiedUtf8Char(const char** pbuf);

private:
static bool IsValidCodePoint(uint16_t code_point);
static ju8string decode2byte(const std::string& utf8char);
static ju8string decode3byte(const std::string& utf8char);
static ju8string decode4byte(const std::string& utf8char);
};

} // namespace internal
Expand Down
Loading