From c440a7aeecdb9dec0ce21bb31cef58ed6d1724e1 Mon Sep 17 00:00:00 2001 From: Stephen Webb Date: Wed, 8 Apr 2026 12:26:00 +1000 Subject: [PATCH 1/4] InputStreamReader failed to load a multibyte UTF-8 sequence on a read boundary --- src/main/cpp/inputstreamreader.cpp | 22 ++++------- src/test/cpp/filetestcase.cpp | 60 +++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/src/main/cpp/inputstreamreader.cpp b/src/main/cpp/inputstreamreader.cpp index ba5ce19bc..449a5f110 100644 --- a/src/main/cpp/inputstreamreader.cpp +++ b/src/main/cpp/inputstreamreader.cpp @@ -79,25 +79,17 @@ LogString InputStreamReader::read(Pool& p) while (m_priv->in->read(buf) >= 0) { buf.flip(); + auto lastAvailableCount = buf.remaining(); log4cxx_status_t stat = m_priv->dec->decode(buf, output); - if (stat != 0) + if (buf.remaining() == lastAvailableCount) { - throw IOException(LOG4CXX_STR("decode"), stat); - } - - if (buf.remaining() > 0) - { - if (buf.remaining() == BUFSIZE) - { - throw IOException(LOG4CXX_STR("Decoder made no progress")); - } - buf.carry(); - } - else - { - buf.clear(); + if (stat != 0) + throw IOException(LOG4CXX_STR("decode"), stat); + else + throw IOException(LOG4CXX_STR("decode made no progress")); } + buf.carry(); } return output; diff --git a/src/test/cpp/filetestcase.cpp b/src/test/cpp/filetestcase.cpp index 00146c834..9910afdf3 100644 --- a/src/test/cpp/filetestcase.cpp +++ b/src/test/cpp/filetestcase.cpp @@ -28,6 +28,8 @@ #include #include #include +#include +#include #if LOG4CXX_CFSTRING_API #include @@ -58,6 +60,8 @@ LOGUNIT_CLASS(FileTestCase) LOGUNIT_TEST(copyConstructor); LOGUNIT_TEST(assignment); LOGUNIT_TEST(deleteBackslashedFileName); + LOGUNIT_TEST(testSplitMultibyteUtf8); + LOGUNIT_TEST(testInvalidUtf8); LOGUNIT_TEST_SUITE_END(); #ifdef _DEBUG @@ -102,6 +106,8 @@ LOGUNIT_CLASS(FileTestCase) } catch (IOException& ex) { + LOG4CXX_DECODE_CHAR(msg, ex.what()); + LogLog::debug(msg); } } @@ -206,7 +212,59 @@ LOGUNIT_CLASS(FileTestCase) Pool pool; /*bool deleted = */file.deleteFile(pool); } -}; + class MockInputStream : public InputStream + { + ByteBuffer m_data; + public: + MockInputStream(const char* data, size_t charCount) + : m_data(const_cast(data), charCount) + {} + + int read(ByteBuffer& dst) override + { + auto availableBytes = m_data.remaining(); + if (availableBytes < 1) + return -1; + int count = 0; + for (auto p = m_data.current(); count < availableBytes && dst.put(*p); ++p) + ++count; + m_data.increment_position(count); + return count; + } + + void close() override {} + }; + + /** + * Tests behavior when a multibyte UTF-8 sequence occurs on a read boundary + */ + void testSplitMultibyteUtf8() + { + Pool p; + // InputStreamReader uses a buffer of size 4096 + std::string input( 4094, 'A' ); + // räksmörgås.josefsson.org + input.append("\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147"); + InputStreamReader reader(std::make_shared(input.c_str(), input.size()), CharsetDecoder::getUTF8Decoder()); + auto contentLS = reader.read(p); + LOG4CXX_ENCODE_CHAR(content, contentLS); + LOGUNIT_ASSERT_EQUAL(input, content); + } + + /** + * Tests behavior given an incomplete multibyte UTF-8 sequence in the input + */ + void testInvalidUtf8() + { + Pool p; + // 0xC2 is a generic start byte for a 2-byte sequence in UTF-8. + char input[] = { 'A', (char)0xC2, 'B', 'C', 0 }; + InputStreamReader reader(std::make_shared(input, 4), CharsetDecoder::getUTF8Decoder()); + auto contentLS = reader.read(p); + LOG4CXX_ENCODE_CHAR(content, contentLS); + LOGUNIT_ASSERT_EQUAL("A", content); + } +}; LOGUNIT_TEST_SUITE_REGISTRATION(FileTestCase); From ae5bcc435b1a587c0aba439ab2986586621305d4 Mon Sep 17 00:00:00 2001 From: Stephen Webb Date: Wed, 8 Apr 2026 13:09:37 +1000 Subject: [PATCH 2/4] Raise an exception on a decoding failure --- src/main/cpp/inputstreamreader.cpp | 28 ++++++++++++++++++++-------- src/test/cpp/filetestcase.cpp | 13 ++++++++++--- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/main/cpp/inputstreamreader.cpp b/src/main/cpp/inputstreamreader.cpp index 449a5f110..bb89a8398 100644 --- a/src/main/cpp/inputstreamreader.cpp +++ b/src/main/cpp/inputstreamreader.cpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include using namespace LOG4CXX_NS; using namespace LOG4CXX_NS::helpers; @@ -74,23 +76,33 @@ LogString InputStreamReader::read(Pool& p) const size_t BUFSIZE = 4096; ByteBuffer buf(p.pstralloc(BUFSIZE), BUFSIZE); LogString output; + log4cxx_status_t stat{ 0 }; // read whole file while (m_priv->in->read(buf) >= 0) { buf.flip(); auto lastAvailableCount = buf.remaining(); - log4cxx_status_t stat = m_priv->dec->decode(buf, output); - + stat = m_priv->dec->decode(buf, output); if (buf.remaining() == lastAvailableCount) - { - if (stat != 0) - throw IOException(LOG4CXX_STR("decode"), stat); - else - throw IOException(LOG4CXX_STR("decode made no progress")); - } + break; buf.carry(); } + if (stat != 0 && 0 < buf.remaining()) + { + auto toHexDigit = [](int ch) -> int + { + return (10 <= ch ? (0x61 - 10) : 0x30) + ch; + }; + LogString msg(LOG4CXX_STR("Unable to decode character 0x")); + auto ch = static_cast(*buf.current()); + msg.push_back(toHexDigit((ch & 0xF0) >> 4)); + msg.push_back(toHexDigit((ch & 0xF))); + msg += LOG4CXX_STR(" at offset "); + Pool p; + StringHelper::toString(output.size(), p, msg); + throw RuntimeException(msg); + } return output; } diff --git a/src/test/cpp/filetestcase.cpp b/src/test/cpp/filetestcase.cpp index 9910afdf3..e0c45d179 100644 --- a/src/test/cpp/filetestcase.cpp +++ b/src/test/cpp/filetestcase.cpp @@ -261,9 +261,16 @@ LOGUNIT_CLASS(FileTestCase) // 0xC2 is a generic start byte for a 2-byte sequence in UTF-8. char input[] = { 'A', (char)0xC2, 'B', 'C', 0 }; InputStreamReader reader(std::make_shared(input, 4), CharsetDecoder::getUTF8Decoder()); - auto contentLS = reader.read(p); - LOG4CXX_ENCODE_CHAR(content, contentLS); - LOGUNIT_ASSERT_EQUAL("A", content); + try + { + reader.read(p); + LOGUNIT_ASSERT(false); + } + catch (const Exception& ex) + { + LOG4CXX_DECODE_CHAR(msg, ex.what()); + LogLog::debug(msg); + } } }; From d9fcfd8daf7e71a380fab598dfbaeef4ad6657bc Mon Sep 17 00:00:00 2001 From: Stephen Webb Date: Wed, 8 Apr 2026 13:13:58 +1000 Subject: [PATCH 3/4] Raise an exception on a decoding failure --- src/main/cpp/inputstreamreader.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/cpp/inputstreamreader.cpp b/src/main/cpp/inputstreamreader.cpp index bb89a8398..53f839ac1 100644 --- a/src/main/cpp/inputstreamreader.cpp +++ b/src/main/cpp/inputstreamreader.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include using namespace LOG4CXX_NS; @@ -85,7 +84,11 @@ LogString InputStreamReader::read(Pool& p) auto lastAvailableCount = buf.remaining(); stat = m_priv->dec->decode(buf, output); if (buf.remaining() == lastAvailableCount) + { + if (stat == 0) + stat = -1; break; + } buf.carry(); } if (stat != 0 && 0 < buf.remaining()) From f80574b9cce45f7801e05dee9c759970c166a885 Mon Sep 17 00:00:00 2001 From: Stephen Webb Date: Wed, 8 Apr 2026 13:41:14 +1000 Subject: [PATCH 4/4] CharsetDecoder::getUTF8Decoder should not return a TrivialCharsetDecoder --- src/main/cpp/charsetdecoder.cpp | 27 +++++++++------------------ src/main/cpp/domconfigurator.cpp | 2 +- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp index 7aadc1ee5..b7f852a7f 100644 --- a/src/main/cpp/charsetdecoder.cpp +++ b/src/main/cpp/charsetdecoder.cpp @@ -280,10 +280,6 @@ class TrivialCharsetDecoder : public CharsetDecoder TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&); }; - -#if LOG4CXX_LOGCHAR_IS_UTF8 -typedef TrivialCharsetDecoder UTF8CharsetDecoder; -#else /** * Converts from UTF-8 to std::wstring * @@ -333,7 +329,6 @@ class UTF8CharsetDecoder : public CharsetDecoder UTF8CharsetDecoder(const UTF8CharsetDecoder&); UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&); }; -#endif /** * Converts from ISO-8859-1 to LogString. @@ -504,7 +499,11 @@ CharsetDecoder::~CharsetDecoder() CharsetDecoder* CharsetDecoder::createDefaultDecoder() { #if LOG4CXX_CHARSET_UTF8 +#if LOG4CXX_LOGCHAR_IS_UTF8 + return new TrivialCharsetDecoder(); +#else return new UTF8CharsetDecoder(); +#endif #elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE) return new ISOLatinCharsetDecoder(); #elif LOG4CXX_CHARSET_USASCII @@ -535,19 +534,7 @@ CharsetDecoderPtr CharsetDecoder::getDefaultDecoder() CharsetDecoderPtr CharsetDecoder::getUTF8Decoder() { - static WideLife decoder(new UTF8CharsetDecoder()); - - // - // if invoked after static variable destruction - // (if logging is called in the destructor of a static object) - // then create a new decoder. - // - if (decoder.value() == 0) - { - return std::make_shared(); - } - - return decoder; + return std::make_shared(); } CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder() @@ -562,7 +549,11 @@ CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset) StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) || StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001"))) { +#if LOG4CXX_LOGCHAR_IS_UTF8 + return std::make_shared(); +#else return std::make_shared(); +#endif } else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) || charset == LOG4CXX_STR("646") || diff --git a/src/main/cpp/domconfigurator.cpp b/src/main/cpp/domconfigurator.cpp index 82b0d0d92..69e457db0 100644 --- a/src/main/cpp/domconfigurator.cpp +++ b/src/main/cpp/domconfigurator.cpp @@ -85,7 +85,7 @@ struct DOMConfigurator::DOMConfiguratorPrivate bool appenderAdded{ false }; AppenderMap appenders; Pool p; - CharsetDecoderPtr utf8Decoder{ CharsetDecoder::getUTF8Decoder() }; + CharsetDecoderPtr utf8Decoder{ CharsetDecoder::getDecoder(LOG4CXX_STR("UTF-8")) }; apr_xml_doc* doc{ nullptr }; public: // ...structor