From c440a7aeecdb9dec0ce21bb31cef58ed6d1724e1 Mon Sep 17 00:00:00 2001
From: Stephen Webb <swebb2066@gmail.com>
Date: Wed, 8 Apr 2026 12:26:00 +1000
Subject: [PATCH 1/4] InputStreamReader failed to load a multibyte UTF-8
 sequence on a read boundary

---
 src/main/cpp/inputstreamreader.cpp | 22 ++++-------
 src/test/cpp/filetestcase.cpp      | 60 +++++++++++++++++++++++++++++-
 2 files changed, 66 insertions(+), 16 deletions(-)
diff --git a/src/main/cpp/inputstreamreader.cpp b/src/main/cpp/inputstreamreader.cpp
index ba5ce19bc..449a5f110 100644
--- a/src/main/cpp/inputstreamreader.cpp
+++ b/src/main/cpp/inputstreamreader.cpp
@@ -79,25 +79,17 @@ LogString InputStreamReader::read(Pool& p)
 	while (m_priv->in->read(buf) >= 0)
 	{
 		buf.flip();
+		auto lastAvailableCount = buf.remaining();
 		log4cxx_status_t stat = m_priv->dec->decode(buf, output);
 
-		if (stat != 0)
+		if (buf.remaining() == lastAvailableCount)
 		{
-			throw IOException(LOG4CXX_STR("decode"), stat);
-		}
-
-		if (buf.remaining() > 0)
-		{
-			if (buf.remaining() == BUFSIZE)
-			{
-				throw IOException(LOG4CXX_STR("Decoder made no progress"));
-			}
-			buf.carry();
-		}
-		else
-		{
-			buf.clear();
+			if (stat != 0)
+				throw IOException(LOG4CXX_STR("decode"), stat);
+			else
+				throw IOException(LOG4CXX_STR("decode made no progress"));
 		}
+		buf.carry();
 	}
 
 	return output;
diff --git a/src/test/cpp/filetestcase.cpp b/src/test/cpp/filetestcase.cpp
index 00146c834..9910afdf3 100644
--- a/src/test/cpp/filetestcase.cpp
+++ b/src/test/cpp/filetestcase.cpp
@@ -28,6 +28,8 @@
 #include <log4cxx/helpers/inputstreamreader.h>
 #include <log4cxx/helpers/fileinputstream.h>
 #include <log4cxx/helpers/loglog.h>
+#include <log4cxx/helpers/bytebuffer.h>
+#include <log4cxx/helpers/transcoder.h>
 
 #if LOG4CXX_CFSTRING_API
 	#include <CoreFoundation/CFString.h>
@@ -58,6 +60,8 @@ LOGUNIT_CLASS(FileTestCase)
 	LOGUNIT_TEST(copyConstructor);
 	LOGUNIT_TEST(assignment);
 	LOGUNIT_TEST(deleteBackslashedFileName);
+	LOGUNIT_TEST(testSplitMultibyteUtf8);
+	LOGUNIT_TEST(testInvalidUtf8);
 	LOGUNIT_TEST_SUITE_END();
 
 #ifdef _DEBUG
@@ -102,6 +106,8 @@ LOGUNIT_CLASS(FileTestCase)
 		}
 		catch (IOException& ex)
 		{
+			LOG4CXX_DECODE_CHAR(msg, ex.what());
+			LogLog::debug(msg);
 		}
 	}
 
@@ -206,7 +212,59 @@ LOGUNIT_CLASS(FileTestCase)
 		Pool pool;
 		/*bool deleted = */file.deleteFile(pool);
 	}
-};
 
+	class MockInputStream : public InputStream
+	{
+		ByteBuffer m_data;
+	public:
+		MockInputStream(const char* data, size_t charCount)
+			: m_data(const_cast<char*>(data), charCount)
+		{}
+
+		int read(ByteBuffer& dst) override
+		{
+			auto availableBytes = m_data.remaining();
+			if (availableBytes < 1)
+				return -1;
+			int count = 0;
+			for (auto p = m_data.current(); count < availableBytes && dst.put(*p); ++p)
+				++count;
+			m_data.increment_position(count);
+			return count;
+		}
+
+		void close() override {}
+	};
+
+	/**
+	 * Tests behavior when a multibyte UTF-8 sequence occurs on a read boundary
+	 */
+	void testSplitMultibyteUtf8()
+	{
+		Pool p;
+		// InputStreamReader uses a buffer of size 4096
+		std::string input( 4094, 'A' );
+		// räksmörgås.josefsson.org
+		input.append("\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147");
+		InputStreamReader reader(std::make_shared<MockInputStream>(input.c_str(), input.size()), CharsetDecoder::getUTF8Decoder());
+		auto contentLS = reader.read(p);
+		LOG4CXX_ENCODE_CHAR(content, contentLS);
+		LOGUNIT_ASSERT_EQUAL(input, content);
+	}
+
+	/**
+	 * Tests behavior given an incomplete multibyte UTF-8 sequence in the input
+	 */
+	void testInvalidUtf8()
+	{
+		Pool p;
+		// 0xC2 is a generic start byte for a 2-byte sequence in UTF-8.
+		char input[] = { 'A', (char)0xC2, 'B', 'C', 0 };
+		InputStreamReader reader(std::make_shared<MockInputStream>(input, 4), CharsetDecoder::getUTF8Decoder());
+		auto contentLS = reader.read(p);
+		LOG4CXX_ENCODE_CHAR(content, contentLS);
+		LOGUNIT_ASSERT_EQUAL("A", content);
+	}
+};
 
 LOGUNIT_TEST_SUITE_REGISTRATION(FileTestCase);

From ae5bcc435b1a587c0aba439ab2986586621305d4 Mon Sep 17 00:00:00 2001
From: Stephen Webb <swebb2066@gmail.com>
Date: Wed, 8 Apr 2026 13:09:37 +1000
Subject: [PATCH 2/4] Raise an exception on a decoding failure

---
 src/main/cpp/inputstreamreader.cpp | 28 ++++++++++++++++++++--------
 src/test/cpp/filetestcase.cpp      | 13 ++++++++++---
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/src/main/cpp/inputstreamreader.cpp b/src/main/cpp/inputstreamreader.cpp
index 449a5f110..bb89a8398 100644
--- a/src/main/cpp/inputstreamreader.cpp
+++ b/src/main/cpp/inputstreamreader.cpp
@@ -20,6 +20,8 @@
 #include <log4cxx/helpers/exception.h>
 #include <log4cxx/helpers/pool.h>
 #include <log4cxx/helpers/bytebuffer.h>
+#include <log4cxx/helpers/loglog.h>
+#include <log4cxx/helpers/stringhelper.h>
 
 using namespace LOG4CXX_NS;
 using namespace LOG4CXX_NS::helpers;
@@ -74,23 +76,33 @@ LogString InputStreamReader::read(Pool& p)
 	const size_t BUFSIZE = 4096;
 	ByteBuffer buf(p.pstralloc(BUFSIZE), BUFSIZE);
 	LogString output;
+	log4cxx_status_t stat{ 0 };
 
 	// read whole file
 	while (m_priv->in->read(buf) >= 0)
 	{
 		buf.flip();
 		auto lastAvailableCount = buf.remaining();
-		log4cxx_status_t stat = m_priv->dec->decode(buf, output);
-
+		stat = m_priv->dec->decode(buf, output);
 		if (buf.remaining() == lastAvailableCount)
-		{
-			if (stat != 0)
-				throw IOException(LOG4CXX_STR("decode"), stat);
-			else
-				throw IOException(LOG4CXX_STR("decode made no progress"));
-		}
+			break;
 		buf.carry();
 	}
+	if (stat != 0 && 0 < buf.remaining())
+	{
+		auto toHexDigit = [](int ch) -> int
+		{
+			return (10 <= ch ? (0x61 - 10) : 0x30) + ch;
+		};
+		LogString msg(LOG4CXX_STR("Unable to decode character 0x"));
+		auto ch = static_cast<unsigned int>(*buf.current());
+		msg.push_back(toHexDigit((ch & 0xF0) >> 4));
+		msg.push_back(toHexDigit((ch & 0xF)));
+		msg += LOG4CXX_STR(" at offset ");
+		Pool p;
+		StringHelper::toString(output.size(), p, msg);
+		throw RuntimeException(msg);
+	}
 
 	return output;
 }
diff --git a/src/test/cpp/filetestcase.cpp b/src/test/cpp/filetestcase.cpp
index 9910afdf3..e0c45d179 100644
--- a/src/test/cpp/filetestcase.cpp
+++ b/src/test/cpp/filetestcase.cpp
@@ -261,9 +261,16 @@ LOGUNIT_CLASS(FileTestCase)
 		// 0xC2 is a generic start byte for a 2-byte sequence in UTF-8.
 		char input[] = { 'A', (char)0xC2, 'B', 'C', 0 };
 		InputStreamReader reader(std::make_shared<MockInputStream>(input, 4), CharsetDecoder::getUTF8Decoder());
-		auto contentLS = reader.read(p);
-		LOG4CXX_ENCODE_CHAR(content, contentLS);
-		LOGUNIT_ASSERT_EQUAL("A", content);
+		try
+		{
+			reader.read(p);
+			LOGUNIT_ASSERT(false);
+		}
+		catch (const Exception& ex)
+		{
+			LOG4CXX_DECODE_CHAR(msg, ex.what());
+			LogLog::debug(msg);
+		}
 	}
 };
 

From d9fcfd8daf7e71a380fab598dfbaeef4ad6657bc Mon Sep 17 00:00:00 2001
From: Stephen Webb <swebb2066@gmail.com>
Date: Wed, 8 Apr 2026 13:13:58 +1000
Subject: [PATCH 3/4] Raise an exception on a decoding failure

---
 src/main/cpp/inputstreamreader.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/main/cpp/inputstreamreader.cpp b/src/main/cpp/inputstreamreader.cpp
index bb89a8398..53f839ac1 100644
--- a/src/main/cpp/inputstreamreader.cpp
+++ b/src/main/cpp/inputstreamreader.cpp
@@ -20,7 +20,6 @@
 #include <log4cxx/helpers/exception.h>
 #include <log4cxx/helpers/pool.h>
 #include <log4cxx/helpers/bytebuffer.h>
-#include <log4cxx/helpers/loglog.h>
 #include <log4cxx/helpers/stringhelper.h>
 
 using namespace LOG4CXX_NS;
@@ -85,7 +84,11 @@ LogString InputStreamReader::read(Pool& p)
 		auto lastAvailableCount = buf.remaining();
 		stat = m_priv->dec->decode(buf, output);
 		if (buf.remaining() == lastAvailableCount)
+		{
+			if (stat == 0)
+				stat = -1;
 			break;
+		}
 		buf.carry();
 	}
 	if (stat != 0 && 0 < buf.remaining())

From f80574b9cce45f7801e05dee9c759970c166a885 Mon Sep 17 00:00:00 2001
From: Stephen Webb <swebb2066@gmail.com>
Date: Wed, 8 Apr 2026 13:41:14 +1000
Subject: [PATCH 4/4] CharsetDecoder::getUTF8Decoder should not return a
 TrivialCharsetDecoder

---
 src/main/cpp/charsetdecoder.cpp  | 27 +++++++++------------------
 src/main/cpp/domconfigurator.cpp |  2 +-
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp
index 7aadc1ee5..b7f852a7f 100644
--- a/src/main/cpp/charsetdecoder.cpp
+++ b/src/main/cpp/charsetdecoder.cpp
@@ -280,10 +280,6 @@ class TrivialCharsetDecoder : public CharsetDecoder
 		TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&);
 };
 
-
-#if LOG4CXX_LOGCHAR_IS_UTF8
-typedef TrivialCharsetDecoder UTF8CharsetDecoder;
-#else
 /**
 *    Converts from UTF-8 to std::wstring
 *
@@ -333,7 +329,6 @@ class UTF8CharsetDecoder : public CharsetDecoder
 		UTF8CharsetDecoder(const UTF8CharsetDecoder&);
 		UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&);
 };
-#endif
 
 /**
 *    Converts from ISO-8859-1 to LogString.
@@ -504,7 +499,11 @@ CharsetDecoder::~CharsetDecoder()
 CharsetDecoder* CharsetDecoder::createDefaultDecoder()
 {
 #if LOG4CXX_CHARSET_UTF8
+#if LOG4CXX_LOGCHAR_IS_UTF8
+	return new TrivialCharsetDecoder();
+#else
 	return new UTF8CharsetDecoder();
+#endif
 #elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE)
 	return new ISOLatinCharsetDecoder();
 #elif LOG4CXX_CHARSET_USASCII
@@ -535,19 +534,7 @@ CharsetDecoderPtr CharsetDecoder::getDefaultDecoder()
 
 CharsetDecoderPtr CharsetDecoder::getUTF8Decoder()
 {
-	static WideLife<CharsetDecoderPtr> decoder(new UTF8CharsetDecoder());
-
-	//
-	//  if invoked after static variable destruction
-	//     (if logging is called in the destructor of a static object)
-	//     then create a new decoder.
-	//
-	if (decoder.value() == 0)
-	{
-		return std::make_shared<UTF8CharsetDecoder>();
-	}
-
-	return decoder;
+	return std::make_shared<UTF8CharsetDecoder>();
 }
 
 CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder()
@@ -562,7 +549,11 @@ CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset)
 		StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) ||
 		StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001")))
 	{
+#if LOG4CXX_LOGCHAR_IS_UTF8
+		return std::make_shared<TrivialCharsetDecoder>();
+#else
 		return std::make_shared<UTF8CharsetDecoder>();
+#endif
 	}
 	else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) ||
 		charset == LOG4CXX_STR("646") ||
diff --git a/src/main/cpp/domconfigurator.cpp b/src/main/cpp/domconfigurator.cpp
index 82b0d0d92..69e457db0 100644
--- a/src/main/cpp/domconfigurator.cpp
+++ b/src/main/cpp/domconfigurator.cpp
@@ -85,7 +85,7 @@ struct DOMConfigurator::DOMConfiguratorPrivate
 	bool appenderAdded{ false };
 	AppenderMap	appenders;
 	Pool p;
-	CharsetDecoderPtr utf8Decoder{ CharsetDecoder::getUTF8Decoder() };
+	CharsetDecoderPtr utf8Decoder{ CharsetDecoder::getDecoder(LOG4CXX_STR("UTF-8")) };
 	apr_xml_doc* doc{ nullptr };
 
 public: // ...structor