hyphanet · ArneBab · Jul 25, 2024 · Sep 22, 2024 · Nov 11, 2024 · bertm
diff --git a/src/freenet/client/filter/CSSReadFilter.java b/src/freenet/client/filter/CSSReadFilter.java
@@ -127,6 +127,11 @@ static byte[] parse(String s) {
 
 	@Override
 	public BOMDetection getCharsetByBOM(byte[] input, int length) throws DataFilterException, IOException {
+		return detectCharsetFromBOM(input, length);
+	}
+
+	public static BOMDetection detectCharsetFromBOM(byte[] input, int length)
+			throws UnsupportedCharsetInFilterException {
 		if(ContentFilter.startsWith(input, ascii, length))
 			return new BOMDetection("UTF-8", true);
 		if(ContentFilter.startsWith(input, utf16be, length))

diff --git a/src/freenet/client/filter/ContentFilter.java b/src/freenet/client/filter/ContentFilter.java
@@ -32,8 +32,9 @@ public class ContentFilter {
 
 	/** The HTML mime types are defined here, to allow other modules to identify it*/
 	public static final String[] HTML_MIME_TYPES=new String[]{"text/html", "application/xhtml+xml", "text/xml+xhtml", "text/xhtml", "application/xhtml"};
+	private static final int CHARSET_DETECTION_FALLBACK_BUFFERSIZE = 64;
 
-        private static volatile boolean logMINOR;
+	private static volatile boolean logMINOR;
 	static {
 		Logger.registerLogThresholdCallback(new LogThresholdCallback(){
 			@Override
@@ -54,7 +55,7 @@ public static void init() {
 		register(new FilterMIMEType("text/plain", "txt", new String[0], new String[] { "text", "pot" },
 				true, true, null, false, false, false, false, false, false,
 				l10n("textPlainReadAdvice"),
-				true, "US-ASCII", null, false));
+				true, "utf-8", null, false));
 
 		// GIF - has a filter
 		register(new FilterMIMEType("image/gif", "gif", new String[0], new String[0],
@@ -343,16 +344,8 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
 			if(handler.readFilter != null) {
 				if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
 					int bufferSize = handler.charsetExtractor.getCharsetBufferSize();
-					input.mark(bufferSize);
 					byte[] charsetBuffer = new byte[bufferSize];
-					int bytesRead = 0, offset = 0, toread=0;
-					while(true) {
-						toread = bufferSize - offset;
-						bytesRead = input.read(charsetBuffer, offset, toread);
-						if(bytesRead == -1 || toread == 0) break;
-						offset += bytesRead;
-					}
-					input.reset();
+					int offset = readIntoBuffer(input, bufferSize, charsetBuffer);
 					charset = detectCharset(charsetBuffer, offset, handler, maybeCharset);
 				}
 				try {
@@ -374,6 +367,16 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
 			}
 
 			if(handler.safeToRead) {
+				if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
+					byte[] charsetBuffer = new byte[CHARSET_DETECTION_FALLBACK_BUFFERSIZE];
+					int offset = readIntoBuffer(input, CHARSET_DETECTION_FALLBACK_BUFFERSIZE, charsetBuffer);
+					BOMDetection bom = CSSReadFilter.detectCharsetFromBOM(charsetBuffer, CHARSET_DETECTION_FALLBACK_BUFFERSIZE);
+					if (bom != null) {
+						charset = bom.charset;
+					} else if (handler.defaultCharset != null){
+						charset = handler.defaultCharset;
+					}
+				}
 				FileUtil.copy(input, output, -1);
 				output.flush();
 				return new FilterStatus(charset, typeName);
@@ -384,6 +387,20 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
 		return null;
 	}
 
+	private static int readIntoBuffer(InputStream input, int bufferSize, byte[] charsetBuffer)
+			throws IOException {
+		input.mark(bufferSize);
+		int bytesRead = 0, offset = 0, toread=0;
+		while(true) {
+			toread = bufferSize - offset;
+			bytesRead = input.read(charsetBuffer, offset, toread);
+			if(bytesRead == -1 || toread == 0) break;
+			offset += bytesRead;
+		}
+		input.reset();
+		return offset;
+	}
+
 	public static String detectCharset(byte[] input, int length, FilterMIMEType handler, String maybeCharset) throws IOException {
 		// Detect charset
 		String charset = detectBOM(input, length);

diff --git a/test/freenet/client/filter/ContentFilterTest.java b/test/freenet/client/filter/ContentFilterTest.java
@@ -380,6 +380,37 @@ public void testEvilCharset() throws IOException {
         }
     }
 
+
+    @Test
+    public void byteOrderMarkForUtf8IsDetectedCorrectly() throws IOException {
+        byte[] buf = { (byte) 0xef, (byte) 0xbb, (byte) 0xbf, 0x40 };
+        ArrayBucket out = new ArrayBucket();
+        FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
+        assertTrue("utf-8".equals(fo.charset));
+    }
+
+    @Test
+    public void byteOrderMarkForUtf16BeIsDetectedCorrectly() throws IOException {
+        byte[] buf = { (byte) 0xfe, (byte) 0xff, 0x00, 0x40 };
+        ArrayBucket out = new ArrayBucket();
+        FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
+        assertTrue("UTF-16BE".equals(fo.charset));
+    }
+
+    @Test
+    public void byteOrderMarkForUtf16LeIsDetectedCorrectly() throws IOException {
+        byte[] buf = { (byte) 0xff, (byte) 0xfe, 0x40, 0x00 };
+        ArrayBucket out = new ArrayBucket();
+        FilterStatus fo = ContentFilter.filter(
+            new ArrayBucket(buf).getInputStream(),
+            out.getOutputStream(),
+            "text/plain",
+            null,
+            null,
+            null);
+        assertTrue("UTF-16LE".equals(fo.charset));
+    }
+
     public static String htmlFilter(String data) throws Exception {
         if (data.startsWith("<html")) return htmlFilter(data, false);
         if (data.startsWith("<?")) return htmlFilter(data, false);