Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/freenet/client/filter/CSSReadFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ static byte[] parse(String s) {

@Override
public BOMDetection getCharsetByBOM(byte[] input, int length) throws DataFilterException, IOException {
return detectCharsetFromBOM(input, length);
}

public static BOMDetection detectCharsetFromBOM(byte[] input, int length)
throws UnsupportedCharsetInFilterException {
if(ContentFilter.startsWith(input, ascii, length))
return new BOMDetection("UTF-8", true);
if(ContentFilter.startsWith(input, utf16be, length))
Expand Down
39 changes: 28 additions & 11 deletions src/freenet/client/filter/ContentFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ public class ContentFilter {

/** The HTML mime types are defined here, to allow other modules to identify it*/
public static final String[] HTML_MIME_TYPES=new String[]{"text/html", "application/xhtml+xml", "text/xml+xhtml", "text/xhtml", "application/xhtml"};
private static final int CHARSET_DETECTION_FALLBACK_BUFFERSIZE = 64;

private static volatile boolean logMINOR;
private static volatile boolean logMINOR;
static {
Logger.registerLogThresholdCallback(new LogThresholdCallback(){
@Override
Expand All @@ -54,7 +55,7 @@ public static void init() {
register(new FilterMIMEType("text/plain", "txt", new String[0], new String[] { "text", "pot" },
true, true, null, false, false, false, false, false, false,
l10n("textPlainReadAdvice"),
true, "US-ASCII", null, false));
true, "utf-8", null, false));

// GIF - has a filter
register(new FilterMIMEType("image/gif", "gif", new String[0], new String[0],
Expand Down Expand Up @@ -343,16 +344,8 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
if(handler.readFilter != null) {
if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
int bufferSize = handler.charsetExtractor.getCharsetBufferSize();
input.mark(bufferSize);
byte[] charsetBuffer = new byte[bufferSize];
int bytesRead = 0, offset = 0, toread=0;
while(true) {
toread = bufferSize - offset;
bytesRead = input.read(charsetBuffer, offset, toread);
if(bytesRead == -1 || toread == 0) break;
offset += bytesRead;
}
input.reset();
int offset = readIntoBuffer(input, bufferSize, charsetBuffer);
charset = detectCharset(charsetBuffer, offset, handler, maybeCharset);
Comment on lines 345 to 349
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the correct solution to this problem is moving this block of code right before the if(handler.readFilter != null) check: text/plain does not have a readFilter, but does takesACharset so this would run the detectCharset appropriately.

Few things to consider:

  • handler.charsetExtractor.getCharsetBufferSize() will NPE, so we need to choose the bufferSize to the max BOM length (5?) when handler.charsetExtractor is absent.
  • Alternatively a dummy CharsetExtractor can be used that always fails to detect a charset, so the BOM one is used automagically.
  • this will return UTF-8 rather than utf-8 so the related test would need some adjustment.

}
try {
Expand All @@ -374,6 +367,16 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
}

if(handler.safeToRead) {
if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
byte[] charsetBuffer = new byte[CHARSET_DETECTION_FALLBACK_BUFFERSIZE];
int offset = readIntoBuffer(input, CHARSET_DETECTION_FALLBACK_BUFFERSIZE, charsetBuffer);
BOMDetection bom = CSSReadFilter.detectCharsetFromBOM(charsetBuffer, CHARSET_DETECTION_FALLBACK_BUFFERSIZE);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m pretty sure this is 100% wrong. That method detects an encoding from the representation of the string @charset. It is also gloriously misnamed as it has nothing to do with a BOM. 😀

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I agree with @Bombe here. See my other comment for something that does appear to work.

if (bom != null) {
charset = bom.charset;
} else if (handler.defaultCharset != null){
charset = handler.defaultCharset;
}
}
FileUtil.copy(input, output, -1);
output.flush();
return new FilterStatus(charset, typeName);
Expand All @@ -384,6 +387,20 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
return null;
}

private static int readIntoBuffer(InputStream input, int bufferSize, byte[] charsetBuffer)
throws IOException {
input.mark(bufferSize);
int bytesRead = 0, offset = 0, toread=0;
while(true) {
toread = bufferSize - offset;
bytesRead = input.read(charsetBuffer, offset, toread);
if(bytesRead == -1 || toread == 0) break;
offset += bytesRead;
}
input.reset();
return offset;
}

public static String detectCharset(byte[] input, int length, FilterMIMEType handler, String maybeCharset) throws IOException {
// Detect charset
String charset = detectBOM(input, length);
Expand Down
31 changes: 31 additions & 0 deletions test/freenet/client/filter/ContentFilterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,37 @@ public void testEvilCharset() throws IOException {
}
}


@Test
public void byteOrderMarkForUtf8IsDetectedCorrectly() throws IOException {
byte[] buf = { (byte) 0xef, (byte) 0xbb, (byte) 0xbf, 0x40 };
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
assertTrue("utf-8".equals(fo.charset));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use assertThat(actual, equalTo(expected)) or assertEquals(expected, actual) for checking equality - this just yields a non-descriptive AssertionError when it fails instead of showing what the actual value was.

}

@Test
public void byteOrderMarkForUtf16BeIsDetectedCorrectly() throws IOException {
byte[] buf = { (byte) 0xfe, (byte) 0xff, 0x00, 0x40 };
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
assertTrue("UTF-16BE".equals(fo.charset));
}

@Test
public void byteOrderMarkForUtf16LeIsDetectedCorrectly() throws IOException {
byte[] buf = { (byte) 0xff, (byte) 0xfe, 0x40, 0x00 };
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(
new ArrayBucket(buf).getInputStream(),
out.getOutputStream(),
"text/plain",
null,
null,
null);
assertTrue("UTF-16LE".equals(fo.charset));
}

public static String htmlFilter(String data) throws Exception {
if (data.startsWith("<html")) return htmlFilter(data, false);
if (data.startsWith("<?")) return htmlFilter(data, false);
Expand Down