-
Notifications
You must be signed in to change notification settings - Fork 232
Provide text with UTF-8 MIME Type by default #970
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,8 +32,9 @@ public class ContentFilter { | |
|
|
||
| /** The HTML mime types are defined here, to allow other modules to identify it*/ | ||
| public static final String[] HTML_MIME_TYPES=new String[]{"text/html", "application/xhtml+xml", "text/xml+xhtml", "text/xhtml", "application/xhtml"}; | ||
| private static final int CHARSET_DETECTION_FALLBACK_BUFFERSIZE = 64; | ||
|
|
||
| private static volatile boolean logMINOR; | ||
| private static volatile boolean logMINOR; | ||
| static { | ||
| Logger.registerLogThresholdCallback(new LogThresholdCallback(){ | ||
| @Override | ||
|
|
@@ -54,7 +55,7 @@ public static void init() { | |
| register(new FilterMIMEType("text/plain", "txt", new String[0], new String[] { "text", "pot" }, | ||
| true, true, null, false, false, false, false, false, false, | ||
| l10n("textPlainReadAdvice"), | ||
| true, "US-ASCII", null, false)); | ||
| true, "utf-8", null, false)); | ||
|
|
||
| // GIF - has a filter | ||
| register(new FilterMIMEType("image/gif", "gif", new String[0], new String[0], | ||
|
|
@@ -343,16 +344,8 @@ public static FilterStatus filter(InputStream input, OutputStream output, String | |
| if(handler.readFilter != null) { | ||
| if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) { | ||
| int bufferSize = handler.charsetExtractor.getCharsetBufferSize(); | ||
| input.mark(bufferSize); | ||
| byte[] charsetBuffer = new byte[bufferSize]; | ||
| int bytesRead = 0, offset = 0, toread=0; | ||
| while(true) { | ||
| toread = bufferSize - offset; | ||
| bytesRead = input.read(charsetBuffer, offset, toread); | ||
| if(bytesRead == -1 || toread == 0) break; | ||
| offset += bytesRead; | ||
| } | ||
| input.reset(); | ||
| int offset = readIntoBuffer(input, bufferSize, charsetBuffer); | ||
| charset = detectCharset(charsetBuffer, offset, handler, maybeCharset); | ||
| } | ||
| try { | ||
|
|
@@ -374,6 +367,16 @@ public static FilterStatus filter(InputStream input, OutputStream output, String | |
| } | ||
|
|
||
| if(handler.safeToRead) { | ||
| if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) { | ||
| byte[] charsetBuffer = new byte[CHARSET_DETECTION_FALLBACK_BUFFERSIZE]; | ||
| int offset = readIntoBuffer(input, CHARSET_DETECTION_FALLBACK_BUFFERSIZE, charsetBuffer); | ||
| BOMDetection bom = CSSReadFilter.detectCharsetFromBOM(charsetBuffer, CHARSET_DETECTION_FALLBACK_BUFFERSIZE); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’m pretty sure this is 100% wrong. That method detects an encoding from the representation of the string
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, I agree with @Bombe here. See my other comment for something that does appear to work. |
||
| if (bom != null) { | ||
| charset = bom.charset; | ||
| } else if (handler.defaultCharset != null){ | ||
| charset = handler.defaultCharset; | ||
| } | ||
| } | ||
| FileUtil.copy(input, output, -1); | ||
| output.flush(); | ||
| return new FilterStatus(charset, typeName); | ||
|
|
@@ -384,6 +387,20 @@ public static FilterStatus filter(InputStream input, OutputStream output, String | |
| return null; | ||
| } | ||
|
|
||
| private static int readIntoBuffer(InputStream input, int bufferSize, byte[] charsetBuffer) | ||
| throws IOException { | ||
| input.mark(bufferSize); | ||
| int bytesRead = 0, offset = 0, toread=0; | ||
| while(true) { | ||
| toread = bufferSize - offset; | ||
| bytesRead = input.read(charsetBuffer, offset, toread); | ||
| if(bytesRead == -1 || toread == 0) break; | ||
| offset += bytesRead; | ||
| } | ||
| input.reset(); | ||
| return offset; | ||
| } | ||
|
|
||
| public static String detectCharset(byte[] input, int length, FilterMIMEType handler, String maybeCharset) throws IOException { | ||
| // Detect charset | ||
| String charset = detectBOM(input, length); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -380,6 +380,37 @@ public void testEvilCharset() throws IOException { | |
| } | ||
| } | ||
|
|
||
|
|
||
| @Test | ||
| public void byteOrderMarkForUtf8IsDetectedCorrectly() throws IOException { | ||
| byte[] buf = { (byte) 0xef, (byte) 0xbb, (byte) 0xbf, 0x40 }; | ||
| ArrayBucket out = new ArrayBucket(); | ||
| FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null); | ||
| assertTrue("utf-8".equals(fo.charset)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use |
||
| } | ||
|
|
||
| @Test | ||
| public void byteOrderMarkForUtf16BeIsDetectedCorrectly() throws IOException { | ||
| byte[] buf = { (byte) 0xfe, (byte) 0xff, 0x00, 0x40 }; | ||
| ArrayBucket out = new ArrayBucket(); | ||
| FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null); | ||
| assertTrue("UTF-16BE".equals(fo.charset)); | ||
| } | ||
|
|
||
| @Test | ||
| public void byteOrderMarkForUtf16LeIsDetectedCorrectly() throws IOException { | ||
| byte[] buf = { (byte) 0xff, (byte) 0xfe, 0x40, 0x00 }; | ||
| ArrayBucket out = new ArrayBucket(); | ||
| FilterStatus fo = ContentFilter.filter( | ||
| new ArrayBucket(buf).getInputStream(), | ||
| out.getOutputStream(), | ||
| "text/plain", | ||
| null, | ||
| null, | ||
| null); | ||
| assertTrue("UTF-16LE".equals(fo.charset)); | ||
| } | ||
|
|
||
| public static String htmlFilter(String data) throws Exception { | ||
| if (data.startsWith("<html")) return htmlFilter(data, false); | ||
| if (data.startsWith("<?")) return htmlFilter(data, false); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe the correct solution to this problem is moving this block of code right before the
if(handler.readFilter != null)check: text/plain does not have areadFilter, but doestakesACharsetso this would run thedetectCharsetappropriately.Few things to consider:
handler.charsetExtractor.getCharsetBufferSize()will NPE, so we need to choose the bufferSize to the max BOM length (5?) whenhandler.charsetExtractoris absent.UTF-8rather thanutf-8so the related test would need some adjustment.