@@ -5,18 +5,16 @@ use std::io::{self, Read};
55use std:: str:: Utf8Error ;
66
77#[ cfg( feature = "encoding" ) ]
8- use encoding_rs:: { DecoderResult , Encoding , UTF_16BE , UTF_16LE , UTF_8 } ;
8+ use encoding_rs;
99
1010/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
1111/// See <https://unicode.org/faq/utf_bom.html#bom1>
1212pub ( crate ) const UTF8_BOM : & [ u8 ] = & [ 0xEF , 0xBB , 0xBF ] ;
1313/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
1414/// See <https://unicode.org/faq/utf_bom.html#bom1>
15- #[ cfg( feature = "encoding" ) ]
1615pub ( crate ) const UTF16_LE_BOM : & [ u8 ] = & [ 0xFF , 0xFE ] ;
1716/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
1817/// See <https://unicode.org/faq/utf_bom.html#bom1>
19- #[ cfg( feature = "encoding" ) ]
2018pub ( crate ) const UTF16_BE_BOM : & [ u8 ] = & [ 0xFE , 0xFF ] ;
2119
2220/// An error type representing UTF-8 validation failure.
@@ -77,7 +75,7 @@ pub enum EncodingError {
7775 Utf8 ( Utf8ValidationError ) ,
7876 /// Input did not adhere to the given encoding
7977 #[ cfg( feature = "encoding" ) ]
80- Other ( & ' static Encoding ) ,
78+ Other ( & ' static encoding_rs :: Encoding ) ,
8179}
8280
8381impl From < Utf8Error > for EncodingError {
@@ -131,20 +129,20 @@ impl std::fmt::Display for EncodingError {
131129#[ derive( Clone , Copy , Debug , Eq , PartialEq ) ]
132130pub struct Decoder {
133131 #[ cfg( feature = "encoding" ) ]
134- pub ( crate ) encoding : & ' static Encoding ,
132+ pub ( crate ) encoding : & ' static encoding_rs :: Encoding ,
135133}
136134
137135impl Decoder {
138136 pub ( crate ) const fn utf8 ( ) -> Self {
139137 Decoder {
140138 #[ cfg( feature = "encoding" ) ]
141- encoding : UTF_8 ,
139+ encoding : encoding_rs :: UTF_8 ,
142140 }
143141 }
144142
145143 #[ cfg( all( test, feature = "encoding" , feature = "serialize" ) ) ]
146144 pub ( crate ) const fn utf16 ( ) -> Self {
147- Decoder { encoding : UTF_16LE }
145+ Decoder { encoding : encoding_rs :: UTF_16LE }
148146 }
149147}
150148
@@ -155,7 +153,7 @@ impl Decoder {
155153 ///
156154 /// [`decode`]: Self::decode
157155 #[ cfg( feature = "encoding" ) ]
158- pub const fn encoding ( & self ) -> & ' static Encoding {
156+ pub const fn encoding ( & self ) -> & ' static encoding_rs :: Encoding {
159157 self . encoding
160158 }
161159
@@ -236,7 +234,7 @@ impl Decoder {
236234#[ cfg( feature = "encoding" ) ]
237235pub fn decode < ' b > (
238236 bytes : & ' b [ u8 ] ,
239- encoding : & ' static Encoding ,
237+ encoding : & ' static encoding_rs :: Encoding ,
240238) -> Result < Cow < ' b , str > , EncodingError > {
241239 encoding
242240 . decode_without_bom_handling_and_without_replacement ( bytes)
@@ -247,10 +245,10 @@ pub fn decode<'b>(
247245#[ cfg( feature = "encoding" ) ]
248246pub fn decode_into (
249247 bytes : & [ u8 ] ,
250- encoding : & ' static Encoding ,
248+ encoding : & ' static encoding_rs :: Encoding ,
251249 buf : & mut String ,
252250) -> Result < ( ) , EncodingError > {
253- if encoding == UTF_8 {
251+ if encoding == encoding_rs :: UTF_8 {
254252 buf. push_str ( std:: str:: from_utf8 ( bytes) ?) ;
255253 return Ok ( ( ) ) ;
256254 }
@@ -265,22 +263,22 @@ pub fn decode_into(
265263 ) ;
266264 let ( result, read) = decoder. decode_to_string_without_replacement ( bytes, buf, true ) ;
267265 match result {
268- DecoderResult :: InputEmpty => {
266+ encoding_rs :: DecoderResult :: InputEmpty => {
269267 debug_assert_eq ! ( read, bytes. len( ) ) ;
270268 Ok ( ( ) )
271269 }
272- DecoderResult :: Malformed ( _, _) => Err ( EncodingError :: Other ( encoding) ) ,
270+ encoding_rs :: DecoderResult :: Malformed ( _, _) => Err ( EncodingError :: Other ( encoding) ) ,
273271 // SAFETY: We allocate enough space above
274- DecoderResult :: OutputFull => unreachable ! ( ) ,
272+ encoding_rs :: DecoderResult :: OutputFull => unreachable ! ( ) ,
275273 }
276274}
277275
278276/// Automatic encoding detection of XML files based using the
279277/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
280278///
281- /// If encoding is detected, `Some` is returned with an encoding and size of BOM
282- /// in bytes, if detection was performed using BOM, or zero , if detection was
283- /// performed without BOM.
279+ /// If encoding is detected, `Some` is returned with a [`DetectedEncoding`], from which
280+ /// can be derived the size of the BOM in bytes , if detection was performed using BOM
281+ /// - or zero, if detection was performed without BOM.
284282///
285283/// IF encoding was not recognized, `None` is returned.
286284///
@@ -300,25 +298,63 @@ pub fn decode_into(
300298/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
301299/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
302300/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
303- #[ cfg( feature = "encoding" ) ]
304- pub fn detect_encoding ( bytes : & [ u8 ] ) -> Option < ( & ' static Encoding , usize ) > {
301+ pub fn detect_encoding ( bytes : & [ u8 ] ) -> Option < DetectedEncoding > {
305302 // Prevent suggesting "<?xm". We want to have the same formatted lines for all arms.
306303 #[ allow( clippy:: byte_char_slices) ]
307304 match bytes {
308305 // with BOM
309- _ if bytes. starts_with ( UTF16_BE_BOM ) => Some ( ( UTF_16BE , 2 ) ) ,
310- _ if bytes. starts_with ( UTF16_LE_BOM ) => Some ( ( UTF_16LE , 2 ) ) ,
311- _ if bytes. starts_with ( UTF8_BOM ) => Some ( ( UTF_8 , 3 ) ) ,
306+ _ if bytes. starts_with ( UTF16_BE_BOM ) => Some ( DetectedEncoding :: Utf16BeBom ) ,
307+ _ if bytes. starts_with ( UTF16_LE_BOM ) => Some ( DetectedEncoding :: Utf16LeBom ) ,
308+ _ if bytes. starts_with ( UTF8_BOM ) => Some ( DetectedEncoding :: Utf8Bom ) ,
312309
313310 // without BOM
314- _ if bytes. starts_with ( & [ 0x00 , b'<' , 0x00 , b'?' ] ) => Some ( ( UTF_16BE , 0 ) ) , // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
315- _ if bytes. starts_with ( & [ b'<' , 0x00 , b'?' , 0x00 ] ) => Some ( ( UTF_16LE , 0 ) ) , // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
316- _ if bytes. starts_with ( & [ b'<' , b'?' , b'x' , b'm' ] ) => Some ( ( UTF_8 , 0 ) ) , // Some ASCII compatible
311+ _ if bytes. starts_with ( & [ 0x00 , b'<' , 0x00 , b'?' ] ) => Some ( DetectedEncoding :: Utf16Be ) , // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
312+ _ if bytes. starts_with ( & [ b'<' , 0x00 , b'?' , 0x00 ] ) => Some ( DetectedEncoding :: Utf16Le ) , // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
313+ _ if bytes. starts_with ( & [ b'<' , b'?' , b'x' , b'm' ] ) => Some ( DetectedEncoding :: AsciiCompatible ) , // Some ASCII compatible
317314
318315 _ => None ,
319316 }
320317}
321318
319+ /// Possible scenarios for start-of-xml detection of encoding
320+ ///
321+ /// See the documentation of [`detect_encoding`]
322+ pub enum DetectedEncoding {
323+ /// Matches UTF-8 or some other ascii-compatible encoding
324+ AsciiCompatible ,
325+ /// We saw a UTF-8 BOM
326+ Utf8Bom ,
327+ /// Matches UTF-16-LE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
328+ Utf16Le ,
329+ /// We saw a UTF-16 BOM in little-endian orientation
330+ Utf16LeBom ,
331+ /// Matches UTF-16-BE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
332+ Utf16Be ,
333+ /// We saw a UTF-16 BOM in big-endian orientation
334+ Utf16BeBom ,
335+ }
336+
337+ impl DetectedEncoding {
338+ /// Return an Encoding object appropriate for the detected encoding
339+ #[ cfg( feature = "encoding" ) ]
340+ pub fn encoding ( & self ) -> & ' static encoding_rs:: Encoding {
341+ match self {
342+ DetectedEncoding :: AsciiCompatible | DetectedEncoding :: Utf8Bom => encoding_rs:: UTF_8 ,
343+ DetectedEncoding :: Utf16Le | DetectedEncoding :: Utf16LeBom => encoding_rs:: UTF_16LE ,
344+ DetectedEncoding :: Utf16Be | DetectedEncoding :: Utf16BeBom => encoding_rs:: UTF_16BE ,
345+ }
346+ }
347+
348+ /// Length of the BOM, which may need to be stripped from the input
349+ pub fn bom_len ( & self ) -> usize {
350+ match self {
351+ DetectedEncoding :: Utf8Bom => 3 ,
352+ DetectedEncoding :: Utf16LeBom | DetectedEncoding :: Utf16BeBom => 2 ,
353+ DetectedEncoding :: AsciiCompatible | DetectedEncoding :: Utf16Le | DetectedEncoding :: Utf16Be => 0 ,
354+ }
355+ }
356+ }
357+
322358/// A struct for transparently decoding / validating bytes as UTF-8.
323359#[ derive( Debug ) ]
324360pub struct Utf8BytesReader < R > {
@@ -328,6 +364,8 @@ pub struct Utf8BytesReader<R> {
328364 reader : io:: BufReader < Utf8ValidatingReader < R > > ,
329365}
330366
367+ // TODO: Utf8BytesReader should manage encoding detection and BOM stripping - that responsibility
368+ // can then be removed from the readers, with perhaps an exception for slice reader.
331369impl < R : io:: Read > Utf8BytesReader < R > {
332370 /// Build a new reader which decodes a stream of bytes in an unknown encoding into UTF-8.
333371 /// (TODO: well, not yet - right now it's just a dumb wrapper)
0 commit comments