Remove feature gating from detect_encoding()

dralley · dralley · commit aa577c1a72f5 · 2026-02-22T12:37:30.000-05:00
Have it return a generic enum so that it can be used with or without the
encoding feature.
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -5,18 +5,16 @@ use std::io::{self, Read};
 use std::str::Utf8Error;
 
 #[cfg(feature = "encoding")]
-use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
+use encoding_rs;
 
 /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
 /// See <https://unicode.org/faq/utf_bom.html#bom1>
 pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
 /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
 /// See <https://unicode.org/faq/utf_bom.html#bom1>
-#[cfg(feature = "encoding")]
 pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
 /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
 /// See <https://unicode.org/faq/utf_bom.html#bom1>
-#[cfg(feature = "encoding")]
 pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
 
 /// An error type representing UTF-8 validation failure.
@@ -77,7 +75,7 @@ pub enum EncodingError {
     Utf8(Utf8ValidationError),
     /// Input did not adhere to the given encoding
     #[cfg(feature = "encoding")]
-    Other(&'static Encoding),
+    Other(&'static encoding_rs::Encoding),
 }
 
 impl From<Utf8Error> for EncodingError {
@@ -131,20 +129,20 @@ impl std::fmt::Display for EncodingError {
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub struct Decoder {
     #[cfg(feature = "encoding")]
-    pub(crate) encoding: &'static Encoding,
+    pub(crate) encoding: &'static encoding_rs::Encoding,
 }
 
 impl Decoder {
     pub(crate) const fn utf8() -> Self {
         Decoder {
             #[cfg(feature = "encoding")]
-            encoding: UTF_8,
+            encoding: encoding_rs::UTF_8,
         }
     }
 
     #[cfg(all(test, feature = "encoding", feature = "serialize"))]
     pub(crate) const fn utf16() -> Self {
-        Decoder { encoding: UTF_16LE }
+        Decoder { encoding: encoding_rs::UTF_16LE }
     }
 }
 
@@ -155,7 +153,7 @@ impl Decoder {
     ///
     /// [`decode`]: Self::decode
     #[cfg(feature = "encoding")]
-    pub const fn encoding(&self) -> &'static Encoding {
+    pub const fn encoding(&self) -> &'static encoding_rs::Encoding {
         self.encoding
     }
 
@@ -236,7 +234,7 @@ impl Decoder {
 #[cfg(feature = "encoding")]
 pub fn decode<'b>(
     bytes: &'b [u8],
-    encoding: &'static Encoding,
+    encoding: &'static encoding_rs::Encoding,
 ) -> Result<Cow<'b, str>, EncodingError> {
     encoding
         .decode_without_bom_handling_and_without_replacement(bytes)
@@ -247,10 +245,10 @@ pub fn decode<'b>(
 #[cfg(feature = "encoding")]
 pub fn decode_into(
     bytes: &[u8],
-    encoding: &'static Encoding,
+    encoding: &'static encoding_rs::Encoding,
     buf: &mut String,
 ) -> Result<(), EncodingError> {
-    if encoding == UTF_8 {
+    if encoding == encoding_rs::UTF_8 {
         buf.push_str(std::str::from_utf8(bytes)?);
         return Ok(());
     }
@@ -265,22 +263,22 @@ pub fn decode_into(
     );
     let (result, read) = decoder.decode_to_string_without_replacement(bytes, buf, true);
     match result {
-        DecoderResult::InputEmpty => {
+        encoding_rs::DecoderResult::InputEmpty => {
             debug_assert_eq!(read, bytes.len());
             Ok(())
         }
-        DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
+        encoding_rs::DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
         // SAFETY: We allocate enough space above
-        DecoderResult::OutputFull => unreachable!(),
+        encoding_rs::DecoderResult::OutputFull => unreachable!(),
     }
 }
 
 /// Automatic encoding detection of XML files based using the
 /// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
 ///
-/// If encoding is detected, `Some` is returned with an encoding and size of BOM
-/// in bytes, if detection was performed using BOM, or zero, if detection was
-/// performed without BOM.
+/// If encoding is detected, `Some` is returned with a [`DetectedEncoding`], from which
+/// can be derived the size of the BOM in bytes, if detection was performed using BOM
+/// - or zero, if detection was performed without BOM.
 ///
 /// IF encoding was not recognized, `None` is returned.
 ///
@@ -300,25 +298,63 @@ pub fn decode_into(
 /// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
 /// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
 /// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
-#[cfg(feature = "encoding")]
-pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
+pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
     // Prevent suggesting "<?xm". We want to have the same formatted lines for all arms.
     #[allow(clippy::byte_char_slices)]
     match bytes {
         // with BOM
-        _ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
-        _ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
-        _ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),
+        _ if bytes.starts_with(UTF16_BE_BOM) => Some(DetectedEncoding::Utf16BeBom),
+        _ if bytes.starts_with(UTF16_LE_BOM) => Some(DetectedEncoding::Utf16LeBom),
+        _ if bytes.starts_with(UTF8_BOM) => Some(DetectedEncoding::Utf8Bom),
 
         // without BOM
-        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
-        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
-        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible
+        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(DetectedEncoding::Utf16Be), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
+        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(DetectedEncoding::Utf16Le), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
+        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(DetectedEncoding::AsciiCompatible), // Some ASCII compatible
 
         _ => None,
     }
 }
 
+/// Possible scenarios for start-of-xml detection of encoding
+///
+/// See the documentation of [`detect_encoding`]
+pub enum DetectedEncoding {
+    /// Matches UTF-8 or some other ascii-compatible encoding
+    AsciiCompatible,
+    /// We saw a UTF-8 BOM
+    Utf8Bom,
+    /// Matches UTF-16-LE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
+    Utf16Le,
+    /// We saw a UTF-16 BOM in little-endian orientation
+    Utf16LeBom,
+    /// Matches UTF-16-BE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
+    Utf16Be,
+    /// We saw a UTF-16 BOM in big-endian orientation
+    Utf16BeBom,
+}
+
+impl DetectedEncoding {
+    /// Return an Encoding object appropriate for the detected encoding
+    #[cfg(feature = "encoding")]
+    pub fn encoding(&self) -> &'static encoding_rs::Encoding {
+        match self {
+            DetectedEncoding::AsciiCompatible | DetectedEncoding::Utf8Bom => encoding_rs::UTF_8,
+            DetectedEncoding::Utf16Le | DetectedEncoding::Utf16LeBom => encoding_rs::UTF_16LE,
+            DetectedEncoding::Utf16Be | DetectedEncoding::Utf16BeBom => encoding_rs::UTF_16BE,
+        }
+    }
+
+    /// Length of the BOM, which may need to be stripped from the input
+    pub fn bom_len(&self) -> usize {
+        match self {
+            DetectedEncoding::Utf8Bom => 3,
+            DetectedEncoding::Utf16LeBom | DetectedEncoding::Utf16BeBom => 2,
+            DetectedEncoding::AsciiCompatible | DetectedEncoding::Utf16Le | DetectedEncoding::Utf16Be => 0,
+        }
+    }
+}
+
 /// A struct for transparently decoding / validating bytes as UTF-8.
 #[derive(Debug)]
 pub struct Utf8BytesReader<R> {
@@ -328,6 +364,8 @@ pub struct Utf8BytesReader<R> {
     reader: io::BufReader<Utf8ValidatingReader<R>>,
 }
 
+// TODO: Utf8BytesReader should manage encoding detection and BOM stripping - that responsibility
+// can then be removed from the readers, with perhaps an exception for slice reader.
 impl<R: io::Read> Utf8BytesReader<R> {
     /// Build a new reader which decodes a stream of bytes in an unknown encoding into UTF-8.
     /// (TODO: well, not yet - right now it's just a dumb wrapper)
diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs
@@ -5,7 +5,7 @@ use std::fs::File;
 use std::io::{self, BufRead, BufReader};
 use std::path::Path;
 
-use crate::encoding::Utf8BytesReader;
+use crate::encoding::{self, DetectedEncoding, Utf8BytesReader};
 use crate::errors::{Error, Result};
 use crate::events::{BytesText, Event};
 use crate::name::QName;
@@ -36,12 +36,12 @@ macro_rules! impl_buffered_source {
 
         #[cfg(feature = "encoding")]
         #[inline]
-        $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
+        $($async)? fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>> {
             loop {
                 break match self $(.$reader)? .fill_buf() $(.$await)? {
-                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
-                        self $(.$reader)? .consume(bom_len);
-                        Ok(Some(enc))
+                    Ok(n) => if let Some(detected) = encoding::detect_encoding(n) {
+                        self $(.$reader)? .consume(detected.bom_len());
+                        Ok(Some(detected))
                     } else {
                         Ok(None)
                     },
diff --git a/src/reader/mod.rs b/src/reader/mod.rs
@@ -5,7 +5,7 @@ use encoding_rs::Encoding;
 use std::io;
 use std::ops::Range;
 
-use crate::encoding::{Decoder, Utf8BytesReader};
+use crate::encoding::{Decoder, DetectedEncoding, Utf8BytesReader};
 use crate::errors::{Error, IllFormedError, SyntaxError};
 use crate::events::{BytesRef, Event};
 use crate::parser::{DtdParser, ElementParser, Parser, PiParser};
@@ -267,7 +267,7 @@ macro_rules! read_event_impl {
                     #[cfg(feature = "encoding")]
                     if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
                         if $self.state.encoding.can_be_refined() {
-                            $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
+                            $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding.encoder());
                         }
                     }
 
@@ -1086,7 +1086,7 @@ trait XmlSource<'r, B> {
 
     /// Determines encoding from the start of input and removes BOM if it is present
     #[cfg(feature = "encoding")]
-    fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
+    fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>>;
 
     /// Read input until start of markup (the `<`) is found, start of general entity
     /// reference (the `&`) is found or end of input is reached.
diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs
@@ -4,6 +4,7 @@
 
 use std::io;
 
+use crate::encoding::DetectedEncoding;
 #[cfg(feature = "encoding")]
 use crate::reader::EncodingRef;
 #[cfg(feature = "encoding")]
@@ -253,10 +254,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
 
     #[cfg(feature = "encoding")]
     #[inline]
-    fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>> {
-        if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
-            *self = &self[bom_len..];
-            return Ok(Some(enc));
+    fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>> {
+        if let Some(detected) = crate::encoding::detect_encoding(self) {
+            *self = &self[detected.bom_len() as usize..];
+            return Ok(Some(detected));
         }
         Ok(None)
     }
diff --git a/tests/encodings.rs b/tests/encodings.rs
@@ -21,12 +21,25 @@ mod decode {
 
     #[test]
     fn test_detect_encoding() {
+        use quick_xml::encoding::DetectedEncoding;
+
         // No BOM
-        assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some((UTF_8, 0)));
+        let detected = detect_encoding(UTF8_TEXT.as_bytes()).unwrap();
+        assert_eq!(detected.encoding(), UTF_8);
+        assert_eq!(detected.bom_len(), 0);
+
         // BOM
-        assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some((UTF_8, 3)));
-        assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some((UTF_16BE, 2)));
-        assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some((UTF_16LE, 2)));
+        let detected = detect_encoding(UTF8_TEXT_WITH_BOM).unwrap();
+        assert_eq!(detected.encoding(), UTF_8);
+        assert_eq!(detected.bom_len(), 3);
+
+        let detected = detect_encoding(UTF16BE_TEXT_WITH_BOM).unwrap();
+        assert_eq!(detected.encoding(), UTF_16BE);
+        assert_eq!(detected.bom_len(), 2);
+
+        let detected = detect_encoding(UTF16LE_TEXT_WITH_BOM).unwrap();
+        assert_eq!(detected.encoding(), UTF_16LE);
+        assert_eq!(detected.bom_len(), 2);
     }
 }
 #[cfg(not(feature = "encoding"))]