Skip to content

Commit aa577c1

Browse files
committed
Remove feature gating from detect_encoding()
Have it return a generic enum so that it can be used with or without the encoding feature.
1 parent 05cd262 commit aa577c1

5 files changed

Lines changed: 93 additions & 41 deletions

File tree

src/encoding.rs

Lines changed: 63 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,16 @@ use std::io::{self, Read};
55
use std::str::Utf8Error;
66

77
#[cfg(feature = "encoding")]
8-
use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
8+
use encoding_rs;
99

1010
/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
1111
/// See <https://unicode.org/faq/utf_bom.html#bom1>
1212
pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
1313
/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
1414
/// See <https://unicode.org/faq/utf_bom.html#bom1>
15-
#[cfg(feature = "encoding")]
1615
pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
1716
/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
1817
/// See <https://unicode.org/faq/utf_bom.html#bom1>
19-
#[cfg(feature = "encoding")]
2018
pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
2119

2220
/// An error type representing UTF-8 validation failure.
@@ -77,7 +75,7 @@ pub enum EncodingError {
7775
Utf8(Utf8ValidationError),
7876
/// Input did not adhere to the given encoding
7977
#[cfg(feature = "encoding")]
80-
Other(&'static Encoding),
78+
Other(&'static encoding_rs::Encoding),
8179
}
8280

8381
impl From<Utf8Error> for EncodingError {
@@ -131,20 +129,20 @@ impl std::fmt::Display for EncodingError {
131129
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
132130
pub struct Decoder {
133131
#[cfg(feature = "encoding")]
134-
pub(crate) encoding: &'static Encoding,
132+
pub(crate) encoding: &'static encoding_rs::Encoding,
135133
}
136134

137135
impl Decoder {
138136
pub(crate) const fn utf8() -> Self {
139137
Decoder {
140138
#[cfg(feature = "encoding")]
141-
encoding: UTF_8,
139+
encoding: encoding_rs::UTF_8,
142140
}
143141
}
144142

145143
#[cfg(all(test, feature = "encoding", feature = "serialize"))]
146144
pub(crate) const fn utf16() -> Self {
147-
Decoder { encoding: UTF_16LE }
145+
Decoder { encoding: encoding_rs::UTF_16LE }
148146
}
149147
}
150148

@@ -155,7 +153,7 @@ impl Decoder {
155153
///
156154
/// [`decode`]: Self::decode
157155
#[cfg(feature = "encoding")]
158-
pub const fn encoding(&self) -> &'static Encoding {
156+
pub const fn encoding(&self) -> &'static encoding_rs::Encoding {
159157
self.encoding
160158
}
161159

@@ -236,7 +234,7 @@ impl Decoder {
236234
#[cfg(feature = "encoding")]
237235
pub fn decode<'b>(
238236
bytes: &'b [u8],
239-
encoding: &'static Encoding,
237+
encoding: &'static encoding_rs::Encoding,
240238
) -> Result<Cow<'b, str>, EncodingError> {
241239
encoding
242240
.decode_without_bom_handling_and_without_replacement(bytes)
@@ -247,10 +245,10 @@ pub fn decode<'b>(
247245
#[cfg(feature = "encoding")]
248246
pub fn decode_into(
249247
bytes: &[u8],
250-
encoding: &'static Encoding,
248+
encoding: &'static encoding_rs::Encoding,
251249
buf: &mut String,
252250
) -> Result<(), EncodingError> {
253-
if encoding == UTF_8 {
251+
if encoding == encoding_rs::UTF_8 {
254252
buf.push_str(std::str::from_utf8(bytes)?);
255253
return Ok(());
256254
}
@@ -265,22 +263,22 @@ pub fn decode_into(
265263
);
266264
let (result, read) = decoder.decode_to_string_without_replacement(bytes, buf, true);
267265
match result {
268-
DecoderResult::InputEmpty => {
266+
encoding_rs::DecoderResult::InputEmpty => {
269267
debug_assert_eq!(read, bytes.len());
270268
Ok(())
271269
}
272-
DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
270+
encoding_rs::DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
273271
// SAFETY: We allocate enough space above
274-
DecoderResult::OutputFull => unreachable!(),
272+
encoding_rs::DecoderResult::OutputFull => unreachable!(),
275273
}
276274
}
277275

278276
/// Automatic encoding detection of XML files based using the
279277
/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
280278
///
281-
/// If encoding is detected, `Some` is returned with an encoding and size of BOM
282-
/// in bytes, if detection was performed using BOM, or zero, if detection was
283-
/// performed without BOM.
279+
/// If encoding is detected, `Some` is returned with a [`DetectedEncoding`], from which
280+
/// can be derived the size of the BOM in bytes, if detection was performed using BOM
281+
/// - or zero, if detection was performed without BOM.
284282
///
285283
/// IF encoding was not recognized, `None` is returned.
286284
///
@@ -300,25 +298,63 @@ pub fn decode_into(
300298
/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
301299
/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
302300
/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
303-
#[cfg(feature = "encoding")]
304-
pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
301+
pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
305302
// Prevent suggesting "<?xm". We want to have the same formatted lines for all arms.
306303
#[allow(clippy::byte_char_slices)]
307304
match bytes {
308305
// with BOM
309-
_ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
310-
_ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
311-
_ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),
306+
_ if bytes.starts_with(UTF16_BE_BOM) => Some(DetectedEncoding::Utf16BeBom),
307+
_ if bytes.starts_with(UTF16_LE_BOM) => Some(DetectedEncoding::Utf16LeBom),
308+
_ if bytes.starts_with(UTF8_BOM) => Some(DetectedEncoding::Utf8Bom),
312309

313310
// without BOM
314-
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
315-
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
316-
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible
311+
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(DetectedEncoding::Utf16Be), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
312+
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(DetectedEncoding::Utf16Le), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
313+
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(DetectedEncoding::AsciiCompatible), // Some ASCII compatible
317314

318315
_ => None,
319316
}
320317
}
321318

319+
/// Possible scenarios for start-of-xml detection of encoding
320+
///
321+
/// See the documentation of [`detect_encoding`]
322+
pub enum DetectedEncoding {
323+
/// Matches UTF-8 or some other ascii-compatible encoding
324+
AsciiCompatible,
325+
/// We saw a UTF-8 BOM
326+
Utf8Bom,
327+
/// Matches UTF-16-LE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
328+
Utf16Le,
329+
/// We saw a UTF-16 BOM in little-endian orientation
330+
Utf16LeBom,
331+
/// Matches UTF-16-BE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
332+
Utf16Be,
333+
/// We saw a UTF-16 BOM in big-endian orientation
334+
Utf16BeBom,
335+
}
336+
337+
impl DetectedEncoding {
338+
/// Return an Encoding object appropriate for the detected encoding
339+
#[cfg(feature = "encoding")]
340+
pub fn encoding(&self) -> &'static encoding_rs::Encoding {
341+
match self {
342+
DetectedEncoding::AsciiCompatible | DetectedEncoding::Utf8Bom => encoding_rs::UTF_8,
343+
DetectedEncoding::Utf16Le | DetectedEncoding::Utf16LeBom => encoding_rs::UTF_16LE,
344+
DetectedEncoding::Utf16Be | DetectedEncoding::Utf16BeBom => encoding_rs::UTF_16BE,
345+
}
346+
}
347+
348+
/// Length of the BOM, which may need to be stripped from the input
349+
pub fn bom_len(&self) -> usize {
350+
match self {
351+
DetectedEncoding::Utf8Bom => 3,
352+
DetectedEncoding::Utf16LeBom | DetectedEncoding::Utf16BeBom => 2,
353+
DetectedEncoding::AsciiCompatible | DetectedEncoding::Utf16Le | DetectedEncoding::Utf16Be => 0,
354+
}
355+
}
356+
}
357+
322358
/// A struct for transparently decoding / validating bytes as UTF-8.
323359
#[derive(Debug)]
324360
pub struct Utf8BytesReader<R> {
@@ -328,6 +364,8 @@ pub struct Utf8BytesReader<R> {
328364
reader: io::BufReader<Utf8ValidatingReader<R>>,
329365
}
330366

367+
// TODO: Utf8BytesReader should manage encoding detection and BOM stripping - that responsibility
368+
// can then be removed from the readers, with perhaps an exception for slice reader.
331369
impl<R: io::Read> Utf8BytesReader<R> {
332370
/// Build a new reader which decodes a stream of bytes in an unknown encoding into UTF-8.
333371
/// (TODO: well, not yet - right now it's just a dumb wrapper)

src/reader/buffered_reader.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::fs::File;
55
use std::io::{self, BufRead, BufReader};
66
use std::path::Path;
77

8-
use crate::encoding::Utf8BytesReader;
8+
use crate::encoding::{self, DetectedEncoding, Utf8BytesReader};
99
use crate::errors::{Error, Result};
1010
use crate::events::{BytesText, Event};
1111
use crate::name::QName;
@@ -36,12 +36,12 @@ macro_rules! impl_buffered_source {
3636

3737
#[cfg(feature = "encoding")]
3838
#[inline]
39-
$($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39+
$($async)? fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>> {
4040
loop {
4141
break match self $(.$reader)? .fill_buf() $(.$await)? {
42-
Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
43-
self $(.$reader)? .consume(bom_len);
44-
Ok(Some(enc))
42+
Ok(n) => if let Some(detected) = encoding::detect_encoding(n) {
43+
self $(.$reader)? .consume(detected.bom_len());
44+
Ok(Some(detected))
4545
} else {
4646
Ok(None)
4747
},

src/reader/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use encoding_rs::Encoding;
55
use std::io;
66
use std::ops::Range;
77

8-
use crate::encoding::{Decoder, Utf8BytesReader};
8+
use crate::encoding::{Decoder, DetectedEncoding, Utf8BytesReader};
99
use crate::errors::{Error, IllFormedError, SyntaxError};
1010
use crate::events::{BytesRef, Event};
1111
use crate::parser::{DtdParser, ElementParser, Parser, PiParser};
@@ -267,7 +267,7 @@ macro_rules! read_event_impl {
267267
#[cfg(feature = "encoding")]
268268
if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
269269
if $self.state.encoding.can_be_refined() {
270-
$self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
270+
$self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding.encoder());
271271
}
272272
}
273273

@@ -1086,7 +1086,7 @@ trait XmlSource<'r, B> {
10861086

10871087
/// Determines encoding from the start of input and removes BOM if it is present
10881088
#[cfg(feature = "encoding")]
1089-
fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
1089+
fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>>;
10901090

10911091
/// Read input until start of markup (the `<`) is found, start of general entity
10921092
/// reference (the `&`) is found or end of input is reached.

src/reader/slice_reader.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
55
use std::io;
66

7+
use crate::encoding::DetectedEncoding;
78
#[cfg(feature = "encoding")]
89
use crate::reader::EncodingRef;
910
#[cfg(feature = "encoding")]
@@ -253,10 +254,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
253254

254255
#[cfg(feature = "encoding")]
255256
#[inline]
256-
fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>> {
257-
if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
258-
*self = &self[bom_len..];
259-
return Ok(Some(enc));
257+
fn detect_encoding(&mut self) -> io::Result<Option<DetectedEncoding>> {
258+
if let Some(detected) = crate::encoding::detect_encoding(self) {
259+
*self = &self[detected.bom_len() as usize..];
260+
return Ok(Some(detected));
260261
}
261262
Ok(None)
262263
}

tests/encodings.rs

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,25 @@ mod decode {
2121

2222
#[test]
2323
fn test_detect_encoding() {
24+
use quick_xml::encoding::DetectedEncoding;
25+
2426
// No BOM
25-
assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some((UTF_8, 0)));
27+
let detected = detect_encoding(UTF8_TEXT.as_bytes()).unwrap();
28+
assert_eq!(detected.encoding(), UTF_8);
29+
assert_eq!(detected.bom_len(), 0);
30+
2631
// BOM
27-
assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some((UTF_8, 3)));
28-
assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some((UTF_16BE, 2)));
29-
assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some((UTF_16LE, 2)));
32+
let detected = detect_encoding(UTF8_TEXT_WITH_BOM).unwrap();
33+
assert_eq!(detected.encoding(), UTF_8);
34+
assert_eq!(detected.bom_len(), 3);
35+
36+
let detected = detect_encoding(UTF16BE_TEXT_WITH_BOM).unwrap();
37+
assert_eq!(detected.encoding(), UTF_16BE);
38+
assert_eq!(detected.bom_len(), 2);
39+
40+
let detected = detect_encoding(UTF16LE_TEXT_WITH_BOM).unwrap();
41+
assert_eq!(detected.encoding(), UTF_16LE);
42+
assert_eq!(detected.bom_len(), 2);
3043
}
3144
}
3245
#[cfg(not(feature = "encoding"))]

0 commit comments

Comments
 (0)