tafia
diff --git a/‎src/encoding.rs‎
Lines changed: 276 additions & 6 deletions b/‎src/encoding.rs‎
Lines changed: 276 additions & 6 deletions
@@ -31,6 +31,8 @@ pub enum Utf8ValidationError {
     },
     /// Incomplete UTF-8 sequence at end of stream
     IncompleteSequence,
+    /// Non-UTF-8 encoding detected at start of stream
+    NonUtf8EncodingDetected(DetectedEncoding),
 }
 
 impl From<Utf8Error> for Utf8ValidationError {
@@ -50,6 +52,13 @@ impl std::fmt::Display for Utf8ValidationError {
             Self::IncompleteSequence => {
                 write!(f, "incomplete UTF-8 sequence at end of stream")
             }
+            Self::NonUtf8EncodingDetected(detected) => {
+                write!(
+                    f,
+                    "non-UTF-8 encoding detected at start of stream: {:?}",
+                    detected
+                )
+            }
         }
     }
 }
@@ -323,6 +332,7 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
 /// Possible scenarios for start-of-xml detection of encoding
 ///
 /// See the documentation of [`detect_encoding`]
+#[derive(Clone, Debug, PartialEq, Eq)]
 pub enum DetectedEncoding {
     /// Matches UTF-8 or some other ascii-compatible encoding
     AsciiCompatible,
@@ -417,6 +427,10 @@ impl<R: io::Read> io::BufRead for Utf8BytesReader<R> {
 /// that only valid UTF-8 bytes are written to the output buffer. Incomplete UTF-8
 /// sequences at read boundaries are buffered and combined with subsequent reads.
 ///
+/// Additionally, this reader checks the very beginning of the stream for encoding
+/// signatures (BOMs or XML declaration patterns) and rejects streams that appear to
+/// be encoded in UTF-16 or other non-UTF-8 encodings.
+///
 /// # Examples
 ///
 /// ```
@@ -434,6 +448,8 @@ pub struct Utf8ValidatingReader<R> {
     inner: R,
     /// Buffer to hold incomplete UTF-8 sequences from previous reads (max 3 bytes)
     buffer: Vec<u8>,
+    /// Whether we've checked for encoding at the start of the stream
+    encoding_checked: bool,
 }
 
 impl<R> Utf8ValidatingReader<R> {
@@ -442,6 +458,7 @@ impl<R> Utf8ValidatingReader<R> {
         Self {
             inner,
             buffer: Vec::with_capacity(4),
+            encoding_checked: false,
         }
     }
 
@@ -467,6 +484,49 @@ impl<R: Read> Read for Utf8ValidatingReader<R> {
             return Ok(0);
         }
 
+        // Check for encoding at the start of the stream
+        if !self.encoding_checked {
+            self.encoding_checked = true;
+
+            // Read initial data to detect encoding
+            // Read enough for encoding detection (4 bytes) plus fill up to caller's buffer size
+            let read_size = buf.len().max(64); // Read at least 64 bytes for efficiency
+            let mut temp = vec![0u8; read_size];
+            let n = self.inner.read(&mut temp)?;
+
+            if n > 0 {
+                self.buffer.extend_from_slice(&temp[..n]);
+
+                // Try to detect encoding if we have at least 4 bytes
+                if self.buffer.len() >= 4 {
+                    if let Some(detected) = detect_encoding(&self.buffer) {
+                        match detected {
+                            DetectedEncoding::Utf8Bom | DetectedEncoding::AsciiCompatible => {
+                                // Strip BOM if present
+                                let bom_len = detected.bom_len();
+                                if bom_len > 0 {
+                                    self.buffer.drain(..bom_len);
+                                }
+                            }
+                            DetectedEncoding::Utf16Le
+                            | DetectedEncoding::Utf16LeBom
+                            | DetectedEncoding::Utf16Be
+                            | DetectedEncoding::Utf16BeBom => {
+                                // Reject UTF-16 encodings
+                                return Err(io::Error::new(
+                                    io::ErrorKind::InvalidData,
+                                    EncodingError::Utf8(
+                                        Utf8ValidationError::NonUtf8EncodingDetected(detected),
+                                    ),
+                                ));
+                            }
+                        }
+                    }
+                }
+            }
+            // If we read 0 bytes or less than 4 bytes, assume UTF-8 and continue
+        }
+
         loop {
             // If we have buffered data, check if it's complete UTF-8
             if !self.buffer.is_empty() {
@@ -775,7 +835,21 @@ mod utf8_validating_reader_tests {
         // Second read should fail because incomplete sequence at EOF
         let result = reader.read(&mut buf);
         assert!(result.is_err());
-        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData);
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's the IncompleteSequence error
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::IncompleteSequence) => {}
+            other => panic!("Expected IncompleteSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -786,7 +860,23 @@ mod utf8_validating_reader_tests {
         let mut buf = [0u8; 10];
         let result = reader.read(&mut buf);
         assert!(result.is_err());
-        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData);
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's an InvalidSequence error
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => {
+                assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence (0xFF)");
+            }
+            other => panic!("Expected InvalidSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -797,7 +887,23 @@ mod utf8_validating_reader_tests {
         let mut buf = [0u8; 10];
         let result = reader.read(&mut buf);
         assert!(result.is_err());
-        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData);
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's an InvalidSequence error
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => {
+                assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence");
+            }
+            other => panic!("Expected InvalidSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -942,7 +1048,23 @@ mod utf8_validating_reader_tests {
 
         let result = reader.read(&mut buf);
         assert!(result.is_err());
-        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData);
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's the expected error variant
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => {
+                assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence (0xFF)");
+            }
+            other => panic!("Expected InvalidSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -1013,7 +1135,21 @@ mod utf8_validating_reader_tests {
 
         let result = reader.read(&mut buf);
         assert!(result.is_err());
-        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData);
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's the IncompleteSequence error
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::IncompleteSequence) => {}
+            other => panic!("Expected IncompleteSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -1028,7 +1164,21 @@ mod utf8_validating_reader_tests {
 
         let result = reader.read(&mut buf);
         assert!(result.is_err());
-        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData);
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's the IncompleteSequence error
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::IncompleteSequence) => {}
+            other => panic!("Expected IncompleteSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -1042,6 +1192,26 @@ mod utf8_validating_reader_tests {
 
         let result = reader.read(&mut buf);
         assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's an InvalidSequence error
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => {
+                assert_eq!(
+                    *error_len, 1,
+                    "Expected 1-byte invalid sequence (0xC0 is invalid)"
+                );
+            }
+            other => panic!("Expected InvalidSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -1054,6 +1224,26 @@ mod utf8_validating_reader_tests {
 
         let result = reader.read(&mut buf);
         assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's an InvalidSequence error
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => {
+                assert_eq!(
+                    *error_len, 1,
+                    "Expected 1-byte invalid sequence (0xED starts surrogate)"
+                );
+            }
+            other => panic!("Expected InvalidSequence error, got: {:?}", other),
+        }
     }
 
     #[test]
@@ -1078,4 +1268,84 @@ mod utf8_validating_reader_tests {
         assert_eq!(n, 5);
         assert_eq!(&buf[..n], data);
     }
+
+    #[test]
+    fn test_utf8_bom_stripped() {
+        // UTF-8 BOM (0xEF 0xBB 0xBF) followed by "Hello"
+        let data = b"\xEF\xBB\xBFHello";
+        let mut reader = Utf8ValidatingReader::new(&data[..]);
+        let mut buf = [0u8; 20];
+        let n = reader.read(&mut buf).unwrap();
+
+        // BOM should be stripped, only "Hello" should be returned
+        assert_eq!(&buf[..n], b"Hello");
+        assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "Hello");
+    }
+
+    #[test]
+    fn test_utf16le_bom_rejected() {
+        // UTF-16 LE BOM (0xFF 0xFE)
+        let data = b"\xFF\xFE<?xml";
+        let mut reader = Utf8ValidatingReader::new(&data[..]);
+        let mut buf = [0u8; 20];
+
+        let result = reader.read(&mut buf);
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's the NonUtf8EncodingDetected error with the correct encoding
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => {
+                assert_eq!(*detected, DetectedEncoding::Utf16LeBom);
+            }
+            other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other),
+        }
+    }
+
+    #[test]
+    fn test_utf16be_bom_rejected() {
+        // UTF-16 BE BOM (0xFE 0xFF)
+        let data = b"\xFE\xFF\x00<\x00?";
+        let mut reader = Utf8ValidatingReader::new(&data[..]);
+        let mut buf = [0u8; 20];
+
+        let result = reader.read(&mut buf);
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+        // Verify the error can be downcast to EncodingError
+        let encoding_err = err
+            .get_ref()
+            .unwrap()
+            .downcast_ref::<EncodingError>()
+            .expect("Error should downcast to EncodingError");
+
+        // Verify it's the NonUtf8EncodingDetected error with the correct encoding
+        match encoding_err {
+            EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => {
+                assert_eq!(*detected, DetectedEncoding::Utf16BeBom);
+            }
+            other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other),
+        }
+    }
+
+    #[test]
+    fn test_ascii_compatible_encoding_allowed() {
+        // ASCII-compatible XML declaration (no BOM)
+        let data = b"<?xml version=\"1.0\"?><root/>";
+        let mut reader = Utf8ValidatingReader::new(&data[..]);
+        let mut buf = [0u8; 50];
+
+        let n = reader.read(&mut buf).unwrap();
+        assert_eq!(&buf[..n], data);
+    }
 }