diff --git a/Cargo.toml b/Cargo.toml index f7944de1f..1a839ca9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -233,6 +233,11 @@ name = "serde-de-enum" required-features = ["serialize"] path = "tests/serde-de-enum.rs" +[[test]] +name = "serde-de-references" +required-features = ["serialize"] +path = "tests/serde-de-references.rs" + [[test]] name = "serde-de-seq" required-features = ["serialize"] diff --git a/Changelog.md b/Changelog.md index 56b6e81fb..c4f0bdf67 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,6 +14,10 @@ ## Unreleased +The new `XmlReader` type was added that is automatically resolves general entity references. + +`quick_xml::de::resolver` was replaced by `quick_xml::resolver` module. + ### New Features - [#938]: Add new enumeration `XmlVersion` and typified getter `BytesDecl::xml_version()`. @@ -33,6 +37,10 @@ Deprecated functions now behaves the same as newly added. +- [#948]: Add `quick_xml::reader::EntityResolver` which is able to resolve external entities. +- [#948]: Add `quick_xml::reader::XmlReader`, a new high-level reader which should be preferred + over the old `Reader`. + ### Bug Fixes - [#938]: Use correct rules for EOL normalization in `Deserializer` when parse XML 1.0 documents. @@ -51,6 +59,7 @@ [#914]: https://github.com/tafia/quick-xml/pull/914 [#938]: https://github.com/tafia/quick-xml/pull/938 [#944]: https://github.com/tafia/quick-xml/pull/944 +[#948]: https://github.com/tafia/quick-xml/pull/948 ## 0.39.2 -- 2026-02-20 diff --git a/compare/benches/low-level.rs b/compare/benches/low-level.rs index 631fc0c68..cf475ab75 100644 --- a/compare/benches/low-level.rs +++ b/compare/benches/low-level.rs @@ -1,7 +1,7 @@ use criterion::{self, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use pretty_assertions::assert_eq; use quick_xml::events::Event; -use quick_xml::reader::Reader; +use quick_xml::reader::{self, Reader, XmlReader}; use std::hint::black_box; use xml::reader::{EventReader, XmlEvent}; @@ -94,6 +94,26 @@ fn low_level_comparison(c: &mut Criterion) { }, ); + group.bench_with_input( + BenchmarkId::new("quick_xml:reader", filename), + *data, + |b, input| { + b.iter(|| { + let mut reader = XmlReader::from_str(input); + // TODO: reader.config_mut().check_end_names = false; + let mut count = black_box(0); + loop { + match reader.read_event() { + Ok(reader::Event::Start(_)) | Ok(reader::Event::Empty(_)) => count += 1, + Ok(reader::Event::Eof) => break, + _ => (), + } + } + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }, + ); + group.bench_with_input( BenchmarkId::new("maybe_xml:0.10", filename), *data, diff --git a/examples/high-level-entities.rs b/examples/high-level-entities.rs new file mode 100644 index 000000000..f42c50257 --- /dev/null +++ b/examples/high-level-entities.rs @@ -0,0 +1,224 @@ +//! This example demonstrate how custom entities can be extracted from the DOCTYPE +//! and usage of the high-level `Reader` API. +//! +//! NB: this example is deliberately kept simple: +//! * the regex in this example is simple but brittle. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::convert::Infallible; +use std::fmt; +use std::io::{BufRead, Cursor}; + +use quick_xml::events::{BytesEnd, BytesStart, BytesText}; +use quick_xml::reader::{ + EntityResolver, EntityResolverFactory, Reader, ReplacementText, XmlEvent, XmlReader, +}; +use regex::bytes::Regex; + +use pretty_assertions::assert_eq; + +const XML1: &str = r#" + +" > + &element1; " > +]> +&element2; +&external; +"#; + +/// Additional document which in reality would be referenced by +/// `` +const XML2: &str = r#" + +text +"#; + +struct MyResolver<'i> { + /// Map of captured internal _parsed general entities_. _Parsed_ means that + /// value of the entity is parsed by XML reader. + entities: HashMap, Cow<'i, [u8]>>, + /// In this example we use simple regular expression to capture entities from DTD. + /// In real application you should use DTD parser. + entity_re: Regex, +} +impl<'i> fmt::Debug for MyResolver<'i> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_map() + .entries(self.entities.iter().map(|(k, v)| { + ( + std::str::from_utf8(k).unwrap(), + std::str::from_utf8(v).unwrap(), + ) + })) + .finish() + } +} + +impl<'i> MyResolver<'i> { + fn new() -> Result { + Ok(Self { + entities: Default::default(), + // Capture "name" and "content" from such string: + // + entity_re: Regex::new(r#""#)?, + }) + } + fn capture_borrowed(&mut self, doctype: &'i [u8]) { + for cap in self.entity_re.captures_iter(doctype) { + self.entities.insert( + cap.get(1).unwrap().as_bytes().into(), + cap.get(2).unwrap().as_bytes().into(), + ); + } + } + fn capture_owned(&mut self, doctype: Vec) { + for cap in self.entity_re.captures_iter(&doctype) { + self.entities.insert( + cap.get(1).unwrap().as_bytes().to_owned().into(), + cap.get(2).unwrap().as_bytes().to_owned().into(), + ); + } + } +} + +impl<'i> EntityResolverFactory<'i> for MyResolver<'i> { + type CaptureError = Infallible; + type Resolver = Self; + + fn new_resolver(&mut self) -> Self::Resolver { + // We use valid regex so cannot fail + Self::new().unwrap() + } +} + +impl<'i> EntityResolver<'i> for MyResolver<'i> { + type CaptureError = Infallible; + + fn capture(&mut self, doctype: BytesText<'i>) -> Result<(), Self::CaptureError> { + dbg!(&doctype); + match doctype.into_inner() { + Cow::Borrowed(doctype) => self.capture_borrowed(doctype), + Cow::Owned(doctype) => self.capture_owned(doctype), + } + dbg!(self); + Ok(()) + } + + fn resolve<'e>(&self, entity: &str) -> Option> { + dbg!((entity, self)); + if entity == "external" { + return Some(ReplacementText::External(Box::new(Cursor::new( + XML2.as_bytes(), + )))); + } + match self.entities.get(entity.as_bytes()) { + Some(replacement) => Some(ReplacementText::Internal(replacement.clone())), + None => None, + } + } +} + +/// In this example the events will borrow from the first document +fn borrowed() -> Result<(), Box> { + let mut reader = Reader::from_str(XML1); + reader.config_mut().trim_text(true); + + let mut r = XmlReader::borrowed(reader, MyResolver::new()?); + + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::from_content( + r#"test label="Message: &text;""#, + 4 + )) + ); + + //-------------------------------------------------------------------------- + // This part was inserted into original document from entity defined in DTD + assert_eq!(r.read_event()?, XmlEvent::Start(BytesStart::new("a"))); + assert_eq!( + r.read_event()?, + XmlEvent::Empty(BytesStart::from_content( + r#"dtd attr = 'Message: &text;'"#, + 3 + )) + ); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("a"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("test"))); + + //-------------------------------------------------------------------------- + // Start of external document + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::new("external")) + ); + assert_eq!(r.read_event()?, XmlEvent::Text(BytesText::new("text"))); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("external"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::Eof); + + Ok(()) +} + +/// In this example the events will always copy data +fn buffered() -> Result<(), Box> { + let boxed: Box = Box::new(Cursor::new(XML1.as_bytes())); + let mut reader = Reader::from_reader(boxed); + reader.config_mut().trim_text(true); + + let mut r = XmlReader::buffered(reader, MyResolver::new()?); + + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::from_content( + r#"test label="Message: &text;""#, + 4 + )) + ); + + //-------------------------------------------------------------------------- + // This part was inserted into original document from entity defined in DTD + assert_eq!(r.read_event()?, XmlEvent::Start(BytesStart::new("a"))); + assert_eq!( + r.read_event()?, + XmlEvent::Empty(BytesStart::from_content( + r#"dtd attr = 'Message: &text;'"#, + 3 + )) + ); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("a"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("test"))); + + //-------------------------------------------------------------------------- + // Start of external document + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::new("external")) + ); + assert_eq!(r.read_event()?, XmlEvent::Text(BytesText::new("text"))); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("external"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::Eof); + + Ok(()) +} + +fn main() -> Result<(), Box> { + println!("{}", XML1); + // In this example the events will borrow from the first document + borrowed()?; + + println!("----------------------------------------------------------------"); + println!("{}", XML1); + // In this example the events will always copy data + buffered()?; + Ok(()) +} diff --git a/examples/custom_entities.rs b/examples/low-level-entities.rs similarity index 97% rename from examples/custom_entities.rs rename to examples/low-level-entities.rs index ed8c082a2..61b5ea0bb 100644 --- a/examples/custom_entities.rs +++ b/examples/low-level-entities.rs @@ -16,7 +16,6 @@ use std::str::from_utf8; use quick_xml::encoding::Decoder; use quick_xml::errors::Error; -use quick_xml::escape::EscapeError; use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event}; use quick_xml::name::QName; use quick_xml::reader::Reader; @@ -107,11 +106,10 @@ impl<'i> MyReader<'i> { } } - fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> { + fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], Error> { match self.entities.get(entity) { Some(replacement) => Ok(replacement), - None => Err(EscapeError::UnrecognizedEntity( - 0..0, + None => Err(Error::UnrecognizedGeneralEntity( String::from_utf8_lossy(entity).into_owned(), )), } diff --git a/src/de/map.rs b/src/de/map.rs index 3d25a9411..d5b4cfb6e 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -2,15 +2,15 @@ use crate::{ de::key::QNameDeserializer, - de::resolver::EntityResolver, de::simple_type::SimpleTypeDeserializer, de::text::TextDeserializer, - de::{DeEvent, Deserializer, XmlRead, TEXT_KEY, VALUE_KEY}, + de::{DeEvent, Deserializer, TEXT_KEY, VALUE_KEY}, errors::serialize::DeError, errors::Error, events::attributes::IterState, events::BytesStart, name::QName, + reader::EntityResolverFactory, }; use serde::de::value::BorrowedStrDeserializer; use serde::de::{self, DeserializeSeed, Deserializer as _, MapAccess, SeqAccess, Visitor}; @@ -166,14 +166,13 @@ enum ValueSource { /// /// - `'d` lifetime represents a parent deserializer, which could own the data /// buffer. -pub(crate) struct ElementMapAccess<'de, 'd, R, E> +pub(crate) struct ElementMapAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Tag -- owner of attributes start: BytesStart<'de>, - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, EF>, /// State of the iterator over attributes. Contains the next position in the /// inner `start` slice, from which next attribute should be parsed. iter: IterState, @@ -195,14 +194,13 @@ where has_text_field: bool, } -impl<'de, 'd, R, E> ElementMapAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> ElementMapAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Create a new ElementMapAccess pub fn new( - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, EF>, start: BytesStart<'de>, fields: &'static [&'static str], ) -> Self { @@ -240,10 +238,9 @@ where } } -impl<'de, 'd, R, E> MapAccess<'de> for ElementMapAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> MapAccess<'de> for ElementMapAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -446,14 +443,13 @@ where /// /// [`deserialize_tuple`]: #method.deserialize_tuple /// [`deserialize_struct`]: #method.deserialize_struct -struct MapValueDeserializer<'de, 'd, 'm, R, E> +struct MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Access to the map that created this deserializer. Gives access to the /// context, such as list of fields, that current map known about. - map: &'m mut ElementMapAccess<'de, 'd, R, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>, /// Whether this deserializer was created for deserialization from an element /// with fixed name, or the elements with different names or even text are allowed. /// @@ -531,10 +527,9 @@ where fixed_name: bool, } -impl<'de, 'd, 'm, R, E> MapValueDeserializer<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, EF> MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Returns a next string as concatenated content of consequent [`Text`] and /// [`CData`] events, used inside [`deserialize_primitives!()`]. @@ -548,10 +543,9 @@ where } } -impl<'de, 'd, 'm, R, E> de::Deserializer<'de> for MapValueDeserializer<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, EF> de::Deserializer<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -685,13 +679,12 @@ where } } -impl<'de, 'd, 'm, R, E> de::EnumAccess<'de> for MapValueDeserializer<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, EF> de::EnumAccess<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; - type Variant = MapValueVariantAccess<'de, 'd, 'm, R, E>; + type Variant = MapValueVariantAccess<'de, 'e, 'd, 'm, EF>; fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error> where @@ -716,23 +709,21 @@ where } } -struct MapValueVariantAccess<'de, 'd, 'm, R, E> +struct MapValueVariantAccess<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Access to the map that created this enum accessor. Gives access to the /// context, such as list of fields, that current map known about. - map: &'m mut ElementMapAccess<'de, 'd, R, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>, /// `true` if variant should be deserialized from a textual content /// and `false` if from tag is_text: bool, } -impl<'de, 'd, 'm, R, E> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, EF> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -907,14 +898,13 @@ impl<'de> TagFilter<'de> { /// /// [`Text`]: crate::events::Event::Text /// [`CData`]: crate::events::Event::CData -struct MapValueSeqAccess<'de, 'd, 'm, R, E> +struct MapValueSeqAccess<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Accessor to a map that creates this accessor and to a deserializer for /// a sequence items. - map: &'m mut ElementMapAccess<'de, 'd, R, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>, /// Filter that determines whether a tag is a part of this sequence. /// /// When feature [`overlapped-lists`] is not activated, iteration will stop @@ -934,20 +924,18 @@ where } #[cfg(feature = "overlapped-lists")] -impl<'de, 'd, 'm, R, E> Drop for MapValueSeqAccess<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, EF> Drop for MapValueSeqAccess<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { fn drop(&mut self) { self.map.de.start_replay(self.checkpoint); } } -impl<'de, 'd, 'm, R, E> SeqAccess<'de> for MapValueSeqAccess<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, EF> SeqAccess<'de> for MapValueSeqAccess<'de, 'e, 'd, 'm, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -1051,19 +1039,17 @@ where /// [specification]: https://www.w3.org/TR/xmlschema11-2/#boolean /// [`deserialize_tuple`]: #method.deserialize_tuple /// [`deserialize_struct`]: #method.deserialize_struct -struct ElementDeserializer<'de, 'd, R, E> +struct ElementDeserializer<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { start: BytesStart<'de>, - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, EF>, } -impl<'de, 'd, R, E> ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> ElementDeserializer<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Returns a next string as concatenated content of consequent [`Text`] and /// [`CData`] events, used inside [`deserialize_primitives!()`]. @@ -1076,10 +1062,9 @@ where } } -impl<'de, 'd, R, E> de::Deserializer<'de> for ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> de::Deserializer<'de> for ElementDeserializer<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -1167,10 +1152,9 @@ where } } -impl<'de, 'd, R, E> de::EnumAccess<'de> for ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> de::EnumAccess<'de> for ElementDeserializer<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; type Variant = Self; @@ -1184,10 +1168,9 @@ where } } -impl<'de, 'd, R, E> de::VariantAccess<'de> for ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> de::VariantAccess<'de> for ElementDeserializer<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; diff --git a/src/de/mod.rs b/src/de/mod.rs index de2206a78..3ccd37d17 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -125,8 +125,8 @@ //! //! //! -//! [text]: Event::Text -//! [CDATA]: Event::CData +//! [text]: crate::events::Event::Text +//! [CDATA]: crate::events::Event::CData //! //! //! @@ -2098,25 +2098,21 @@ macro_rules! deserialize_primitives { mod attributes; mod key; mod map; -mod resolver; mod simple_type; mod text; mod var; pub use self::attributes::AttributesDeserializer; -pub use self::resolver::{EntityResolver, PredefinedEntityResolver}; pub use self::simple_type::SimpleTypeDeserializer; pub use crate::errors::serialize::DeError; -use crate::XmlVersion; use crate::{ de::map::ElementMapAccess, encoding::Decoder, errors::Error, - escape::{parse_number, EscapeError}, - events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText, Event}, + events::{BytesCData, BytesEnd, BytesStart, BytesText}, name::QName, - reader::NsReader, + reader::{EntityResolverFactory, NsReader, PredefinedEntityResolver, XmlEvent, XmlReader}, }; use serde::de::{ self, Deserialize, DeserializeOwned, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor, @@ -2148,17 +2144,17 @@ const fn is_non_whitespace(ch: char) -> bool { /// Internally text is stored in `Cow`. Cloning of text is cheap while it /// is borrowed and makes copies of data when it is owned. /// -/// [`Text`]: Event::Text -/// [`CData`]: Event::CData -/// [`Comment`]: Event::Comment -/// [`PI`]: Event::PI +/// [`Text`]: crate::events::Event::Text +/// [`CData`]: crate::events::Event::CData +/// [`Comment`]: crate::events::Event::Comment +/// [`PI`]: crate::events::Event::PI #[derive(Clone, Debug, PartialEq, Eq)] pub struct Text<'a> { /// Untrimmed text after concatenating content of all /// [`Text`] and [`CData`] events /// - /// [`Text`]: Event::Text - /// [`CData`]: Event::CData + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData text: Cow<'a, str>, /// A range into `text` which contains data after trimming content: Range, @@ -2270,10 +2266,10 @@ pub enum DeEvent<'a> { /// events. _Consequent_ means that events should follow each other or be /// delimited only by (any count of) [`Comment`] or [`PI`] events. /// - /// [`Text`]: Event::Text - /// [`CData`]: Event::CData - /// [`Comment`]: Event::Comment - /// [`PI`]: Event::PI + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData + /// [`Comment`]: crate::events::Event::Comment + /// [`PI`]: crate::events::Event::PI Text(Text<'a>), /// End of XML document. Eof, @@ -2290,8 +2286,8 @@ pub enum DeEvent<'a> { /// end spaces we should lookahead by one deserializer event (i. e. skip all /// comments and processing instructions). /// -/// [`Text`]: Event::Text -/// [`CData`]: Event::CData +/// [`Text`]: crate::events::Event::Text +/// [`CData`]: crate::events::Event::CData #[derive(Clone, Debug, PartialEq, Eq)] pub enum PayloadEvent<'a> { /// Start tag (with attributes) ``. @@ -2302,60 +2298,30 @@ pub enum PayloadEvent<'a> { Text(BytesText<'a>), /// Unescaped character data stored in ``. CData(BytesCData<'a>), - /// Document type definition data (DTD) stored in ``. - DocType(BytesText<'a>), - /// Reference `&ref;` in the textual data. - GeneralRef(BytesRef<'a>), /// End of XML document. Eof, } -impl<'a> PayloadEvent<'a> { - /// Ensures that all data is owned to extend the object's lifetime if necessary. - #[inline] - fn into_owned(self) -> PayloadEvent<'static> { - match self { - PayloadEvent::Start(e) => PayloadEvent::Start(e.into_owned()), - PayloadEvent::End(e) => PayloadEvent::End(e.into_owned()), - PayloadEvent::Text(e) => PayloadEvent::Text(e.into_owned()), - PayloadEvent::CData(e) => PayloadEvent::CData(e.into_owned()), - PayloadEvent::DocType(e) => PayloadEvent::DocType(e.into_owned()), - PayloadEvent::GeneralRef(e) => PayloadEvent::GeneralRef(e.into_owned()), - PayloadEvent::Eof => PayloadEvent::Eof, - } - } -} - /// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s. /// [`PayloadEvent::Text`] events, that followed by any event except /// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end. -struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolver> { +struct LookaheadReader<'i, 'e, EF: EntityResolverFactory<'i> = PredefinedEntityResolver> { /// A source of low-level XML events - reader: R, + reader: XmlReader<'i, 'e, EF>, /// Intermediate event, that could be returned by the next call to `next()`. /// If that is the `Text` event then leading spaces already trimmed, but /// trailing spaces is not. Before the event will be returned, trimming of /// the spaces could be necessary lookahead: Result, DeError>, - - /// Used to resolve unknown entities that would otherwise cause the parser - /// to return an [`EscapeError::UnrecognizedEntity`] error. - /// - /// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity - entity_resolver: E, } -impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { - fn new(mut reader: R, entity_resolver: E) -> Self { +impl<'i, 'e, EF: EntityResolverFactory<'i>> LookaheadReader<'i, 'e, EF> { + fn new(mut reader: XmlReader<'i, 'e, EF>) -> Self { // Lookahead by one event immediately, so we do not need to check in the // loop if we need lookahead or not let lookahead = reader.next(); - Self { - reader, - lookahead, - entity_resolver, - } + Self { reader, lookahead } } /// Returns `true` if all events was consumed @@ -2375,7 +2341,7 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { // If next event is a text or CDATA, we should not trim trailing spaces !matches!( self.lookahead, - Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_) | PayloadEvent::GeneralRef(_)) + Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_)) ) } @@ -2398,10 +2364,9 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { PayloadEvent::CData(e) => result .to_mut() .push_str(&e.xml_content(self.reader.xml_version())?), - PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?, - // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef - _ => unreachable!("Only `Text`, `CData` or `GeneralRef` events can come here"), + // SAFETY: current_event_is_last_text checks that event is Text or CData + _ => unreachable!("Only `Text` or `CData` events can come here"), } } Ok(DeEvent::Text(Text::new(result))) @@ -2417,38 +2382,11 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { PayloadEvent::CData(e) => { self.drain_text(e.xml_content(self.reader.xml_version())?) } - PayloadEvent::DocType(e) => { - self.entity_resolver - .capture(e) - .map_err(|err| DeError::Custom(format!("cannot parse DTD: {}", err)))?; - continue; - } - PayloadEvent::GeneralRef(e) => { - let mut text = String::new(); - self.resolve_reference(&mut text, e)?; - self.drain_text(text.into()) - } PayloadEvent::Eof => Ok(DeEvent::Eof), }; } } - fn resolve_reference(&mut self, result: &mut String, event: BytesRef) -> Result<(), DeError> { - let len = event.len(); - let reference = self.decoder().decode(&event)?; - - if let Some(num) = reference.strip_prefix('#') { - let codepoint = parse_number(num).map_err(EscapeError::InvalidCharRef)?; - result.push_str(codepoint.encode_utf8(&mut [0u8; 4])); - return Ok(()); - } - if let Some(value) = self.entity_resolver.resolve(reference.as_ref()) { - result.push_str(value); - return Ok(()); - } - Err(EscapeError::UnrecognizedEntity(0..len, reference.to_string()).into()) - } - #[inline] fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { match self.lookahead { @@ -2517,12 +2455,9 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// /// A structure that deserializes XML into Rust values. -pub struct Deserializer<'de, R, E: EntityResolver = PredefinedEntityResolver> -where - R: XmlRead<'de>, -{ +pub struct Deserializer<'de, 'e, EF: EntityResolverFactory<'de> = PredefinedEntityResolver> { /// An XML reader that streams events into this deserializer - reader: XmlReader<'de, R, E>, + reader: LookaheadReader<'de, 'e, EF>, /// When deserializing sequences sometimes we have to skip unwanted events. /// That events should be stored and then replayed. This is a replay buffer, @@ -2555,20 +2490,71 @@ where key_buf: String, } -impl<'de, R, E> Deserializer<'de, R, E> +impl<'de, 'e, EF> Deserializer<'de, 'e, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { + /// Create a new deserializer that will borrow data from the specified string + /// and use the specified entity resolver. + pub fn from_str_with_resolver(source: &'de str, entity_resolver_factory: EF) -> Self { + Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver_factory) + } + + /// Create a new deserializer that will borrow data from the specified preconfigured + /// reader and use the specified entity resolver. + /// + /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. + /// + /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements + pub fn borrowing_with_resolver( + mut reader: NsReader<&'de [u8]>, + entity_resolver_factory: EF, + ) -> Self { + let config = reader.config_mut(); + config.expand_empty_elements = true; + + Self::new(XmlReader::borrowed_ns(reader, entity_resolver_factory)) + } + + /// Create a new deserializer that will copy data from the specified reader + /// into internal buffer and use the specified entity resolver. + /// + /// If you already have a string use [`Self::from_str`] instead, because it + /// will borrow instead of copy. If you have `&[u8]` which is known to represent + /// UTF-8, you can decode it first before using [`from_str`]. + pub fn with_resolver(reader: R, entity_resolver_factory: EF) -> Self + where + R: BufRead + 'de, + { + let boxed: Box = Box::new(reader); + Self::buffering_with_resolver(NsReader::from_reader(boxed), entity_resolver_factory) + } + + /// Create new deserializer that will copy data from the specified preconfigured reader + /// into internal buffer and use the specified entity resolver. + /// + /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. + /// + /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements + pub fn buffering_with_resolver( + mut reader: NsReader>, + entity_resolver_factory: EF, + ) -> Self { + let config = reader.config_mut(); + config.expand_empty_elements = true; + + Self::new(XmlReader::buffered_ns(reader, entity_resolver_factory)) + } + /// Create an XML deserializer from one of the possible quick_xml input sources. /// /// Typically it is more convenient to use one of these methods instead: /// /// - [`Deserializer::from_str`] /// - [`Deserializer::from_reader`] - fn new(reader: R, entity_resolver: E) -> Self { + fn new(reader: XmlReader<'de, 'e, EF>) -> Self { Self { - reader: XmlReader::new(reader, entity_resolver), + reader: LookaheadReader::new(reader), #[cfg(feature = "overlapped-lists")] read: VecDeque::new(), @@ -2604,7 +2590,7 @@ where /// # use pretty_assertions::assert_eq; /// use serde::Deserialize; /// use quick_xml::de::Deserializer; - /// use quick_xml::NsReader; + /// use quick_xml::reader::XmlReader; /// /// #[derive(Deserialize)] /// struct SomeStruct { @@ -2621,12 +2607,12 @@ where /// let err = SomeStruct::deserialize(&mut de); /// assert!(err.is_err()); /// - /// let reader: &NsReader<_> = de.get_ref().get_ref(); + /// let reader: &XmlReader<_> = de.get_ref(); /// /// assert_eq!(reader.error_position(), 28); /// assert_eq!(reader.buffer_position(), 41); /// ``` - pub const fn get_ref(&self) -> &R { + pub const fn get_ref(&self) -> &XmlReader<'de, 'e, EF> { &self.reader.reader } @@ -2876,8 +2862,8 @@ where /// |[`DeEvent::Text`] |`text content` or `` (probably mixed)|Returns event content unchanged, expects the `` after that /// |[`DeEvent::Eof`] | |Emits [`InvalidXml(IllFormed(MissingEndTag))`](DeError::InvalidXml) /// - /// [`Text`]: Event::Text - /// [`CData`]: Event::CData + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData fn read_string_impl(&mut self, allow_start: bool) -> Result, DeError> { match self.next()? { // Reached by doc tests only: this file, lines 979 and 996 @@ -3006,7 +2992,7 @@ where } } -impl<'de> Deserializer<'de, SliceReader<'de>> { +impl<'de, 'e> Deserializer<'de, 'e> { /// Create a new deserializer that will borrow data from the specified string. /// /// Deserializer created with this method will not resolve custom entities. @@ -3053,42 +3039,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> { pub fn borrowing(reader: NsReader<&'de [u8]>) -> Self { Self::borrowing_with_resolver(reader, PredefinedEntityResolver) } -} - -impl<'de, E> Deserializer<'de, SliceReader<'de>, E> -where - E: EntityResolver, -{ - /// Create a new deserializer that will borrow data from the specified string - /// and use the specified entity resolver. - pub fn from_str_with_resolver(source: &'de str, entity_resolver: E) -> Self { - Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver) - } - - /// Create a new deserializer that will borrow data from the specified preconfigured - /// reader and use the specified entity resolver. - /// - /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. - /// - /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements - pub fn borrowing_with_resolver(mut reader: NsReader<&'de [u8]>, entity_resolver: E) -> Self { - let config = reader.config_mut(); - config.expand_empty_elements = true; - - Self::new( - SliceReader { - reader, - version: XmlVersion::V1_0, - }, - entity_resolver, - ) - } -} -impl<'de, R> Deserializer<'de, IoReader> -where - R: BufRead, -{ /// Create a new deserializer that will copy data from the specified reader /// into internal buffer. /// @@ -3097,7 +3048,10 @@ where /// UTF-8, you can decode it first before using [`from_str`]. /// /// Deserializer created with this method will not resolve custom entities. - pub fn from_reader(reader: R) -> Self { + pub fn from_reader(reader: R) -> Self + where + R: BufRead + 'de, + { Self::with_resolver(reader, PredefinedEntityResolver) } @@ -3115,18 +3069,22 @@ where /// # use quick_xml::de::Deserializer; /// # use quick_xml::NsReader; /// # use serde::Deserialize; - /// # + /// use std::io::{BufRead, Cursor}; + /// /// #[derive(Deserialize, PartialEq, Debug)] /// struct Object { /// tag: String, /// } /// - /// let mut reader = NsReader::from_str(" test "); + /// let boxed: Box = Box::new(Cursor::new(" test ")); + /// let mut reader = NsReader::from_reader(boxed); /// - /// let mut de = Deserializer::buffering(reader.clone()); + /// let mut de = Deserializer::buffering(reader); /// let obj = Object::deserialize(&mut de).unwrap(); /// assert_eq!(obj, Object { tag: " test ".to_string() }); /// + /// let boxed: Box = Box::new(Cursor::new(" test ")); + /// let mut reader = NsReader::from_reader(boxed); /// reader.config_mut().trim_text(true); /// /// let mut de = Deserializer::buffering(reader); @@ -3136,62 +3094,14 @@ where /// /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements #[inline] - pub fn buffering(reader: NsReader) -> Self { + pub fn buffering(reader: NsReader>) -> Self { Self::buffering_with_resolver(reader, PredefinedEntityResolver) } } -impl<'de, R, E> Deserializer<'de, IoReader, E> +impl<'de, 'e, EF> de::Deserializer<'de> for &mut Deserializer<'de, 'e, EF> where - R: BufRead, - E: EntityResolver, -{ - /// Create a new deserializer that will copy data from the specified reader - /// into internal buffer and use the specified entity resolver. - /// - /// If you already have a string use [`Self::from_str`] instead, because it - /// will borrow instead of copy. If you have `&[u8]` which is known to represent - /// UTF-8, you can decode it first before using [`from_str`]. - pub fn with_resolver(reader: R, entity_resolver: E) -> Self { - let mut reader = NsReader::from_reader(reader); - let config = reader.config_mut(); - config.expand_empty_elements = true; - - Self::new( - IoReader { - reader, - buf: Vec::new(), - version: XmlVersion::V1_0, - }, - entity_resolver, - ) - } - - /// Create new deserializer that will copy data from the specified preconfigured reader - /// into internal buffer and use the specified entity resolver. - /// - /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. - /// - /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements - pub fn buffering_with_resolver(mut reader: NsReader, entity_resolver: E) -> Self { - let config = reader.config_mut(); - config.expand_empty_elements = true; - - Self::new( - IoReader { - reader, - buf: Vec::new(), - version: XmlVersion::V1_0, - }, - entity_resolver, - ) - } -} - -impl<'de, R, E> de::Deserializer<'de> for &mut Deserializer<'de, R, E> -where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -3328,10 +3238,9 @@ where /// /// Technically, multiple top-level elements violates XML rule of only one top-level /// element, but we consider this as several concatenated XML documents. -impl<'de, R, E> SeqAccess<'de> for &mut Deserializer<'de, R, E> +impl<'de, 'e, EF> SeqAccess<'de> for &mut Deserializer<'de, 'e, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -3355,10 +3264,9 @@ where } } -impl<'de, R, E> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, R, E> +impl<'de, 'e, EF> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, 'e, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Deserializer = Self; @@ -3370,222 +3278,39 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Converts raw reader's event into a payload event. -/// Returns `None`, if event should be skipped. -#[inline(always)] -fn skip_uninterested<'a>(event: Event<'a>) -> Option> { - let event = match event { - Event::DocType(e) => PayloadEvent::DocType(e), - Event::Start(e) => PayloadEvent::Start(e), - Event::End(e) => PayloadEvent::End(e), - Event::Eof => PayloadEvent::Eof, - - // Do not trim next text event after Text, CDATA or reference event - Event::CData(e) => PayloadEvent::CData(e), - Event::Text(e) => PayloadEvent::Text(e), - Event::GeneralRef(e) => PayloadEvent::GeneralRef(e), - - _ => return None, - }; - Some(event) -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -/// Trait used by the deserializer for iterating over input. This is manually -/// "specialized" for iterating over `&[u8]`. -/// -/// You do not need to implement this trait, it is needed to abstract from -/// [borrowing](SliceReader) and [copying](IoReader) data sources and reuse code in -/// deserializer -pub trait XmlRead<'i> { - /// Return an input-borrowing event. - fn next(&mut self) -> Result, DeError>; - - /// Skips until end element is found. Unlike `next()` it will not allocate - /// when it cannot satisfy the lifetime. - fn read_to_end(&mut self, name: QName) -> Result<(), DeError>; - - /// Return an XML version of the source. - fn xml_version(&self) -> XmlVersion; - - /// A copy of the reader's decoder used to decode strings. - fn decoder(&self) -> Decoder; - - /// Checks if the `start` tag has a [`xsi:nil`] attribute. This method ignores - /// any errors in attributes. - /// - /// [`xsi:nil`]: https://www.w3.org/TR/xmlschema-1/#xsi_nil - fn has_nil_attr(&self, start: &BytesStart) -> bool; -} - -/// XML input source that reads from a std::io input stream. -/// -/// You cannot create it, it is created automatically when you call -/// [`Deserializer::from_reader`] -pub struct IoReader { - reader: NsReader, - buf: Vec, - version: XmlVersion, -} - -impl IoReader { - /// Returns the underlying XML reader. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use serde::Deserialize; - /// use std::io::Cursor; - /// use quick_xml::de::Deserializer; - /// use quick_xml::NsReader; - /// - /// #[derive(Deserialize)] - /// struct SomeStruct { - /// field1: String, - /// field2: String, - /// } - /// - /// // Try to deserialize from broken XML - /// let mut de = Deserializer::from_reader(Cursor::new( - /// "" - /// // 0 ^= 28 ^= 41 - /// )); - /// - /// let err = SomeStruct::deserialize(&mut de); - /// assert!(err.is_err()); - /// - /// let reader: &NsReader> = de.get_ref().get_ref(); - /// - /// assert_eq!(reader.error_position(), 28); - /// assert_eq!(reader.buffer_position(), 41); - /// ``` - pub const fn get_ref(&self) -> &NsReader { - &self.reader - } -} - -impl<'i, R: BufRead> XmlRead<'i> for IoReader { - fn next(&mut self) -> Result, DeError> { - loop { - self.buf.clear(); - - let event = self.reader.read_event_into(&mut self.buf)?; - if let Event::Decl(e) = &event { - self.version = e.xml_version()?; - } - if let Some(event) = skip_uninterested(event) { - return Ok(event.into_owned()); - } - } - } - - fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { - match self.reader.read_to_end_into(name, &mut self.buf) { - Err(e) => Err(e.into()), - Ok(_) => Ok(()), - } - } - - #[inline] - fn xml_version(&self) -> XmlVersion { - self.version - } - - #[inline] - fn decoder(&self) -> Decoder { - self.reader.decoder() - } - - fn has_nil_attr(&self, start: &BytesStart) -> bool { - start.attributes().has_nil(self.reader.resolver()) - } -} - -/// XML input source that reads from a slice of bytes and can borrow from it. -/// -/// You cannot create it, it is created automatically when you call -/// [`Deserializer::from_str`]. -pub struct SliceReader<'de> { - reader: NsReader<&'de [u8]>, - version: XmlVersion, -} - -impl<'de> SliceReader<'de> { - /// Returns the underlying XML reader. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use serde::Deserialize; - /// use quick_xml::de::Deserializer; - /// use quick_xml::NsReader; - /// - /// #[derive(Deserialize)] - /// struct SomeStruct { - /// field1: String, - /// field2: String, - /// } - /// - /// // Try to deserialize from broken XML - /// let mut de = Deserializer::from_str( - /// "" - /// // 0 ^= 28 ^= 41 - /// ); - /// - /// let err = SomeStruct::deserialize(&mut de); - /// assert!(err.is_err()); - /// - /// let reader: &NsReader<&[u8]> = de.get_ref().get_ref(); - /// - /// assert_eq!(reader.error_position(), 28); - /// assert_eq!(reader.buffer_position(), 41); - /// ``` - pub const fn get_ref(&self) -> &NsReader<&'de [u8]> { - &self.reader - } -} - -impl<'de> XmlRead<'de> for SliceReader<'de> { +impl<'de, 'e, EF> XmlReader<'de, 'e, EF> +where + EF: EntityResolverFactory<'de>, +{ fn next(&mut self) -> Result, DeError> { loop { - let event = self.reader.read_event()?; - if let Event::Decl(e) = &event { - self.version = e.xml_version()?; - } - if let Some(event) = skip_uninterested(event) { - return Ok(event); - } - } - } - - fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { - match self.reader.read_to_end(name) { - Err(e) => Err(e.into()), - Ok(_) => Ok(()), + let event = match self.read_event()? { + XmlEvent::Start(e) => PayloadEvent::Start(e), + XmlEvent::End(e) => PayloadEvent::End(e), + XmlEvent::Eof => PayloadEvent::Eof, + + // Do not trim next text event after Text or CDATA + XmlEvent::CData(e) => PayloadEvent::CData(e), + XmlEvent::Text(e) => PayloadEvent::Text(e), + + // XmlEvent::Empty doesn't produced, because it is expanded into Start+End + // Skip XmlEvent::PI + _ => continue, + }; + return Ok(event); } } - - #[inline] - fn xml_version(&self) -> XmlVersion { - self.version - } - - #[inline] - fn decoder(&self) -> Decoder { - self.reader.decoder() - } - - fn has_nil_attr(&self, start: &BytesStart) -> bool { - start.attributes().has_nil(self.reader.resolver()) - } } +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod tests { use super::*; use crate::errors::IllFormedError; use pretty_assertions::assert_eq; - fn make_de<'de>(source: &'de str) -> Deserializer<'de, SliceReader<'de>> { + fn make_de<'de, 'e>(source: &'de str) -> Deserializer<'de, 'e> { dbg!(source); Deserializer::from_str(source) } @@ -4150,36 +3875,6 @@ mod tests { } } - #[test] - fn borrowing_reader_parity() { - let s = r#" - Some text - - - "#; - - let mut reader1 = IoReader { - reader: NsReader::from_reader(s.as_bytes()), - buf: Vec::new(), - version: XmlVersion::V1_0, - }; - let mut reader2 = SliceReader { - reader: NsReader::from_str(s), - version: XmlVersion::V1_0, - }; - - loop { - let event1 = reader1.next().unwrap(); - let event2 = reader2.next().unwrap(); - - if let (PayloadEvent::Eof, PayloadEvent::Eof) = (&event1, &event2) { - break; - } - - assert_eq!(event1, event2); - } - } - #[test] fn borrowing_reader_events() { let s = r#" @@ -4189,13 +3884,10 @@ mod tests { "#; - let mut reader = SliceReader { - reader: NsReader::from_str(s), - version: XmlVersion::V1_0, - }; + let mut reader = NsReader::from_str(s); + reader.config_mut().expand_empty_elements = true; - let config = reader.reader.config_mut(); - config.expand_empty_elements = true; + let mut reader = XmlReader::borrowed_ns(reader, PredefinedEntityResolver); let mut events = Vec::new(); diff --git a/src/de/resolver.rs b/src/de/resolver.rs deleted file mode 100644 index 5efc0117c..000000000 --- a/src/de/resolver.rs +++ /dev/null @@ -1,115 +0,0 @@ -//! Entity resolver module - -use std::convert::Infallible; -use std::error::Error; - -use crate::escape::resolve_predefined_entity; -use crate::events::BytesText; - -/// Used to resolve unknown entities while parsing -/// -/// # Example -/// -/// ``` -/// # use serde::Deserialize; -/// # use pretty_assertions::assert_eq; -/// use regex::bytes::Regex; -/// use std::collections::BTreeMap; -/// use std::string::FromUtf8Error; -/// use quick_xml::de::{Deserializer, EntityResolver}; -/// use quick_xml::events::BytesText; -/// -/// struct DocTypeEntityResolver { -/// re: Regex, -/// map: BTreeMap, -/// } -/// -/// impl Default for DocTypeEntityResolver { -/// fn default() -> Self { -/// Self { -/// // We do not focus on true parsing in this example -/// // You should use special libraries to parse DTD -/// re: Regex::new(r#""#).unwrap(), -/// map: BTreeMap::new(), -/// } -/// } -/// } -/// -/// impl EntityResolver for DocTypeEntityResolver { -/// type Error = FromUtf8Error; -/// -/// fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> { -/// for cap in self.re.captures_iter(&doctype) { -/// self.map.insert( -/// String::from_utf8(cap[1].to_vec())?, -/// String::from_utf8(cap[2].to_vec())?, -/// ); -/// } -/// Ok(()) -/// } -/// -/// fn resolve(&self, entity: &str) -> Option<&str> { -/// self.map.get(entity).map(|s| s.as_str()) -/// } -/// } -/// -/// let xml_reader = br#" -/// ]> -/// -/// &e1; -/// -/// "#.as_ref(); -/// -/// let mut de = Deserializer::with_resolver( -/// xml_reader, -/// DocTypeEntityResolver::default(), -/// ); -/// let data: BTreeMap = BTreeMap::deserialize(&mut de).unwrap(); -/// -/// assert_eq!(data.get("entity_one"), Some(&"entity 1".to_string())); -/// ``` -pub trait EntityResolver { - /// The error type that represents DTD parse error - type Error: Error; - - /// Called on contents of [`Event::DocType`] to capture declared entities. - /// Can be called multiple times, for each parsed `` declaration. - /// - /// [`Event::DocType`]: crate::events::Event::DocType - fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error>; - - /// Called when an entity needs to be resolved. - /// - /// `None` is returned if a suitable value can not be found. - /// In that case an [`EscapeError::UnrecognizedEntity`] will be returned by - /// a deserializer. - /// - /// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity - fn resolve(&self, entity: &str) -> Option<&str>; -} - -/// An [`EntityResolver`] that resolves only predefined entities: -/// -/// | Entity | Resolution -/// |--------|------------ -/// |`<` | `<` -/// |`>` | `>` -/// |`&` | `&` -/// |`'`| `'` -/// |`"`| `"` -#[derive(Default, Copy, Clone)] -pub struct PredefinedEntityResolver; - -impl EntityResolver for PredefinedEntityResolver { - type Error = Infallible; - - #[inline] - fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::Error> { - Ok(()) - } - - #[inline] - fn resolve(&self, entity: &str) -> Option<&str> { - resolve_predefined_entity(entity) - } -} diff --git a/src/de/var.rs b/src/de/var.rs index e64e29f85..a7d79199a 100644 --- a/src/de/var.rs +++ b/src/de/var.rs @@ -1,40 +1,37 @@ use crate::{ de::key::QNameDeserializer, de::map::ElementMapAccess, - de::resolver::EntityResolver, de::simple_type::SimpleTypeDeserializer, - de::{DeEvent, Deserializer, XmlRead, TEXT_KEY}, + de::{DeEvent, Deserializer, TEXT_KEY}, errors::serialize::DeError, + reader::EntityResolverFactory, }; use serde::de::value::BorrowedStrDeserializer; use serde::de::{self, DeserializeSeed, Deserializer as _, Visitor}; /// An enum access -pub struct EnumAccess<'de, 'd, R, E> +pub struct EnumAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, EF>, } -impl<'de, 'd, R, E> EnumAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> EnumAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { - pub fn new(de: &'d mut Deserializer<'de, R, E>) -> Self { + pub fn new(de: &'d mut Deserializer<'de, 'e, EF>) -> Self { EnumAccess { de } } } -impl<'de, 'd, R, E> de::EnumAccess<'de> for EnumAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> de::EnumAccess<'de> for EnumAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; - type Variant = VariantAccess<'de, 'd, R, E>; + type Variant = VariantAccess<'de, 'e, 'd, EF>; fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error> where @@ -61,21 +58,19 @@ where } } -pub struct VariantAccess<'de, 'd, R, E> +pub struct VariantAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, EF>, /// `true` if variant should be deserialized from a textual content /// and `false` if from tag is_text: bool, } -impl<'de, 'd, R, E> de::VariantAccess<'de> for VariantAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, EF> de::VariantAccess<'de> for VariantAccess<'de, 'e, 'd, EF> where - R: XmlRead<'de>, - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; diff --git a/src/errors.rs b/src/errors.rs index 9002f0477..49e18ae64 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -206,6 +206,13 @@ pub enum Error { Escape(EscapeError), /// Parsed XML has some namespace-related problems Namespace(NamespaceError), + /// The error returned by [`EntityResolver::capture`](crate::reader::EntityResolver::capture). + DoctypeParse(Arc), + /// Entity reference was not resolved to the entity; [`EntityResolver::resolve`] returned `None`. + /// Contains the name of entity without `&` and `;`. + /// + /// [`EntityResolver::resolve`]: crate::reader::EntityResolver::resolve + UnrecognizedGeneralEntity(String), } impl Error { @@ -284,6 +291,8 @@ impl fmt::Display for Error { Self::Encoding(e) => e.fmt(f), Self::Escape(e) => e.fmt(f), Self::Namespace(e) => e.fmt(f), + Self::DoctypeParse(e) => write!(f, "cannot parse DTD: {}", e), + Self::UnrecognizedGeneralEntity(e) => write!(f, "unrecognized general entity `{}`", e), } } } @@ -298,6 +307,8 @@ impl std::error::Error for Error { Self::Encoding(e) => Some(e), Self::Escape(e) => Some(e), Self::Namespace(e) => Some(e), + Self::DoctypeParse(e) => Some(e), + Self::UnrecognizedGeneralEntity(_) => None, } } } diff --git a/src/lib.rs b/src/lib.rs index bd37f91d0..7a434855e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,8 @@ //! The user has to explicitly _ask_ for the next XML event, similar to a database cursor. //! This is achieved by the following two structs: //! +//! - [`XmlReader`]: A high level XML pull-reader which resolves entities and can process several +//! XML sources if you provide them. //! - [`Reader`]: A low level XML pull-reader where buffer allocation/clearing is left to user. //! - [`Writer`]: A XML writer. Can be nested with readers if you want to transform XMLs. //! @@ -27,7 +29,8 @@ //! //! # Examples //! -//! - For a reading example see [`Reader`] +//! - For a reading example see [`XmlReader`] +//! - For a low-level reading example see [`Reader`] //! - For a writing example see [`Writer`] //! //! # Features @@ -78,7 +81,7 @@ pub use crate::encoding::Decoder; #[cfg(feature = "serialize")] pub use crate::errors::serialize::{DeError, SeError}; pub use crate::errors::{Error, Result}; -pub use crate::reader::{NsReader, Reader}; +pub use crate::reader::{NsReader, Reader, XmlReader}; pub use crate::writer::{ElementWriter, Writer}; /// Version of XML standard diff --git a/src/reader/event.rs b/src/reader/event.rs new file mode 100644 index 000000000..893b17b49 --- /dev/null +++ b/src/reader/event.rs @@ -0,0 +1,44 @@ +use crate::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText}; + +/// Event emitted by [`Reader::read_event`]. +/// +/// # Lifetime +/// +/// The `'i` lifetime of this struct is the lifetime of data that may be borrowed +/// from the XML input (when reader of the main document reads from `&[u8]` or `&str`). +/// +/// [`Reader::read_event`]: crate::reader::Reader::read_event +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum Event<'i> { + /// Empty element tag (with attributes) ``. + Empty(BytesStart<'i>), + /// Start tag (with attributes) ``. + Start(BytesStart<'i>), + /// End tag ``. + End(BytesEnd<'i>), + /// Character data between `Start` and `End` element. + Text(BytesText<'i>), + /// CData ``. + CData(BytesCData<'i>), + /// Processing instruction ``. + PI(BytesPI<'i>), + /// End of XML document. + Eof, +} + +impl<'i> Event<'i> { + /// Ensures that all data is owned to extend the object's lifetime if necessary. + #[inline] + pub fn into_owned(self) -> Event<'static> { + match self { + Self::Empty(e) => Event::Empty(e.into_owned()), + Self::Start(e) => Event::Start(e.into_owned()), + Self::End(e) => Event::End(e.into_owned()), + Self::Text(e) => Event::Text(e.into_owned()), + Self::CData(e) => Event::CData(e.into_owned()), + Self::PI(e) => Event::PI(e.into_owned()), + Self::Eof => Event::Eof, + } + } +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs index b8a569b2f..adee92ee3 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -2,14 +2,21 @@ #[cfg(feature = "encoding")] use encoding_rs::Encoding; -use std::io; +use std::borrow::Cow; +use std::collections::VecDeque; +use std::fmt; +use std::io::{self, BufRead, Cursor}; use std::ops::Range; +use std::sync::Arc; use crate::encoding::Decoder; use crate::errors::{Error, IllFormedError, SyntaxError}; -use crate::events::{BytesRef, Event}; +use crate::escape::{parse_number, EscapeError}; +use crate::events::{BytesRef, BytesStart, BytesText, Event}; +use crate::name::{NamespaceResolver, QName}; use crate::parser::{DtdParser, ElementParser, Parser, PiParser}; use crate::reader::state::ReaderState; +use crate::XmlVersion; /// A struct that holds a parser configuration. /// @@ -250,6 +257,14 @@ impl Default for Config { //////////////////////////////////////////////////////////////////////////////////////////////////// +mod event; +mod resolver; + +pub use event::Event as XmlEvent; +pub use resolver::{ + EntityResolver, EntityResolverFactory, PredefinedEntityResolver, ReplacementText, +}; + macro_rules! read_event_impl { ( $self:ident, $buf:ident, @@ -1228,6 +1243,492 @@ impl BangType { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Result of reading event by the underlying reader +enum ReadEvent<'i> { + /// Upper-level reader should skip event and request another one from the underlying reader + Skip, + Event(XmlEvent<'i>), + ExternalEntity(Reader>), +} + +impl<'i> fmt::Debug for ReadEvent<'i> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Skip => f.write_str("Skip"), + Self::Event(r) => write!(f, "Event({:?})", r), + Self::ExternalEntity(r) => write!(f, "ExternalEntity({:p})", &r), + } + } +} + +enum EntityReader<'i, 'e> { + /// Reader of internal entity, i.e. the entity defined in the same source as + /// a main document, that returns borrowed events. + InternalBorrowed(Reader<&'i [u8]>), + /// Reader of internal entity, i.e. the entity defined in the same source as + /// a main document, that returns owned events. + InternalOwned(Reader>), + /// Reader of external entity, i.e. the entity defined in the different from the main document + /// source. + External(Reader>), +} + +impl<'i, 'e> fmt::Debug for EntityReader<'i, 'e> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InternalBorrowed(r) => r.fmt(f), + Self::InternalOwned(r) => write!(f, "InternalOwned({:p})", &r), + Self::External(r) => write!(f, "External({:p})", &r), + } + } +} + +impl<'i, 'e> EntityReader<'i, 'e> { + const fn config(&self) -> &Config { + match self { + Self::InternalBorrowed(r) => r.config(), + Self::InternalOwned(r) => r.config(), + Self::External(r) => r.config(), + } + } + + const fn decoder(&self) -> Decoder { + match self { + Self::InternalBorrowed(r) => r.decoder(), + Self::InternalOwned(r) => r.decoder(), + Self::External(r) => r.decoder(), + } + } + + const fn buffer_position(&self) -> u64 { + match self { + Self::InternalBorrowed(r) => r.buffer_position(), + Self::InternalOwned(r) => r.buffer_position(), + Self::External(r) => r.buffer_position(), + } + } + + const fn error_position(&self) -> u64 { + match self { + Self::InternalBorrowed(r) => r.error_position(), + Self::InternalOwned(r) => r.error_position(), + Self::External(r) => r.error_position(), + } + } + + fn read_event(&mut self, buf: &mut Vec) -> Result, Error> { + match self { + Self::InternalBorrowed(r) => r.read_event(), + Self::InternalOwned(r) => Ok(r.read_event_into(buf)?.into_owned()), + Self::External(r) => Ok(r.read_event_into(buf)?.into_owned()), + } + } + + fn read_to_end(&mut self, end: QName, buf: &mut Vec) -> Result { + match self { + EntityReader::InternalBorrowed(r) => r.read_to_end(end), + EntityReader::InternalOwned(r) => r.read_to_end_into(end, buf), + EntityReader::External(r) => r.read_to_end_into(end, buf), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// One single parse unit, for example, a file. In XML specification it is called +/// _entity_, but we avoid calling it that as it may lead to confusion. Under _entity_ +/// XML consumers usually mean thing that in specification called "entity reference". +/// +/// Cite from the [specification]: +/// +/// > Each XML document has both a logical and a physical structure. +/// > Physically, the document is composed of units called **entities**. +/// > An entity may refer to other entities to cause their inclusion in the document. +/// > A document begins in a "root" or document entity. Logically, the document +/// > is composed of declarations, elements, comments, character references, +/// > and processing instructions, all of which are indicated in the document +/// > by explicit markup. The logical and physical structures MUST nest properly, +/// > as described in 4.3.2 Well-Formed Parsed Entities. +/// +/// [Also]: +/// > An XML document may consist of one or many storage units. These are called +/// > _entities_; they all have content and are all (except for the document entity +/// > and the external DTD subset) identified by entity **name**. +/// +/// # Lifetimes +/// The `'i` lifetime stands for "input" and is a lifetime of a document entity, +/// i.e. the source which the end-user requested to parse. +/// +/// The `'e` lifetime stands for "external" and it is a lifetime of an external source +/// which the end-user requested to parse. +/// +/// [specification]: https://www.w3.org/TR/xml11/#sec-documents +/// [Also]: https://www.w3.org/TR/xml11/#dt-entity +#[derive(Debug)] +struct StorageUnit<'i, 'e, E> { + /// Readers used to produce events from this entity. + parts: VecDeque>, + + /// Version of XML standard used by this storage unit. + version: XmlVersion, + + /// A buffer to manage namespaces + ns_resolver: NamespaceResolver, + /// We cannot pop data from the namespace stack until returned `Empty` or `End` + /// event will be processed by the user, so we only mark that we should that + /// in the next [`Self::read_event()`] call. + pending_ns_pop: bool, + + /// Used to resolve unknown entities that would otherwise cause the parser + /// to return an [`Error::UnrecognizedGeneralEntity`] error. + entity_resolver: E, +} +impl<'i, 'e, E> StorageUnit<'i, 'e, E> +where + E: EntityResolver<'i>, +{ + fn new(part: EntityReader<'i, 'e>, entity_resolver: E) -> Self { + Self { + parts: VecDeque::from([part]), + version: XmlVersion::V1_0, + ns_resolver: NamespaceResolver::default(), + pending_ns_pop: false, + entity_resolver, + } + } + + fn read_event_impl(&mut self, buf: &mut Vec) -> Result, Error> { + while let Some(part) = self.parts.back_mut() { + let event = match part.read_event(buf)? { + Event::Decl(e) => { + self.version = e.xml_version()?; + ReadEvent::Skip + } + Event::Comment(_) => ReadEvent::Skip, + + Event::DocType(doctype) => { + self.entity_resolver + .capture(doctype) + .map_err(|e| Error::DoctypeParse(Arc::new(e)))?; + ReadEvent::Skip + } + Event::GeneralRef(e) => { + let reference = part.decoder().decode(&e)?; + if let Some(num) = reference.strip_prefix('#') { + let codepoint = parse_number(num).map_err(EscapeError::InvalidCharRef)?; + let mut bytes = [0u8; 4]; + let text = BytesText::wrap( + codepoint.encode_utf8(&mut bytes).as_bytes(), + Decoder::utf8(), + ); + return Ok(ReadEvent::Event(XmlEvent::Text(text.into_owned()))); + } + match self.entity_resolver.resolve(reference.as_ref()) { + Some(ReplacementText::Internal(Cow::Borrowed(entity))) => { + let mut nested = Reader::from_reader(entity); + *nested.config_mut() = part.config().clone(); + self.parts.push_back(EntityReader::InternalBorrowed(nested)); + continue; + } + Some(ReplacementText::Internal(Cow::Owned(entity))) => { + let boxed: Box = Box::new(Cursor::new(entity)); + let mut nested = Reader::from_reader(boxed); + *nested.config_mut() = part.config().clone(); + self.parts.push_back(EntityReader::InternalOwned(nested)); + continue; + } + Some(ReplacementText::External(source)) => { + let mut external = Reader::from_reader(source); + *external.config_mut() = part.config().clone(); + ReadEvent::ExternalEntity(external) + } + _ => return Err(Error::UnrecognizedGeneralEntity(reference.into_owned())), + } + } + + Event::Empty(e) => ReadEvent::Event(XmlEvent::Empty(e)), + Event::Start(e) => ReadEvent::Event(XmlEvent::Start(e)), + Event::End(e) => ReadEvent::Event(XmlEvent::End(e)), + Event::Text(e) => ReadEvent::Event(XmlEvent::Text(e)), + Event::CData(e) => ReadEvent::Event(XmlEvent::CData(e)), + Event::PI(e) => ReadEvent::Event(XmlEvent::PI(e)), + Event::Eof => { + self.parts.pop_back(); + continue; + } + }; + return Ok(event); + } + Ok(ReadEvent::Event(XmlEvent::Eof)) + } + + fn read_event(&mut self, buf: &mut Vec) -> Result, Error> { + self.pop(); + let event = self.read_event_impl(buf); + self.process_event(event) + } + + #[inline] + fn pop(&mut self) { + if self.pending_ns_pop { + self.ns_resolver.pop(); + self.pending_ns_pop = false; + } + } + + #[inline] + fn process_event( + &mut self, + event: Result, Error>, + ) -> Result, Error> { + match event { + Ok(ReadEvent::Event(XmlEvent::Start(e))) => { + self.ns_resolver.push(&e)?; + Ok(ReadEvent::Event(XmlEvent::Start(e))) + } + Ok(ReadEvent::Event(XmlEvent::Empty(e))) => { + self.ns_resolver.push(&e)?; + // notify next `read_event()` invocation that it needs to pop this + // namespace scope + self.pending_ns_pop = true; + Ok(ReadEvent::Event(XmlEvent::Empty(e))) + } + Ok(ReadEvent::Event(XmlEvent::End(e))) => { + // notify next `read_event()` invocation that it needs to pop this + // namespace scope + self.pending_ns_pop = true; + Ok(ReadEvent::Event(XmlEvent::End(e))) + } + e => e, + } + } + + fn read_to_end(&mut self, end: QName, buf: &mut Vec) -> Result { + // FIXME: this is incorrect, because entity reference does not obligated + // to properly nested XML tree + if let Some(part) = self.parts.back_mut() { + part.read_to_end(end, buf)?; + // Because we found the end tag and consume it, we should pop any namespaces that + // was started by the Start event + self.pop(); + return Ok(true); + } + Ok(false) + } + + fn decoder(&self) -> Decoder { + match self.parts.back() { + Some(part) => part.decoder(), + // Does not matter what decoder to use when all events exhausted + None => Decoder::utf8(), + } + } + + fn has_nil_attr(&self, start: &BytesStart) -> bool { + start.attributes().has_nil(&self.ns_resolver) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// High-level XML reader which automatically resolves entity references (`&...;`) +/// and can stream events from several physical documents (storage units, called _entities_ +/// in the XML [specification]). +/// +/// # Lifetimes +/// The `'i` lifetime stands for "input" and is a lifetime of a document entity, +/// i.e. the source which the end-user requested to parse, _from which events may borrow_. +/// +/// The `'e` lifetime stands for "external" and it is a lifetime of _any_ entity, +/// that parser may parse, from which events will not borrow data. +/// +/// # Type parameter +/// `EF`: the general entity resolver. Used to resolve unknown entities that would +/// otherwise cause the parser to return an [`Error::UnrecognizedGeneralEntity`] error. +/// +/// Note, that the same entity resolved is used to resolve entity references as in initial +/// document, as in any other documents loaded due to entity resolution. +/// +/// [specification]: https://www.w3.org/TR/xml11/#sec-documents +#[derive(Debug)] +pub struct XmlReader<'i, 'e, EF = PredefinedEntityResolver> +where + EF: EntityResolverFactory<'i>, +{ + /// Stack of things that represents individual storage units, such as files. + /// The first element is the initial unit, representing document which user + /// want to parse, others readers created for each resolved external entity + /// (entity, defined in another storage unit). + units: VecDeque>, + + /// Buffer to which external readers will read data. After reading each event + /// data is copied to the event data and buffer is cleared + buffer: Vec, + + entity_resolver_factory: EF, +} + +impl<'i, 'e, EF> XmlReader<'i, 'e, EF> +where + EF: EntityResolverFactory<'i>, +{ + fn new(part: EntityReader<'i, 'e>, mut entity_resolver_factory: EF) -> Self { + let resolver = entity_resolver_factory.new_resolver(); + Self { + units: VecDeque::from([StorageUnit::new(part, resolver)]), + buffer: Vec::new(), + entity_resolver_factory, + } + } + + /// Creates new `Reader` from low-level reader and entity resolver, which would + /// borrow event data from the source when event represent piece of original document. + /// Events from other entities (documents loaded during entity resolution) would + /// own data. + /// + /// For each resolved entity a new [`Reader`] would be created to read entity + /// data. That reader will receive a copy of configuration that would set for + /// `reader`. If `entity_resolver` returns [`ReplacementText::Internal`], then events + /// from that entity would also borrow from the source, otherwise they will + /// maintain an own copy of data. + pub fn borrowed(reader: Reader<&'i [u8]>, entity_resolver_factory: EF) -> Self { + Self::new( + EntityReader::InternalBorrowed(reader), + entity_resolver_factory, + ) + } + + /// Creates new `Reader` from low-level reader and entity resolver, where all + /// events would store its own copy of data. + /// + /// For each resolved entity a new [`Reader`] would be created to read entity + /// data. That reader will receive a copy of configuration that would set for + /// `reader`. + pub fn buffered(reader: Reader>, entity_resolver_factory: EF) -> Self { + Self::new(EntityReader::InternalOwned(reader), entity_resolver_factory) + } + + /// The same, as [`borrowed`](Self::borrowed), but creates from namespace-aware reader. + /// The state of the reader will be preserved. + pub fn borrowed_ns(reader: NsReader<&'i [u8]>, mut entity_resolver_factory: EF) -> Self { + let resolver = entity_resolver_factory.new_resolver(); + Self { + units: VecDeque::from([reader.to_borrowed_storage_unit(resolver)]), + buffer: Vec::new(), + entity_resolver_factory, + } + } + + /// The same, as [`buffered`](Self::buffered), but creates from namespace-aware reader. + /// The state of the reader will be preserved. + pub fn buffered_ns( + reader: NsReader>, + mut entity_resolver_factory: EF, + ) -> Self { + let resolver = entity_resolver_factory.new_resolver(); + Self { + units: VecDeque::from([reader.to_buffered_storage_unit(resolver)]), + buffer: Vec::new(), + entity_resolver_factory, + } + } + + /// Returns event which, if possible, would borrow from the source and contains + /// a copy of data if borrowing is impossible (for example, event from another + /// document resolved by entity reference). + pub fn read_event(&mut self) -> Result, Error> { + while let Some(unit) = self.units.back_mut() { + self.buffer.clear(); + match unit.read_event(&mut self.buffer)? { + ReadEvent::ExternalEntity(reader) => { + self.units.push_back(StorageUnit::new( + EntityReader::External(reader), + self.entity_resolver_factory.new_resolver(), + )); + continue; + } + ReadEvent::Event(XmlEvent::Eof) => { + self.units.pop_back(); + continue; + } + ReadEvent::Event(event) => return Ok(event), + _ => continue, + } + } + Ok(XmlEvent::Eof) + } + + /// Returns a storage of namespace bindings associated with this reader. + /// + /// Note, that this object may change after reading new event, if new event + /// will be from the new storage unit. That is possible only if custom + /// [`EntityResolver`] is used. + #[inline] + pub fn resolver(&self) -> &NamespaceResolver { + // SAFETY: At least one storage unit should always be there + &self.units.back().unwrap().ns_resolver + } + + /// Reads until end element is found. This function is supposed to be called + /// after you already read a [`Event::Start`] event. + /// + /// Unlike [`Reader::read_to_end`] this method does not return span because + /// there might not be continuos space that is occupied by the XML tree. + pub fn read_to_end(&mut self, end: QName) -> Result<(), Error> { + // FIXME: this is incorrect, because entity reference does not obligated + // to properly nested XML tree + if let Some(unit) = self.units.back_mut() { + unit.read_to_end(end, &mut self.buffer)?; + return Ok(()); + } + Err(Error::missed_end(end, Decoder::utf8())) + } + + /// Note: version can be changed after reading new event, because new event + /// could be produced from another document due to entity resolution. + pub fn xml_version(&self) -> XmlVersion { + match self.units.back() { + Some(unit) => unit.version, + // If there no units we assume default XML version + None => XmlVersion::V1_0, + } + } + + /// Note: decoder can be changed after reading new event, because new event + /// could be produced from another document due to entity resolution. + pub fn decoder(&self) -> Decoder { + match self.units.back() { + Some(unit) => unit.decoder(), + // Does not matter what decoder to use when all events exhausted + None => Decoder::utf8(), + } + } + + /// Checks if the `start` tag has a [`xsi:nil`] attribute. This method ignores + /// any errors in attributes. + /// + /// [`xsi:nil`]: https://www.w3.org/TR/xmlschema-1/#xsi_nil + pub fn has_nil_attr(&self, start: &BytesStart) -> bool { + match self.units.back() { + Some(unit) => unit.has_nil_attr(start), + None => false, + } + } +} + +/// This is an implementation for reading from a `&[u8]` as underlying byte stream. +/// This implementation supports not using an intermediate buffer as the byte slice +/// itself can be used to borrow from. +impl<'i, 'e> XmlReader<'i, 'e, PredefinedEntityResolver> { + /// Creates an XML reader from a string slice. + #[allow(clippy::should_implement_trait)] + pub fn from_str(source: &'i str) -> Self { + Self::borrowed(Reader::from_str(source), PredefinedEntityResolver) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod test { /// Checks the internal implementation of the various reader methods diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 46858cc85..ceca86eb1 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -4,6 +4,7 @@ //! [qualified names]: https://www.w3.org/TR/xml-names11/#dt-qualname //! [expanded names]: https://www.w3.org/TR/xml-names11/#dt-expname +use std::collections::VecDeque; use std::fs::File; use std::io::{BufRead, BufReader}; use std::ops::Deref; @@ -12,7 +13,8 @@ use std::path::Path; use crate::errors::Result; use crate::events::{BytesText, Event}; use crate::name::{NamespaceResolver, QName, ResolveResult}; -use crate::reader::{Config, Reader, Span, XmlSource}; +use crate::reader::{Config, EntityReader, EntityResolver, Reader, Span, StorageUnit, XmlSource}; +use crate::XmlVersion; /// A low level encoding-agnostic XML event reader that performs namespace resolution. /// @@ -751,6 +753,42 @@ impl<'i> NsReader<&'i [u8]> { self.ns_resolver.pop(); Ok(result) } + + /// Converts this reader with its state into the storage unit for the [`XmlReader`](super::XmlReader). + pub(super) fn to_borrowed_storage_unit<'e, E>( + self, + entity_resolver: E, + ) -> StorageUnit<'i, 'e, E> + where + E: EntityResolver<'i>, + { + StorageUnit { + parts: VecDeque::from([EntityReader::InternalBorrowed(self.reader)]), + version: XmlVersion::V1_0, + ns_resolver: self.ns_resolver, + pending_ns_pop: self.pending_pop, + entity_resolver, + } + } +} + +impl<'i> NsReader> { + /// Converts this reader with its state into the storage unit for the [`XmlReader`](super::XmlReader). + pub(super) fn to_buffered_storage_unit<'e, E>( + self, + entity_resolver: E, + ) -> StorageUnit<'i, 'e, E> + where + E: EntityResolver<'i>, + { + StorageUnit { + parts: VecDeque::from([EntityReader::InternalOwned(self.reader)]), + version: XmlVersion::V1_0, + ns_resolver: self.ns_resolver, + pending_ns_pop: self.pending_pop, + entity_resolver, + } + } } impl Deref for NsReader { diff --git a/src/reader/resolver.rs b/src/reader/resolver.rs new file mode 100644 index 000000000..5cfbde1df --- /dev/null +++ b/src/reader/resolver.rs @@ -0,0 +1,155 @@ +use std::borrow::Cow; +use std::convert::Infallible; +use std::error::Error; +use std::fmt; +use std::io::BufRead; + +use crate::events::BytesText; +use crate::utils::Bytes; + +/// [Replacement text] of the resolved entity reference (`&...;`). +/// +/// [Replacement text]: https://www.w3.org/TR/xml11/#dt-repltext +pub enum ReplacementText<'i, 'e> { + /// Referenced entity inside the same document in the internal DTD. + Internal(Cow<'i, [u8]>), + /// Referenced entity inside the other document which will be read from + /// the specified source. + External(Box), +} +impl<'i, 'e> fmt::Debug for ReplacementText<'i, 'e> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Internal(e) => Bytes(e).fmt(f), + Self::External(e) => write!(f, "", &e), + } + } +} + +/// Used to create entity resolver for each physical document (storage unit or an _[entity]_) +/// that would be parsed by the reader. +/// +/// [entity]: https://www.w3.org/TR/xml11/#sec-documents +pub trait EntityResolverFactory<'i> { + /// The error type that represents DTD parse error. + type CaptureError: Error + 'static; + /// Type that holds state for each entity, for example, for each file, which + /// forms the whole logical structure of the XML document. + type Resolver: EntityResolver<'i, CaptureError = Self::CaptureError>; + + /// Creates state for the new entity parser. + fn new_resolver(&mut self) -> Self::Resolver; +} + +/// Used to resolve unknown [general entities] (`&...;`) while parsing. +/// +/// Note, that this trait is not used to resolve _[parameter entities]_ (`%...;`), they are resolved +/// inside implementation of this trait. Parameter entities cannot be used outside of the `` +/// declaration, so no need to resolve them in the document. +/// +/// # Example +/// +/// That example is taken from the XML specification. Suppose that we have the following DTD: +/// ```xml +/// +/// +/// +/// ``` +/// Here we have two defined _internal general entities_ (`rights` and `book`), which may be used +/// everything in the document below their definition point (including the DOCTYPE declaration) and +/// one _parameter entity_ (`pub`), which may be used only inside DOCTYPE declaration below it +/// definition point. The literal values and replacement texts for those entities are: +/// +/// |Entity|Literal value |Replacement text +/// |------|-----------------------------|------------------------------------ +/// |pub |`Éditions Gallimard` |`Éditions Gallimard` +/// |rights|`All rights reserved` |`All rights reserved` +/// |book |`© 1947 %pub;. &rights;`|`© 1947 Éditions Gallimard. &rights;` +/// +/// Implementation of the `EntityResolver` must return the _replacement text_ from the +/// [resolve](Self::resolve) method. To follow XML specification, that means, that the +/// following must be done over the text that was captured in the [capture](Self::capture) method: +/// - EOLs must be normalized according to the XML version for which this resolver was created +/// - any parameter entity references should be resolved: they should be replaced by their's +/// replacement text +/// - any character references should be expanded into the corresponding characters +/// - any references to the other general entities (`&...;`) should be left as is +/// +/// If the implementation will not parse DTD and just provide values for the general entity +/// references (which usually custom resolvers will do), then just know, that any returned +/// text will be considered as a replacement text as required by the XML specification. +/// One consequence of this: if you want to have literal `<` and `&` characters in the text, +/// you should use escape form of them, either as character reference or as entity reference. +/// Otherwise they will be considered as part of the markup. +/// +/// [general entities]: https://www.w3.org/TR/xml11/#gen-entity +/// [parameter entities]: https://www.w3.org/TR/xml11/#dt-PE +pub trait EntityResolver<'i> { + /// The error type that represents DTD parse error. + type CaptureError: Error + 'static; + + /// Called on contents of [`Event::DocType`] to capture declared entities. + /// Can be called multiple times, for each parsed `` declaration. + /// + /// [`Event::DocType`]: crate::reader::Event::DocType + fn capture(&mut self, doctype: BytesText<'i>) -> Result<(), Self::CaptureError>; + + /// Called when an entity needs to be resolved. Returns entity's [replacement text]. + /// + /// `None` is returned if a suitable value can not be found. + /// In that case an [`Error::UnrecognizedGeneralEntity`] will be returned by a reader. + /// + /// [replacement text]: https://www.w3.org/TR/xml11/#dt-repltext + /// [`Error::UnrecognizedGeneralEntity`]: crate::errors::Error::UnrecognizedGeneralEntity + fn resolve<'e>(&self, entity: &str) -> Option>; +} + +/// An [`EntityResolver`] that resolves only predefined entities, as defined in [specification]: +/// +/// | Entity | Resolution +/// |--------|------------ +/// |`<` | `<` (note: not `<`) +/// |`>` | `>` +/// |`&` | `&` (note: not `&`) +/// |`'`| `'` +/// |`"`| `"` +/// +/// This is the default resolver for reader and deserializer. +/// +/// [specification]: https://www.w3.org/TR/xml11/#sec-predefined-ent +#[derive(Default, Debug, Copy, Clone)] +pub struct PredefinedEntityResolver; + +impl<'i> EntityResolverFactory<'i> for PredefinedEntityResolver { + type CaptureError = Infallible; + type Resolver = Self; + + #[inline] + fn new_resolver(&mut self) -> Self::Resolver { + *self + } +} + +impl<'i> EntityResolver<'i> for PredefinedEntityResolver { + type CaptureError = Infallible; + + #[inline] + fn capture(&mut self, _doctype: BytesText<'i>) -> Result<(), Self::CaptureError> { + Ok(()) + } + + #[inline] + fn resolve<'e>(&self, entity: &str) -> Option> { + let replacement_text = match entity { + "lt" => "<", + "gt" => ">", + "amp" => "&", + "apos" => "'", + "quot" => "\"", + _ => return None, + }; + Some(ReplacementText::Internal(Cow::Borrowed( + replacement_text.as_bytes(), + ))) + } +} diff --git a/tests/serde-de-references.rs b/tests/serde-de-references.rs new file mode 100644 index 000000000..548b122b5 --- /dev/null +++ b/tests/serde-de-references.rs @@ -0,0 +1,94 @@ +use std::borrow::Cow; +use std::convert::Infallible; + +use quick_xml::de::Deserializer; +use quick_xml::events::BytesText; +use quick_xml::reader::{EntityResolver, EntityResolverFactory, ReplacementText}; + +use pretty_assertions::assert_eq; +use serde::Deserialize; + +#[derive(Clone, Copy)] +struct TestEntityResolver { + capture_called: bool, +} + +impl<'i> EntityResolverFactory<'i> for TestEntityResolver { + type CaptureError = Infallible; + type Resolver = Self; + + fn new_resolver(&mut self) -> Self::Resolver { + *self + } +} + +impl<'i> EntityResolver<'i> for TestEntityResolver { + type CaptureError = Infallible; + + fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::CaptureError> { + self.capture_called = true; + Ok(()) + } + + fn resolve<'e>(&self, entity: &str) -> Option> { + assert!( + self.capture_called, + "`EntityResolver::capture` should be called before `EntityResolver::resolve(\"{}\")`", + entity, + ); + + match dbg!(entity) { + "text" => Some(ReplacementText::Internal(Cow::Borrowed( + b" ", + ))), + _ => Some(ReplacementText::Internal(Cow::Borrowed( + b" + &text; + + ", + ))), + } + } +} + +#[derive(Debug, PartialEq, Deserialize)] +struct Root { + child1: Child1, + child2: (), +} + +#[derive(Debug, PartialEq, Deserialize)] +struct Child1 { + #[serde(rename = "@attribute")] + attribute: String, + + #[serde(rename = "$text")] + text: String, +} + +#[test] +fn entities() { + let mut de = Deserializer::from_str_with_resolver( + " + + &entity; + ", + TestEntityResolver { + capture_called: false, + }, + ); + + let data = Root::deserialize(&mut de).unwrap(); + + de.check_eof_reached(); + assert_eq!( + data, + Root { + child1: Child1 { + attribute: "".to_string(), + text: " second text ".to_string(), + }, + child2: (), + } + ); +} diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 7f1850a3a..10a6e1be7 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -1,4 +1,3 @@ -use quick_xml::de::Deserializer; use quick_xml::utils::{ByteBuf, Bytes}; use quick_xml::DeError; @@ -1827,75 +1826,6 @@ mod borrow { } } -/// Test for entity resolver -mod resolve { - use super::*; - use pretty_assertions::assert_eq; - use quick_xml::de::EntityResolver; - use quick_xml::events::BytesText; - use std::collections::BTreeMap; - use std::convert::Infallible; - use std::iter::FromIterator; - - struct TestEntityResolver { - capture_called: bool, - } - - impl EntityResolver for TestEntityResolver { - type Error = Infallible; - - fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> { - self.capture_called = true; - - assert_eq!(doctype.as_ref(), br#"dict[ ]"#); - - Ok(()) - } - - fn resolve(&self, entity: &str) -> Option<&str> { - assert!( - self.capture_called, - "`EntityResolver::capture` should be called before `EntityResolver::resolve`" - ); - match entity { - "t1" => Some("test_one"), - "t2" => Some("test_two"), - _ => None, - } - } - } - - #[test] - fn resolve_custom_entity() { - let resolver = TestEntityResolver { - capture_called: false, - }; - let mut de = Deserializer::with_resolver( - br#" - ]> - - - &t1; - &t2; - non-entity - - "# - .as_ref(), - resolver, - ); - - let data: BTreeMap = BTreeMap::deserialize(&mut de).unwrap(); - assert_eq!( - data, - BTreeMap::from_iter([ - (String::from("entity_one"), String::from("test_one")), - (String::from("entity_two"), String::from("test_two")), - (String::from("entity_three"), String::from("non-entity")), - ]) - ); - } -} - /// Tests for https://github.com/tafia/quick-xml/pull/603. /// /// According to comments,