diff --git a/Cargo.toml b/Cargo.toml
index f7944de1f..1a839ca9f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -233,6 +233,11 @@ name = "serde-de-enum"
required-features = ["serialize"]
path = "tests/serde-de-enum.rs"
+[[test]]
+name = "serde-de-references"
+required-features = ["serialize"]
+path = "tests/serde-de-references.rs"
+
[[test]]
name = "serde-de-seq"
required-features = ["serialize"]
diff --git a/Changelog.md b/Changelog.md
index 56b6e81fb..c4f0bdf67 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -14,6 +14,10 @@
## Unreleased
+The new `XmlReader` type was added that is automatically resolves general entity references.
+
+`quick_xml::de::resolver` was replaced by `quick_xml::resolver` module.
+
### New Features
- [#938]: Add new enumeration `XmlVersion` and typified getter `BytesDecl::xml_version()`.
@@ -33,6 +37,10 @@
Deprecated functions now behaves the same as newly added.
+- [#948]: Add `quick_xml::reader::EntityResolver` which is able to resolve external entities.
+- [#948]: Add `quick_xml::reader::XmlReader`, a new high-level reader which should be preferred
+ over the old `Reader`.
+
### Bug Fixes
- [#938]: Use correct rules for EOL normalization in `Deserializer` when parse XML 1.0 documents.
@@ -51,6 +59,7 @@
[#914]: https://github.com/tafia/quick-xml/pull/914
[#938]: https://github.com/tafia/quick-xml/pull/938
[#944]: https://github.com/tafia/quick-xml/pull/944
+[#948]: https://github.com/tafia/quick-xml/pull/948
## 0.39.2 -- 2026-02-20
diff --git a/compare/benches/low-level.rs b/compare/benches/low-level.rs
index 631fc0c68..cf475ab75 100644
--- a/compare/benches/low-level.rs
+++ b/compare/benches/low-level.rs
@@ -1,7 +1,7 @@
use criterion::{self, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use pretty_assertions::assert_eq;
use quick_xml::events::Event;
-use quick_xml::reader::Reader;
+use quick_xml::reader::{self, Reader, XmlReader};
use std::hint::black_box;
use xml::reader::{EventReader, XmlEvent};
@@ -94,6 +94,26 @@ fn low_level_comparison(c: &mut Criterion) {
},
);
+ group.bench_with_input(
+ BenchmarkId::new("quick_xml:reader", filename),
+ *data,
+ |b, input| {
+ b.iter(|| {
+ let mut reader = XmlReader::from_str(input);
+ // TODO: reader.config_mut().check_end_names = false;
+ let mut count = black_box(0);
+ loop {
+ match reader.read_event() {
+ Ok(reader::Event::Start(_)) | Ok(reader::Event::Empty(_)) => count += 1,
+ Ok(reader::Event::Eof) => break,
+ _ => (),
+ }
+ }
+ assert_eq!(count, total_tags, "Overall tag count in {}", filename);
+ })
+ },
+ );
+
group.bench_with_input(
BenchmarkId::new("maybe_xml:0.10", filename),
*data,
diff --git a/examples/high-level-entities.rs b/examples/high-level-entities.rs
new file mode 100644
index 000000000..f42c50257
--- /dev/null
+++ b/examples/high-level-entities.rs
@@ -0,0 +1,224 @@
+//! This example demonstrate how custom entities can be extracted from the DOCTYPE
+//! and usage of the high-level `Reader` API.
+//!
+//! NB: this example is deliberately kept simple:
+//! * the regex in this example is simple but brittle.
+
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::convert::Infallible;
+use std::fmt;
+use std::io::{BufRead, Cursor};
+
+use quick_xml::events::{BytesEnd, BytesStart, BytesText};
+use quick_xml::reader::{
+ EntityResolver, EntityResolverFactory, Reader, ReplacementText, XmlEvent, XmlReader,
+};
+use regex::bytes::Regex;
+
+use pretty_assertions::assert_eq;
+
+const XML1: &str = r#"
+
+" >
+ &element1; " >
+]>
+&element2;
+&external;
+"#;
+
+/// Additional document which in reality would be referenced by
+/// ``
+const XML2: &str = r#"
+
+text
+"#;
+
+struct MyResolver<'i> {
+ /// Map of captured internal _parsed general entities_. _Parsed_ means that
+ /// value of the entity is parsed by XML reader.
+ entities: HashMap, Cow<'i, [u8]>>,
+ /// In this example we use simple regular expression to capture entities from DTD.
+ /// In real application you should use DTD parser.
+ entity_re: Regex,
+}
+impl<'i> fmt::Debug for MyResolver<'i> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_map()
+ .entries(self.entities.iter().map(|(k, v)| {
+ (
+ std::str::from_utf8(k).unwrap(),
+ std::str::from_utf8(v).unwrap(),
+ )
+ }))
+ .finish()
+ }
+}
+
+impl<'i> MyResolver<'i> {
+ fn new() -> Result {
+ Ok(Self {
+ entities: Default::default(),
+ // Capture "name" and "content" from such string:
+ //
+ entity_re: Regex::new(r#""#)?,
+ })
+ }
+ fn capture_borrowed(&mut self, doctype: &'i [u8]) {
+ for cap in self.entity_re.captures_iter(doctype) {
+ self.entities.insert(
+ cap.get(1).unwrap().as_bytes().into(),
+ cap.get(2).unwrap().as_bytes().into(),
+ );
+ }
+ }
+ fn capture_owned(&mut self, doctype: Vec) {
+ for cap in self.entity_re.captures_iter(&doctype) {
+ self.entities.insert(
+ cap.get(1).unwrap().as_bytes().to_owned().into(),
+ cap.get(2).unwrap().as_bytes().to_owned().into(),
+ );
+ }
+ }
+}
+
+impl<'i> EntityResolverFactory<'i> for MyResolver<'i> {
+ type CaptureError = Infallible;
+ type Resolver = Self;
+
+ fn new_resolver(&mut self) -> Self::Resolver {
+ // We use valid regex so cannot fail
+ Self::new().unwrap()
+ }
+}
+
+impl<'i> EntityResolver<'i> for MyResolver<'i> {
+ type CaptureError = Infallible;
+
+ fn capture(&mut self, doctype: BytesText<'i>) -> Result<(), Self::CaptureError> {
+ dbg!(&doctype);
+ match doctype.into_inner() {
+ Cow::Borrowed(doctype) => self.capture_borrowed(doctype),
+ Cow::Owned(doctype) => self.capture_owned(doctype),
+ }
+ dbg!(self);
+ Ok(())
+ }
+
+ fn resolve<'e>(&self, entity: &str) -> Option> {
+ dbg!((entity, self));
+ if entity == "external" {
+ return Some(ReplacementText::External(Box::new(Cursor::new(
+ XML2.as_bytes(),
+ ))));
+ }
+ match self.entities.get(entity.as_bytes()) {
+ Some(replacement) => Some(ReplacementText::Internal(replacement.clone())),
+ None => None,
+ }
+ }
+}
+
+/// In this example the events will borrow from the first document
+fn borrowed() -> Result<(), Box> {
+ let mut reader = Reader::from_str(XML1);
+ reader.config_mut().trim_text(true);
+
+ let mut r = XmlReader::borrowed(reader, MyResolver::new()?);
+
+ assert_eq!(
+ r.read_event()?,
+ XmlEvent::Start(BytesStart::from_content(
+ r#"test label="Message: &text;""#,
+ 4
+ ))
+ );
+
+ //--------------------------------------------------------------------------
+ // This part was inserted into original document from entity defined in DTD
+ assert_eq!(r.read_event()?, XmlEvent::Start(BytesStart::new("a")));
+ assert_eq!(
+ r.read_event()?,
+ XmlEvent::Empty(BytesStart::from_content(
+ r#"dtd attr = 'Message: &text;'"#,
+ 3
+ ))
+ );
+ assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("a")));
+ //--------------------------------------------------------------------------
+
+ assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("test")));
+
+ //--------------------------------------------------------------------------
+ // Start of external document
+ assert_eq!(
+ r.read_event()?,
+ XmlEvent::Start(BytesStart::new("external"))
+ );
+ assert_eq!(r.read_event()?, XmlEvent::Text(BytesText::new("text")));
+ assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("external")));
+ //--------------------------------------------------------------------------
+
+ assert_eq!(r.read_event()?, XmlEvent::Eof);
+
+ Ok(())
+}
+
+/// In this example the events will always copy data
+fn buffered() -> Result<(), Box> {
+ let boxed: Box = Box::new(Cursor::new(XML1.as_bytes()));
+ let mut reader = Reader::from_reader(boxed);
+ reader.config_mut().trim_text(true);
+
+ let mut r = XmlReader::buffered(reader, MyResolver::new()?);
+
+ assert_eq!(
+ r.read_event()?,
+ XmlEvent::Start(BytesStart::from_content(
+ r#"test label="Message: &text;""#,
+ 4
+ ))
+ );
+
+ //--------------------------------------------------------------------------
+ // This part was inserted into original document from entity defined in DTD
+ assert_eq!(r.read_event()?, XmlEvent::Start(BytesStart::new("a")));
+ assert_eq!(
+ r.read_event()?,
+ XmlEvent::Empty(BytesStart::from_content(
+ r#"dtd attr = 'Message: &text;'"#,
+ 3
+ ))
+ );
+ assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("a")));
+ //--------------------------------------------------------------------------
+
+ assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("test")));
+
+ //--------------------------------------------------------------------------
+ // Start of external document
+ assert_eq!(
+ r.read_event()?,
+ XmlEvent::Start(BytesStart::new("external"))
+ );
+ assert_eq!(r.read_event()?, XmlEvent::Text(BytesText::new("text")));
+ assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("external")));
+ //--------------------------------------------------------------------------
+
+ assert_eq!(r.read_event()?, XmlEvent::Eof);
+
+ Ok(())
+}
+
+fn main() -> Result<(), Box> {
+ println!("{}", XML1);
+ // In this example the events will borrow from the first document
+ borrowed()?;
+
+ println!("----------------------------------------------------------------");
+ println!("{}", XML1);
+ // In this example the events will always copy data
+ buffered()?;
+ Ok(())
+}
diff --git a/examples/custom_entities.rs b/examples/low-level-entities.rs
similarity index 97%
rename from examples/custom_entities.rs
rename to examples/low-level-entities.rs
index ed8c082a2..61b5ea0bb 100644
--- a/examples/custom_entities.rs
+++ b/examples/low-level-entities.rs
@@ -16,7 +16,6 @@ use std::str::from_utf8;
use quick_xml::encoding::Decoder;
use quick_xml::errors::Error;
-use quick_xml::escape::EscapeError;
use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
use quick_xml::name::QName;
use quick_xml::reader::Reader;
@@ -107,11 +106,10 @@ impl<'i> MyReader<'i> {
}
}
- fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> {
+ fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], Error> {
match self.entities.get(entity) {
Some(replacement) => Ok(replacement),
- None => Err(EscapeError::UnrecognizedEntity(
- 0..0,
+ None => Err(Error::UnrecognizedGeneralEntity(
String::from_utf8_lossy(entity).into_owned(),
)),
}
diff --git a/src/de/map.rs b/src/de/map.rs
index 3d25a9411..d5b4cfb6e 100644
--- a/src/de/map.rs
+++ b/src/de/map.rs
@@ -2,15 +2,15 @@
use crate::{
de::key::QNameDeserializer,
- de::resolver::EntityResolver,
de::simple_type::SimpleTypeDeserializer,
de::text::TextDeserializer,
- de::{DeEvent, Deserializer, XmlRead, TEXT_KEY, VALUE_KEY},
+ de::{DeEvent, Deserializer, TEXT_KEY, VALUE_KEY},
errors::serialize::DeError,
errors::Error,
events::attributes::IterState,
events::BytesStart,
name::QName,
+ reader::EntityResolverFactory,
};
use serde::de::value::BorrowedStrDeserializer;
use serde::de::{self, DeserializeSeed, Deserializer as _, MapAccess, SeqAccess, Visitor};
@@ -166,14 +166,13 @@ enum ValueSource {
///
/// - `'d` lifetime represents a parent deserializer, which could own the data
/// buffer.
-pub(crate) struct ElementMapAccess<'de, 'd, R, E>
+pub(crate) struct ElementMapAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
/// Tag -- owner of attributes
start: BytesStart<'de>,
- de: &'d mut Deserializer<'de, R, E>,
+ de: &'d mut Deserializer<'de, 'e, EF>,
/// State of the iterator over attributes. Contains the next position in the
/// inner `start` slice, from which next attribute should be parsed.
iter: IterState,
@@ -195,14 +194,13 @@ where
has_text_field: bool,
}
-impl<'de, 'd, R, E> ElementMapAccess<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> ElementMapAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
/// Create a new ElementMapAccess
pub fn new(
- de: &'d mut Deserializer<'de, R, E>,
+ de: &'d mut Deserializer<'de, 'e, EF>,
start: BytesStart<'de>,
fields: &'static [&'static str],
) -> Self {
@@ -240,10 +238,9 @@ where
}
}
-impl<'de, 'd, R, E> MapAccess<'de> for ElementMapAccess<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> MapAccess<'de> for ElementMapAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
@@ -446,14 +443,13 @@ where
///
/// [`deserialize_tuple`]: #method.deserialize_tuple
/// [`deserialize_struct`]: #method.deserialize_struct
-struct MapValueDeserializer<'de, 'd, 'm, R, E>
+struct MapValueDeserializer<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
/// Access to the map that created this deserializer. Gives access to the
/// context, such as list of fields, that current map known about.
- map: &'m mut ElementMapAccess<'de, 'd, R, E>,
+ map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>,
/// Whether this deserializer was created for deserialization from an element
/// with fixed name, or the elements with different names or even text are allowed.
///
@@ -531,10 +527,9 @@ where
fixed_name: bool,
}
-impl<'de, 'd, 'm, R, E> MapValueDeserializer<'de, 'd, 'm, R, E>
+impl<'de, 'e, 'd, 'm, EF> MapValueDeserializer<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
/// Returns a next string as concatenated content of consequent [`Text`] and
/// [`CData`] events, used inside [`deserialize_primitives!()`].
@@ -548,10 +543,9 @@ where
}
}
-impl<'de, 'd, 'm, R, E> de::Deserializer<'de> for MapValueDeserializer<'de, 'd, 'm, R, E>
+impl<'de, 'e, 'd, 'm, EF> de::Deserializer<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
@@ -685,13 +679,12 @@ where
}
}
-impl<'de, 'd, 'm, R, E> de::EnumAccess<'de> for MapValueDeserializer<'de, 'd, 'm, R, E>
+impl<'de, 'e, 'd, 'm, EF> de::EnumAccess<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
- type Variant = MapValueVariantAccess<'de, 'd, 'm, R, E>;
+ type Variant = MapValueVariantAccess<'de, 'e, 'd, 'm, EF>;
fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error>
where
@@ -716,23 +709,21 @@ where
}
}
-struct MapValueVariantAccess<'de, 'd, 'm, R, E>
+struct MapValueVariantAccess<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
/// Access to the map that created this enum accessor. Gives access to the
/// context, such as list of fields, that current map known about.
- map: &'m mut ElementMapAccess<'de, 'd, R, E>,
+ map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>,
/// `true` if variant should be deserialized from a textual content
/// and `false` if from tag
is_text: bool,
}
-impl<'de, 'd, 'm, R, E> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'd, 'm, R, E>
+impl<'de, 'e, 'd, 'm, EF> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
@@ -907,14 +898,13 @@ impl<'de> TagFilter<'de> {
///
/// [`Text`]: crate::events::Event::Text
/// [`CData`]: crate::events::Event::CData
-struct MapValueSeqAccess<'de, 'd, 'm, R, E>
+struct MapValueSeqAccess<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
/// Accessor to a map that creates this accessor and to a deserializer for
/// a sequence items.
- map: &'m mut ElementMapAccess<'de, 'd, R, E>,
+ map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>,
/// Filter that determines whether a tag is a part of this sequence.
///
/// When feature [`overlapped-lists`] is not activated, iteration will stop
@@ -934,20 +924,18 @@ where
}
#[cfg(feature = "overlapped-lists")]
-impl<'de, 'd, 'm, R, E> Drop for MapValueSeqAccess<'de, 'd, 'm, R, E>
+impl<'de, 'e, 'd, 'm, EF> Drop for MapValueSeqAccess<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
fn drop(&mut self) {
self.map.de.start_replay(self.checkpoint);
}
}
-impl<'de, 'd, 'm, R, E> SeqAccess<'de> for MapValueSeqAccess<'de, 'd, 'm, R, E>
+impl<'de, 'e, 'd, 'm, EF> SeqAccess<'de> for MapValueSeqAccess<'de, 'e, 'd, 'm, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
@@ -1051,19 +1039,17 @@ where
/// [specification]: https://www.w3.org/TR/xmlschema11-2/#boolean
/// [`deserialize_tuple`]: #method.deserialize_tuple
/// [`deserialize_struct`]: #method.deserialize_struct
-struct ElementDeserializer<'de, 'd, R, E>
+struct ElementDeserializer<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
start: BytesStart<'de>,
- de: &'d mut Deserializer<'de, R, E>,
+ de: &'d mut Deserializer<'de, 'e, EF>,
}
-impl<'de, 'd, R, E> ElementDeserializer<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> ElementDeserializer<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
/// Returns a next string as concatenated content of consequent [`Text`] and
/// [`CData`] events, used inside [`deserialize_primitives!()`].
@@ -1076,10 +1062,9 @@ where
}
}
-impl<'de, 'd, R, E> de::Deserializer<'de> for ElementDeserializer<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> de::Deserializer<'de> for ElementDeserializer<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
@@ -1167,10 +1152,9 @@ where
}
}
-impl<'de, 'd, R, E> de::EnumAccess<'de> for ElementDeserializer<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> de::EnumAccess<'de> for ElementDeserializer<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
type Variant = Self;
@@ -1184,10 +1168,9 @@ where
}
}
-impl<'de, 'd, R, E> de::VariantAccess<'de> for ElementDeserializer<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> de::VariantAccess<'de> for ElementDeserializer<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
diff --git a/src/de/mod.rs b/src/de/mod.rs
index de2206a78..3ccd37d17 100644
--- a/src/de/mod.rs
+++ b/src/de/mod.rs
@@ -125,8 +125,8 @@
//!
//!
//!
-//! [text]: Event::Text
-//! [CDATA]: Event::CData
+//! [text]: crate::events::Event::Text
+//! [CDATA]: crate::events::Event::CData
//!
//!
//!
@@ -2098,25 +2098,21 @@ macro_rules! deserialize_primitives {
mod attributes;
mod key;
mod map;
-mod resolver;
mod simple_type;
mod text;
mod var;
pub use self::attributes::AttributesDeserializer;
-pub use self::resolver::{EntityResolver, PredefinedEntityResolver};
pub use self::simple_type::SimpleTypeDeserializer;
pub use crate::errors::serialize::DeError;
-use crate::XmlVersion;
use crate::{
de::map::ElementMapAccess,
encoding::Decoder,
errors::Error,
- escape::{parse_number, EscapeError},
- events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText, Event},
+ events::{BytesCData, BytesEnd, BytesStart, BytesText},
name::QName,
- reader::NsReader,
+ reader::{EntityResolverFactory, NsReader, PredefinedEntityResolver, XmlEvent, XmlReader},
};
use serde::de::{
self, Deserialize, DeserializeOwned, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor,
@@ -2148,17 +2144,17 @@ const fn is_non_whitespace(ch: char) -> bool {
/// Internally text is stored in `Cow`. Cloning of text is cheap while it
/// is borrowed and makes copies of data when it is owned.
///
-/// [`Text`]: Event::Text
-/// [`CData`]: Event::CData
-/// [`Comment`]: Event::Comment
-/// [`PI`]: Event::PI
+/// [`Text`]: crate::events::Event::Text
+/// [`CData`]: crate::events::Event::CData
+/// [`Comment`]: crate::events::Event::Comment
+/// [`PI`]: crate::events::Event::PI
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Text<'a> {
/// Untrimmed text after concatenating content of all
/// [`Text`] and [`CData`] events
///
- /// [`Text`]: Event::Text
- /// [`CData`]: Event::CData
+ /// [`Text`]: crate::events::Event::Text
+ /// [`CData`]: crate::events::Event::CData
text: Cow<'a, str>,
/// A range into `text` which contains data after trimming
content: Range,
@@ -2270,10 +2266,10 @@ pub enum DeEvent<'a> {
/// events. _Consequent_ means that events should follow each other or be
/// delimited only by (any count of) [`Comment`] or [`PI`] events.
///
- /// [`Text`]: Event::Text
- /// [`CData`]: Event::CData
- /// [`Comment`]: Event::Comment
- /// [`PI`]: Event::PI
+ /// [`Text`]: crate::events::Event::Text
+ /// [`CData`]: crate::events::Event::CData
+ /// [`Comment`]: crate::events::Event::Comment
+ /// [`PI`]: crate::events::Event::PI
Text(Text<'a>),
/// End of XML document.
Eof,
@@ -2290,8 +2286,8 @@ pub enum DeEvent<'a> {
/// end spaces we should lookahead by one deserializer event (i. e. skip all
/// comments and processing instructions).
///
-/// [`Text`]: Event::Text
-/// [`CData`]: Event::CData
+/// [`Text`]: crate::events::Event::Text
+/// [`CData`]: crate::events::Event::CData
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum PayloadEvent<'a> {
/// Start tag (with attributes) ``.
@@ -2302,60 +2298,30 @@ pub enum PayloadEvent<'a> {
Text(BytesText<'a>),
/// Unescaped character data stored in ``.
CData(BytesCData<'a>),
- /// Document type definition data (DTD) stored in ``.
- DocType(BytesText<'a>),
- /// Reference `&ref;` in the textual data.
- GeneralRef(BytesRef<'a>),
/// End of XML document.
Eof,
}
-impl<'a> PayloadEvent<'a> {
- /// Ensures that all data is owned to extend the object's lifetime if necessary.
- #[inline]
- fn into_owned(self) -> PayloadEvent<'static> {
- match self {
- PayloadEvent::Start(e) => PayloadEvent::Start(e.into_owned()),
- PayloadEvent::End(e) => PayloadEvent::End(e.into_owned()),
- PayloadEvent::Text(e) => PayloadEvent::Text(e.into_owned()),
- PayloadEvent::CData(e) => PayloadEvent::CData(e.into_owned()),
- PayloadEvent::DocType(e) => PayloadEvent::DocType(e.into_owned()),
- PayloadEvent::GeneralRef(e) => PayloadEvent::GeneralRef(e.into_owned()),
- PayloadEvent::Eof => PayloadEvent::Eof,
- }
- }
-}
-
/// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s.
/// [`PayloadEvent::Text`] events, that followed by any event except
/// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end.
-struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolver> {
+struct LookaheadReader<'i, 'e, EF: EntityResolverFactory<'i> = PredefinedEntityResolver> {
/// A source of low-level XML events
- reader: R,
+ reader: XmlReader<'i, 'e, EF>,
/// Intermediate event, that could be returned by the next call to `next()`.
/// If that is the `Text` event then leading spaces already trimmed, but
/// trailing spaces is not. Before the event will be returned, trimming of
/// the spaces could be necessary
lookahead: Result, DeError>,
-
- /// Used to resolve unknown entities that would otherwise cause the parser
- /// to return an [`EscapeError::UnrecognizedEntity`] error.
- ///
- /// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity
- entity_resolver: E,
}
-impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
- fn new(mut reader: R, entity_resolver: E) -> Self {
+impl<'i, 'e, EF: EntityResolverFactory<'i>> LookaheadReader<'i, 'e, EF> {
+ fn new(mut reader: XmlReader<'i, 'e, EF>) -> Self {
// Lookahead by one event immediately, so we do not need to check in the
// loop if we need lookahead or not
let lookahead = reader.next();
- Self {
- reader,
- lookahead,
- entity_resolver,
- }
+ Self { reader, lookahead }
}
/// Returns `true` if all events was consumed
@@ -2375,7 +2341,7 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
// If next event is a text or CDATA, we should not trim trailing spaces
!matches!(
self.lookahead,
- Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_) | PayloadEvent::GeneralRef(_))
+ Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_))
)
}
@@ -2398,10 +2364,9 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
PayloadEvent::CData(e) => result
.to_mut()
.push_str(&e.xml_content(self.reader.xml_version())?),
- PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?,
- // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef
- _ => unreachable!("Only `Text`, `CData` or `GeneralRef` events can come here"),
+ // SAFETY: current_event_is_last_text checks that event is Text or CData
+ _ => unreachable!("Only `Text` or `CData` events can come here"),
}
}
Ok(DeEvent::Text(Text::new(result)))
@@ -2417,38 +2382,11 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
PayloadEvent::CData(e) => {
self.drain_text(e.xml_content(self.reader.xml_version())?)
}
- PayloadEvent::DocType(e) => {
- self.entity_resolver
- .capture(e)
- .map_err(|err| DeError::Custom(format!("cannot parse DTD: {}", err)))?;
- continue;
- }
- PayloadEvent::GeneralRef(e) => {
- let mut text = String::new();
- self.resolve_reference(&mut text, e)?;
- self.drain_text(text.into())
- }
PayloadEvent::Eof => Ok(DeEvent::Eof),
};
}
}
- fn resolve_reference(&mut self, result: &mut String, event: BytesRef) -> Result<(), DeError> {
- let len = event.len();
- let reference = self.decoder().decode(&event)?;
-
- if let Some(num) = reference.strip_prefix('#') {
- let codepoint = parse_number(num).map_err(EscapeError::InvalidCharRef)?;
- result.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
- return Ok(());
- }
- if let Some(value) = self.entity_resolver.resolve(reference.as_ref()) {
- result.push_str(value);
- return Ok(());
- }
- Err(EscapeError::UnrecognizedEntity(0..len, reference.to_string()).into())
- }
-
#[inline]
fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
match self.lookahead {
@@ -2517,12 +2455,9 @@ where
////////////////////////////////////////////////////////////////////////////////////////////////////
/// A structure that deserializes XML into Rust values.
-pub struct Deserializer<'de, R, E: EntityResolver = PredefinedEntityResolver>
-where
- R: XmlRead<'de>,
-{
+pub struct Deserializer<'de, 'e, EF: EntityResolverFactory<'de> = PredefinedEntityResolver> {
/// An XML reader that streams events into this deserializer
- reader: XmlReader<'de, R, E>,
+ reader: LookaheadReader<'de, 'e, EF>,
/// When deserializing sequences sometimes we have to skip unwanted events.
/// That events should be stored and then replayed. This is a replay buffer,
@@ -2555,20 +2490,71 @@ where
key_buf: String,
}
-impl<'de, R, E> Deserializer<'de, R, E>
+impl<'de, 'e, EF> Deserializer<'de, 'e, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
+ /// Create a new deserializer that will borrow data from the specified string
+ /// and use the specified entity resolver.
+ pub fn from_str_with_resolver(source: &'de str, entity_resolver_factory: EF) -> Self {
+ Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver_factory)
+ }
+
+ /// Create a new deserializer that will borrow data from the specified preconfigured
+ /// reader and use the specified entity resolver.
+ ///
+ /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`.
+ ///
+ /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements
+ pub fn borrowing_with_resolver(
+ mut reader: NsReader<&'de [u8]>,
+ entity_resolver_factory: EF,
+ ) -> Self {
+ let config = reader.config_mut();
+ config.expand_empty_elements = true;
+
+ Self::new(XmlReader::borrowed_ns(reader, entity_resolver_factory))
+ }
+
+ /// Create a new deserializer that will copy data from the specified reader
+ /// into internal buffer and use the specified entity resolver.
+ ///
+ /// If you already have a string use [`Self::from_str`] instead, because it
+ /// will borrow instead of copy. If you have `&[u8]` which is known to represent
+ /// UTF-8, you can decode it first before using [`from_str`].
+ pub fn with_resolver(reader: R, entity_resolver_factory: EF) -> Self
+ where
+ R: BufRead + 'de,
+ {
+ let boxed: Box = Box::new(reader);
+ Self::buffering_with_resolver(NsReader::from_reader(boxed), entity_resolver_factory)
+ }
+
+ /// Create new deserializer that will copy data from the specified preconfigured reader
+ /// into internal buffer and use the specified entity resolver.
+ ///
+ /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`.
+ ///
+ /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements
+ pub fn buffering_with_resolver(
+ mut reader: NsReader>,
+ entity_resolver_factory: EF,
+ ) -> Self {
+ let config = reader.config_mut();
+ config.expand_empty_elements = true;
+
+ Self::new(XmlReader::buffered_ns(reader, entity_resolver_factory))
+ }
+
/// Create an XML deserializer from one of the possible quick_xml input sources.
///
/// Typically it is more convenient to use one of these methods instead:
///
/// - [`Deserializer::from_str`]
/// - [`Deserializer::from_reader`]
- fn new(reader: R, entity_resolver: E) -> Self {
+ fn new(reader: XmlReader<'de, 'e, EF>) -> Self {
Self {
- reader: XmlReader::new(reader, entity_resolver),
+ reader: LookaheadReader::new(reader),
#[cfg(feature = "overlapped-lists")]
read: VecDeque::new(),
@@ -2604,7 +2590,7 @@ where
/// # use pretty_assertions::assert_eq;
/// use serde::Deserialize;
/// use quick_xml::de::Deserializer;
- /// use quick_xml::NsReader;
+ /// use quick_xml::reader::XmlReader;
///
/// #[derive(Deserialize)]
/// struct SomeStruct {
@@ -2621,12 +2607,12 @@ where
/// let err = SomeStruct::deserialize(&mut de);
/// assert!(err.is_err());
///
- /// let reader: &NsReader<_> = de.get_ref().get_ref();
+ /// let reader: &XmlReader<_> = de.get_ref();
///
/// assert_eq!(reader.error_position(), 28);
/// assert_eq!(reader.buffer_position(), 41);
/// ```
- pub const fn get_ref(&self) -> &R {
+ pub const fn get_ref(&self) -> &XmlReader<'de, 'e, EF> {
&self.reader.reader
}
@@ -2876,8 +2862,8 @@ where
/// |[`DeEvent::Text`] |`text content` or `` (probably mixed)|Returns event content unchanged, expects the `` after that
/// |[`DeEvent::Eof`] | |Emits [`InvalidXml(IllFormed(MissingEndTag))`](DeError::InvalidXml)
///
- /// [`Text`]: Event::Text
- /// [`CData`]: Event::CData
+ /// [`Text`]: crate::events::Event::Text
+ /// [`CData`]: crate::events::Event::CData
fn read_string_impl(&mut self, allow_start: bool) -> Result, DeError> {
match self.next()? {
// Reached by doc tests only: this file, lines 979 and 996
@@ -3006,7 +2992,7 @@ where
}
}
-impl<'de> Deserializer<'de, SliceReader<'de>> {
+impl<'de, 'e> Deserializer<'de, 'e> {
/// Create a new deserializer that will borrow data from the specified string.
///
/// Deserializer created with this method will not resolve custom entities.
@@ -3053,42 +3039,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> {
pub fn borrowing(reader: NsReader<&'de [u8]>) -> Self {
Self::borrowing_with_resolver(reader, PredefinedEntityResolver)
}
-}
-
-impl<'de, E> Deserializer<'de, SliceReader<'de>, E>
-where
- E: EntityResolver,
-{
- /// Create a new deserializer that will borrow data from the specified string
- /// and use the specified entity resolver.
- pub fn from_str_with_resolver(source: &'de str, entity_resolver: E) -> Self {
- Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver)
- }
-
- /// Create a new deserializer that will borrow data from the specified preconfigured
- /// reader and use the specified entity resolver.
- ///
- /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`.
- ///
- /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements
- pub fn borrowing_with_resolver(mut reader: NsReader<&'de [u8]>, entity_resolver: E) -> Self {
- let config = reader.config_mut();
- config.expand_empty_elements = true;
-
- Self::new(
- SliceReader {
- reader,
- version: XmlVersion::V1_0,
- },
- entity_resolver,
- )
- }
-}
-impl<'de, R> Deserializer<'de, IoReader>
-where
- R: BufRead,
-{
/// Create a new deserializer that will copy data from the specified reader
/// into internal buffer.
///
@@ -3097,7 +3048,10 @@ where
/// UTF-8, you can decode it first before using [`from_str`].
///
/// Deserializer created with this method will not resolve custom entities.
- pub fn from_reader(reader: R) -> Self {
+ pub fn from_reader(reader: R) -> Self
+ where
+ R: BufRead + 'de,
+ {
Self::with_resolver(reader, PredefinedEntityResolver)
}
@@ -3115,18 +3069,22 @@ where
/// # use quick_xml::de::Deserializer;
/// # use quick_xml::NsReader;
/// # use serde::Deserialize;
- /// #
+ /// use std::io::{BufRead, Cursor};
+ ///
/// #[derive(Deserialize, PartialEq, Debug)]
/// struct Object {
/// tag: String,
/// }
///
- /// let mut reader = NsReader::from_str(" test ");
+ /// let boxed: Box = Box::new(Cursor::new(" test "));
+ /// let mut reader = NsReader::from_reader(boxed);
///
- /// let mut de = Deserializer::buffering(reader.clone());
+ /// let mut de = Deserializer::buffering(reader);
/// let obj = Object::deserialize(&mut de).unwrap();
/// assert_eq!(obj, Object { tag: " test ".to_string() });
///
+ /// let boxed: Box = Box::new(Cursor::new(" test "));
+ /// let mut reader = NsReader::from_reader(boxed);
/// reader.config_mut().trim_text(true);
///
/// let mut de = Deserializer::buffering(reader);
@@ -3136,62 +3094,14 @@ where
///
/// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements
#[inline]
- pub fn buffering(reader: NsReader) -> Self {
+ pub fn buffering(reader: NsReader>) -> Self {
Self::buffering_with_resolver(reader, PredefinedEntityResolver)
}
}
-impl<'de, R, E> Deserializer<'de, IoReader, E>
+impl<'de, 'e, EF> de::Deserializer<'de> for &mut Deserializer<'de, 'e, EF>
where
- R: BufRead,
- E: EntityResolver,
-{
- /// Create a new deserializer that will copy data from the specified reader
- /// into internal buffer and use the specified entity resolver.
- ///
- /// If you already have a string use [`Self::from_str`] instead, because it
- /// will borrow instead of copy. If you have `&[u8]` which is known to represent
- /// UTF-8, you can decode it first before using [`from_str`].
- pub fn with_resolver(reader: R, entity_resolver: E) -> Self {
- let mut reader = NsReader::from_reader(reader);
- let config = reader.config_mut();
- config.expand_empty_elements = true;
-
- Self::new(
- IoReader {
- reader,
- buf: Vec::new(),
- version: XmlVersion::V1_0,
- },
- entity_resolver,
- )
- }
-
- /// Create new deserializer that will copy data from the specified preconfigured reader
- /// into internal buffer and use the specified entity resolver.
- ///
- /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`.
- ///
- /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements
- pub fn buffering_with_resolver(mut reader: NsReader, entity_resolver: E) -> Self {
- let config = reader.config_mut();
- config.expand_empty_elements = true;
-
- Self::new(
- IoReader {
- reader,
- buf: Vec::new(),
- version: XmlVersion::V1_0,
- },
- entity_resolver,
- )
- }
-}
-
-impl<'de, R, E> de::Deserializer<'de> for &mut Deserializer<'de, R, E>
-where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
@@ -3328,10 +3238,9 @@ where
///
/// Technically, multiple top-level elements violates XML rule of only one top-level
/// element, but we consider this as several concatenated XML documents.
-impl<'de, R, E> SeqAccess<'de> for &mut Deserializer<'de, R, E>
+impl<'de, 'e, EF> SeqAccess<'de> for &mut Deserializer<'de, 'e, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
@@ -3355,10 +3264,9 @@ where
}
}
-impl<'de, R, E> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, R, E>
+impl<'de, 'e, EF> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, 'e, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Deserializer = Self;
@@ -3370,222 +3278,39 @@ where
////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Converts raw reader's event into a payload event.
-/// Returns `None`, if event should be skipped.
-#[inline(always)]
-fn skip_uninterested<'a>(event: Event<'a>) -> Option> {
- let event = match event {
- Event::DocType(e) => PayloadEvent::DocType(e),
- Event::Start(e) => PayloadEvent::Start(e),
- Event::End(e) => PayloadEvent::End(e),
- Event::Eof => PayloadEvent::Eof,
-
- // Do not trim next text event after Text, CDATA or reference event
- Event::CData(e) => PayloadEvent::CData(e),
- Event::Text(e) => PayloadEvent::Text(e),
- Event::GeneralRef(e) => PayloadEvent::GeneralRef(e),
-
- _ => return None,
- };
- Some(event)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Trait used by the deserializer for iterating over input. This is manually
-/// "specialized" for iterating over `&[u8]`.
-///
-/// You do not need to implement this trait, it is needed to abstract from
-/// [borrowing](SliceReader) and [copying](IoReader) data sources and reuse code in
-/// deserializer
-pub trait XmlRead<'i> {
- /// Return an input-borrowing event.
- fn next(&mut self) -> Result, DeError>;
-
- /// Skips until end element is found. Unlike `next()` it will not allocate
- /// when it cannot satisfy the lifetime.
- fn read_to_end(&mut self, name: QName) -> Result<(), DeError>;
-
- /// Return an XML version of the source.
- fn xml_version(&self) -> XmlVersion;
-
- /// A copy of the reader's decoder used to decode strings.
- fn decoder(&self) -> Decoder;
-
- /// Checks if the `start` tag has a [`xsi:nil`] attribute. This method ignores
- /// any errors in attributes.
- ///
- /// [`xsi:nil`]: https://www.w3.org/TR/xmlschema-1/#xsi_nil
- fn has_nil_attr(&self, start: &BytesStart) -> bool;
-}
-
-/// XML input source that reads from a std::io input stream.
-///
-/// You cannot create it, it is created automatically when you call
-/// [`Deserializer::from_reader`]
-pub struct IoReader {
- reader: NsReader,
- buf: Vec,
- version: XmlVersion,
-}
-
-impl IoReader {
- /// Returns the underlying XML reader.
- ///
- /// ```
- /// # use pretty_assertions::assert_eq;
- /// use serde::Deserialize;
- /// use std::io::Cursor;
- /// use quick_xml::de::Deserializer;
- /// use quick_xml::NsReader;
- ///
- /// #[derive(Deserialize)]
- /// struct SomeStruct {
- /// field1: String,
- /// field2: String,
- /// }
- ///
- /// // Try to deserialize from broken XML
- /// let mut de = Deserializer::from_reader(Cursor::new(
- /// ""
- /// // 0 ^= 28 ^= 41
- /// ));
- ///
- /// let err = SomeStruct::deserialize(&mut de);
- /// assert!(err.is_err());
- ///
- /// let reader: &NsReader> = de.get_ref().get_ref();
- ///
- /// assert_eq!(reader.error_position(), 28);
- /// assert_eq!(reader.buffer_position(), 41);
- /// ```
- pub const fn get_ref(&self) -> &NsReader {
- &self.reader
- }
-}
-
-impl<'i, R: BufRead> XmlRead<'i> for IoReader {
- fn next(&mut self) -> Result, DeError> {
- loop {
- self.buf.clear();
-
- let event = self.reader.read_event_into(&mut self.buf)?;
- if let Event::Decl(e) = &event {
- self.version = e.xml_version()?;
- }
- if let Some(event) = skip_uninterested(event) {
- return Ok(event.into_owned());
- }
- }
- }
-
- fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
- match self.reader.read_to_end_into(name, &mut self.buf) {
- Err(e) => Err(e.into()),
- Ok(_) => Ok(()),
- }
- }
-
- #[inline]
- fn xml_version(&self) -> XmlVersion {
- self.version
- }
-
- #[inline]
- fn decoder(&self) -> Decoder {
- self.reader.decoder()
- }
-
- fn has_nil_attr(&self, start: &BytesStart) -> bool {
- start.attributes().has_nil(self.reader.resolver())
- }
-}
-
-/// XML input source that reads from a slice of bytes and can borrow from it.
-///
-/// You cannot create it, it is created automatically when you call
-/// [`Deserializer::from_str`].
-pub struct SliceReader<'de> {
- reader: NsReader<&'de [u8]>,
- version: XmlVersion,
-}
-
-impl<'de> SliceReader<'de> {
- /// Returns the underlying XML reader.
- ///
- /// ```
- /// # use pretty_assertions::assert_eq;
- /// use serde::Deserialize;
- /// use quick_xml::de::Deserializer;
- /// use quick_xml::NsReader;
- ///
- /// #[derive(Deserialize)]
- /// struct SomeStruct {
- /// field1: String,
- /// field2: String,
- /// }
- ///
- /// // Try to deserialize from broken XML
- /// let mut de = Deserializer::from_str(
- /// ""
- /// // 0 ^= 28 ^= 41
- /// );
- ///
- /// let err = SomeStruct::deserialize(&mut de);
- /// assert!(err.is_err());
- ///
- /// let reader: &NsReader<&[u8]> = de.get_ref().get_ref();
- ///
- /// assert_eq!(reader.error_position(), 28);
- /// assert_eq!(reader.buffer_position(), 41);
- /// ```
- pub const fn get_ref(&self) -> &NsReader<&'de [u8]> {
- &self.reader
- }
-}
-
-impl<'de> XmlRead<'de> for SliceReader<'de> {
+impl<'de, 'e, EF> XmlReader<'de, 'e, EF>
+where
+ EF: EntityResolverFactory<'de>,
+{
fn next(&mut self) -> Result, DeError> {
loop {
- let event = self.reader.read_event()?;
- if let Event::Decl(e) = &event {
- self.version = e.xml_version()?;
- }
- if let Some(event) = skip_uninterested(event) {
- return Ok(event);
- }
- }
- }
-
- fn read_to_end(&mut self, name: QName) -> Result<(), DeError> {
- match self.reader.read_to_end(name) {
- Err(e) => Err(e.into()),
- Ok(_) => Ok(()),
+ let event = match self.read_event()? {
+ XmlEvent::Start(e) => PayloadEvent::Start(e),
+ XmlEvent::End(e) => PayloadEvent::End(e),
+ XmlEvent::Eof => PayloadEvent::Eof,
+
+ // Do not trim next text event after Text or CDATA
+ XmlEvent::CData(e) => PayloadEvent::CData(e),
+ XmlEvent::Text(e) => PayloadEvent::Text(e),
+
+ // XmlEvent::Empty doesn't produced, because it is expanded into Start+End
+ // Skip XmlEvent::PI
+ _ => continue,
+ };
+ return Ok(event);
}
}
-
- #[inline]
- fn xml_version(&self) -> XmlVersion {
- self.version
- }
-
- #[inline]
- fn decoder(&self) -> Decoder {
- self.reader.decoder()
- }
-
- fn has_nil_attr(&self, start: &BytesStart) -> bool {
- start.attributes().has_nil(self.reader.resolver())
- }
}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
#[cfg(test)]
mod tests {
use super::*;
use crate::errors::IllFormedError;
use pretty_assertions::assert_eq;
- fn make_de<'de>(source: &'de str) -> Deserializer<'de, SliceReader<'de>> {
+ fn make_de<'de, 'e>(source: &'de str) -> Deserializer<'de, 'e> {
dbg!(source);
Deserializer::from_str(source)
}
@@ -4150,36 +3875,6 @@ mod tests {
}
}
- #[test]
- fn borrowing_reader_parity() {
- let s = r#"
- - Some text
-
-
- "#;
-
- let mut reader1 = IoReader {
- reader: NsReader::from_reader(s.as_bytes()),
- buf: Vec::new(),
- version: XmlVersion::V1_0,
- };
- let mut reader2 = SliceReader {
- reader: NsReader::from_str(s),
- version: XmlVersion::V1_0,
- };
-
- loop {
- let event1 = reader1.next().unwrap();
- let event2 = reader2.next().unwrap();
-
- if let (PayloadEvent::Eof, PayloadEvent::Eof) = (&event1, &event2) {
- break;
- }
-
- assert_eq!(event1, event2);
- }
- }
-
#[test]
fn borrowing_reader_events() {
let s = r#"
@@ -4189,13 +3884,10 @@ mod tests {
"#;
- let mut reader = SliceReader {
- reader: NsReader::from_str(s),
- version: XmlVersion::V1_0,
- };
+ let mut reader = NsReader::from_str(s);
+ reader.config_mut().expand_empty_elements = true;
- let config = reader.reader.config_mut();
- config.expand_empty_elements = true;
+ let mut reader = XmlReader::borrowed_ns(reader, PredefinedEntityResolver);
let mut events = Vec::new();
diff --git a/src/de/resolver.rs b/src/de/resolver.rs
deleted file mode 100644
index 5efc0117c..000000000
--- a/src/de/resolver.rs
+++ /dev/null
@@ -1,115 +0,0 @@
-//! Entity resolver module
-
-use std::convert::Infallible;
-use std::error::Error;
-
-use crate::escape::resolve_predefined_entity;
-use crate::events::BytesText;
-
-/// Used to resolve unknown entities while parsing
-///
-/// # Example
-///
-/// ```
-/// # use serde::Deserialize;
-/// # use pretty_assertions::assert_eq;
-/// use regex::bytes::Regex;
-/// use std::collections::BTreeMap;
-/// use std::string::FromUtf8Error;
-/// use quick_xml::de::{Deserializer, EntityResolver};
-/// use quick_xml::events::BytesText;
-///
-/// struct DocTypeEntityResolver {
-/// re: Regex,
-/// map: BTreeMap,
-/// }
-///
-/// impl Default for DocTypeEntityResolver {
-/// fn default() -> Self {
-/// Self {
-/// // We do not focus on true parsing in this example
-/// // You should use special libraries to parse DTD
-/// re: Regex::new(r#""#).unwrap(),
-/// map: BTreeMap::new(),
-/// }
-/// }
-/// }
-///
-/// impl EntityResolver for DocTypeEntityResolver {
-/// type Error = FromUtf8Error;
-///
-/// fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> {
-/// for cap in self.re.captures_iter(&doctype) {
-/// self.map.insert(
-/// String::from_utf8(cap[1].to_vec())?,
-/// String::from_utf8(cap[2].to_vec())?,
-/// );
-/// }
-/// Ok(())
-/// }
-///
-/// fn resolve(&self, entity: &str) -> Option<&str> {
-/// self.map.get(entity).map(|s| s.as_str())
-/// }
-/// }
-///
-/// let xml_reader = br#"
-/// ]>
-///
-/// &e1;
-///
-/// "#.as_ref();
-///
-/// let mut de = Deserializer::with_resolver(
-/// xml_reader,
-/// DocTypeEntityResolver::default(),
-/// );
-/// let data: BTreeMap = BTreeMap::deserialize(&mut de).unwrap();
-///
-/// assert_eq!(data.get("entity_one"), Some(&"entity 1".to_string()));
-/// ```
-pub trait EntityResolver {
- /// The error type that represents DTD parse error
- type Error: Error;
-
- /// Called on contents of [`Event::DocType`] to capture declared entities.
- /// Can be called multiple times, for each parsed `` declaration.
- ///
- /// [`Event::DocType`]: crate::events::Event::DocType
- fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error>;
-
- /// Called when an entity needs to be resolved.
- ///
- /// `None` is returned if a suitable value can not be found.
- /// In that case an [`EscapeError::UnrecognizedEntity`] will be returned by
- /// a deserializer.
- ///
- /// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity
- fn resolve(&self, entity: &str) -> Option<&str>;
-}
-
-/// An [`EntityResolver`] that resolves only predefined entities:
-///
-/// | Entity | Resolution
-/// |--------|------------
-/// |`<` | `<`
-/// |`>` | `>`
-/// |`&` | `&`
-/// |`'`| `'`
-/// |`"`| `"`
-#[derive(Default, Copy, Clone)]
-pub struct PredefinedEntityResolver;
-
-impl EntityResolver for PredefinedEntityResolver {
- type Error = Infallible;
-
- #[inline]
- fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::Error> {
- Ok(())
- }
-
- #[inline]
- fn resolve(&self, entity: &str) -> Option<&str> {
- resolve_predefined_entity(entity)
- }
-}
diff --git a/src/de/var.rs b/src/de/var.rs
index e64e29f85..a7d79199a 100644
--- a/src/de/var.rs
+++ b/src/de/var.rs
@@ -1,40 +1,37 @@
use crate::{
de::key::QNameDeserializer,
de::map::ElementMapAccess,
- de::resolver::EntityResolver,
de::simple_type::SimpleTypeDeserializer,
- de::{DeEvent, Deserializer, XmlRead, TEXT_KEY},
+ de::{DeEvent, Deserializer, TEXT_KEY},
errors::serialize::DeError,
+ reader::EntityResolverFactory,
};
use serde::de::value::BorrowedStrDeserializer;
use serde::de::{self, DeserializeSeed, Deserializer as _, Visitor};
/// An enum access
-pub struct EnumAccess<'de, 'd, R, E>
+pub struct EnumAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
- de: &'d mut Deserializer<'de, R, E>,
+ de: &'d mut Deserializer<'de, 'e, EF>,
}
-impl<'de, 'd, R, E> EnumAccess<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> EnumAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
- pub fn new(de: &'d mut Deserializer<'de, R, E>) -> Self {
+ pub fn new(de: &'d mut Deserializer<'de, 'e, EF>) -> Self {
EnumAccess { de }
}
}
-impl<'de, 'd, R, E> de::EnumAccess<'de> for EnumAccess<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> de::EnumAccess<'de> for EnumAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
- type Variant = VariantAccess<'de, 'd, R, E>;
+ type Variant = VariantAccess<'de, 'e, 'd, EF>;
fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error>
where
@@ -61,21 +58,19 @@ where
}
}
-pub struct VariantAccess<'de, 'd, R, E>
+pub struct VariantAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
- de: &'d mut Deserializer<'de, R, E>,
+ de: &'d mut Deserializer<'de, 'e, EF>,
/// `true` if variant should be deserialized from a textual content
/// and `false` if from tag
is_text: bool,
}
-impl<'de, 'd, R, E> de::VariantAccess<'de> for VariantAccess<'de, 'd, R, E>
+impl<'de, 'e, 'd, EF> de::VariantAccess<'de> for VariantAccess<'de, 'e, 'd, EF>
where
- R: XmlRead<'de>,
- E: EntityResolver,
+ EF: EntityResolverFactory<'de>,
{
type Error = DeError;
diff --git a/src/errors.rs b/src/errors.rs
index 9002f0477..49e18ae64 100644
--- a/src/errors.rs
+++ b/src/errors.rs
@@ -206,6 +206,13 @@ pub enum Error {
Escape(EscapeError),
/// Parsed XML has some namespace-related problems
Namespace(NamespaceError),
+ /// The error returned by [`EntityResolver::capture`](crate::reader::EntityResolver::capture).
+ DoctypeParse(Arc),
+ /// Entity reference was not resolved to the entity; [`EntityResolver::resolve`] returned `None`.
+ /// Contains the name of entity without `&` and `;`.
+ ///
+ /// [`EntityResolver::resolve`]: crate::reader::EntityResolver::resolve
+ UnrecognizedGeneralEntity(String),
}
impl Error {
@@ -284,6 +291,8 @@ impl fmt::Display for Error {
Self::Encoding(e) => e.fmt(f),
Self::Escape(e) => e.fmt(f),
Self::Namespace(e) => e.fmt(f),
+ Self::DoctypeParse(e) => write!(f, "cannot parse DTD: {}", e),
+ Self::UnrecognizedGeneralEntity(e) => write!(f, "unrecognized general entity `{}`", e),
}
}
}
@@ -298,6 +307,8 @@ impl std::error::Error for Error {
Self::Encoding(e) => Some(e),
Self::Escape(e) => Some(e),
Self::Namespace(e) => Some(e),
+ Self::DoctypeParse(e) => Some(e),
+ Self::UnrecognizedGeneralEntity(_) => None,
}
}
}
diff --git a/src/lib.rs b/src/lib.rs
index bd37f91d0..7a434855e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,6 +10,8 @@
//! The user has to explicitly _ask_ for the next XML event, similar to a database cursor.
//! This is achieved by the following two structs:
//!
+//! - [`XmlReader`]: A high level XML pull-reader which resolves entities and can process several
+//! XML sources if you provide them.
//! - [`Reader`]: A low level XML pull-reader where buffer allocation/clearing is left to user.
//! - [`Writer`]: A XML writer. Can be nested with readers if you want to transform XMLs.
//!
@@ -27,7 +29,8 @@
//!
//! # Examples
//!
-//! - For a reading example see [`Reader`]
+//! - For a reading example see [`XmlReader`]
+//! - For a low-level reading example see [`Reader`]
//! - For a writing example see [`Writer`]
//!
//! # Features
@@ -78,7 +81,7 @@ pub use crate::encoding::Decoder;
#[cfg(feature = "serialize")]
pub use crate::errors::serialize::{DeError, SeError};
pub use crate::errors::{Error, Result};
-pub use crate::reader::{NsReader, Reader};
+pub use crate::reader::{NsReader, Reader, XmlReader};
pub use crate::writer::{ElementWriter, Writer};
/// Version of XML standard
diff --git a/src/reader/event.rs b/src/reader/event.rs
new file mode 100644
index 000000000..893b17b49
--- /dev/null
+++ b/src/reader/event.rs
@@ -0,0 +1,44 @@
+use crate::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText};
+
+/// Event emitted by [`Reader::read_event`].
+///
+/// # Lifetime
+///
+/// The `'i` lifetime of this struct is the lifetime of data that may be borrowed
+/// from the XML input (when reader of the main document reads from `&[u8]` or `&str`).
+///
+/// [`Reader::read_event`]: crate::reader::Reader::read_event
+#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
+pub enum Event<'i> {
+ /// Empty element tag (with attributes) ``.
+ Empty(BytesStart<'i>),
+ /// Start tag (with attributes) ``.
+ Start(BytesStart<'i>),
+ /// End tag ``.
+ End(BytesEnd<'i>),
+ /// Character data between `Start` and `End` element.
+ Text(BytesText<'i>),
+ /// CData ``.
+ CData(BytesCData<'i>),
+ /// Processing instruction `...?>`.
+ PI(BytesPI<'i>),
+ /// End of XML document.
+ Eof,
+}
+
+impl<'i> Event<'i> {
+ /// Ensures that all data is owned to extend the object's lifetime if necessary.
+ #[inline]
+ pub fn into_owned(self) -> Event<'static> {
+ match self {
+ Self::Empty(e) => Event::Empty(e.into_owned()),
+ Self::Start(e) => Event::Start(e.into_owned()),
+ Self::End(e) => Event::End(e.into_owned()),
+ Self::Text(e) => Event::Text(e.into_owned()),
+ Self::CData(e) => Event::CData(e.into_owned()),
+ Self::PI(e) => Event::PI(e.into_owned()),
+ Self::Eof => Event::Eof,
+ }
+ }
+}
diff --git a/src/reader/mod.rs b/src/reader/mod.rs
index b8a569b2f..adee92ee3 100644
--- a/src/reader/mod.rs
+++ b/src/reader/mod.rs
@@ -2,14 +2,21 @@
#[cfg(feature = "encoding")]
use encoding_rs::Encoding;
-use std::io;
+use std::borrow::Cow;
+use std::collections::VecDeque;
+use std::fmt;
+use std::io::{self, BufRead, Cursor};
use std::ops::Range;
+use std::sync::Arc;
use crate::encoding::Decoder;
use crate::errors::{Error, IllFormedError, SyntaxError};
-use crate::events::{BytesRef, Event};
+use crate::escape::{parse_number, EscapeError};
+use crate::events::{BytesRef, BytesStart, BytesText, Event};
+use crate::name::{NamespaceResolver, QName};
use crate::parser::{DtdParser, ElementParser, Parser, PiParser};
use crate::reader::state::ReaderState;
+use crate::XmlVersion;
/// A struct that holds a parser configuration.
///
@@ -250,6 +257,14 @@ impl Default for Config {
////////////////////////////////////////////////////////////////////////////////////////////////////
+mod event;
+mod resolver;
+
+pub use event::Event as XmlEvent;
+pub use resolver::{
+ EntityResolver, EntityResolverFactory, PredefinedEntityResolver, ReplacementText,
+};
+
macro_rules! read_event_impl {
(
$self:ident, $buf:ident,
@@ -1228,6 +1243,492 @@ impl BangType {
////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Result of reading event by the underlying reader
+enum ReadEvent<'i> {
+ /// Upper-level reader should skip event and request another one from the underlying reader
+ Skip,
+ Event(XmlEvent<'i>),
+ ExternalEntity(Reader>),
+}
+
+impl<'i> fmt::Debug for ReadEvent<'i> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ Self::Skip => f.write_str("Skip"),
+ Self::Event(r) => write!(f, "Event({:?})", r),
+ Self::ExternalEntity(r) => write!(f, "ExternalEntity({:p})", &r),
+ }
+ }
+}
+
+enum EntityReader<'i, 'e> {
+ /// Reader of internal entity, i.e. the entity defined in the same source as
+ /// a main document, that returns borrowed events.
+ InternalBorrowed(Reader<&'i [u8]>),
+ /// Reader of internal entity, i.e. the entity defined in the same source as
+ /// a main document, that returns owned events.
+ InternalOwned(Reader>),
+ /// Reader of external entity, i.e. the entity defined in the different from the main document
+ /// source.
+ External(Reader>),
+}
+
+impl<'i, 'e> fmt::Debug for EntityReader<'i, 'e> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ Self::InternalBorrowed(r) => r.fmt(f),
+ Self::InternalOwned(r) => write!(f, "InternalOwned({:p})", &r),
+ Self::External(r) => write!(f, "External({:p})", &r),
+ }
+ }
+}
+
+impl<'i, 'e> EntityReader<'i, 'e> {
+ const fn config(&self) -> &Config {
+ match self {
+ Self::InternalBorrowed(r) => r.config(),
+ Self::InternalOwned(r) => r.config(),
+ Self::External(r) => r.config(),
+ }
+ }
+
+ const fn decoder(&self) -> Decoder {
+ match self {
+ Self::InternalBorrowed(r) => r.decoder(),
+ Self::InternalOwned(r) => r.decoder(),
+ Self::External(r) => r.decoder(),
+ }
+ }
+
+ const fn buffer_position(&self) -> u64 {
+ match self {
+ Self::InternalBorrowed(r) => r.buffer_position(),
+ Self::InternalOwned(r) => r.buffer_position(),
+ Self::External(r) => r.buffer_position(),
+ }
+ }
+
+ const fn error_position(&self) -> u64 {
+ match self {
+ Self::InternalBorrowed(r) => r.error_position(),
+ Self::InternalOwned(r) => r.error_position(),
+ Self::External(r) => r.error_position(),
+ }
+ }
+
+ fn read_event(&mut self, buf: &mut Vec) -> Result, Error> {
+ match self {
+ Self::InternalBorrowed(r) => r.read_event(),
+ Self::InternalOwned(r) => Ok(r.read_event_into(buf)?.into_owned()),
+ Self::External(r) => Ok(r.read_event_into(buf)?.into_owned()),
+ }
+ }
+
+ fn read_to_end(&mut self, end: QName, buf: &mut Vec) -> Result {
+ match self {
+ EntityReader::InternalBorrowed(r) => r.read_to_end(end),
+ EntityReader::InternalOwned(r) => r.read_to_end_into(end, buf),
+ EntityReader::External(r) => r.read_to_end_into(end, buf),
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// One single parse unit, for example, a file. In XML specification it is called
+/// _entity_, but we avoid calling it that as it may lead to confusion. Under _entity_
+/// XML consumers usually mean thing that in specification called "entity reference".
+///
+/// Cite from the [specification]:
+///
+/// > Each XML document has both a logical and a physical structure.
+/// > Physically, the document is composed of units called **entities**.
+/// > An entity may refer to other entities to cause their inclusion in the document.
+/// > A document begins in a "root" or document entity. Logically, the document
+/// > is composed of declarations, elements, comments, character references,
+/// > and processing instructions, all of which are indicated in the document
+/// > by explicit markup. The logical and physical structures MUST nest properly,
+/// > as described in 4.3.2 Well-Formed Parsed Entities.
+///
+/// [Also]:
+/// > An XML document may consist of one or many storage units. These are called
+/// > _entities_; they all have content and are all (except for the document entity
+/// > and the external DTD subset) identified by entity **name**.
+///
+/// # Lifetimes
+/// The `'i` lifetime stands for "input" and is a lifetime of a document entity,
+/// i.e. the source which the end-user requested to parse.
+///
+/// The `'e` lifetime stands for "external" and it is a lifetime of an external source
+/// which the end-user requested to parse.
+///
+/// [specification]: https://www.w3.org/TR/xml11/#sec-documents
+/// [Also]: https://www.w3.org/TR/xml11/#dt-entity
+#[derive(Debug)]
+struct StorageUnit<'i, 'e, E> {
+ /// Readers used to produce events from this entity.
+ parts: VecDeque>,
+
+ /// Version of XML standard used by this storage unit.
+ version: XmlVersion,
+
+ /// A buffer to manage namespaces
+ ns_resolver: NamespaceResolver,
+ /// We cannot pop data from the namespace stack until returned `Empty` or `End`
+ /// event will be processed by the user, so we only mark that we should that
+ /// in the next [`Self::read_event()`] call.
+ pending_ns_pop: bool,
+
+ /// Used to resolve unknown entities that would otherwise cause the parser
+ /// to return an [`Error::UnrecognizedGeneralEntity`] error.
+ entity_resolver: E,
+}
+impl<'i, 'e, E> StorageUnit<'i, 'e, E>
+where
+ E: EntityResolver<'i>,
+{
+ fn new(part: EntityReader<'i, 'e>, entity_resolver: E) -> Self {
+ Self {
+ parts: VecDeque::from([part]),
+ version: XmlVersion::V1_0,
+ ns_resolver: NamespaceResolver::default(),
+ pending_ns_pop: false,
+ entity_resolver,
+ }
+ }
+
+ fn read_event_impl(&mut self, buf: &mut Vec) -> Result, Error> {
+ while let Some(part) = self.parts.back_mut() {
+ let event = match part.read_event(buf)? {
+ Event::Decl(e) => {
+ self.version = e.xml_version()?;
+ ReadEvent::Skip
+ }
+ Event::Comment(_) => ReadEvent::Skip,
+
+ Event::DocType(doctype) => {
+ self.entity_resolver
+ .capture(doctype)
+ .map_err(|e| Error::DoctypeParse(Arc::new(e)))?;
+ ReadEvent::Skip
+ }
+ Event::GeneralRef(e) => {
+ let reference = part.decoder().decode(&e)?;
+ if let Some(num) = reference.strip_prefix('#') {
+ let codepoint = parse_number(num).map_err(EscapeError::InvalidCharRef)?;
+ let mut bytes = [0u8; 4];
+ let text = BytesText::wrap(
+ codepoint.encode_utf8(&mut bytes).as_bytes(),
+ Decoder::utf8(),
+ );
+ return Ok(ReadEvent::Event(XmlEvent::Text(text.into_owned())));
+ }
+ match self.entity_resolver.resolve(reference.as_ref()) {
+ Some(ReplacementText::Internal(Cow::Borrowed(entity))) => {
+ let mut nested = Reader::from_reader(entity);
+ *nested.config_mut() = part.config().clone();
+ self.parts.push_back(EntityReader::InternalBorrowed(nested));
+ continue;
+ }
+ Some(ReplacementText::Internal(Cow::Owned(entity))) => {
+ let boxed: Box = Box::new(Cursor::new(entity));
+ let mut nested = Reader::from_reader(boxed);
+ *nested.config_mut() = part.config().clone();
+ self.parts.push_back(EntityReader::InternalOwned(nested));
+ continue;
+ }
+ Some(ReplacementText::External(source)) => {
+ let mut external = Reader::from_reader(source);
+ *external.config_mut() = part.config().clone();
+ ReadEvent::ExternalEntity(external)
+ }
+ _ => return Err(Error::UnrecognizedGeneralEntity(reference.into_owned())),
+ }
+ }
+
+ Event::Empty(e) => ReadEvent::Event(XmlEvent::Empty(e)),
+ Event::Start(e) => ReadEvent::Event(XmlEvent::Start(e)),
+ Event::End(e) => ReadEvent::Event(XmlEvent::End(e)),
+ Event::Text(e) => ReadEvent::Event(XmlEvent::Text(e)),
+ Event::CData(e) => ReadEvent::Event(XmlEvent::CData(e)),
+ Event::PI(e) => ReadEvent::Event(XmlEvent::PI(e)),
+ Event::Eof => {
+ self.parts.pop_back();
+ continue;
+ }
+ };
+ return Ok(event);
+ }
+ Ok(ReadEvent::Event(XmlEvent::Eof))
+ }
+
+ fn read_event(&mut self, buf: &mut Vec) -> Result, Error> {
+ self.pop();
+ let event = self.read_event_impl(buf);
+ self.process_event(event)
+ }
+
+ #[inline]
+ fn pop(&mut self) {
+ if self.pending_ns_pop {
+ self.ns_resolver.pop();
+ self.pending_ns_pop = false;
+ }
+ }
+
+ #[inline]
+ fn process_event(
+ &mut self,
+ event: Result, Error>,
+ ) -> Result, Error> {
+ match event {
+ Ok(ReadEvent::Event(XmlEvent::Start(e))) => {
+ self.ns_resolver.push(&e)?;
+ Ok(ReadEvent::Event(XmlEvent::Start(e)))
+ }
+ Ok(ReadEvent::Event(XmlEvent::Empty(e))) => {
+ self.ns_resolver.push(&e)?;
+ // notify next `read_event()` invocation that it needs to pop this
+ // namespace scope
+ self.pending_ns_pop = true;
+ Ok(ReadEvent::Event(XmlEvent::Empty(e)))
+ }
+ Ok(ReadEvent::Event(XmlEvent::End(e))) => {
+ // notify next `read_event()` invocation that it needs to pop this
+ // namespace scope
+ self.pending_ns_pop = true;
+ Ok(ReadEvent::Event(XmlEvent::End(e)))
+ }
+ e => e,
+ }
+ }
+
+ fn read_to_end(&mut self, end: QName, buf: &mut Vec) -> Result {
+ // FIXME: this is incorrect, because entity reference does not obligated
+ // to properly nested XML tree
+ if let Some(part) = self.parts.back_mut() {
+ part.read_to_end(end, buf)?;
+ // Because we found the end tag and consume it, we should pop any namespaces that
+ // was started by the Start event
+ self.pop();
+ return Ok(true);
+ }
+ Ok(false)
+ }
+
+ fn decoder(&self) -> Decoder {
+ match self.parts.back() {
+ Some(part) => part.decoder(),
+ // Does not matter what decoder to use when all events exhausted
+ None => Decoder::utf8(),
+ }
+ }
+
+ fn has_nil_attr(&self, start: &BytesStart) -> bool {
+ start.attributes().has_nil(&self.ns_resolver)
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// High-level XML reader which automatically resolves entity references (`&...;`)
+/// and can stream events from several physical documents (storage units, called _entities_
+/// in the XML [specification]).
+///
+/// # Lifetimes
+/// The `'i` lifetime stands for "input" and is a lifetime of a document entity,
+/// i.e. the source which the end-user requested to parse, _from which events may borrow_.
+///
+/// The `'e` lifetime stands for "external" and it is a lifetime of _any_ entity,
+/// that parser may parse, from which events will not borrow data.
+///
+/// # Type parameter
+/// `EF`: the general entity resolver. Used to resolve unknown entities that would
+/// otherwise cause the parser to return an [`Error::UnrecognizedGeneralEntity`] error.
+///
+/// Note, that the same entity resolved is used to resolve entity references as in initial
+/// document, as in any other documents loaded due to entity resolution.
+///
+/// [specification]: https://www.w3.org/TR/xml11/#sec-documents
+#[derive(Debug)]
+pub struct XmlReader<'i, 'e, EF = PredefinedEntityResolver>
+where
+ EF: EntityResolverFactory<'i>,
+{
+ /// Stack of things that represents individual storage units, such as files.
+ /// The first element is the initial unit, representing document which user
+ /// want to parse, others readers created for each resolved external entity
+ /// (entity, defined in another storage unit).
+ units: VecDeque>,
+
+ /// Buffer to which external readers will read data. After reading each event
+ /// data is copied to the event data and buffer is cleared
+ buffer: Vec,
+
+ entity_resolver_factory: EF,
+}
+
+impl<'i, 'e, EF> XmlReader<'i, 'e, EF>
+where
+ EF: EntityResolverFactory<'i>,
+{
+ fn new(part: EntityReader<'i, 'e>, mut entity_resolver_factory: EF) -> Self {
+ let resolver = entity_resolver_factory.new_resolver();
+ Self {
+ units: VecDeque::from([StorageUnit::new(part, resolver)]),
+ buffer: Vec::new(),
+ entity_resolver_factory,
+ }
+ }
+
+ /// Creates new `Reader` from low-level reader and entity resolver, which would
+ /// borrow event data from the source when event represent piece of original document.
+ /// Events from other entities (documents loaded during entity resolution) would
+ /// own data.
+ ///
+ /// For each resolved entity a new [`Reader`] would be created to read entity
+ /// data. That reader will receive a copy of configuration that would set for
+ /// `reader`. If `entity_resolver` returns [`ReplacementText::Internal`], then events
+ /// from that entity would also borrow from the source, otherwise they will
+ /// maintain an own copy of data.
+ pub fn borrowed(reader: Reader<&'i [u8]>, entity_resolver_factory: EF) -> Self {
+ Self::new(
+ EntityReader::InternalBorrowed(reader),
+ entity_resolver_factory,
+ )
+ }
+
+ /// Creates new `Reader` from low-level reader and entity resolver, where all
+ /// events would store its own copy of data.
+ ///
+ /// For each resolved entity a new [`Reader`] would be created to read entity
+ /// data. That reader will receive a copy of configuration that would set for
+ /// `reader`.
+ pub fn buffered(reader: Reader>, entity_resolver_factory: EF) -> Self {
+ Self::new(EntityReader::InternalOwned(reader), entity_resolver_factory)
+ }
+
+ /// The same, as [`borrowed`](Self::borrowed), but creates from namespace-aware reader.
+ /// The state of the reader will be preserved.
+ pub fn borrowed_ns(reader: NsReader<&'i [u8]>, mut entity_resolver_factory: EF) -> Self {
+ let resolver = entity_resolver_factory.new_resolver();
+ Self {
+ units: VecDeque::from([reader.to_borrowed_storage_unit(resolver)]),
+ buffer: Vec::new(),
+ entity_resolver_factory,
+ }
+ }
+
+ /// The same, as [`buffered`](Self::buffered), but creates from namespace-aware reader.
+ /// The state of the reader will be preserved.
+ pub fn buffered_ns(
+ reader: NsReader>,
+ mut entity_resolver_factory: EF,
+ ) -> Self {
+ let resolver = entity_resolver_factory.new_resolver();
+ Self {
+ units: VecDeque::from([reader.to_buffered_storage_unit(resolver)]),
+ buffer: Vec::new(),
+ entity_resolver_factory,
+ }
+ }
+
+ /// Returns event which, if possible, would borrow from the source and contains
+ /// a copy of data if borrowing is impossible (for example, event from another
+ /// document resolved by entity reference).
+ pub fn read_event(&mut self) -> Result, Error> {
+ while let Some(unit) = self.units.back_mut() {
+ self.buffer.clear();
+ match unit.read_event(&mut self.buffer)? {
+ ReadEvent::ExternalEntity(reader) => {
+ self.units.push_back(StorageUnit::new(
+ EntityReader::External(reader),
+ self.entity_resolver_factory.new_resolver(),
+ ));
+ continue;
+ }
+ ReadEvent::Event(XmlEvent::Eof) => {
+ self.units.pop_back();
+ continue;
+ }
+ ReadEvent::Event(event) => return Ok(event),
+ _ => continue,
+ }
+ }
+ Ok(XmlEvent::Eof)
+ }
+
+ /// Returns a storage of namespace bindings associated with this reader.
+ ///
+ /// Note, that this object may change after reading new event, if new event
+ /// will be from the new storage unit. That is possible only if custom
+ /// [`EntityResolver`] is used.
+ #[inline]
+ pub fn resolver(&self) -> &NamespaceResolver {
+ // SAFETY: At least one storage unit should always be there
+ &self.units.back().unwrap().ns_resolver
+ }
+
+ /// Reads until end element is found. This function is supposed to be called
+ /// after you already read a [`Event::Start`] event.
+ ///
+ /// Unlike [`Reader::read_to_end`] this method does not return span because
+ /// there might not be continuos space that is occupied by the XML tree.
+ pub fn read_to_end(&mut self, end: QName) -> Result<(), Error> {
+ // FIXME: this is incorrect, because entity reference does not obligated
+ // to properly nested XML tree
+ if let Some(unit) = self.units.back_mut() {
+ unit.read_to_end(end, &mut self.buffer)?;
+ return Ok(());
+ }
+ Err(Error::missed_end(end, Decoder::utf8()))
+ }
+
+ /// Note: version can be changed after reading new event, because new event
+ /// could be produced from another document due to entity resolution.
+ pub fn xml_version(&self) -> XmlVersion {
+ match self.units.back() {
+ Some(unit) => unit.version,
+ // If there no units we assume default XML version
+ None => XmlVersion::V1_0,
+ }
+ }
+
+ /// Note: decoder can be changed after reading new event, because new event
+ /// could be produced from another document due to entity resolution.
+ pub fn decoder(&self) -> Decoder {
+ match self.units.back() {
+ Some(unit) => unit.decoder(),
+ // Does not matter what decoder to use when all events exhausted
+ None => Decoder::utf8(),
+ }
+ }
+
+ /// Checks if the `start` tag has a [`xsi:nil`] attribute. This method ignores
+ /// any errors in attributes.
+ ///
+ /// [`xsi:nil`]: https://www.w3.org/TR/xmlschema-1/#xsi_nil
+ pub fn has_nil_attr(&self, start: &BytesStart) -> bool {
+ match self.units.back() {
+ Some(unit) => unit.has_nil_attr(start),
+ None => false,
+ }
+ }
+}
+
+/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
+/// This implementation supports not using an intermediate buffer as the byte slice
+/// itself can be used to borrow from.
+impl<'i, 'e> XmlReader<'i, 'e, PredefinedEntityResolver> {
+ /// Creates an XML reader from a string slice.
+ #[allow(clippy::should_implement_trait)]
+ pub fn from_str(source: &'i str) -> Self {
+ Self::borrowed(Reader::from_str(source), PredefinedEntityResolver)
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
#[cfg(test)]
mod test {
/// Checks the internal implementation of the various reader methods
diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs
index 46858cc85..ceca86eb1 100644
--- a/src/reader/ns_reader.rs
+++ b/src/reader/ns_reader.rs
@@ -4,6 +4,7 @@
//! [qualified names]: https://www.w3.org/TR/xml-names11/#dt-qualname
//! [expanded names]: https://www.w3.org/TR/xml-names11/#dt-expname
+use std::collections::VecDeque;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::ops::Deref;
@@ -12,7 +13,8 @@ use std::path::Path;
use crate::errors::Result;
use crate::events::{BytesText, Event};
use crate::name::{NamespaceResolver, QName, ResolveResult};
-use crate::reader::{Config, Reader, Span, XmlSource};
+use crate::reader::{Config, EntityReader, EntityResolver, Reader, Span, StorageUnit, XmlSource};
+use crate::XmlVersion;
/// A low level encoding-agnostic XML event reader that performs namespace resolution.
///
@@ -751,6 +753,42 @@ impl<'i> NsReader<&'i [u8]> {
self.ns_resolver.pop();
Ok(result)
}
+
+ /// Converts this reader with its state into the storage unit for the [`XmlReader`](super::XmlReader).
+ pub(super) fn to_borrowed_storage_unit<'e, E>(
+ self,
+ entity_resolver: E,
+ ) -> StorageUnit<'i, 'e, E>
+ where
+ E: EntityResolver<'i>,
+ {
+ StorageUnit {
+ parts: VecDeque::from([EntityReader::InternalBorrowed(self.reader)]),
+ version: XmlVersion::V1_0,
+ ns_resolver: self.ns_resolver,
+ pending_ns_pop: self.pending_pop,
+ entity_resolver,
+ }
+ }
+}
+
+impl<'i> NsReader> {
+ /// Converts this reader with its state into the storage unit for the [`XmlReader`](super::XmlReader).
+ pub(super) fn to_buffered_storage_unit<'e, E>(
+ self,
+ entity_resolver: E,
+ ) -> StorageUnit<'i, 'e, E>
+ where
+ E: EntityResolver<'i>,
+ {
+ StorageUnit {
+ parts: VecDeque::from([EntityReader::InternalOwned(self.reader)]),
+ version: XmlVersion::V1_0,
+ ns_resolver: self.ns_resolver,
+ pending_ns_pop: self.pending_pop,
+ entity_resolver,
+ }
+ }
}
impl Deref for NsReader {
diff --git a/src/reader/resolver.rs b/src/reader/resolver.rs
new file mode 100644
index 000000000..5cfbde1df
--- /dev/null
+++ b/src/reader/resolver.rs
@@ -0,0 +1,155 @@
+use std::borrow::Cow;
+use std::convert::Infallible;
+use std::error::Error;
+use std::fmt;
+use std::io::BufRead;
+
+use crate::events::BytesText;
+use crate::utils::Bytes;
+
+/// [Replacement text] of the resolved entity reference (`&...;`).
+///
+/// [Replacement text]: https://www.w3.org/TR/xml11/#dt-repltext
+pub enum ReplacementText<'i, 'e> {
+ /// Referenced entity inside the same document in the internal DTD.
+ Internal(Cow<'i, [u8]>),
+ /// Referenced entity inside the other document which will be read from
+ /// the specified source.
+ External(Box),
+}
+impl<'i, 'e> fmt::Debug for ReplacementText<'i, 'e> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match self {
+ Self::Internal(e) => Bytes(e).fmt(f),
+ Self::External(e) => write!(f, "", &e),
+ }
+ }
+}
+
+/// Used to create entity resolver for each physical document (storage unit or an _[entity]_)
+/// that would be parsed by the reader.
+///
+/// [entity]: https://www.w3.org/TR/xml11/#sec-documents
+pub trait EntityResolverFactory<'i> {
+ /// The error type that represents DTD parse error.
+ type CaptureError: Error + 'static;
+ /// Type that holds state for each entity, for example, for each file, which
+ /// forms the whole logical structure of the XML document.
+ type Resolver: EntityResolver<'i, CaptureError = Self::CaptureError>;
+
+ /// Creates state for the new entity parser.
+ fn new_resolver(&mut self) -> Self::Resolver;
+}
+
+/// Used to resolve unknown [general entities] (`&...;`) while parsing.
+///
+/// Note, that this trait is not used to resolve _[parameter entities]_ (`%...;`), they are resolved
+/// inside implementation of this trait. Parameter entities cannot be used outside of the ``
+/// declaration, so no need to resolve them in the document.
+///
+/// # Example
+///
+/// That example is taken from the XML specification. Suppose that we have the following DTD:
+/// ```xml
+///
+///
+///
+/// ```
+/// Here we have two defined _internal general entities_ (`rights` and `book`), which may be used
+/// everything in the document below their definition point (including the DOCTYPE declaration) and
+/// one _parameter entity_ (`pub`), which may be used only inside DOCTYPE declaration below it
+/// definition point. The literal values and replacement texts for those entities are:
+///
+/// |Entity|Literal value |Replacement text
+/// |------|-----------------------------|------------------------------------
+/// |pub |`Éditions Gallimard` |`Éditions Gallimard`
+/// |rights|`All rights reserved` |`All rights reserved`
+/// |book |`© 1947 %pub;. &rights;`|`© 1947 Éditions Gallimard. &rights;`
+///
+/// Implementation of the `EntityResolver` must return the _replacement text_ from the
+/// [resolve](Self::resolve) method. To follow XML specification, that means, that the
+/// following must be done over the text that was captured in the [capture](Self::capture) method:
+/// - EOLs must be normalized according to the XML version for which this resolver was created
+/// - any parameter entity references should be resolved: they should be replaced by their's
+/// replacement text
+/// - any character references should be expanded into the corresponding characters
+/// - any references to the other general entities (`&...;`) should be left as is
+///
+/// If the implementation will not parse DTD and just provide values for the general entity
+/// references (which usually custom resolvers will do), then just know, that any returned
+/// text will be considered as a replacement text as required by the XML specification.
+/// One consequence of this: if you want to have literal `<` and `&` characters in the text,
+/// you should use escape form of them, either as character reference or as entity reference.
+/// Otherwise they will be considered as part of the markup.
+///
+/// [general entities]: https://www.w3.org/TR/xml11/#gen-entity
+/// [parameter entities]: https://www.w3.org/TR/xml11/#dt-PE
+pub trait EntityResolver<'i> {
+ /// The error type that represents DTD parse error.
+ type CaptureError: Error + 'static;
+
+ /// Called on contents of [`Event::DocType`] to capture declared entities.
+ /// Can be called multiple times, for each parsed `` declaration.
+ ///
+ /// [`Event::DocType`]: crate::reader::Event::DocType
+ fn capture(&mut self, doctype: BytesText<'i>) -> Result<(), Self::CaptureError>;
+
+ /// Called when an entity needs to be resolved. Returns entity's [replacement text].
+ ///
+ /// `None` is returned if a suitable value can not be found.
+ /// In that case an [`Error::UnrecognizedGeneralEntity`] will be returned by a reader.
+ ///
+ /// [replacement text]: https://www.w3.org/TR/xml11/#dt-repltext
+ /// [`Error::UnrecognizedGeneralEntity`]: crate::errors::Error::UnrecognizedGeneralEntity
+ fn resolve<'e>(&self, entity: &str) -> Option>;
+}
+
+/// An [`EntityResolver`] that resolves only predefined entities, as defined in [specification]:
+///
+/// | Entity | Resolution
+/// |--------|------------
+/// |`<` | `<` (note: not `<`)
+/// |`>` | `>`
+/// |`&` | `&` (note: not `&`)
+/// |`'`| `'`
+/// |`"`| `"`
+///
+/// This is the default resolver for reader and deserializer.
+///
+/// [specification]: https://www.w3.org/TR/xml11/#sec-predefined-ent
+#[derive(Default, Debug, Copy, Clone)]
+pub struct PredefinedEntityResolver;
+
+impl<'i> EntityResolverFactory<'i> for PredefinedEntityResolver {
+ type CaptureError = Infallible;
+ type Resolver = Self;
+
+ #[inline]
+ fn new_resolver(&mut self) -> Self::Resolver {
+ *self
+ }
+}
+
+impl<'i> EntityResolver<'i> for PredefinedEntityResolver {
+ type CaptureError = Infallible;
+
+ #[inline]
+ fn capture(&mut self, _doctype: BytesText<'i>) -> Result<(), Self::CaptureError> {
+ Ok(())
+ }
+
+ #[inline]
+ fn resolve<'e>(&self, entity: &str) -> Option> {
+ let replacement_text = match entity {
+ "lt" => "<",
+ "gt" => ">",
+ "amp" => "&",
+ "apos" => "'",
+ "quot" => "\"",
+ _ => return None,
+ };
+ Some(ReplacementText::Internal(Cow::Borrowed(
+ replacement_text.as_bytes(),
+ )))
+ }
+}
diff --git a/tests/serde-de-references.rs b/tests/serde-de-references.rs
new file mode 100644
index 000000000..548b122b5
--- /dev/null
+++ b/tests/serde-de-references.rs
@@ -0,0 +1,94 @@
+use std::borrow::Cow;
+use std::convert::Infallible;
+
+use quick_xml::de::Deserializer;
+use quick_xml::events::BytesText;
+use quick_xml::reader::{EntityResolver, EntityResolverFactory, ReplacementText};
+
+use pretty_assertions::assert_eq;
+use serde::Deserialize;
+
+#[derive(Clone, Copy)]
+struct TestEntityResolver {
+ capture_called: bool,
+}
+
+impl<'i> EntityResolverFactory<'i> for TestEntityResolver {
+ type CaptureError = Infallible;
+ type Resolver = Self;
+
+ fn new_resolver(&mut self) -> Self::Resolver {
+ *self
+ }
+}
+
+impl<'i> EntityResolver<'i> for TestEntityResolver {
+ type CaptureError = Infallible;
+
+ fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::CaptureError> {
+ self.capture_called = true;
+ Ok(())
+ }
+
+ fn resolve<'e>(&self, entity: &str) -> Option> {
+ assert!(
+ self.capture_called,
+ "`EntityResolver::capture` should be called before `EntityResolver::resolve(\"{}\")`",
+ entity,
+ );
+
+ match dbg!(entity) {
+ "text" => Some(ReplacementText::Internal(Cow::Borrowed(
+ b" ",
+ ))),
+ _ => Some(ReplacementText::Internal(Cow::Borrowed(
+ b"
+ &text;
+
+ ",
+ ))),
+ }
+ }
+}
+
+#[derive(Debug, PartialEq, Deserialize)]
+struct Root {
+ child1: Child1,
+ child2: (),
+}
+
+#[derive(Debug, PartialEq, Deserialize)]
+struct Child1 {
+ #[serde(rename = "@attribute")]
+ attribute: String,
+
+ #[serde(rename = "$text")]
+ text: String,
+}
+
+#[test]
+fn entities() {
+ let mut de = Deserializer::from_str_with_resolver(
+ "
+
+ &entity;
+ ",
+ TestEntityResolver {
+ capture_called: false,
+ },
+ );
+
+ let data = Root::deserialize(&mut de).unwrap();
+
+ de.check_eof_reached();
+ assert_eq!(
+ data,
+ Root {
+ child1: Child1 {
+ attribute: "".to_string(),
+ text: " second text ".to_string(),
+ },
+ child2: (),
+ }
+ );
+}
diff --git a/tests/serde-de.rs b/tests/serde-de.rs
index 7f1850a3a..10a6e1be7 100644
--- a/tests/serde-de.rs
+++ b/tests/serde-de.rs
@@ -1,4 +1,3 @@
-use quick_xml::de::Deserializer;
use quick_xml::utils::{ByteBuf, Bytes};
use quick_xml::DeError;
@@ -1827,75 +1826,6 @@ mod borrow {
}
}
-/// Test for entity resolver
-mod resolve {
- use super::*;
- use pretty_assertions::assert_eq;
- use quick_xml::de::EntityResolver;
- use quick_xml::events::BytesText;
- use std::collections::BTreeMap;
- use std::convert::Infallible;
- use std::iter::FromIterator;
-
- struct TestEntityResolver {
- capture_called: bool,
- }
-
- impl EntityResolver for TestEntityResolver {
- type Error = Infallible;
-
- fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> {
- self.capture_called = true;
-
- assert_eq!(doctype.as_ref(), br#"dict[ ]"#);
-
- Ok(())
- }
-
- fn resolve(&self, entity: &str) -> Option<&str> {
- assert!(
- self.capture_called,
- "`EntityResolver::capture` should be called before `EntityResolver::resolve`"
- );
- match entity {
- "t1" => Some("test_one"),
- "t2" => Some("test_two"),
- _ => None,
- }
- }
- }
-
- #[test]
- fn resolve_custom_entity() {
- let resolver = TestEntityResolver {
- capture_called: false,
- };
- let mut de = Deserializer::with_resolver(
- br#"
- ]>
-
-
- &t1;
- &t2;
- non-entity
-
- "#
- .as_ref(),
- resolver,
- );
-
- let data: BTreeMap = BTreeMap::deserialize(&mut de).unwrap();
- assert_eq!(
- data,
- BTreeMap::from_iter([
- (String::from("entity_one"), String::from("test_one")),
- (String::from("entity_two"), String::from("test_two")),
- (String::from("entity_three"), String::from("non-entity")),
- ])
- );
- }
-}
-
/// Tests for https://github.com/tafia/quick-xml/pull/603.
///
/// According to comments,