From e8e249561da9d2fdaf6063b1884226c9117521b0 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 23 Nov 2024 00:29:52 +0500 Subject: [PATCH 01/11] Implement a high-level XmlReader which is able to parse XML consisted of several XML sources TODO: remaining questions about namespace resolving --- Changelog.md | 7 + compare/benches/low-level.rs | 22 ++- src/errors.rs | 11 ++ src/lib.rs | 7 +- src/reader/event.rs | 44 +++++ src/reader/mod.rs | 339 ++++++++++++++++++++++++++++++++++- src/reader/resolver.rs | 155 ++++++++++++++++ 7 files changed, 580 insertions(+), 5 deletions(-) create mode 100644 src/reader/event.rs create mode 100644 src/reader/resolver.rs diff --git a/Changelog.md b/Changelog.md index 56b6e81f..21ddbb7f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,6 +14,8 @@ ## Unreleased +The new `XmlReader` type was added that is automatically resolves general entity references. + ### New Features - [#938]: Add new enumeration `XmlVersion` and typified getter `BytesDecl::xml_version()`. @@ -33,6 +35,10 @@ Deprecated functions now behaves the same as newly added. +- [#948]: Add `quick_xml::reader::EntityResolver` which is able to resolve external entities. +- [#948]: Add `quick_xml::reader::XmlReader`, a new high-level reader which should be preferred + over the old `Reader`. + ### Bug Fixes - [#938]: Use correct rules for EOL normalization in `Deserializer` when parse XML 1.0 documents. @@ -51,6 +57,7 @@ [#914]: https://github.com/tafia/quick-xml/pull/914 [#938]: https://github.com/tafia/quick-xml/pull/938 [#944]: https://github.com/tafia/quick-xml/pull/944 +[#948]: https://github.com/tafia/quick-xml/pull/948 ## 0.39.2 -- 2026-02-20 diff --git a/compare/benches/low-level.rs b/compare/benches/low-level.rs index 631fc0c6..cf475ab7 100644 --- a/compare/benches/low-level.rs +++ b/compare/benches/low-level.rs @@ -1,7 +1,7 @@ use criterion::{self, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use pretty_assertions::assert_eq; use quick_xml::events::Event; -use quick_xml::reader::Reader; +use quick_xml::reader::{self, Reader, XmlReader}; use std::hint::black_box; use xml::reader::{EventReader, XmlEvent}; @@ -94,6 +94,26 @@ fn low_level_comparison(c: &mut Criterion) { }, ); + group.bench_with_input( + BenchmarkId::new("quick_xml:reader", filename), + *data, + |b, input| { + b.iter(|| { + let mut reader = XmlReader::from_str(input); + // TODO: reader.config_mut().check_end_names = false; + let mut count = black_box(0); + loop { + match reader.read_event() { + Ok(reader::Event::Start(_)) | Ok(reader::Event::Empty(_)) => count += 1, + Ok(reader::Event::Eof) => break, + _ => (), + } + } + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }, + ); + group.bench_with_input( BenchmarkId::new("maybe_xml:0.10", filename), *data, diff --git a/src/errors.rs b/src/errors.rs index 9002f047..49e18ae6 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -206,6 +206,13 @@ pub enum Error { Escape(EscapeError), /// Parsed XML has some namespace-related problems Namespace(NamespaceError), + /// The error returned by [`EntityResolver::capture`](crate::reader::EntityResolver::capture). + DoctypeParse(Arc), + /// Entity reference was not resolved to the entity; [`EntityResolver::resolve`] returned `None`. + /// Contains the name of entity without `&` and `;`. + /// + /// [`EntityResolver::resolve`]: crate::reader::EntityResolver::resolve + UnrecognizedGeneralEntity(String), } impl Error { @@ -284,6 +291,8 @@ impl fmt::Display for Error { Self::Encoding(e) => e.fmt(f), Self::Escape(e) => e.fmt(f), Self::Namespace(e) => e.fmt(f), + Self::DoctypeParse(e) => write!(f, "cannot parse DTD: {}", e), + Self::UnrecognizedGeneralEntity(e) => write!(f, "unrecognized general entity `{}`", e), } } } @@ -298,6 +307,8 @@ impl std::error::Error for Error { Self::Encoding(e) => Some(e), Self::Escape(e) => Some(e), Self::Namespace(e) => Some(e), + Self::DoctypeParse(e) => Some(e), + Self::UnrecognizedGeneralEntity(_) => None, } } } diff --git a/src/lib.rs b/src/lib.rs index bd37f91d..7a434855 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,8 @@ //! The user has to explicitly _ask_ for the next XML event, similar to a database cursor. //! This is achieved by the following two structs: //! +//! - [`XmlReader`]: A high level XML pull-reader which resolves entities and can process several +//! XML sources if you provide them. //! - [`Reader`]: A low level XML pull-reader where buffer allocation/clearing is left to user. //! - [`Writer`]: A XML writer. Can be nested with readers if you want to transform XMLs. //! @@ -27,7 +29,8 @@ //! //! # Examples //! -//! - For a reading example see [`Reader`] +//! - For a reading example see [`XmlReader`] +//! - For a low-level reading example see [`Reader`] //! - For a writing example see [`Writer`] //! //! # Features @@ -78,7 +81,7 @@ pub use crate::encoding::Decoder; #[cfg(feature = "serialize")] pub use crate::errors::serialize::{DeError, SeError}; pub use crate::errors::{Error, Result}; -pub use crate::reader::{NsReader, Reader}; +pub use crate::reader::{NsReader, Reader, XmlReader}; pub use crate::writer::{ElementWriter, Writer}; /// Version of XML standard diff --git a/src/reader/event.rs b/src/reader/event.rs new file mode 100644 index 00000000..893b17b4 --- /dev/null +++ b/src/reader/event.rs @@ -0,0 +1,44 @@ +use crate::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText}; + +/// Event emitted by [`Reader::read_event`]. +/// +/// # Lifetime +/// +/// The `'i` lifetime of this struct is the lifetime of data that may be borrowed +/// from the XML input (when reader of the main document reads from `&[u8]` or `&str`). +/// +/// [`Reader::read_event`]: crate::reader::Reader::read_event +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum Event<'i> { + /// Empty element tag (with attributes) ``. + Empty(BytesStart<'i>), + /// Start tag (with attributes) ``. + Start(BytesStart<'i>), + /// End tag ``. + End(BytesEnd<'i>), + /// Character data between `Start` and `End` element. + Text(BytesText<'i>), + /// CData ``. + CData(BytesCData<'i>), + /// Processing instruction ``. + PI(BytesPI<'i>), + /// End of XML document. + Eof, +} + +impl<'i> Event<'i> { + /// Ensures that all data is owned to extend the object's lifetime if necessary. + #[inline] + pub fn into_owned(self) -> Event<'static> { + match self { + Self::Empty(e) => Event::Empty(e.into_owned()), + Self::Start(e) => Event::Start(e.into_owned()), + Self::End(e) => Event::End(e.into_owned()), + Self::Text(e) => Event::Text(e.into_owned()), + Self::CData(e) => Event::CData(e.into_owned()), + Self::PI(e) => Event::PI(e.into_owned()), + Self::Eof => Event::Eof, + } + } +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs index b8a569b2..87a335b8 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -2,14 +2,20 @@ #[cfg(feature = "encoding")] use encoding_rs::Encoding; -use std::io; +use std::borrow::Cow; +use std::collections::VecDeque; +use std::fmt; +use std::io::{self, BufRead, Cursor}; use std::ops::Range; +use std::sync::Arc; use crate::encoding::Decoder; use crate::errors::{Error, IllFormedError, SyntaxError}; -use crate::events::{BytesRef, Event}; +use crate::escape::{parse_number, EscapeError}; +use crate::events::{BytesRef, BytesText, Event}; use crate::parser::{DtdParser, ElementParser, Parser, PiParser}; use crate::reader::state::ReaderState; +use crate::XmlVersion; /// A struct that holds a parser configuration. /// @@ -250,6 +256,14 @@ impl Default for Config { //////////////////////////////////////////////////////////////////////////////////////////////////// +mod event; +mod resolver; + +pub use event::Event as XmlEvent; +pub use resolver::{ + EntityResolver, EntityResolverFactory, PredefinedEntityResolver, ReplacementText, +}; + macro_rules! read_event_impl { ( $self:ident, $buf:ident, @@ -1228,6 +1242,327 @@ impl BangType { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Result of reading event by the underlying reader +enum ReadEvent<'i> { + /// Upper-level reader should skip event and request another one from the underlying reader + Skip, + Event(XmlEvent<'i>), + ExternalEntity(Reader>), +} + +impl<'i> fmt::Debug for ReadEvent<'i> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Skip => f.write_str("Skip"), + Self::Event(r) => write!(f, "Event({:?})", r), + Self::ExternalEntity(r) => write!(f, "ExternalEntity({:p})", &r), + } + } +} + +enum EntityReader<'i, 'e> { + /// Reader of internal entity, i.e. the entity defined in the same source as + /// a main document, that returns borrowed events. + InternalBorrowed(Reader<&'i [u8]>), + /// Reader of internal entity, i.e. the entity defined in the same source as + /// a main document, that returns owned events. + InternalOwned(Reader>), + /// Reader of external entity, i.e. the entity defined in the different from the main document + /// source. + External(Reader>), +} + +impl<'i, 'e> fmt::Debug for EntityReader<'i, 'e> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InternalBorrowed(r) => r.fmt(f), + Self::InternalOwned(r) => write!(f, "InternalOwned({:p})", &r), + Self::External(r) => write!(f, "External({:p})", &r), + } + } +} + +impl<'i, 'e> EntityReader<'i, 'e> { + const fn config(&self) -> &Config { + match self { + Self::InternalBorrowed(r) => r.config(), + Self::InternalOwned(r) => r.config(), + Self::External(r) => r.config(), + } + } + + const fn decoder(&self) -> Decoder { + match self { + Self::InternalBorrowed(r) => r.decoder(), + Self::InternalOwned(r) => r.decoder(), + Self::External(r) => r.decoder(), + } + } + + const fn buffer_position(&self) -> u64 { + match self { + Self::InternalBorrowed(r) => r.buffer_position(), + Self::InternalOwned(r) => r.buffer_position(), + Self::External(r) => r.buffer_position(), + } + } + + const fn error_position(&self) -> u64 { + match self { + Self::InternalBorrowed(r) => r.error_position(), + Self::InternalOwned(r) => r.error_position(), + Self::External(r) => r.error_position(), + } + } + + fn read_event(&mut self, buf: &mut Vec) -> Result, Error> { + match self { + Self::InternalBorrowed(r) => r.read_event(), + Self::InternalOwned(r) => Ok(r.read_event_into(buf)?.into_owned()), + Self::External(r) => Ok(r.read_event_into(buf)?.into_owned()), + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// One single parse unit, for example, a file. In XML specification it is called +/// _entity_, but we avoid calling it that as it may lead to confusion. Under _entity_ +/// XML consumers usually mean thing that in specification called "entity reference". +/// +/// Cite from the [specification]: +/// +/// > Each XML document has both a logical and a physical structure. +/// > Physically, the document is composed of units called **entities**. +/// > An entity may refer to other entities to cause their inclusion in the document. +/// > A document begins in a "root" or document entity. Logically, the document +/// > is composed of declarations, elements, comments, character references, +/// > and processing instructions, all of which are indicated in the document +/// > by explicit markup. The logical and physical structures MUST nest properly, +/// > as described in 4.3.2 Well-Formed Parsed Entities. +/// +/// [Also]: +/// > An XML document may consist of one or many storage units. These are called +/// > _entities_; they all have content and are all (except for the document entity +/// > and the external DTD subset) identified by entity **name**. +/// +/// # Lifetimes +/// The `'i` lifetime stands for "input" and is a lifetime of a document entity, +/// i.e. the source which the end-user requested to parse. +/// +/// The `'e` lifetime stands for "external" and it is a lifetime of an external source +/// which the end-user requested to parse. +/// +/// [specification]: https://www.w3.org/TR/xml11/#sec-documents +/// [Also]: https://www.w3.org/TR/xml11/#dt-entity +#[derive(Debug)] +struct StorageUnit<'i, 'e, E> { + /// Readers used to produce events from this entity. + parts: VecDeque>, + + /// Version of XML standard used by this storage unit. + version: XmlVersion, + + /// Used to resolve unknown entities that would otherwise cause the parser + /// to return an [`Error::UnrecognizedGeneralEntity`] error. + entity_resolver: E, +} +impl<'i, 'e, E> StorageUnit<'i, 'e, E> +where + E: EntityResolver<'i>, +{ + fn new(part: EntityReader<'i, 'e>, entity_resolver: E) -> Self { + Self { + parts: VecDeque::from([part]), + version: XmlVersion::V1_0, + entity_resolver, + } + } + + fn read_event(&mut self, buf: &mut Vec) -> Result, Error> { + while let Some(part) = self.parts.back_mut() { + let event = match part.read_event(buf)? { + Event::Decl(e) => { + self.version = e.xml_version()?; + ReadEvent::Skip + } + Event::Comment(_) => ReadEvent::Skip, + + Event::DocType(doctype) => { + self.entity_resolver + .capture(doctype) + .map_err(|e| Error::DoctypeParse(Arc::new(e)))?; + ReadEvent::Skip + } + Event::GeneralRef(e) => { + let reference = part.decoder().decode(&e)?; + if let Some(num) = reference.strip_prefix('#') { + let codepoint = parse_number(num).map_err(EscapeError::InvalidCharRef)?; + let mut bytes = [0u8; 4]; + let text = BytesText::wrap( + codepoint.encode_utf8(&mut bytes).as_bytes(), + Decoder::utf8(), + ); + return Ok(ReadEvent::Event(XmlEvent::Text(text.into_owned()))); + } + match self.entity_resolver.resolve(reference.as_ref()) { + Some(ReplacementText::Internal(Cow::Borrowed(entity))) => { + let mut nested = Reader::from_reader(entity); + *nested.config_mut() = part.config().clone(); + self.parts.push_back(EntityReader::InternalBorrowed(nested)); + continue; + } + Some(ReplacementText::Internal(Cow::Owned(entity))) => { + let boxed: Box = Box::new(Cursor::new(entity)); + let mut nested = Reader::from_reader(boxed); + *nested.config_mut() = part.config().clone(); + self.parts.push_back(EntityReader::InternalOwned(nested)); + continue; + } + Some(ReplacementText::External(source)) => { + let mut external = Reader::from_reader(source); + *external.config_mut() = part.config().clone(); + ReadEvent::ExternalEntity(external) + } + _ => return Err(Error::UnrecognizedGeneralEntity(reference.into_owned())), + } + } + + Event::Empty(e) => ReadEvent::Event(XmlEvent::Empty(e)), + Event::Start(e) => ReadEvent::Event(XmlEvent::Start(e)), + Event::End(e) => ReadEvent::Event(XmlEvent::End(e)), + Event::Text(e) => ReadEvent::Event(XmlEvent::Text(e)), + Event::CData(e) => ReadEvent::Event(XmlEvent::CData(e)), + Event::PI(e) => ReadEvent::Event(XmlEvent::PI(e)), + Event::Eof => { + self.parts.pop_back(); + continue; + } + }; + return Ok(event); + } + Ok(ReadEvent::Event(XmlEvent::Eof)) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// High-level XML reader which automatically resolves entity references (`&...;`) +/// and can stream events from several physical documents (storage units, called _entities_ +/// in the XML [specification]). +/// +/// # Lifetimes +/// The `'i` lifetime stands for "input" and is a lifetime of a document entity, +/// i.e. the source which the end-user requested to parse, _from which events may borrow_. +/// +/// The `'e` lifetime stands for "external" and it is a lifetime of _any_ entity, +/// that parser may parse, from which events will not borrow data. +/// +/// # Type parameter +/// `EF`: the general entity resolver. Used to resolve unknown entities that would +/// otherwise cause the parser to return an [`Error::UnrecognizedGeneralEntity`] error. +/// +/// Note, that the same entity resolved is used to resolve entity references as in initial +/// document, as in any other documents loaded due to entity resolution. +/// +/// [specification]: https://www.w3.org/TR/xml11/#sec-documents +#[derive(Debug)] +pub struct XmlReader<'i, 'e, EF = PredefinedEntityResolver> +where + EF: EntityResolverFactory<'i>, +{ + /// Stack of things that represents individual storage units, such as files. + /// The first element is the initial unit, representing document which user + /// want to parse, others readers created for each resolved external entity + /// (entity, defined in another storage unit). + units: VecDeque>, + + /// Buffer to which external readers will read data. After reading each event + /// data is copied to the event data and buffer is cleared + buffer: Vec, + + entity_resolver_factory: EF, +} + +impl<'i, 'e, EF> XmlReader<'i, 'e, EF> +where + EF: EntityResolverFactory<'i>, +{ + fn new(part: EntityReader<'i, 'e>, mut entity_resolver_factory: EF) -> Self { + let resolver = entity_resolver_factory.new_resolver(); + Self { + units: VecDeque::from([StorageUnit::new(part, resolver)]), + buffer: Vec::new(), + entity_resolver_factory, + } + } + + /// Creates new `Reader` from low-level reader and entity resolver, which would + /// borrow event data from the source when event represent piece of original document. + /// Events from other entities (documents loaded during entity resolution) would + /// own data. + /// + /// For each resolved entity a new [`Reader`] would be created to read entity + /// data. That reader will receive a copy of configuration that would set for + /// `reader`. If `entity_resolver` returns [`ReplacementText::Internal`], then events + /// from that entity would also borrow from the source, otherwise they will + /// maintain an own copy of data. + pub fn borrowed(reader: Reader<&'i [u8]>, entity_resolver_factory: EF) -> Self { + Self::new( + EntityReader::InternalBorrowed(reader), + entity_resolver_factory, + ) + } + + /// Creates new `Reader` from low-level reader and entity resolver, where all + /// events would store its own copy of data. + /// + /// For each resolved entity a new [`Reader`] would be created to read entity + /// data. That reader will receive a copy of configuration that would set for + /// `reader`. + pub fn buffered(reader: Reader>, entity_resolver_factory: EF) -> Self { + Self::new(EntityReader::InternalOwned(reader), entity_resolver_factory) + } + + /// Returns event which, if possible, would borrow from the source and contains + /// a copy of data if borrowing is impossible (for example, event from another + /// document resolved by entity reference). + pub fn read_event(&mut self) -> Result, Error> { + while let Some(unit) = self.units.back_mut() { + self.buffer.clear(); + match unit.read_event(&mut self.buffer)? { + ReadEvent::ExternalEntity(reader) => { + self.units.push_back(StorageUnit::new( + EntityReader::External(reader), + self.entity_resolver_factory.new_resolver(), + )); + continue; + } + ReadEvent::Event(XmlEvent::Eof) => { + self.units.pop_back(); + continue; + } + ReadEvent::Event(event) => return Ok(event), + _ => continue, + } + } + Ok(XmlEvent::Eof) + } +} + +/// This is an implementation for reading from a `&[u8]` as underlying byte stream. +/// This implementation supports not using an intermediate buffer as the byte slice +/// itself can be used to borrow from. +impl<'i, 'e> XmlReader<'i, 'e, PredefinedEntityResolver> { + /// Creates an XML reader from a string slice. + #[allow(clippy::should_implement_trait)] + pub fn from_str(source: &'i str) -> Self { + Self::borrowed(Reader::from_str(source), PredefinedEntityResolver) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod test { /// Checks the internal implementation of the various reader methods diff --git a/src/reader/resolver.rs b/src/reader/resolver.rs new file mode 100644 index 00000000..5cfbde1d --- /dev/null +++ b/src/reader/resolver.rs @@ -0,0 +1,155 @@ +use std::borrow::Cow; +use std::convert::Infallible; +use std::error::Error; +use std::fmt; +use std::io::BufRead; + +use crate::events::BytesText; +use crate::utils::Bytes; + +/// [Replacement text] of the resolved entity reference (`&...;`). +/// +/// [Replacement text]: https://www.w3.org/TR/xml11/#dt-repltext +pub enum ReplacementText<'i, 'e> { + /// Referenced entity inside the same document in the internal DTD. + Internal(Cow<'i, [u8]>), + /// Referenced entity inside the other document which will be read from + /// the specified source. + External(Box), +} +impl<'i, 'e> fmt::Debug for ReplacementText<'i, 'e> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Internal(e) => Bytes(e).fmt(f), + Self::External(e) => write!(f, "", &e), + } + } +} + +/// Used to create entity resolver for each physical document (storage unit or an _[entity]_) +/// that would be parsed by the reader. +/// +/// [entity]: https://www.w3.org/TR/xml11/#sec-documents +pub trait EntityResolverFactory<'i> { + /// The error type that represents DTD parse error. + type CaptureError: Error + 'static; + /// Type that holds state for each entity, for example, for each file, which + /// forms the whole logical structure of the XML document. + type Resolver: EntityResolver<'i, CaptureError = Self::CaptureError>; + + /// Creates state for the new entity parser. + fn new_resolver(&mut self) -> Self::Resolver; +} + +/// Used to resolve unknown [general entities] (`&...;`) while parsing. +/// +/// Note, that this trait is not used to resolve _[parameter entities]_ (`%...;`), they are resolved +/// inside implementation of this trait. Parameter entities cannot be used outside of the `` +/// declaration, so no need to resolve them in the document. +/// +/// # Example +/// +/// That example is taken from the XML specification. Suppose that we have the following DTD: +/// ```xml +/// +/// +/// +/// ``` +/// Here we have two defined _internal general entities_ (`rights` and `book`), which may be used +/// everything in the document below their definition point (including the DOCTYPE declaration) and +/// one _parameter entity_ (`pub`), which may be used only inside DOCTYPE declaration below it +/// definition point. The literal values and replacement texts for those entities are: +/// +/// |Entity|Literal value |Replacement text +/// |------|-----------------------------|------------------------------------ +/// |pub |`Éditions Gallimard` |`Éditions Gallimard` +/// |rights|`All rights reserved` |`All rights reserved` +/// |book |`© 1947 %pub;. &rights;`|`© 1947 Éditions Gallimard. &rights;` +/// +/// Implementation of the `EntityResolver` must return the _replacement text_ from the +/// [resolve](Self::resolve) method. To follow XML specification, that means, that the +/// following must be done over the text that was captured in the [capture](Self::capture) method: +/// - EOLs must be normalized according to the XML version for which this resolver was created +/// - any parameter entity references should be resolved: they should be replaced by their's +/// replacement text +/// - any character references should be expanded into the corresponding characters +/// - any references to the other general entities (`&...;`) should be left as is +/// +/// If the implementation will not parse DTD and just provide values for the general entity +/// references (which usually custom resolvers will do), then just know, that any returned +/// text will be considered as a replacement text as required by the XML specification. +/// One consequence of this: if you want to have literal `<` and `&` characters in the text, +/// you should use escape form of them, either as character reference or as entity reference. +/// Otherwise they will be considered as part of the markup. +/// +/// [general entities]: https://www.w3.org/TR/xml11/#gen-entity +/// [parameter entities]: https://www.w3.org/TR/xml11/#dt-PE +pub trait EntityResolver<'i> { + /// The error type that represents DTD parse error. + type CaptureError: Error + 'static; + + /// Called on contents of [`Event::DocType`] to capture declared entities. + /// Can be called multiple times, for each parsed `` declaration. + /// + /// [`Event::DocType`]: crate::reader::Event::DocType + fn capture(&mut self, doctype: BytesText<'i>) -> Result<(), Self::CaptureError>; + + /// Called when an entity needs to be resolved. Returns entity's [replacement text]. + /// + /// `None` is returned if a suitable value can not be found. + /// In that case an [`Error::UnrecognizedGeneralEntity`] will be returned by a reader. + /// + /// [replacement text]: https://www.w3.org/TR/xml11/#dt-repltext + /// [`Error::UnrecognizedGeneralEntity`]: crate::errors::Error::UnrecognizedGeneralEntity + fn resolve<'e>(&self, entity: &str) -> Option>; +} + +/// An [`EntityResolver`] that resolves only predefined entities, as defined in [specification]: +/// +/// | Entity | Resolution +/// |--------|------------ +/// |`<` | `<` (note: not `<`) +/// |`>` | `>` +/// |`&` | `&` (note: not `&`) +/// |`'`| `'` +/// |`"`| `"` +/// +/// This is the default resolver for reader and deserializer. +/// +/// [specification]: https://www.w3.org/TR/xml11/#sec-predefined-ent +#[derive(Default, Debug, Copy, Clone)] +pub struct PredefinedEntityResolver; + +impl<'i> EntityResolverFactory<'i> for PredefinedEntityResolver { + type CaptureError = Infallible; + type Resolver = Self; + + #[inline] + fn new_resolver(&mut self) -> Self::Resolver { + *self + } +} + +impl<'i> EntityResolver<'i> for PredefinedEntityResolver { + type CaptureError = Infallible; + + #[inline] + fn capture(&mut self, _doctype: BytesText<'i>) -> Result<(), Self::CaptureError> { + Ok(()) + } + + #[inline] + fn resolve<'e>(&self, entity: &str) -> Option> { + let replacement_text = match entity { + "lt" => "<", + "gt" => ">", + "amp" => "&", + "apos" => "'", + "quot" => "\"", + _ => return None, + }; + Some(ReplacementText::Internal(Cow::Borrowed( + replacement_text.as_bytes(), + ))) + } +} From 4a12a7f8dbeba084211876bdaa9becb6f7fc41e5 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 15 Nov 2025 23:51:36 +0500 Subject: [PATCH 02/11] Support namespaces in the new XmlReader --- src/reader/mod.rs | 89 ++++++++++++++++++++++++++++++++++++++++- src/reader/ns_reader.rs | 40 +++++++++++++++++- 2 files changed, 127 insertions(+), 2 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 87a335b8..5d5dc7db 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -13,6 +13,7 @@ use crate::encoding::Decoder; use crate::errors::{Error, IllFormedError, SyntaxError}; use crate::escape::{parse_number, EscapeError}; use crate::events::{BytesRef, BytesText, Event}; +use crate::name::NamespaceResolver; use crate::parser::{DtdParser, ElementParser, Parser, PiParser}; use crate::reader::state::ReaderState; use crate::XmlVersion; @@ -1363,6 +1364,13 @@ struct StorageUnit<'i, 'e, E> { /// Version of XML standard used by this storage unit. version: XmlVersion, + /// A buffer to manage namespaces + ns_resolver: NamespaceResolver, + /// We cannot pop data from the namespace stack until returned `Empty` or `End` + /// event will be processed by the user, so we only mark that we should that + /// in the next [`Self::read_event()`] call. + pending_ns_pop: bool, + /// Used to resolve unknown entities that would otherwise cause the parser /// to return an [`Error::UnrecognizedGeneralEntity`] error. entity_resolver: E, @@ -1375,11 +1383,13 @@ where Self { parts: VecDeque::from([part]), version: XmlVersion::V1_0, + ns_resolver: NamespaceResolver::default(), + pending_ns_pop: false, entity_resolver, } } - fn read_event(&mut self, buf: &mut Vec) -> Result, Error> { + fn read_event_impl(&mut self, buf: &mut Vec) -> Result, Error> { while let Some(part) = self.parts.back_mut() { let event = match part.read_event(buf)? { Event::Decl(e) => { @@ -1443,6 +1453,47 @@ where } Ok(ReadEvent::Event(XmlEvent::Eof)) } + + fn read_event(&mut self, buf: &mut Vec) -> Result, Error> { + self.pop(); + let event = self.read_event_impl(buf); + self.process_event(event) + } + + #[inline] + fn pop(&mut self) { + if self.pending_ns_pop { + self.ns_resolver.pop(); + self.pending_ns_pop = false; + } + } + + #[inline] + fn process_event( + &mut self, + event: Result, Error>, + ) -> Result, Error> { + match event { + Ok(ReadEvent::Event(XmlEvent::Start(e))) => { + self.ns_resolver.push(&e)?; + Ok(ReadEvent::Event(XmlEvent::Start(e))) + } + Ok(ReadEvent::Event(XmlEvent::Empty(e))) => { + self.ns_resolver.push(&e)?; + // notify next `read_event()` invocation that it needs to pop this + // namespace scope + self.pending_ns_pop = true; + Ok(ReadEvent::Event(XmlEvent::Empty(e))) + } + Ok(ReadEvent::Event(XmlEvent::End(e))) => { + // notify next `read_event()` invocation that it needs to pop this + // namespace scope + self.pending_ns_pop = true; + Ok(ReadEvent::Event(XmlEvent::End(e))) + } + e => e, + } + } } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1524,6 +1575,31 @@ where Self::new(EntityReader::InternalOwned(reader), entity_resolver_factory) } + /// The same, as [`borrowed`](Self::borrowed), but creates from namespace-aware reader. + /// The state of the reader will be preserved. + pub fn borrowed_ns(reader: NsReader<&'i [u8]>, mut entity_resolver_factory: EF) -> Self { + let resolver = entity_resolver_factory.new_resolver(); + Self { + units: VecDeque::from([reader.to_borrowed_storage_unit(resolver)]), + buffer: Vec::new(), + entity_resolver_factory, + } + } + + /// The same, as [`buffered`](Self::buffered), but creates from namespace-aware reader. + /// The state of the reader will be preserved. + pub fn buffered_ns( + reader: NsReader>, + mut entity_resolver_factory: EF, + ) -> Self { + let resolver = entity_resolver_factory.new_resolver(); + Self { + units: VecDeque::from([reader.to_buffered_storage_unit(resolver)]), + buffer: Vec::new(), + entity_resolver_factory, + } + } + /// Returns event which, if possible, would borrow from the source and contains /// a copy of data if borrowing is impossible (for example, event from another /// document resolved by entity reference). @@ -1548,6 +1624,17 @@ where } Ok(XmlEvent::Eof) } + + /// Returns a storage of namespace bindings associated with this reader. + /// + /// Note, that this object may change after reading new event, if new event + /// will be from the new storage unit. That is possible only if custom + /// [`EntityResolver`] is used. + #[inline] + pub fn resolver(&self) -> &NamespaceResolver { + // SAFETY: At least one storage unit should always be there + &self.units.back().unwrap().ns_resolver + } } /// This is an implementation for reading from a `&[u8]` as underlying byte stream. diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 46858cc8..ceca86eb 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -4,6 +4,7 @@ //! [qualified names]: https://www.w3.org/TR/xml-names11/#dt-qualname //! [expanded names]: https://www.w3.org/TR/xml-names11/#dt-expname +use std::collections::VecDeque; use std::fs::File; use std::io::{BufRead, BufReader}; use std::ops::Deref; @@ -12,7 +13,8 @@ use std::path::Path; use crate::errors::Result; use crate::events::{BytesText, Event}; use crate::name::{NamespaceResolver, QName, ResolveResult}; -use crate::reader::{Config, Reader, Span, XmlSource}; +use crate::reader::{Config, EntityReader, EntityResolver, Reader, Span, StorageUnit, XmlSource}; +use crate::XmlVersion; /// A low level encoding-agnostic XML event reader that performs namespace resolution. /// @@ -751,6 +753,42 @@ impl<'i> NsReader<&'i [u8]> { self.ns_resolver.pop(); Ok(result) } + + /// Converts this reader with its state into the storage unit for the [`XmlReader`](super::XmlReader). + pub(super) fn to_borrowed_storage_unit<'e, E>( + self, + entity_resolver: E, + ) -> StorageUnit<'i, 'e, E> + where + E: EntityResolver<'i>, + { + StorageUnit { + parts: VecDeque::from([EntityReader::InternalBorrowed(self.reader)]), + version: XmlVersion::V1_0, + ns_resolver: self.ns_resolver, + pending_ns_pop: self.pending_pop, + entity_resolver, + } + } +} + +impl<'i> NsReader> { + /// Converts this reader with its state into the storage unit for the [`XmlReader`](super::XmlReader). + pub(super) fn to_buffered_storage_unit<'e, E>( + self, + entity_resolver: E, + ) -> StorageUnit<'i, 'e, E> + where + E: EntityResolver<'i>, + { + StorageUnit { + parts: VecDeque::from([EntityReader::InternalOwned(self.reader)]), + version: XmlVersion::V1_0, + ns_resolver: self.ns_resolver, + pending_ns_pop: self.pending_pop, + entity_resolver, + } + } } impl Deref for NsReader { From 2bb5ff1e8b17d03300049497a82d7b495bd3b993 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 23 Jun 2024 02:15:33 +0500 Subject: [PATCH 03/11] Add a new example which demonstrates the new XmlReader API --- examples/high-level-entities.rs | 224 ++++++++++++++++++ ...stom_entities.rs => low-level-entities.rs} | 6 +- 2 files changed, 226 insertions(+), 4 deletions(-) create mode 100644 examples/high-level-entities.rs rename examples/{custom_entities.rs => low-level-entities.rs} (97%) diff --git a/examples/high-level-entities.rs b/examples/high-level-entities.rs new file mode 100644 index 00000000..f42c5025 --- /dev/null +++ b/examples/high-level-entities.rs @@ -0,0 +1,224 @@ +//! This example demonstrate how custom entities can be extracted from the DOCTYPE +//! and usage of the high-level `Reader` API. +//! +//! NB: this example is deliberately kept simple: +//! * the regex in this example is simple but brittle. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::convert::Infallible; +use std::fmt; +use std::io::{BufRead, Cursor}; + +use quick_xml::events::{BytesEnd, BytesStart, BytesText}; +use quick_xml::reader::{ + EntityResolver, EntityResolverFactory, Reader, ReplacementText, XmlEvent, XmlReader, +}; +use regex::bytes::Regex; + +use pretty_assertions::assert_eq; + +const XML1: &str = r#" + +" > + &element1; " > +]> +&element2; +&external; +"#; + +/// Additional document which in reality would be referenced by +/// `` +const XML2: &str = r#" + +text +"#; + +struct MyResolver<'i> { + /// Map of captured internal _parsed general entities_. _Parsed_ means that + /// value of the entity is parsed by XML reader. + entities: HashMap, Cow<'i, [u8]>>, + /// In this example we use simple regular expression to capture entities from DTD. + /// In real application you should use DTD parser. + entity_re: Regex, +} +impl<'i> fmt::Debug for MyResolver<'i> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_map() + .entries(self.entities.iter().map(|(k, v)| { + ( + std::str::from_utf8(k).unwrap(), + std::str::from_utf8(v).unwrap(), + ) + })) + .finish() + } +} + +impl<'i> MyResolver<'i> { + fn new() -> Result { + Ok(Self { + entities: Default::default(), + // Capture "name" and "content" from such string: + // + entity_re: Regex::new(r#""#)?, + }) + } + fn capture_borrowed(&mut self, doctype: &'i [u8]) { + for cap in self.entity_re.captures_iter(doctype) { + self.entities.insert( + cap.get(1).unwrap().as_bytes().into(), + cap.get(2).unwrap().as_bytes().into(), + ); + } + } + fn capture_owned(&mut self, doctype: Vec) { + for cap in self.entity_re.captures_iter(&doctype) { + self.entities.insert( + cap.get(1).unwrap().as_bytes().to_owned().into(), + cap.get(2).unwrap().as_bytes().to_owned().into(), + ); + } + } +} + +impl<'i> EntityResolverFactory<'i> for MyResolver<'i> { + type CaptureError = Infallible; + type Resolver = Self; + + fn new_resolver(&mut self) -> Self::Resolver { + // We use valid regex so cannot fail + Self::new().unwrap() + } +} + +impl<'i> EntityResolver<'i> for MyResolver<'i> { + type CaptureError = Infallible; + + fn capture(&mut self, doctype: BytesText<'i>) -> Result<(), Self::CaptureError> { + dbg!(&doctype); + match doctype.into_inner() { + Cow::Borrowed(doctype) => self.capture_borrowed(doctype), + Cow::Owned(doctype) => self.capture_owned(doctype), + } + dbg!(self); + Ok(()) + } + + fn resolve<'e>(&self, entity: &str) -> Option> { + dbg!((entity, self)); + if entity == "external" { + return Some(ReplacementText::External(Box::new(Cursor::new( + XML2.as_bytes(), + )))); + } + match self.entities.get(entity.as_bytes()) { + Some(replacement) => Some(ReplacementText::Internal(replacement.clone())), + None => None, + } + } +} + +/// In this example the events will borrow from the first document +fn borrowed() -> Result<(), Box> { + let mut reader = Reader::from_str(XML1); + reader.config_mut().trim_text(true); + + let mut r = XmlReader::borrowed(reader, MyResolver::new()?); + + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::from_content( + r#"test label="Message: &text;""#, + 4 + )) + ); + + //-------------------------------------------------------------------------- + // This part was inserted into original document from entity defined in DTD + assert_eq!(r.read_event()?, XmlEvent::Start(BytesStart::new("a"))); + assert_eq!( + r.read_event()?, + XmlEvent::Empty(BytesStart::from_content( + r#"dtd attr = 'Message: &text;'"#, + 3 + )) + ); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("a"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("test"))); + + //-------------------------------------------------------------------------- + // Start of external document + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::new("external")) + ); + assert_eq!(r.read_event()?, XmlEvent::Text(BytesText::new("text"))); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("external"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::Eof); + + Ok(()) +} + +/// In this example the events will always copy data +fn buffered() -> Result<(), Box> { + let boxed: Box = Box::new(Cursor::new(XML1.as_bytes())); + let mut reader = Reader::from_reader(boxed); + reader.config_mut().trim_text(true); + + let mut r = XmlReader::buffered(reader, MyResolver::new()?); + + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::from_content( + r#"test label="Message: &text;""#, + 4 + )) + ); + + //-------------------------------------------------------------------------- + // This part was inserted into original document from entity defined in DTD + assert_eq!(r.read_event()?, XmlEvent::Start(BytesStart::new("a"))); + assert_eq!( + r.read_event()?, + XmlEvent::Empty(BytesStart::from_content( + r#"dtd attr = 'Message: &text;'"#, + 3 + )) + ); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("a"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("test"))); + + //-------------------------------------------------------------------------- + // Start of external document + assert_eq!( + r.read_event()?, + XmlEvent::Start(BytesStart::new("external")) + ); + assert_eq!(r.read_event()?, XmlEvent::Text(BytesText::new("text"))); + assert_eq!(r.read_event()?, XmlEvent::End(BytesEnd::new("external"))); + //-------------------------------------------------------------------------- + + assert_eq!(r.read_event()?, XmlEvent::Eof); + + Ok(()) +} + +fn main() -> Result<(), Box> { + println!("{}", XML1); + // In this example the events will borrow from the first document + borrowed()?; + + println!("----------------------------------------------------------------"); + println!("{}", XML1); + // In this example the events will always copy data + buffered()?; + Ok(()) +} diff --git a/examples/custom_entities.rs b/examples/low-level-entities.rs similarity index 97% rename from examples/custom_entities.rs rename to examples/low-level-entities.rs index ed8c082a..61b5ea0b 100644 --- a/examples/custom_entities.rs +++ b/examples/low-level-entities.rs @@ -16,7 +16,6 @@ use std::str::from_utf8; use quick_xml::encoding::Decoder; use quick_xml::errors::Error; -use quick_xml::escape::EscapeError; use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event}; use quick_xml::name::QName; use quick_xml::reader::Reader; @@ -107,11 +106,10 @@ impl<'i> MyReader<'i> { } } - fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> { + fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], Error> { match self.entities.get(entity) { Some(replacement) => Ok(replacement), - None => Err(EscapeError::UnrecognizedEntity( - 0..0, + None => Err(Error::UnrecognizedGeneralEntity( String::from_utf8_lossy(entity).into_owned(), )), } From 465924f69856d278e6a198804a849fa5872f3a21 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 13 Dec 2025 21:00:45 +0500 Subject: [PATCH 04/11] Rename XmlReader to LookaheadReader to avoid name clash in the next commit --- src/de/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index de2206a7..e9bc0773 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2329,7 +2329,7 @@ impl<'a> PayloadEvent<'a> { /// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s. /// [`PayloadEvent::Text`] events, that followed by any event except /// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end. -struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolver> { +struct LookaheadReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolver> { /// A source of low-level XML events reader: R, /// Intermediate event, that could be returned by the next call to `next()`. @@ -2345,7 +2345,7 @@ struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolve entity_resolver: E, } -impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { +impl<'i, R: XmlRead<'i>, E: EntityResolver> LookaheadReader<'i, R, E> { fn new(mut reader: R, entity_resolver: E) -> Self { // Lookahead by one event immediately, so we do not need to check in the // loop if we need lookahead or not @@ -2522,7 +2522,7 @@ where R: XmlRead<'de>, { /// An XML reader that streams events into this deserializer - reader: XmlReader<'de, R, E>, + reader: LookaheadReader<'de, R, E>, /// When deserializing sequences sometimes we have to skip unwanted events. /// That events should be stored and then replayed. This is a replay buffer, @@ -2568,7 +2568,7 @@ where /// - [`Deserializer::from_reader`] fn new(reader: R, entity_resolver: E) -> Self { Self { - reader: XmlReader::new(reader, entity_resolver), + reader: LookaheadReader::new(reader, entity_resolver), #[cfg(feature = "overlapped-lists")] read: VecDeque::new(), From 042527d31925f543118e41dc07de1a146e726cc1 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 23 Nov 2024 21:56:39 +0500 Subject: [PATCH 05/11] Implement XmlRead for new XmlReader --- src/de/mod.rs | 45 ++++++++++++++++++++++++- src/reader/mod.rs | 83 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 125 insertions(+), 3 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index e9bc0773..8dab1939 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2116,7 +2116,7 @@ use crate::{ escape::{parse_number, EscapeError}, events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText, Event}, name::QName, - reader::NsReader, + reader::{NsReader, XmlEvent, XmlReader}, }; use serde::de::{ self, Deserialize, DeserializeOwned, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor, @@ -3579,6 +3579,49 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { } } +impl<'de, 'e> XmlRead<'de> for XmlReader<'de, 'e> { + fn next(&mut self) -> Result, DeError> { + loop { + let event = match self.read_event()? { + XmlEvent::Start(e) => PayloadEvent::Start(e), + XmlEvent::End(e) => PayloadEvent::End(e), + XmlEvent::Eof => PayloadEvent::Eof, + + // Do not trim next text event after Text or CDATA + XmlEvent::CData(e) => PayloadEvent::CData(e), + XmlEvent::Text(e) => PayloadEvent::Text(e), + + // XmlEvent::Empty doesn't produced, because it is expanded into Start+End + // Skip XmlEvent::PI + _ => continue, + }; + return Ok(event); + } + } + + fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { + match self.read_to_end(name) { + Err(e) => Err(e.into()), + Ok(_) => Ok(()), + } + } + + #[inline] + fn xml_version(&self) -> XmlVersion { + self.xml_version() + } + + fn decoder(&self) -> Decoder { + self.decoder() + } + + fn has_nil_attr(&self, start: &BytesStart) -> bool { + self.has_nil_attr(start) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod tests { use super::*; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 5d5dc7db..adee92ee 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -12,8 +12,8 @@ use std::sync::Arc; use crate::encoding::Decoder; use crate::errors::{Error, IllFormedError, SyntaxError}; use crate::escape::{parse_number, EscapeError}; -use crate::events::{BytesRef, BytesText, Event}; -use crate::name::NamespaceResolver; +use crate::events::{BytesRef, BytesStart, BytesText, Event}; +use crate::name::{NamespaceResolver, QName}; use crate::parser::{DtdParser, ElementParser, Parser, PiParser}; use crate::reader::state::ReaderState; use crate::XmlVersion; @@ -1323,6 +1323,14 @@ impl<'i, 'e> EntityReader<'i, 'e> { Self::External(r) => Ok(r.read_event_into(buf)?.into_owned()), } } + + fn read_to_end(&mut self, end: QName, buf: &mut Vec) -> Result { + match self { + EntityReader::InternalBorrowed(r) => r.read_to_end(end), + EntityReader::InternalOwned(r) => r.read_to_end_into(end, buf), + EntityReader::External(r) => r.read_to_end_into(end, buf), + } + } } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1494,6 +1502,31 @@ where e => e, } } + + fn read_to_end(&mut self, end: QName, buf: &mut Vec) -> Result { + // FIXME: this is incorrect, because entity reference does not obligated + // to properly nested XML tree + if let Some(part) = self.parts.back_mut() { + part.read_to_end(end, buf)?; + // Because we found the end tag and consume it, we should pop any namespaces that + // was started by the Start event + self.pop(); + return Ok(true); + } + Ok(false) + } + + fn decoder(&self) -> Decoder { + match self.parts.back() { + Some(part) => part.decoder(), + // Does not matter what decoder to use when all events exhausted + None => Decoder::utf8(), + } + } + + fn has_nil_attr(&self, start: &BytesStart) -> bool { + start.attributes().has_nil(&self.ns_resolver) + } } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1635,6 +1668,52 @@ where // SAFETY: At least one storage unit should always be there &self.units.back().unwrap().ns_resolver } + + /// Reads until end element is found. This function is supposed to be called + /// after you already read a [`Event::Start`] event. + /// + /// Unlike [`Reader::read_to_end`] this method does not return span because + /// there might not be continuos space that is occupied by the XML tree. + pub fn read_to_end(&mut self, end: QName) -> Result<(), Error> { + // FIXME: this is incorrect, because entity reference does not obligated + // to properly nested XML tree + if let Some(unit) = self.units.back_mut() { + unit.read_to_end(end, &mut self.buffer)?; + return Ok(()); + } + Err(Error::missed_end(end, Decoder::utf8())) + } + + /// Note: version can be changed after reading new event, because new event + /// could be produced from another document due to entity resolution. + pub fn xml_version(&self) -> XmlVersion { + match self.units.back() { + Some(unit) => unit.version, + // If there no units we assume default XML version + None => XmlVersion::V1_0, + } + } + + /// Note: decoder can be changed after reading new event, because new event + /// could be produced from another document due to entity resolution. + pub fn decoder(&self) -> Decoder { + match self.units.back() { + Some(unit) => unit.decoder(), + // Does not matter what decoder to use when all events exhausted + None => Decoder::utf8(), + } + } + + /// Checks if the `start` tag has a [`xsi:nil`] attribute. This method ignores + /// any errors in attributes. + /// + /// [`xsi:nil`]: https://www.w3.org/TR/xmlschema-1/#xsi_nil + pub fn has_nil_attr(&self, start: &BytesStart) -> bool { + match self.units.back() { + Some(unit) => unit.has_nil_attr(start), + None => false, + } + } } /// This is an implementation for reading from a `&[u8]` as underlying byte stream. From 582f63ce5201b0d2eda0c9f446189dac41d841df Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 23 Nov 2024 22:26:04 +0500 Subject: [PATCH 06/11] Replace SliceReader and IoReader with the implementation that uses the new XmlReader Now deserializing from BufRead requires 'static type (is that still true?) failures: serde-de: resolve::resolve_custom_entity --doc: src\de\mod.rs - de::Deserializer<'de,R,E>::get_ref (line 2580) src\de\resolver.rs - de::resolver::EntityResolver (line 13) --- src/de/mod.rs | 341 ++++++++------------------------------------------ 1 file changed, 51 insertions(+), 290 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index 8dab1939..231875c8 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -125,8 +125,8 @@ //! //! //! -//! [text]: Event::Text -//! [CDATA]: Event::CData +//! [text]: crate::events::Event::Text +//! [CDATA]: crate::events::Event::CData //! //! //! @@ -2114,7 +2114,7 @@ use crate::{ encoding::Decoder, errors::Error, escape::{parse_number, EscapeError}, - events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText, Event}, + events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText}, name::QName, reader::{NsReader, XmlEvent, XmlReader}, }; @@ -2148,17 +2148,17 @@ const fn is_non_whitespace(ch: char) -> bool { /// Internally text is stored in `Cow`. Cloning of text is cheap while it /// is borrowed and makes copies of data when it is owned. /// -/// [`Text`]: Event::Text -/// [`CData`]: Event::CData -/// [`Comment`]: Event::Comment -/// [`PI`]: Event::PI +/// [`Text`]: crate::events::Event::Text +/// [`CData`]: crate::events::Event::CData +/// [`Comment`]: crate::events::Event::Comment +/// [`PI`]: crate::events::Event::PI #[derive(Clone, Debug, PartialEq, Eq)] pub struct Text<'a> { /// Untrimmed text after concatenating content of all /// [`Text`] and [`CData`] events /// - /// [`Text`]: Event::Text - /// [`CData`]: Event::CData + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData text: Cow<'a, str>, /// A range into `text` which contains data after trimming content: Range, @@ -2270,10 +2270,10 @@ pub enum DeEvent<'a> { /// events. _Consequent_ means that events should follow each other or be /// delimited only by (any count of) [`Comment`] or [`PI`] events. /// - /// [`Text`]: Event::Text - /// [`CData`]: Event::CData - /// [`Comment`]: Event::Comment - /// [`PI`]: Event::PI + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData + /// [`Comment`]: crate::events::Event::Comment + /// [`PI`]: crate::events::Event::PI Text(Text<'a>), /// End of XML document. Eof, @@ -2290,8 +2290,8 @@ pub enum DeEvent<'a> { /// end spaces we should lookahead by one deserializer event (i. e. skip all /// comments and processing instructions). /// -/// [`Text`]: Event::Text -/// [`CData`]: Event::CData +/// [`Text`]: crate::events::Event::Text +/// [`CData`]: crate::events::Event::CData #[derive(Clone, Debug, PartialEq, Eq)] pub enum PayloadEvent<'a> { /// Start tag (with attributes) ``. @@ -2310,22 +2310,6 @@ pub enum PayloadEvent<'a> { Eof, } -impl<'a> PayloadEvent<'a> { - /// Ensures that all data is owned to extend the object's lifetime if necessary. - #[inline] - fn into_owned(self) -> PayloadEvent<'static> { - match self { - PayloadEvent::Start(e) => PayloadEvent::Start(e.into_owned()), - PayloadEvent::End(e) => PayloadEvent::End(e.into_owned()), - PayloadEvent::Text(e) => PayloadEvent::Text(e.into_owned()), - PayloadEvent::CData(e) => PayloadEvent::CData(e.into_owned()), - PayloadEvent::DocType(e) => PayloadEvent::DocType(e.into_owned()), - PayloadEvent::GeneralRef(e) => PayloadEvent::GeneralRef(e.into_owned()), - PayloadEvent::Eof => PayloadEvent::Eof, - } - } -} - /// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s. /// [`PayloadEvent::Text`] events, that followed by any event except /// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end. @@ -2604,7 +2588,7 @@ where /// # use pretty_assertions::assert_eq; /// use serde::Deserialize; /// use quick_xml::de::Deserializer; - /// use quick_xml::NsReader; + /// use quick_xml::reader::XmlReader; /// /// #[derive(Deserialize)] /// struct SomeStruct { @@ -2621,7 +2605,7 @@ where /// let err = SomeStruct::deserialize(&mut de); /// assert!(err.is_err()); /// - /// let reader: &NsReader<_> = de.get_ref().get_ref(); + /// let reader: &XmlReader<_> = de.get_ref(); /// /// assert_eq!(reader.error_position(), 28); /// assert_eq!(reader.buffer_position(), 41); @@ -2876,8 +2860,8 @@ where /// |[`DeEvent::Text`] |`text content` or `` (probably mixed)|Returns event content unchanged, expects the `` after that /// |[`DeEvent::Eof`] | |Emits [`InvalidXml(IllFormed(MissingEndTag))`](DeError::InvalidXml) /// - /// [`Text`]: Event::Text - /// [`CData`]: Event::CData + /// [`Text`]: crate::events::Event::Text + /// [`CData`]: crate::events::Event::CData fn read_string_impl(&mut self, allow_start: bool) -> Result, DeError> { match self.next()? { // Reached by doc tests only: this file, lines 979 and 996 @@ -3006,7 +2990,7 @@ where } } -impl<'de> Deserializer<'de, SliceReader<'de>> { +impl<'de, 'e> Deserializer<'de, XmlReader<'de, 'e>> { /// Create a new deserializer that will borrow data from the specified string. /// /// Deserializer created with this method will not resolve custom entities. @@ -3055,7 +3039,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> { } } -impl<'de, E> Deserializer<'de, SliceReader<'de>, E> +impl<'de, 'e, E> Deserializer<'de, XmlReader<'de, 'e>, E> where E: EntityResolver, { @@ -3076,19 +3060,13 @@ where config.expand_empty_elements = true; Self::new( - SliceReader { - reader, - version: XmlVersion::V1_0, - }, + XmlReader::borrowed_ns(reader, crate::reader::PredefinedEntityResolver), entity_resolver, ) } } -impl<'de, R> Deserializer<'de, IoReader> -where - R: BufRead, -{ +impl<'de, 'e> Deserializer<'de, XmlReader<'de, 'e>> { /// Create a new deserializer that will copy data from the specified reader /// into internal buffer. /// @@ -3097,7 +3075,10 @@ where /// UTF-8, you can decode it first before using [`from_str`]. /// /// Deserializer created with this method will not resolve custom entities. - pub fn from_reader(reader: R) -> Self { + pub fn from_reader(reader: R) -> Self + where + R: BufRead + 'de, + { Self::with_resolver(reader, PredefinedEntityResolver) } @@ -3115,18 +3096,22 @@ where /// # use quick_xml::de::Deserializer; /// # use quick_xml::NsReader; /// # use serde::Deserialize; - /// # + /// use std::io::{BufRead, Cursor}; + /// /// #[derive(Deserialize, PartialEq, Debug)] /// struct Object { /// tag: String, /// } /// - /// let mut reader = NsReader::from_str(" test "); + /// let boxed: Box = Box::new(Cursor::new(" test ")); + /// let mut reader = NsReader::from_reader(boxed); /// - /// let mut de = Deserializer::buffering(reader.clone()); + /// let mut de = Deserializer::buffering(reader); /// let obj = Object::deserialize(&mut de).unwrap(); /// assert_eq!(obj, Object { tag: " test ".to_string() }); /// + /// let boxed: Box = Box::new(Cursor::new(" test ")); + /// let mut reader = NsReader::from_reader(boxed); /// reader.config_mut().trim_text(true); /// /// let mut de = Deserializer::buffering(reader); @@ -3136,14 +3121,13 @@ where /// /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements #[inline] - pub fn buffering(reader: NsReader) -> Self { + pub fn buffering(reader: NsReader>) -> Self { Self::buffering_with_resolver(reader, PredefinedEntityResolver) } } -impl<'de, R, E> Deserializer<'de, IoReader, E> +impl<'de, 'e, E> Deserializer<'de, XmlReader<'de, 'e>, E> where - R: BufRead, E: EntityResolver, { /// Create a new deserializer that will copy data from the specified reader @@ -3152,19 +3136,12 @@ where /// If you already have a string use [`Self::from_str`] instead, because it /// will borrow instead of copy. If you have `&[u8]` which is known to represent /// UTF-8, you can decode it first before using [`from_str`]. - pub fn with_resolver(reader: R, entity_resolver: E) -> Self { - let mut reader = NsReader::from_reader(reader); - let config = reader.config_mut(); - config.expand_empty_elements = true; - - Self::new( - IoReader { - reader, - buf: Vec::new(), - version: XmlVersion::V1_0, - }, - entity_resolver, - ) + pub fn with_resolver(reader: R, entity_resolver: E) -> Self + where + R: BufRead + 'de, + { + let boxed: Box = Box::new(reader); + Self::buffering_with_resolver(NsReader::from_reader(boxed), entity_resolver) } /// Create new deserializer that will copy data from the specified preconfigured reader @@ -3173,16 +3150,15 @@ where /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. /// /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements - pub fn buffering_with_resolver(mut reader: NsReader, entity_resolver: E) -> Self { + pub fn buffering_with_resolver( + mut reader: NsReader>, + entity_resolver: E, + ) -> Self { let config = reader.config_mut(); config.expand_empty_elements = true; Self::new( - IoReader { - reader, - buf: Vec::new(), - version: XmlVersion::V1_0, - }, + XmlReader::buffered_ns(reader, crate::reader::PredefinedEntityResolver), entity_resolver, ) } @@ -3370,28 +3346,6 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Converts raw reader's event into a payload event. -/// Returns `None`, if event should be skipped. -#[inline(always)] -fn skip_uninterested<'a>(event: Event<'a>) -> Option> { - let event = match event { - Event::DocType(e) => PayloadEvent::DocType(e), - Event::Start(e) => PayloadEvent::Start(e), - Event::End(e) => PayloadEvent::End(e), - Event::Eof => PayloadEvent::Eof, - - // Do not trim next text event after Text, CDATA or reference event - Event::CData(e) => PayloadEvent::CData(e), - Event::Text(e) => PayloadEvent::Text(e), - Event::GeneralRef(e) => PayloadEvent::GeneralRef(e), - - _ => return None, - }; - Some(event) -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - /// Trait used by the deserializer for iterating over input. This is manually /// "specialized" for iterating over `&[u8]`. /// @@ -3419,166 +3373,6 @@ pub trait XmlRead<'i> { fn has_nil_attr(&self, start: &BytesStart) -> bool; } -/// XML input source that reads from a std::io input stream. -/// -/// You cannot create it, it is created automatically when you call -/// [`Deserializer::from_reader`] -pub struct IoReader { - reader: NsReader, - buf: Vec, - version: XmlVersion, -} - -impl IoReader { - /// Returns the underlying XML reader. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use serde::Deserialize; - /// use std::io::Cursor; - /// use quick_xml::de::Deserializer; - /// use quick_xml::NsReader; - /// - /// #[derive(Deserialize)] - /// struct SomeStruct { - /// field1: String, - /// field2: String, - /// } - /// - /// // Try to deserialize from broken XML - /// let mut de = Deserializer::from_reader(Cursor::new( - /// "" - /// // 0 ^= 28 ^= 41 - /// )); - /// - /// let err = SomeStruct::deserialize(&mut de); - /// assert!(err.is_err()); - /// - /// let reader: &NsReader> = de.get_ref().get_ref(); - /// - /// assert_eq!(reader.error_position(), 28); - /// assert_eq!(reader.buffer_position(), 41); - /// ``` - pub const fn get_ref(&self) -> &NsReader { - &self.reader - } -} - -impl<'i, R: BufRead> XmlRead<'i> for IoReader { - fn next(&mut self) -> Result, DeError> { - loop { - self.buf.clear(); - - let event = self.reader.read_event_into(&mut self.buf)?; - if let Event::Decl(e) = &event { - self.version = e.xml_version()?; - } - if let Some(event) = skip_uninterested(event) { - return Ok(event.into_owned()); - } - } - } - - fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { - match self.reader.read_to_end_into(name, &mut self.buf) { - Err(e) => Err(e.into()), - Ok(_) => Ok(()), - } - } - - #[inline] - fn xml_version(&self) -> XmlVersion { - self.version - } - - #[inline] - fn decoder(&self) -> Decoder { - self.reader.decoder() - } - - fn has_nil_attr(&self, start: &BytesStart) -> bool { - start.attributes().has_nil(self.reader.resolver()) - } -} - -/// XML input source that reads from a slice of bytes and can borrow from it. -/// -/// You cannot create it, it is created automatically when you call -/// [`Deserializer::from_str`]. -pub struct SliceReader<'de> { - reader: NsReader<&'de [u8]>, - version: XmlVersion, -} - -impl<'de> SliceReader<'de> { - /// Returns the underlying XML reader. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use serde::Deserialize; - /// use quick_xml::de::Deserializer; - /// use quick_xml::NsReader; - /// - /// #[derive(Deserialize)] - /// struct SomeStruct { - /// field1: String, - /// field2: String, - /// } - /// - /// // Try to deserialize from broken XML - /// let mut de = Deserializer::from_str( - /// "" - /// // 0 ^= 28 ^= 41 - /// ); - /// - /// let err = SomeStruct::deserialize(&mut de); - /// assert!(err.is_err()); - /// - /// let reader: &NsReader<&[u8]> = de.get_ref().get_ref(); - /// - /// assert_eq!(reader.error_position(), 28); - /// assert_eq!(reader.buffer_position(), 41); - /// ``` - pub const fn get_ref(&self) -> &NsReader<&'de [u8]> { - &self.reader - } -} - -impl<'de> XmlRead<'de> for SliceReader<'de> { - fn next(&mut self) -> Result, DeError> { - loop { - let event = self.reader.read_event()?; - if let Event::Decl(e) = &event { - self.version = e.xml_version()?; - } - if let Some(event) = skip_uninterested(event) { - return Ok(event); - } - } - } - - fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { - match self.reader.read_to_end(name) { - Err(e) => Err(e.into()), - Ok(_) => Ok(()), - } - } - - #[inline] - fn xml_version(&self) -> XmlVersion { - self.version - } - - #[inline] - fn decoder(&self) -> Decoder { - self.reader.decoder() - } - - fn has_nil_attr(&self, start: &BytesStart) -> bool { - start.attributes().has_nil(self.reader.resolver()) - } -} - impl<'de, 'e> XmlRead<'de> for XmlReader<'de, 'e> { fn next(&mut self) -> Result, DeError> { loop { @@ -3628,7 +3422,7 @@ mod tests { use crate::errors::IllFormedError; use pretty_assertions::assert_eq; - fn make_de<'de>(source: &'de str) -> Deserializer<'de, SliceReader<'de>> { + fn make_de<'de, 'e>(source: &'de str) -> Deserializer<'de, XmlReader<'de, 'e>> { dbg!(source); Deserializer::from_str(source) } @@ -4193,36 +3987,6 @@ mod tests { } } - #[test] - fn borrowing_reader_parity() { - let s = r#" - Some text - - - "#; - - let mut reader1 = IoReader { - reader: NsReader::from_reader(s.as_bytes()), - buf: Vec::new(), - version: XmlVersion::V1_0, - }; - let mut reader2 = SliceReader { - reader: NsReader::from_str(s), - version: XmlVersion::V1_0, - }; - - loop { - let event1 = reader1.next().unwrap(); - let event2 = reader2.next().unwrap(); - - if let (PayloadEvent::Eof, PayloadEvent::Eof) = (&event1, &event2) { - break; - } - - assert_eq!(event1, event2); - } - } - #[test] fn borrowing_reader_events() { let s = r#" @@ -4232,13 +3996,10 @@ mod tests { "#; - let mut reader = SliceReader { - reader: NsReader::from_str(s), - version: XmlVersion::V1_0, - }; + let mut reader = NsReader::from_str(s); + reader.config_mut().expand_empty_elements = true; - let config = reader.reader.config_mut(); - config.expand_empty_elements = true; + let mut reader = XmlReader::borrowed_ns(reader, crate::reader::PredefinedEntityResolver); let mut events = Vec::new(); From 7e3b4f17c63af290b6ca0dc1bc11a52b89cf8d43 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 23 Nov 2024 23:02:02 +0500 Subject: [PATCH 07/11] Get rid of XmlRead trait, because now it have only one implementation --- src/de/map.rs | 67 ++++++++++++++----------------------- src/de/mod.rs | 91 ++++++++++----------------------------------------- src/de/var.rs | 25 ++++++-------- 3 files changed, 53 insertions(+), 130 deletions(-) diff --git a/src/de/map.rs b/src/de/map.rs index 3d25a941..49b21c33 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -5,7 +5,7 @@ use crate::{ de::resolver::EntityResolver, de::simple_type::SimpleTypeDeserializer, de::text::TextDeserializer, - de::{DeEvent, Deserializer, XmlRead, TEXT_KEY, VALUE_KEY}, + de::{DeEvent, Deserializer, TEXT_KEY, VALUE_KEY}, errors::serialize::DeError, errors::Error, events::attributes::IterState, @@ -166,14 +166,13 @@ enum ValueSource { /// /// - `'d` lifetime represents a parent deserializer, which could own the data /// buffer. -pub(crate) struct ElementMapAccess<'de, 'd, R, E> +pub(crate) struct ElementMapAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Tag -- owner of attributes start: BytesStart<'de>, - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, E>, /// State of the iterator over attributes. Contains the next position in the /// inner `start` slice, from which next attribute should be parsed. iter: IterState, @@ -195,14 +194,13 @@ where has_text_field: bool, } -impl<'de, 'd, R, E> ElementMapAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, E> ElementMapAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Create a new ElementMapAccess pub fn new( - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, E>, start: BytesStart<'de>, fields: &'static [&'static str], ) -> Self { @@ -240,9 +238,8 @@ where } } -impl<'de, 'd, R, E> MapAccess<'de> for ElementMapAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, E> MapAccess<'de> for ElementMapAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -446,14 +443,13 @@ where /// /// [`deserialize_tuple`]: #method.deserialize_tuple /// [`deserialize_struct`]: #method.deserialize_struct -struct MapValueDeserializer<'de, 'd, 'm, R, E> +struct MapValueDeserializer<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Access to the map that created this deserializer. Gives access to the /// context, such as list of fields, that current map known about. - map: &'m mut ElementMapAccess<'de, 'd, R, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, E>, /// Whether this deserializer was created for deserialization from an element /// with fixed name, or the elements with different names or even text are allowed. /// @@ -531,9 +527,8 @@ where fixed_name: bool, } -impl<'de, 'd, 'm, R, E> MapValueDeserializer<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, E> MapValueDeserializer<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Returns a next string as concatenated content of consequent [`Text`] and @@ -548,9 +543,8 @@ where } } -impl<'de, 'd, 'm, R, E> de::Deserializer<'de> for MapValueDeserializer<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, E> de::Deserializer<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -685,13 +679,12 @@ where } } -impl<'de, 'd, 'm, R, E> de::EnumAccess<'de> for MapValueDeserializer<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, E> de::EnumAccess<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; - type Variant = MapValueVariantAccess<'de, 'd, 'm, R, E>; + type Variant = MapValueVariantAccess<'de, 'e, 'd, 'm, E>; fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error> where @@ -716,22 +709,20 @@ where } } -struct MapValueVariantAccess<'de, 'd, 'm, R, E> +struct MapValueVariantAccess<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Access to the map that created this enum accessor. Gives access to the /// context, such as list of fields, that current map known about. - map: &'m mut ElementMapAccess<'de, 'd, R, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, E>, /// `true` if variant should be deserialized from a textual content /// and `false` if from tag is_text: bool, } -impl<'de, 'd, 'm, R, E> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, E> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -907,14 +898,13 @@ impl<'de> TagFilter<'de> { /// /// [`Text`]: crate::events::Event::Text /// [`CData`]: crate::events::Event::CData -struct MapValueSeqAccess<'de, 'd, 'm, R, E> +struct MapValueSeqAccess<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Accessor to a map that creates this accessor and to a deserializer for /// a sequence items. - map: &'m mut ElementMapAccess<'de, 'd, R, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, E>, /// Filter that determines whether a tag is a part of this sequence. /// /// When feature [`overlapped-lists`] is not activated, iteration will stop @@ -934,9 +924,8 @@ where } #[cfg(feature = "overlapped-lists")] -impl<'de, 'd, 'm, R, E> Drop for MapValueSeqAccess<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, E> Drop for MapValueSeqAccess<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { fn drop(&mut self) { @@ -944,9 +933,8 @@ where } } -impl<'de, 'd, 'm, R, E> SeqAccess<'de> for MapValueSeqAccess<'de, 'd, 'm, R, E> +impl<'de, 'e, 'd, 'm, E> SeqAccess<'de> for MapValueSeqAccess<'de, 'e, 'd, 'm, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -1051,18 +1039,16 @@ where /// [specification]: https://www.w3.org/TR/xmlschema11-2/#boolean /// [`deserialize_tuple`]: #method.deserialize_tuple /// [`deserialize_struct`]: #method.deserialize_struct -struct ElementDeserializer<'de, 'd, R, E> +struct ElementDeserializer<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { start: BytesStart<'de>, - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, E>, } -impl<'de, 'd, R, E> ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, E> ElementDeserializer<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Returns a next string as concatenated content of consequent [`Text`] and @@ -1076,9 +1062,8 @@ where } } -impl<'de, 'd, R, E> de::Deserializer<'de> for ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, E> de::Deserializer<'de> for ElementDeserializer<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -1167,9 +1152,8 @@ where } } -impl<'de, 'd, R, E> de::EnumAccess<'de> for ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, E> de::EnumAccess<'de> for ElementDeserializer<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -1184,9 +1168,8 @@ where } } -impl<'de, 'd, R, E> de::VariantAccess<'de> for ElementDeserializer<'de, 'd, R, E> +impl<'de, 'e, 'd, E> de::VariantAccess<'de> for ElementDeserializer<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; diff --git a/src/de/mod.rs b/src/de/mod.rs index 231875c8..5feacda7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2107,7 +2107,6 @@ pub use self::attributes::AttributesDeserializer; pub use self::resolver::{EntityResolver, PredefinedEntityResolver}; pub use self::simple_type::SimpleTypeDeserializer; pub use crate::errors::serialize::DeError; -use crate::XmlVersion; use crate::{ de::map::ElementMapAccess, @@ -2313,9 +2312,9 @@ pub enum PayloadEvent<'a> { /// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s. /// [`PayloadEvent::Text`] events, that followed by any event except /// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end. -struct LookaheadReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolver> { +struct LookaheadReader<'i, 'e, E: EntityResolver = PredefinedEntityResolver> { /// A source of low-level XML events - reader: R, + reader: XmlReader<'i, 'e>, /// Intermediate event, that could be returned by the next call to `next()`. /// If that is the `Text` event then leading spaces already trimmed, but /// trailing spaces is not. Before the event will be returned, trimming of @@ -2329,8 +2328,8 @@ struct LookaheadReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityR entity_resolver: E, } -impl<'i, R: XmlRead<'i>, E: EntityResolver> LookaheadReader<'i, R, E> { - fn new(mut reader: R, entity_resolver: E) -> Self { +impl<'i, 'e, E: EntityResolver> LookaheadReader<'i, 'e, E> { + fn new(mut reader: XmlReader<'i, 'e>, entity_resolver: E) -> Self { // Lookahead by one event immediately, so we do not need to check in the // loop if we need lookahead or not let lookahead = reader.next(); @@ -2501,12 +2500,9 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// /// A structure that deserializes XML into Rust values. -pub struct Deserializer<'de, R, E: EntityResolver = PredefinedEntityResolver> -where - R: XmlRead<'de>, -{ +pub struct Deserializer<'de, 'e, E: EntityResolver = PredefinedEntityResolver> { /// An XML reader that streams events into this deserializer - reader: LookaheadReader<'de, R, E>, + reader: LookaheadReader<'de, 'e, E>, /// When deserializing sequences sometimes we have to skip unwanted events. /// That events should be stored and then replayed. This is a replay buffer, @@ -2539,9 +2535,8 @@ where key_buf: String, } -impl<'de, R, E> Deserializer<'de, R, E> +impl<'de, 'e, E> Deserializer<'de, 'e, E> where - R: XmlRead<'de>, E: EntityResolver, { /// Create an XML deserializer from one of the possible quick_xml input sources. @@ -2550,7 +2545,7 @@ where /// /// - [`Deserializer::from_str`] /// - [`Deserializer::from_reader`] - fn new(reader: R, entity_resolver: E) -> Self { + fn new(reader: XmlReader<'de, 'e>, entity_resolver: E) -> Self { Self { reader: LookaheadReader::new(reader, entity_resolver), @@ -2610,7 +2605,7 @@ where /// assert_eq!(reader.error_position(), 28); /// assert_eq!(reader.buffer_position(), 41); /// ``` - pub const fn get_ref(&self) -> &R { + pub const fn get_ref(&self) -> &XmlReader<'de, 'e> { &self.reader.reader } @@ -2990,7 +2985,7 @@ where } } -impl<'de, 'e> Deserializer<'de, XmlReader<'de, 'e>> { +impl<'de, 'e> Deserializer<'de, 'e> { /// Create a new deserializer that will borrow data from the specified string. /// /// Deserializer created with this method will not resolve custom entities. @@ -3039,7 +3034,7 @@ impl<'de, 'e> Deserializer<'de, XmlReader<'de, 'e>> { } } -impl<'de, 'e, E> Deserializer<'de, XmlReader<'de, 'e>, E> +impl<'de, 'e, E> Deserializer<'de, 'e, E> where E: EntityResolver, { @@ -3066,7 +3061,7 @@ where } } -impl<'de, 'e> Deserializer<'de, XmlReader<'de, 'e>> { +impl<'de, 'e> Deserializer<'de, 'e> { /// Create a new deserializer that will copy data from the specified reader /// into internal buffer. /// @@ -3126,7 +3121,7 @@ impl<'de, 'e> Deserializer<'de, XmlReader<'de, 'e>> { } } -impl<'de, 'e, E> Deserializer<'de, XmlReader<'de, 'e>, E> +impl<'de, 'e, E> Deserializer<'de, 'e, E> where E: EntityResolver, { @@ -3164,9 +3159,8 @@ where } } -impl<'de, R, E> de::Deserializer<'de> for &mut Deserializer<'de, R, E> +impl<'de, 'e, E> de::Deserializer<'de> for &mut Deserializer<'de, 'e, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -3304,9 +3298,8 @@ where /// /// Technically, multiple top-level elements violates XML rule of only one top-level /// element, but we consider this as several concatenated XML documents. -impl<'de, R, E> SeqAccess<'de> for &mut Deserializer<'de, R, E> +impl<'de, 'e, E> SeqAccess<'de> for &mut Deserializer<'de, 'e, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; @@ -3331,9 +3324,8 @@ where } } -impl<'de, R, E> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, R, E> +impl<'de, 'e, E> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, 'e, E> where - R: XmlRead<'de>, E: EntityResolver, { type Deserializer = Self; @@ -3346,34 +3338,7 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Trait used by the deserializer for iterating over input. This is manually -/// "specialized" for iterating over `&[u8]`. -/// -/// You do not need to implement this trait, it is needed to abstract from -/// [borrowing](SliceReader) and [copying](IoReader) data sources and reuse code in -/// deserializer -pub trait XmlRead<'i> { - /// Return an input-borrowing event. - fn next(&mut self) -> Result, DeError>; - - /// Skips until end element is found. Unlike `next()` it will not allocate - /// when it cannot satisfy the lifetime. - fn read_to_end(&mut self, name: QName) -> Result<(), DeError>; - - /// Return an XML version of the source. - fn xml_version(&self) -> XmlVersion; - - /// A copy of the reader's decoder used to decode strings. - fn decoder(&self) -> Decoder; - - /// Checks if the `start` tag has a [`xsi:nil`] attribute. This method ignores - /// any errors in attributes. - /// - /// [`xsi:nil`]: https://www.w3.org/TR/xmlschema-1/#xsi_nil - fn has_nil_attr(&self, start: &BytesStart) -> bool; -} - -impl<'de, 'e> XmlRead<'de> for XmlReader<'de, 'e> { +impl<'de, 'e> XmlReader<'de, 'e> { fn next(&mut self) -> Result, DeError> { loop { let event = match self.read_event()? { @@ -3392,26 +3357,6 @@ impl<'de, 'e> XmlRead<'de> for XmlReader<'de, 'e> { return Ok(event); } } - - fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { - match self.read_to_end(name) { - Err(e) => Err(e.into()), - Ok(_) => Ok(()), - } - } - - #[inline] - fn xml_version(&self) -> XmlVersion { - self.xml_version() - } - - fn decoder(&self) -> Decoder { - self.decoder() - } - - fn has_nil_attr(&self, start: &BytesStart) -> bool { - self.has_nil_attr(start) - } } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -3422,7 +3367,7 @@ mod tests { use crate::errors::IllFormedError; use pretty_assertions::assert_eq; - fn make_de<'de, 'e>(source: &'de str) -> Deserializer<'de, XmlReader<'de, 'e>> { + fn make_de<'de, 'e>(source: &'de str) -> Deserializer<'de, 'e> { dbg!(source); Deserializer::from_str(source) } diff --git a/src/de/var.rs b/src/de/var.rs index e64e29f8..a2df2edd 100644 --- a/src/de/var.rs +++ b/src/de/var.rs @@ -3,38 +3,35 @@ use crate::{ de::map::ElementMapAccess, de::resolver::EntityResolver, de::simple_type::SimpleTypeDeserializer, - de::{DeEvent, Deserializer, XmlRead, TEXT_KEY}, + de::{DeEvent, Deserializer, TEXT_KEY}, errors::serialize::DeError, }; use serde::de::value::BorrowedStrDeserializer; use serde::de::{self, DeserializeSeed, Deserializer as _, Visitor}; /// An enum access -pub struct EnumAccess<'de, 'd, R, E> +pub struct EnumAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, E>, } -impl<'de, 'd, R, E> EnumAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, E> EnumAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { - pub fn new(de: &'d mut Deserializer<'de, R, E>) -> Self { + pub fn new(de: &'d mut Deserializer<'de, 'e, E>) -> Self { EnumAccess { de } } } -impl<'de, 'd, R, E> de::EnumAccess<'de> for EnumAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, E> de::EnumAccess<'de> for EnumAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; - type Variant = VariantAccess<'de, 'd, R, E>; + type Variant = VariantAccess<'de, 'e, 'd, E>; fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error> where @@ -61,20 +58,18 @@ where } } -pub struct VariantAccess<'de, 'd, R, E> +pub struct VariantAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { - de: &'d mut Deserializer<'de, R, E>, + de: &'d mut Deserializer<'de, 'e, E>, /// `true` if variant should be deserialized from a textual content /// and `false` if from tag is_text: bool, } -impl<'de, 'd, R, E> de::VariantAccess<'de> for VariantAccess<'de, 'd, R, E> +impl<'de, 'e, 'd, E> de::VariantAccess<'de> for VariantAccess<'de, 'e, 'd, E> where - R: XmlRead<'de>, E: EntityResolver, { type Error = DeError; From 0512d793f2879999314bac5abf302393f2c44fc4 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 24 Nov 2024 00:34:00 +0500 Subject: [PATCH 08/11] Get rid of unused PayloadEvent::DocType and PayloadEvent::GeneralRef The new XmlReader captures DTD and resolves references, so that events never produced --- src/de/mod.rs | 41 ++++------------------------------------- 1 file changed, 4 insertions(+), 37 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index 5feacda7..29edb9a7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2112,8 +2112,7 @@ use crate::{ de::map::ElementMapAccess, encoding::Decoder, errors::Error, - escape::{parse_number, EscapeError}, - events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText}, + events::{BytesCData, BytesEnd, BytesStart, BytesText}, name::QName, reader::{NsReader, XmlEvent, XmlReader}, }; @@ -2301,10 +2300,6 @@ pub enum PayloadEvent<'a> { Text(BytesText<'a>), /// Unescaped character data stored in ``. CData(BytesCData<'a>), - /// Document type definition data (DTD) stored in ``. - DocType(BytesText<'a>), - /// Reference `&ref;` in the textual data. - GeneralRef(BytesRef<'a>), /// End of XML document. Eof, } @@ -2358,7 +2353,7 @@ impl<'i, 'e, E: EntityResolver> LookaheadReader<'i, 'e, E> { // If next event is a text or CDATA, we should not trim trailing spaces !matches!( self.lookahead, - Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_) | PayloadEvent::GeneralRef(_)) + Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_)) ) } @@ -2381,10 +2376,9 @@ impl<'i, 'e, E: EntityResolver> LookaheadReader<'i, 'e, E> { PayloadEvent::CData(e) => result .to_mut() .push_str(&e.xml_content(self.reader.xml_version())?), - PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?, - // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef - _ => unreachable!("Only `Text`, `CData` or `GeneralRef` events can come here"), + // SAFETY: current_event_is_last_text checks that event is Text or CData + _ => unreachable!("Only `Text` or `CData` events can come here"), } } Ok(DeEvent::Text(Text::new(result))) @@ -2400,38 +2394,11 @@ impl<'i, 'e, E: EntityResolver> LookaheadReader<'i, 'e, E> { PayloadEvent::CData(e) => { self.drain_text(e.xml_content(self.reader.xml_version())?) } - PayloadEvent::DocType(e) => { - self.entity_resolver - .capture(e) - .map_err(|err| DeError::Custom(format!("cannot parse DTD: {}", err)))?; - continue; - } - PayloadEvent::GeneralRef(e) => { - let mut text = String::new(); - self.resolve_reference(&mut text, e)?; - self.drain_text(text.into()) - } PayloadEvent::Eof => Ok(DeEvent::Eof), }; } } - fn resolve_reference(&mut self, result: &mut String, event: BytesRef) -> Result<(), DeError> { - let len = event.len(); - let reference = self.decoder().decode(&event)?; - - if let Some(num) = reference.strip_prefix('#') { - let codepoint = parse_number(num).map_err(EscapeError::InvalidCharRef)?; - result.push_str(codepoint.encode_utf8(&mut [0u8; 4])); - return Ok(()); - } - if let Some(value) = self.entity_resolver.resolve(reference.as_ref()) { - result.push_str(value); - return Ok(()); - } - Err(EscapeError::UnrecognizedEntity(0..len, reference.to_string()).into()) - } - #[inline] fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { match self.lookahead { From 9a8f79bc7b3151eced701aeba5eed21fd7d3cda2 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 24 Nov 2024 00:59:57 +0500 Subject: [PATCH 09/11] Get rid of de::resolver module, replace it with reader::resolver Fixed: resolve::resolve_custom_entity --- Changelog.md | 2 + src/de/map.rs | 84 ++++++++++++++++----------------- src/de/mod.rs | 90 +++++++++++++++-------------------- src/de/resolver.rs | 115 --------------------------------------------- src/de/var.rs | 30 ++++++------ tests/serde-de.rs | 25 +++++++--- 6 files changed, 116 insertions(+), 230 deletions(-) delete mode 100644 src/de/resolver.rs diff --git a/Changelog.md b/Changelog.md index 21ddbb7f..c4f0bdf6 100644 --- a/Changelog.md +++ b/Changelog.md @@ -16,6 +16,8 @@ The new `XmlReader` type was added that is automatically resolves general entity references. +`quick_xml::de::resolver` was replaced by `quick_xml::resolver` module. + ### New Features - [#938]: Add new enumeration `XmlVersion` and typified getter `BytesDecl::xml_version()`. diff --git a/src/de/map.rs b/src/de/map.rs index 49b21c33..d5b4cfb6 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -2,7 +2,6 @@ use crate::{ de::key::QNameDeserializer, - de::resolver::EntityResolver, de::simple_type::SimpleTypeDeserializer, de::text::TextDeserializer, de::{DeEvent, Deserializer, TEXT_KEY, VALUE_KEY}, @@ -11,6 +10,7 @@ use crate::{ events::attributes::IterState, events::BytesStart, name::QName, + reader::EntityResolverFactory, }; use serde::de::value::BorrowedStrDeserializer; use serde::de::{self, DeserializeSeed, Deserializer as _, MapAccess, SeqAccess, Visitor}; @@ -166,13 +166,13 @@ enum ValueSource { /// /// - `'d` lifetime represents a parent deserializer, which could own the data /// buffer. -pub(crate) struct ElementMapAccess<'de, 'e, 'd, E> +pub(crate) struct ElementMapAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Tag -- owner of attributes start: BytesStart<'de>, - de: &'d mut Deserializer<'de, 'e, E>, + de: &'d mut Deserializer<'de, 'e, EF>, /// State of the iterator over attributes. Contains the next position in the /// inner `start` slice, from which next attribute should be parsed. iter: IterState, @@ -194,13 +194,13 @@ where has_text_field: bool, } -impl<'de, 'e, 'd, E> ElementMapAccess<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> ElementMapAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Create a new ElementMapAccess pub fn new( - de: &'d mut Deserializer<'de, 'e, E>, + de: &'d mut Deserializer<'de, 'e, EF>, start: BytesStart<'de>, fields: &'static [&'static str], ) -> Self { @@ -238,9 +238,9 @@ where } } -impl<'de, 'e, 'd, E> MapAccess<'de> for ElementMapAccess<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> MapAccess<'de> for ElementMapAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -443,13 +443,13 @@ where /// /// [`deserialize_tuple`]: #method.deserialize_tuple /// [`deserialize_struct`]: #method.deserialize_struct -struct MapValueDeserializer<'de, 'e, 'd, 'm, E> +struct MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Access to the map that created this deserializer. Gives access to the /// context, such as list of fields, that current map known about. - map: &'m mut ElementMapAccess<'de, 'e, 'd, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>, /// Whether this deserializer was created for deserialization from an element /// with fixed name, or the elements with different names or even text are allowed. /// @@ -527,9 +527,9 @@ where fixed_name: bool, } -impl<'de, 'e, 'd, 'm, E> MapValueDeserializer<'de, 'e, 'd, 'm, E> +impl<'de, 'e, 'd, 'm, EF> MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Returns a next string as concatenated content of consequent [`Text`] and /// [`CData`] events, used inside [`deserialize_primitives!()`]. @@ -543,9 +543,9 @@ where } } -impl<'de, 'e, 'd, 'm, E> de::Deserializer<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, E> +impl<'de, 'e, 'd, 'm, EF> de::Deserializer<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -679,12 +679,12 @@ where } } -impl<'de, 'e, 'd, 'm, E> de::EnumAccess<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, E> +impl<'de, 'e, 'd, 'm, EF> de::EnumAccess<'de> for MapValueDeserializer<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; - type Variant = MapValueVariantAccess<'de, 'e, 'd, 'm, E>; + type Variant = MapValueVariantAccess<'de, 'e, 'd, 'm, EF>; fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error> where @@ -709,21 +709,21 @@ where } } -struct MapValueVariantAccess<'de, 'e, 'd, 'm, E> +struct MapValueVariantAccess<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Access to the map that created this enum accessor. Gives access to the /// context, such as list of fields, that current map known about. - map: &'m mut ElementMapAccess<'de, 'e, 'd, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>, /// `true` if variant should be deserialized from a textual content /// and `false` if from tag is_text: bool, } -impl<'de, 'e, 'd, 'm, E> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'e, 'd, 'm, E> +impl<'de, 'e, 'd, 'm, EF> de::VariantAccess<'de> for MapValueVariantAccess<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -898,13 +898,13 @@ impl<'de> TagFilter<'de> { /// /// [`Text`]: crate::events::Event::Text /// [`CData`]: crate::events::Event::CData -struct MapValueSeqAccess<'de, 'e, 'd, 'm, E> +struct MapValueSeqAccess<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Accessor to a map that creates this accessor and to a deserializer for /// a sequence items. - map: &'m mut ElementMapAccess<'de, 'e, 'd, E>, + map: &'m mut ElementMapAccess<'de, 'e, 'd, EF>, /// Filter that determines whether a tag is a part of this sequence. /// /// When feature [`overlapped-lists`] is not activated, iteration will stop @@ -924,18 +924,18 @@ where } #[cfg(feature = "overlapped-lists")] -impl<'de, 'e, 'd, 'm, E> Drop for MapValueSeqAccess<'de, 'e, 'd, 'm, E> +impl<'de, 'e, 'd, 'm, EF> Drop for MapValueSeqAccess<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { fn drop(&mut self) { self.map.de.start_replay(self.checkpoint); } } -impl<'de, 'e, 'd, 'm, E> SeqAccess<'de> for MapValueSeqAccess<'de, 'e, 'd, 'm, E> +impl<'de, 'e, 'd, 'm, EF> SeqAccess<'de> for MapValueSeqAccess<'de, 'e, 'd, 'm, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -1039,17 +1039,17 @@ where /// [specification]: https://www.w3.org/TR/xmlschema11-2/#boolean /// [`deserialize_tuple`]: #method.deserialize_tuple /// [`deserialize_struct`]: #method.deserialize_struct -struct ElementDeserializer<'de, 'e, 'd, E> +struct ElementDeserializer<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { start: BytesStart<'de>, - de: &'d mut Deserializer<'de, 'e, E>, + de: &'d mut Deserializer<'de, 'e, EF>, } -impl<'de, 'e, 'd, E> ElementDeserializer<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> ElementDeserializer<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Returns a next string as concatenated content of consequent [`Text`] and /// [`CData`] events, used inside [`deserialize_primitives!()`]. @@ -1062,9 +1062,9 @@ where } } -impl<'de, 'e, 'd, E> de::Deserializer<'de> for ElementDeserializer<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> de::Deserializer<'de> for ElementDeserializer<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -1152,9 +1152,9 @@ where } } -impl<'de, 'e, 'd, E> de::EnumAccess<'de> for ElementDeserializer<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> de::EnumAccess<'de> for ElementDeserializer<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; type Variant = Self; @@ -1168,9 +1168,9 @@ where } } -impl<'de, 'e, 'd, E> de::VariantAccess<'de> for ElementDeserializer<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> de::VariantAccess<'de> for ElementDeserializer<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; diff --git a/src/de/mod.rs b/src/de/mod.rs index 29edb9a7..c3a07bef 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2098,13 +2098,11 @@ macro_rules! deserialize_primitives { mod attributes; mod key; mod map; -mod resolver; mod simple_type; mod text; mod var; pub use self::attributes::AttributesDeserializer; -pub use self::resolver::{EntityResolver, PredefinedEntityResolver}; pub use self::simple_type::SimpleTypeDeserializer; pub use crate::errors::serialize::DeError; @@ -2114,7 +2112,7 @@ use crate::{ errors::Error, events::{BytesCData, BytesEnd, BytesStart, BytesText}, name::QName, - reader::{NsReader, XmlEvent, XmlReader}, + reader::{EntityResolverFactory, NsReader, PredefinedEntityResolver, XmlEvent, XmlReader}, }; use serde::de::{ self, Deserialize, DeserializeOwned, DeserializeSeed, IntoDeserializer, SeqAccess, Visitor, @@ -2307,33 +2305,23 @@ pub enum PayloadEvent<'a> { /// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s. /// [`PayloadEvent::Text`] events, that followed by any event except /// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end. -struct LookaheadReader<'i, 'e, E: EntityResolver = PredefinedEntityResolver> { +struct LookaheadReader<'i, 'e, EF: EntityResolverFactory<'i> = PredefinedEntityResolver> { /// A source of low-level XML events - reader: XmlReader<'i, 'e>, + reader: XmlReader<'i, 'e, EF>, /// Intermediate event, that could be returned by the next call to `next()`. /// If that is the `Text` event then leading spaces already trimmed, but /// trailing spaces is not. Before the event will be returned, trimming of /// the spaces could be necessary lookahead: Result, DeError>, - - /// Used to resolve unknown entities that would otherwise cause the parser - /// to return an [`EscapeError::UnrecognizedEntity`] error. - /// - /// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity - entity_resolver: E, } -impl<'i, 'e, E: EntityResolver> LookaheadReader<'i, 'e, E> { - fn new(mut reader: XmlReader<'i, 'e>, entity_resolver: E) -> Self { +impl<'i, 'e, EF: EntityResolverFactory<'i>> LookaheadReader<'i, 'e, EF> { + fn new(mut reader: XmlReader<'i, 'e, EF>) -> Self { // Lookahead by one event immediately, so we do not need to check in the // loop if we need lookahead or not let lookahead = reader.next(); - Self { - reader, - lookahead, - entity_resolver, - } + Self { reader, lookahead } } /// Returns `true` if all events was consumed @@ -2467,9 +2455,9 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// /// A structure that deserializes XML into Rust values. -pub struct Deserializer<'de, 'e, E: EntityResolver = PredefinedEntityResolver> { +pub struct Deserializer<'de, 'e, EF: EntityResolverFactory<'de> = PredefinedEntityResolver> { /// An XML reader that streams events into this deserializer - reader: LookaheadReader<'de, 'e, E>, + reader: LookaheadReader<'de, 'e, EF>, /// When deserializing sequences sometimes we have to skip unwanted events. /// That events should be stored and then replayed. This is a replay buffer, @@ -2502,9 +2490,9 @@ pub struct Deserializer<'de, 'e, E: EntityResolver = PredefinedEntityResolver> { key_buf: String, } -impl<'de, 'e, E> Deserializer<'de, 'e, E> +impl<'de, 'e, EF> Deserializer<'de, 'e, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Create an XML deserializer from one of the possible quick_xml input sources. /// @@ -2512,9 +2500,9 @@ where /// /// - [`Deserializer::from_str`] /// - [`Deserializer::from_reader`] - fn new(reader: XmlReader<'de, 'e>, entity_resolver: E) -> Self { + fn new(reader: XmlReader<'de, 'e, EF>) -> Self { Self { - reader: LookaheadReader::new(reader, entity_resolver), + reader: LookaheadReader::new(reader), #[cfg(feature = "overlapped-lists")] read: VecDeque::new(), @@ -2572,7 +2560,7 @@ where /// assert_eq!(reader.error_position(), 28); /// assert_eq!(reader.buffer_position(), 41); /// ``` - pub const fn get_ref(&self) -> &XmlReader<'de, 'e> { + pub const fn get_ref(&self) -> &XmlReader<'de, 'e, EF> { &self.reader.reader } @@ -3001,14 +2989,14 @@ impl<'de, 'e> Deserializer<'de, 'e> { } } -impl<'de, 'e, E> Deserializer<'de, 'e, E> +impl<'de, 'e, EF> Deserializer<'de, 'e, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Create a new deserializer that will borrow data from the specified string /// and use the specified entity resolver. - pub fn from_str_with_resolver(source: &'de str, entity_resolver: E) -> Self { - Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver) + pub fn from_str_with_resolver(source: &'de str, entity_resolver_factory: EF) -> Self { + Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver_factory) } /// Create a new deserializer that will borrow data from the specified preconfigured @@ -3017,14 +3005,14 @@ where /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. /// /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements - pub fn borrowing_with_resolver(mut reader: NsReader<&'de [u8]>, entity_resolver: E) -> Self { + pub fn borrowing_with_resolver( + mut reader: NsReader<&'de [u8]>, + entity_resolver_factory: EF, + ) -> Self { let config = reader.config_mut(); config.expand_empty_elements = true; - Self::new( - XmlReader::borrowed_ns(reader, crate::reader::PredefinedEntityResolver), - entity_resolver, - ) + Self::new(XmlReader::borrowed_ns(reader, entity_resolver_factory)) } } @@ -3088,9 +3076,9 @@ impl<'de, 'e> Deserializer<'de, 'e> { } } -impl<'de, 'e, E> Deserializer<'de, 'e, E> +impl<'de, 'e, EF> Deserializer<'de, 'e, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { /// Create a new deserializer that will copy data from the specified reader /// into internal buffer and use the specified entity resolver. @@ -3098,12 +3086,12 @@ where /// If you already have a string use [`Self::from_str`] instead, because it /// will borrow instead of copy. If you have `&[u8]` which is known to represent /// UTF-8, you can decode it first before using [`from_str`]. - pub fn with_resolver(reader: R, entity_resolver: E) -> Self + pub fn with_resolver(reader: R, entity_resolver_factory: EF) -> Self where R: BufRead + 'de, { let boxed: Box = Box::new(reader); - Self::buffering_with_resolver(NsReader::from_reader(boxed), entity_resolver) + Self::buffering_with_resolver(NsReader::from_reader(boxed), entity_resolver_factory) } /// Create new deserializer that will copy data from the specified preconfigured reader @@ -3114,21 +3102,18 @@ where /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements pub fn buffering_with_resolver( mut reader: NsReader>, - entity_resolver: E, + entity_resolver_factory: EF, ) -> Self { let config = reader.config_mut(); config.expand_empty_elements = true; - Self::new( - XmlReader::buffered_ns(reader, crate::reader::PredefinedEntityResolver), - entity_resolver, - ) + Self::new(XmlReader::buffered_ns(reader, entity_resolver_factory)) } } -impl<'de, 'e, E> de::Deserializer<'de> for &mut Deserializer<'de, 'e, E> +impl<'de, 'e, EF> de::Deserializer<'de> for &mut Deserializer<'de, 'e, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -3265,9 +3250,9 @@ where /// /// Technically, multiple top-level elements violates XML rule of only one top-level /// element, but we consider this as several concatenated XML documents. -impl<'de, 'e, E> SeqAccess<'de> for &mut Deserializer<'de, 'e, E> +impl<'de, 'e, EF> SeqAccess<'de> for &mut Deserializer<'de, 'e, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; @@ -3291,9 +3276,9 @@ where } } -impl<'de, 'e, E> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, 'e, E> +impl<'de, 'e, EF> IntoDeserializer<'de, DeError> for &mut Deserializer<'de, 'e, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Deserializer = Self; @@ -3305,7 +3290,10 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// -impl<'de, 'e> XmlReader<'de, 'e> { +impl<'de, 'e, EF> XmlReader<'de, 'e, EF> +where + EF: EntityResolverFactory<'de>, +{ fn next(&mut self) -> Result, DeError> { loop { let event = match self.read_event()? { @@ -3911,7 +3899,7 @@ mod tests { let mut reader = NsReader::from_str(s); reader.config_mut().expand_empty_elements = true; - let mut reader = XmlReader::borrowed_ns(reader, crate::reader::PredefinedEntityResolver); + let mut reader = XmlReader::borrowed_ns(reader, PredefinedEntityResolver); let mut events = Vec::new(); diff --git a/src/de/resolver.rs b/src/de/resolver.rs deleted file mode 100644 index 5efc0117..00000000 --- a/src/de/resolver.rs +++ /dev/null @@ -1,115 +0,0 @@ -//! Entity resolver module - -use std::convert::Infallible; -use std::error::Error; - -use crate::escape::resolve_predefined_entity; -use crate::events::BytesText; - -/// Used to resolve unknown entities while parsing -/// -/// # Example -/// -/// ``` -/// # use serde::Deserialize; -/// # use pretty_assertions::assert_eq; -/// use regex::bytes::Regex; -/// use std::collections::BTreeMap; -/// use std::string::FromUtf8Error; -/// use quick_xml::de::{Deserializer, EntityResolver}; -/// use quick_xml::events::BytesText; -/// -/// struct DocTypeEntityResolver { -/// re: Regex, -/// map: BTreeMap, -/// } -/// -/// impl Default for DocTypeEntityResolver { -/// fn default() -> Self { -/// Self { -/// // We do not focus on true parsing in this example -/// // You should use special libraries to parse DTD -/// re: Regex::new(r#""#).unwrap(), -/// map: BTreeMap::new(), -/// } -/// } -/// } -/// -/// impl EntityResolver for DocTypeEntityResolver { -/// type Error = FromUtf8Error; -/// -/// fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> { -/// for cap in self.re.captures_iter(&doctype) { -/// self.map.insert( -/// String::from_utf8(cap[1].to_vec())?, -/// String::from_utf8(cap[2].to_vec())?, -/// ); -/// } -/// Ok(()) -/// } -/// -/// fn resolve(&self, entity: &str) -> Option<&str> { -/// self.map.get(entity).map(|s| s.as_str()) -/// } -/// } -/// -/// let xml_reader = br#" -/// ]> -/// -/// &e1; -/// -/// "#.as_ref(); -/// -/// let mut de = Deserializer::with_resolver( -/// xml_reader, -/// DocTypeEntityResolver::default(), -/// ); -/// let data: BTreeMap = BTreeMap::deserialize(&mut de).unwrap(); -/// -/// assert_eq!(data.get("entity_one"), Some(&"entity 1".to_string())); -/// ``` -pub trait EntityResolver { - /// The error type that represents DTD parse error - type Error: Error; - - /// Called on contents of [`Event::DocType`] to capture declared entities. - /// Can be called multiple times, for each parsed `` declaration. - /// - /// [`Event::DocType`]: crate::events::Event::DocType - fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error>; - - /// Called when an entity needs to be resolved. - /// - /// `None` is returned if a suitable value can not be found. - /// In that case an [`EscapeError::UnrecognizedEntity`] will be returned by - /// a deserializer. - /// - /// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity - fn resolve(&self, entity: &str) -> Option<&str>; -} - -/// An [`EntityResolver`] that resolves only predefined entities: -/// -/// | Entity | Resolution -/// |--------|------------ -/// |`<` | `<` -/// |`>` | `>` -/// |`&` | `&` -/// |`'`| `'` -/// |`"`| `"` -#[derive(Default, Copy, Clone)] -pub struct PredefinedEntityResolver; - -impl EntityResolver for PredefinedEntityResolver { - type Error = Infallible; - - #[inline] - fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::Error> { - Ok(()) - } - - #[inline] - fn resolve(&self, entity: &str) -> Option<&str> { - resolve_predefined_entity(entity) - } -} diff --git a/src/de/var.rs b/src/de/var.rs index a2df2edd..a7d79199 100644 --- a/src/de/var.rs +++ b/src/de/var.rs @@ -1,37 +1,37 @@ use crate::{ de::key::QNameDeserializer, de::map::ElementMapAccess, - de::resolver::EntityResolver, de::simple_type::SimpleTypeDeserializer, de::{DeEvent, Deserializer, TEXT_KEY}, errors::serialize::DeError, + reader::EntityResolverFactory, }; use serde::de::value::BorrowedStrDeserializer; use serde::de::{self, DeserializeSeed, Deserializer as _, Visitor}; /// An enum access -pub struct EnumAccess<'de, 'e, 'd, E> +pub struct EnumAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { - de: &'d mut Deserializer<'de, 'e, E>, + de: &'d mut Deserializer<'de, 'e, EF>, } -impl<'de, 'e, 'd, E> EnumAccess<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> EnumAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { - pub fn new(de: &'d mut Deserializer<'de, 'e, E>) -> Self { + pub fn new(de: &'d mut Deserializer<'de, 'e, EF>) -> Self { EnumAccess { de } } } -impl<'de, 'e, 'd, E> de::EnumAccess<'de> for EnumAccess<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> de::EnumAccess<'de> for EnumAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; - type Variant = VariantAccess<'de, 'e, 'd, E>; + type Variant = VariantAccess<'de, 'e, 'd, EF>; fn variant_seed(self, seed: V) -> Result<(V::Value, Self::Variant), Self::Error> where @@ -58,19 +58,19 @@ where } } -pub struct VariantAccess<'de, 'e, 'd, E> +pub struct VariantAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { - de: &'d mut Deserializer<'de, 'e, E>, + de: &'d mut Deserializer<'de, 'e, EF>, /// `true` if variant should be deserialized from a textual content /// and `false` if from tag is_text: bool, } -impl<'de, 'e, 'd, E> de::VariantAccess<'de> for VariantAccess<'de, 'e, 'd, E> +impl<'de, 'e, 'd, EF> de::VariantAccess<'de> for VariantAccess<'de, 'e, 'd, EF> where - E: EntityResolver, + EF: EntityResolverFactory<'de>, { type Error = DeError; diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 7f1850a3..860241b2 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -1831,20 +1831,31 @@ mod borrow { mod resolve { use super::*; use pretty_assertions::assert_eq; - use quick_xml::de::EntityResolver; use quick_xml::events::BytesText; + use quick_xml::reader::{EntityResolver, EntityResolverFactory, ReplacementText}; + use std::borrow::Cow; use std::collections::BTreeMap; use std::convert::Infallible; use std::iter::FromIterator; + #[derive(Clone, Copy)] struct TestEntityResolver { capture_called: bool, } - impl EntityResolver for TestEntityResolver { - type Error = Infallible; + impl<'i> EntityResolverFactory<'i> for TestEntityResolver { + type CaptureError = Infallible; + type Resolver = Self; - fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> { + fn new_resolver(&mut self) -> Self::Resolver { + *self + } + } + + impl<'i> EntityResolver<'i> for TestEntityResolver { + type CaptureError = Infallible; + + fn capture(&mut self, doctype: BytesText) -> Result<(), Self::CaptureError> { self.capture_called = true; assert_eq!(doctype.as_ref(), br#"dict[ ]"#); @@ -1852,14 +1863,14 @@ mod resolve { Ok(()) } - fn resolve(&self, entity: &str) -> Option<&str> { + fn resolve<'e>(&self, entity: &str) -> Option> { assert!( self.capture_called, "`EntityResolver::capture` should be called before `EntityResolver::resolve`" ); match entity { - "t1" => Some("test_one"), - "t2" => Some("test_two"), + "t1" => Some(ReplacementText::Internal(Cow::Borrowed(b"test_one"))), + "t2" => Some(ReplacementText::Internal(Cow::Borrowed(b"test_two"))), _ => None, } } From cb8d12aef1c62ee7ee0add6bece4fcce51f2873c Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 20 Jun 2024 00:29:24 +0500 Subject: [PATCH 10/11] Add test to ensure, that entities which expands to the XML fragments are parsed Remove previous test because it is covered by the new one --- Cargo.toml | 5 ++ tests/serde-de-references.rs | 94 ++++++++++++++++++++++++++++++++++++ tests/serde-de.rs | 81 ------------------------------- 3 files changed, 99 insertions(+), 81 deletions(-) create mode 100644 tests/serde-de-references.rs diff --git a/Cargo.toml b/Cargo.toml index f7944de1..1a839ca9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -233,6 +233,11 @@ name = "serde-de-enum" required-features = ["serialize"] path = "tests/serde-de-enum.rs" +[[test]] +name = "serde-de-references" +required-features = ["serialize"] +path = "tests/serde-de-references.rs" + [[test]] name = "serde-de-seq" required-features = ["serialize"] diff --git a/tests/serde-de-references.rs b/tests/serde-de-references.rs new file mode 100644 index 00000000..548b122b --- /dev/null +++ b/tests/serde-de-references.rs @@ -0,0 +1,94 @@ +use std::borrow::Cow; +use std::convert::Infallible; + +use quick_xml::de::Deserializer; +use quick_xml::events::BytesText; +use quick_xml::reader::{EntityResolver, EntityResolverFactory, ReplacementText}; + +use pretty_assertions::assert_eq; +use serde::Deserialize; + +#[derive(Clone, Copy)] +struct TestEntityResolver { + capture_called: bool, +} + +impl<'i> EntityResolverFactory<'i> for TestEntityResolver { + type CaptureError = Infallible; + type Resolver = Self; + + fn new_resolver(&mut self) -> Self::Resolver { + *self + } +} + +impl<'i> EntityResolver<'i> for TestEntityResolver { + type CaptureError = Infallible; + + fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::CaptureError> { + self.capture_called = true; + Ok(()) + } + + fn resolve<'e>(&self, entity: &str) -> Option> { + assert!( + self.capture_called, + "`EntityResolver::capture` should be called before `EntityResolver::resolve(\"{}\")`", + entity, + ); + + match dbg!(entity) { + "text" => Some(ReplacementText::Internal(Cow::Borrowed( + b" ", + ))), + _ => Some(ReplacementText::Internal(Cow::Borrowed( + b" + &text; + + ", + ))), + } + } +} + +#[derive(Debug, PartialEq, Deserialize)] +struct Root { + child1: Child1, + child2: (), +} + +#[derive(Debug, PartialEq, Deserialize)] +struct Child1 { + #[serde(rename = "@attribute")] + attribute: String, + + #[serde(rename = "$text")] + text: String, +} + +#[test] +fn entities() { + let mut de = Deserializer::from_str_with_resolver( + " + + &entity; + ", + TestEntityResolver { + capture_called: false, + }, + ); + + let data = Root::deserialize(&mut de).unwrap(); + + de.check_eof_reached(); + assert_eq!( + data, + Root { + child1: Child1 { + attribute: "".to_string(), + text: " second text ".to_string(), + }, + child2: (), + } + ); +} diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 860241b2..10a6e1be 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -1,4 +1,3 @@ -use quick_xml::de::Deserializer; use quick_xml::utils::{ByteBuf, Bytes}; use quick_xml::DeError; @@ -1827,86 +1826,6 @@ mod borrow { } } -/// Test for entity resolver -mod resolve { - use super::*; - use pretty_assertions::assert_eq; - use quick_xml::events::BytesText; - use quick_xml::reader::{EntityResolver, EntityResolverFactory, ReplacementText}; - use std::borrow::Cow; - use std::collections::BTreeMap; - use std::convert::Infallible; - use std::iter::FromIterator; - - #[derive(Clone, Copy)] - struct TestEntityResolver { - capture_called: bool, - } - - impl<'i> EntityResolverFactory<'i> for TestEntityResolver { - type CaptureError = Infallible; - type Resolver = Self; - - fn new_resolver(&mut self) -> Self::Resolver { - *self - } - } - - impl<'i> EntityResolver<'i> for TestEntityResolver { - type CaptureError = Infallible; - - fn capture(&mut self, doctype: BytesText) -> Result<(), Self::CaptureError> { - self.capture_called = true; - - assert_eq!(doctype.as_ref(), br#"dict[ ]"#); - - Ok(()) - } - - fn resolve<'e>(&self, entity: &str) -> Option> { - assert!( - self.capture_called, - "`EntityResolver::capture` should be called before `EntityResolver::resolve`" - ); - match entity { - "t1" => Some(ReplacementText::Internal(Cow::Borrowed(b"test_one"))), - "t2" => Some(ReplacementText::Internal(Cow::Borrowed(b"test_two"))), - _ => None, - } - } - } - - #[test] - fn resolve_custom_entity() { - let resolver = TestEntityResolver { - capture_called: false, - }; - let mut de = Deserializer::with_resolver( - br#" - ]> - - - &t1; - &t2; - non-entity - - "# - .as_ref(), - resolver, - ); - - let data: BTreeMap = BTreeMap::deserialize(&mut de).unwrap(); - assert_eq!( - data, - BTreeMap::from_iter([ - (String::from("entity_one"), String::from("test_one")), - (String::from("entity_two"), String::from("test_two")), - (String::from("entity_three"), String::from("non-entity")), - ]) - ); - } -} - /// Tests for https://github.com/tafia/quick-xml/pull/603. /// /// According to comments, From 3b10a1cd25b27f455ed35e237fa8da1fdc92418b Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 24 Nov 2024 01:45:23 +0500 Subject: [PATCH 11/11] Merge impl blocks with same bounds After removing `R` generic, some impl blocks began to have the same bounds --- src/de/mod.rs | 116 ++++++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index c3a07bef..3ccd37d1 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2494,6 +2494,58 @@ impl<'de, 'e, EF> Deserializer<'de, 'e, EF> where EF: EntityResolverFactory<'de>, { + /// Create a new deserializer that will borrow data from the specified string + /// and use the specified entity resolver. + pub fn from_str_with_resolver(source: &'de str, entity_resolver_factory: EF) -> Self { + Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver_factory) + } + + /// Create a new deserializer that will borrow data from the specified preconfigured + /// reader and use the specified entity resolver. + /// + /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. + /// + /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements + pub fn borrowing_with_resolver( + mut reader: NsReader<&'de [u8]>, + entity_resolver_factory: EF, + ) -> Self { + let config = reader.config_mut(); + config.expand_empty_elements = true; + + Self::new(XmlReader::borrowed_ns(reader, entity_resolver_factory)) + } + + /// Create a new deserializer that will copy data from the specified reader + /// into internal buffer and use the specified entity resolver. + /// + /// If you already have a string use [`Self::from_str`] instead, because it + /// will borrow instead of copy. If you have `&[u8]` which is known to represent + /// UTF-8, you can decode it first before using [`from_str`]. + pub fn with_resolver(reader: R, entity_resolver_factory: EF) -> Self + where + R: BufRead + 'de, + { + let boxed: Box = Box::new(reader); + Self::buffering_with_resolver(NsReader::from_reader(boxed), entity_resolver_factory) + } + + /// Create new deserializer that will copy data from the specified preconfigured reader + /// into internal buffer and use the specified entity resolver. + /// + /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. + /// + /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements + pub fn buffering_with_resolver( + mut reader: NsReader>, + entity_resolver_factory: EF, + ) -> Self { + let config = reader.config_mut(); + config.expand_empty_elements = true; + + Self::new(XmlReader::buffered_ns(reader, entity_resolver_factory)) + } + /// Create an XML deserializer from one of the possible quick_xml input sources. /// /// Typically it is more convenient to use one of these methods instead: @@ -2987,36 +3039,7 @@ impl<'de, 'e> Deserializer<'de, 'e> { pub fn borrowing(reader: NsReader<&'de [u8]>) -> Self { Self::borrowing_with_resolver(reader, PredefinedEntityResolver) } -} - -impl<'de, 'e, EF> Deserializer<'de, 'e, EF> -where - EF: EntityResolverFactory<'de>, -{ - /// Create a new deserializer that will borrow data from the specified string - /// and use the specified entity resolver. - pub fn from_str_with_resolver(source: &'de str, entity_resolver_factory: EF) -> Self { - Self::borrowing_with_resolver(NsReader::from_str(source), entity_resolver_factory) - } - /// Create a new deserializer that will borrow data from the specified preconfigured - /// reader and use the specified entity resolver. - /// - /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. - /// - /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements - pub fn borrowing_with_resolver( - mut reader: NsReader<&'de [u8]>, - entity_resolver_factory: EF, - ) -> Self { - let config = reader.config_mut(); - config.expand_empty_elements = true; - - Self::new(XmlReader::borrowed_ns(reader, entity_resolver_factory)) - } -} - -impl<'de, 'e> Deserializer<'de, 'e> { /// Create a new deserializer that will copy data from the specified reader /// into internal buffer. /// @@ -3076,41 +3099,6 @@ impl<'de, 'e> Deserializer<'de, 'e> { } } -impl<'de, 'e, EF> Deserializer<'de, 'e, EF> -where - EF: EntityResolverFactory<'de>, -{ - /// Create a new deserializer that will copy data from the specified reader - /// into internal buffer and use the specified entity resolver. - /// - /// If you already have a string use [`Self::from_str`] instead, because it - /// will borrow instead of copy. If you have `&[u8]` which is known to represent - /// UTF-8, you can decode it first before using [`from_str`]. - pub fn with_resolver(reader: R, entity_resolver_factory: EF) -> Self - where - R: BufRead + 'de, - { - let boxed: Box = Box::new(reader); - Self::buffering_with_resolver(NsReader::from_reader(boxed), entity_resolver_factory) - } - - /// Create new deserializer that will copy data from the specified preconfigured reader - /// into internal buffer and use the specified entity resolver. - /// - /// Note, that config option [`Config::expand_empty_elements`] will be set to `true`. - /// - /// [`Config::expand_empty_elements`]: crate::reader::Config::expand_empty_elements - pub fn buffering_with_resolver( - mut reader: NsReader>, - entity_resolver_factory: EF, - ) -> Self { - let config = reader.config_mut(); - config.expand_empty_elements = true; - - Self::new(XmlReader::buffered_ns(reader, entity_resolver_factory)) - } -} - impl<'de, 'e, EF> de::Deserializer<'de> for &mut Deserializer<'de, 'e, EF> where EF: EntityResolverFactory<'de>,