Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ mod comment;
mod dtd;
mod element;
mod pi;
mod start_element;

pub use comment::CommentParser;
pub(crate) use dtd::DtdParser;
pub use element::ElementParser;
pub use pi::PiParser;
pub use start_element::StartElementParser;

/// Used to decouple reading of data from data source and parsing XML structure from it.
/// This is a state preserved between getting chunks of bytes from the reader.
Expand Down
242 changes: 242 additions & 0 deletions src/parser/start_element.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
//! Contains a parser for an XML element.

use crate::errors::SyntaxError;

/// A parser that search a `>` symbol in the slice outside of quoted regions.
///
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
/// a single-quoted (`'...'`) region. Matches found inside those regions are not
/// considered as results. Each region starts and ends by its quote symbol,
/// which cannot be escaped (but can be encoded as XML character entity or named
/// entity. Anyway, that encoding does not contain literal quotes).
///
/// To use a parser create an instance of parser and [`feed`] data into it.
/// After successful search the parser will return [`Some`] with the length
/// of the element name and the position of
/// found symbol. If search is unsuccessful, a [`None`] will be returned. You
/// typically would expect positive result of search, so that you should feed
/// new data until you get it.
///
/// NOTE: after successful match the parser does not returned to the initial
/// state and should not be used anymore. Create a new parser if you want to perform
/// new search.
///
/// # Example
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use quick_xml::parser::{ElementParser, Parser};
///
/// let mut parser = ElementParser::default();
///
/// // Parse `<my-element with = 'some > inside'>and the text follow...`
/// // splitted into three chunks
/// assert_eq!(parser.feed(b"<my-element"), None);
/// // ...get new chunk of data
/// assert_eq!(parser.feed(b" with = 'some >"), None);
/// // ...get another chunk of data
/// assert_eq!(parser.feed(b" inside'>and the text follow..."), Some(8));
/// // ^ ^
/// // 0 8
/// ```
///
/// [`feed`]: Self::feed()
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum StartElementParser {
/// The initial state, inside the Tag name.
/// Contains the current length of the tag name.
Tag(usize),
/// The name fast completely parsed. Now look for the '>'.
Attributes(usize, AttributeParser),
}

/// The internal state of the attribute parser.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum AttributeParser {
/// The initial state, not within ' or ".
Outside,
/// Inside a single-quoted region (`'...'`).
SingleQ,
/// Inside a double-quoted region (`"..."`).
DoubleQ,
}

impl StartElementParser {
/// Returns the length of the name and the number of consumed bytes of the current call or `None` if `>` was not found in `bytes`.
/// A return-value of None implies, that the full bytes array was consumed.
/// Assumes, that the initial '<' is already consumed.
#[inline]
pub fn feed(&mut self, bytes: &[u8]) -> Option<(usize, usize)> {
// The number of bytes consumed in the current feed iteration.
let mut consumed: usize = 0;

let (name_len, mut attr_parser) = 'name_len: {
match *self {
Self::Tag(name_len) => {
for i in 0..bytes.len() {
let byte = bytes[i];

if matches!(byte, b' ' | b'\r' | b'\n' | b'\t' | b'/') {
// TODO(flxbe): Somehow make sure, that the only expect a '>' after the '/'.
let name_len = name_len + i;
let attr_parser = AttributeParser::Outside;
*self = Self::Attributes(name_len, attr_parser);

consumed += i;
break 'name_len (name_len, attr_parser);
} else if byte == b'>' {
return Some((name_len + i, consumed + i));
}
}

*self = Self::Tag(name_len + bytes.len());
return None;
}
Self::Attributes(name_len, attr_parser) => (name_len, attr_parser),
}
};

let new_data = &bytes[consumed..];
for i in memchr::memchr3_iter(b'>', b'\'', b'"', new_data) {
attr_parser = match (attr_parser, new_data[i]) {
// only allowed to match `>` while we are in state `Outside`
(AttributeParser::Outside, b'>') => return Some((name_len, consumed + i)),
(AttributeParser::Outside, b'\'') => AttributeParser::SingleQ,
(AttributeParser::Outside, b'"') => AttributeParser::DoubleQ,

// the only end_byte that gets us out if the same character
(AttributeParser::SingleQ, b'\'') | (AttributeParser::DoubleQ, b'"') => {
AttributeParser::Outside
}

// all other bytes: no state change
_ => continue,
};
}

*self = Self::Attributes(name_len, attr_parser);
None
}

/// Return the correct EOF SyntaxError based on the current internal state.
#[inline]
pub fn eof_error(self, _content: &[u8]) -> SyntaxError {
match self {
Self::Tag(_) => SyntaxError::UnclosedTag,
Self::Attributes(_, attr) => match attr {
AttributeParser::Outside => SyntaxError::UnclosedTag,
AttributeParser::SingleQ => SyntaxError::UnclosedSingleQuotedAttributeValue,
AttributeParser::DoubleQ => SyntaxError::UnclosedDoubleQuotedAttributeValue,
},
}
}
}

impl Default for StartElementParser {
#[inline]
fn default() -> Self {
Self::Tag(0)
}
}

#[test]
fn parse_all() {
use pretty_assertions::assert_eq;

fn parse_input(input: &[u8], name_len: usize) {
let mut parser = StartElementParser::default();

assert_eq!(parser.feed(input), Some((name_len, input.len() - 1)));
}

parse_input(b"tag key='value' key=\"value\">", 3);
parse_input(b"tag>", 3);
parse_input(b"tag />", 3);
parse_input(b"tag/>", 3);
}

#[test]
fn parse_internal_state() {
use pretty_assertions::assert_eq;

let mut parser = StartElementParser::default();
assert_eq!(parser.feed(b""), None);
assert_eq!(parser, StartElementParser::Tag(0));

// start feeding the tag
assert_eq!(parser.feed(b"tag"), None);
assert_eq!(parser, StartElementParser::Tag(3));

// Finish the tag parsing after seeing some whitespace
assert_eq!(parser.feed(b" "), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::Outside)
);

// Remain in state when no progress is made
assert_eq!(parser.feed(b""), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::Outside)
);
assert_eq!(parser.feed(b"some random content"), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::Outside)
);

// Handle single qoute
assert_eq!(parser.feed(b"\'"), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::SingleQ)
);

// Remain in state when no progress is made
assert_eq!(parser.feed(b""), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::SingleQ)
);
assert_eq!(parser.feed(b"some random content \">"), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::SingleQ)
);

// Close single quote
assert_eq!(parser.feed(b"'"), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::Outside)
);

// Handle double qoute
assert_eq!(parser.feed(b"\""), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::DoubleQ)
);

// Remain in state when no progress is made
assert_eq!(parser.feed(b""), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::DoubleQ)
);
assert_eq!(parser.feed(b"some random content '>"), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::DoubleQ)
);

// Close double quote
assert_eq!(parser.feed(b"\""), None);
assert_eq!(
parser,
StartElementParser::Attributes(3, AttributeParser::Outside)
);

assert_eq!(parser.feed(b">"), Some((3, 0)));
}
45 changes: 44 additions & 1 deletion src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::path::Path;
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::parser::Parser;
use crate::parser::{Parser, StartElementParser};
use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
use crate::utils::is_whitespace;

Expand Down Expand Up @@ -316,6 +316,7 @@ macro_rules! impl_buffered_source {
// That method is called only when available buffer starts from '<'
// We need to consume it
self $(.$reader)? .consume(1);

let available = loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => n,
Expand All @@ -325,6 +326,48 @@ macro_rules! impl_buffered_source {
};
Ok(available.first().cloned())
}

#[inline]
$($async)? fn read_start_element<'i>(&mut self, buf: &'i mut Vec<u8>, position: &mut u64) -> Result<(usize, &'i [u8])> {
let mut parser = StartElementParser::default();
let mut read = 1;
// '<' was consumed in peek_one(), but not placed in buf
buf.push(b'<');

let start = buf.len();
loop {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

if let Some((name_len, consumed)) = parser.feed(available) {
buf.extend_from_slice(&available[..consumed]);

// +1 for `>` which we do not include
self $(.$reader)? .consume(consumed + 1);
read += consumed as u64 + 1;

*position += read;
return Ok((name_len, &buf[start..]));
}

// The `>` symbol not yet found, continue reading
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used as u64;
}

*position += read;
Err(Error::Syntax(parser.eof_error(&buf[start..])))
}
};
}

Expand Down
26 changes: 24 additions & 2 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -456,10 +456,10 @@ macro_rules! read_until_close {
},
// `<...` - opening or self-closed tag
Ok(Some(_)) => match $reader
.read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
.read_start_element($buf, &mut $self.state.offset)
$(.$await)?
{
Ok(bytes) => Ok($self.state.emit_start(bytes)),
Ok((name_len, bytes)) => Ok($self.state.emit_start(name_len, bytes)),
Err(e) => {
// We want to report error at `<`
$self.state.last_error_offset = start;
Expand Down Expand Up @@ -1136,6 +1136,28 @@ trait XmlSource<'r, B> {
/// Return one character without consuming it, so that future `read_*` calls
/// will still include it. On EOF, return `None`.
fn peek_one(&mut self) -> io::Result<Option<u8>>;

/// Read input until start element is finished.
///
/// This method expect that start sequence of a parser already was read.
///
/// Returns a tuple of the length of the tag name and a slice of data read up to the end of the thing being parsed.
/// The end of thing and the returned content is determined by the used parser.
///
/// If input (`Self`) is exhausted and no bytes was read, or if the specified
/// parser could not find the ending sequence of the thing, returns `SyntaxError`.
///
/// # Parameters
/// - `buf`: Buffer that could be filled from an input (`Self`) and
/// from which [events] could borrow their data
/// - `position`: Will be increased by amount of bytes consumed
///
/// [events]: crate::events::Event
fn read_start_element(
&mut self,
buf: B,
position: &mut u64,
) -> Result<(usize, &'r [u8]), Error>;
}

/// Possible elements started with `<!`
Expand Down
Loading