From 0a6ecd649848007d4c9222f69bae0f7dc0ae40fc Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 21:49:18 +0500 Subject: [PATCH] Add reusable parser for XML element and use it internally --- Changelog.md | 2 + src/reader/async_tokio.rs | 4 +- src/reader/buffered_reader.rs | 13 ++-- src/reader/element.rs | 113 ++++++++++++++++++++++++++++++++++ src/reader/mod.rs | 36 +---------- src/reader/slice_reader.rs | 13 ++-- 6 files changed, 132 insertions(+), 49 deletions(-) create mode 100644 src/reader/element.rs diff --git a/Changelog.md b/Changelog.md index 08be8fdd..71cd2491 100644 --- a/Changelog.md +++ b/Changelog.md @@ -43,6 +43,7 @@ resolve predefined entities. - `quick_xml::escape::resolve_xml_entity` - `quick_xml::escape::resolve_html5_entity` - [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`. +- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`. ### Bug Fixes @@ -101,6 +102,7 @@ resolve predefined entities. [#743]: https://github.com/tafia/quick-xml/pull/743 [#748]: https://github.com/tafia/quick-xml/pull/748 [#753]: https://github.com/tafia/quick-xml/pull/753 +[#754]: https://github.com/tafia/quick-xml/pull/754 [`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html [`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html [`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 1cdab220..7337540f 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -8,9 +8,7 @@ use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::{QName, ResolveResult}; use crate::reader::buffered_reader::impl_buffered_source; -use crate::reader::{ - is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span, -}; +use crate::reader::{is_whitespace, BangType, ElementParser, NsReader, ParseState, Reader, Span}; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. /// diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 58bb24a2..182e83b7 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -8,7 +8,7 @@ use std::path::Path; use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, ElementParser, Reader, Span, XmlSource}; macro_rules! impl_buffered_source { ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { @@ -190,7 +190,7 @@ macro_rules! impl_buffered_source { buf: &'b mut Vec, position: &mut usize, ) -> Result<&'b [u8]> { - let mut state = ReadElementState::Elem; + let mut parser = ElementParser::default(); let mut read = 0; let start = buf.len(); @@ -198,11 +198,12 @@ macro_rules! impl_buffered_source { match self $(.$reader)? .fill_buf() $(.$await)? { Ok(n) if n.is_empty() => break, Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); + if let Some(used) = parser.feed(available) { + buf.extend_from_slice(&available[..used]); - self $(.$reader)? .consume(used); - read += used; + // +1 for `>` which we do not include + self $(.$reader)? .consume(used + 1); + read += used + 1; // Position now just after the `>` symbol *position += read; diff --git a/src/reader/element.rs b/src/reader/element.rs new file mode 100644 index 00000000..e5d14e7c --- /dev/null +++ b/src/reader/element.rs @@ -0,0 +1,113 @@ +//! Contains a parser for an XML element. + +/// A parser that search a `>` symbol in the slice outside of quoted regions. +/// +/// The parser considers two quoted regions: a double-quoted (`"..."`) and +/// a single-quoted (`'...'`) region. Matches found inside those regions are not +/// considered as results. Each region starts and ends by its quote symbol, +/// which cannot be escaped (but can be encoded as XML character entity or named +/// entity. Anyway, that encoding does not contain literal quotes). +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position of +/// found symbol. If search is unsuccessful, a [`None`] will be returned. You +/// typically would expect positive result of search, so that you should feed +/// new data until you get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use quick_xml::reader::ElementParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = ElementParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 8 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ElementParser { + /// The initial state (inside element, but outside of attribute value). + Outside, + /// Inside a single-quoted region (`'...'`). + SingleQ, + /// Inside a double-quoted region (`"..."`). + DoubleQ, +} + +impl ElementParser { + /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`. + #[inline] + pub fn feed(&mut self, bytes: &[u8]) -> Option { + for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) { + *self = match (*self, bytes[i]) { + // only allowed to match `>` while we are in state `Outside` + (Self::Outside, b'>') => return Some(i), + (Self::Outside, b'\'') => Self::SingleQ, + (Self::Outside, b'\"') => Self::DoubleQ, + + // the only end_byte that gets us out if the same character + (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside, + + // all other bytes: no state change + _ => continue, + }; + } + None + } +} + +impl Default for ElementParser { + #[inline] + fn default() -> Self { + Self::Outside + } +} + +#[test] +fn parse() { + use pretty_assertions::assert_eq; + use ElementParser::*; + + /// Returns `Ok(pos)` with the position in the buffer where element is ended. + /// + /// Returns `Err(internal_state)` if parsing does not done yet. + fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result { + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser), + } + } + + assert_eq!(parse_element(b"", Outside), Err(Outside)); + assert_eq!(parse_element(b"", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"'", Outside), Err(SingleQ)); + assert_eq!(parse_element(b"'", SingleQ), Err(Outside)); + assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ)); + assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside)); + + assert_eq!(parse_element(b">", Outside), Ok(0)); + assert_eq!(parse_element(b">", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"''>", Outside), Ok(2)); + assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ)); +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs index d5cfa237..97b21e06 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -426,11 +426,13 @@ macro_rules! read_to_end { #[cfg(feature = "async-tokio")] mod async_tokio; mod buffered_reader; +mod element; mod ns_reader; mod pi; mod slice_reader; mod state; +pub use element::ElementParser; pub use ns_reader::NsReader; pub use pi::PiParser; @@ -986,40 +988,6 @@ impl BangType { } } -/// State machine for the [`XmlSource::read_element`] -#[derive(Clone, Copy)] -enum ReadElementState { - /// The initial state (inside element, but outside of attribute value) - Elem, - /// Inside a single-quoted attribute value - SingleQ, - /// Inside a double-quoted attribute value - DoubleQ, -} -impl ReadElementState { - /// Changes state by analyzing part of input. - /// Returns a tuple with part of chunk up to element closing symbol `>` - /// and a position after that symbol or `None` if such symbol was not found - #[inline(always)] - fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) { - *self = match (*self, chunk[i]) { - // only allowed to match `>` while we are in state `Elem` - (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)), - (Self::Elem, b'\'') => Self::SingleQ, - (Self::Elem, b'\"') => Self::DoubleQ, - - // the only end_byte that gets us out if the same character - (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem, - - // all other bytes: no state change - _ => *self, - }; - } - None - } -} - /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) const fn is_whitespace(b: u8) -> bool { diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index ad2925b3..e6e89175 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8}; use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, ElementParser, PiParser, Reader, Span, XmlSource}; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. /// This implementation supports not using an intermediate buffer as the byte slice @@ -312,12 +312,13 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { - let mut state = ReadElementState::Elem; + let mut parser = ElementParser::default(); - if let Some((bytes, i)) = state.change(self) { - // Position now just after the `>` symbol - *position += i; - *self = &self[i..]; + if let Some(i) = parser.feed(self) { + // +1 for `>` which we do not include + *position += i + 1; + let bytes = &self[..i]; + *self = &self[i + 1..]; return Ok(bytes); }