tafia · Mingun · Jul 7, 2024 · Jun 11, 2024 · Jun 21, 2024 · Jun 11, 2024
diff --git a/Changelog.md b/Changelog.md
@@ -13,12 +13,31 @@
 
 ## Unreleased
 
+### Significant changes
+
+Now references to entities (as predefined, such as `&lt;`, as user-defined) reported as a new
+`Event::GeneralRef`.
+Caller can parse the content of the entity and stream events from it as it is required by the
+XML specification. See the updated `custom_entities` example!
+
 ### New Features
 
+- [#766]: Allow to parse resolved entities as XML fragments and stream events from them.
+- [#766]: Added new event `Event::GeneralRef` with content of [general entity].
+- [#766]: Added new configuration option `allow_dangling_amp` which allows to have
+  a `&` not followed by `;` in the textual data which is required for some applications
+  for compatibility reasons.
+
 ### Bug Fixes
 
 ### Misc Changes
 
+- [#766]: `BytesText::unescape` and `BytesText::unescape_with` replaced by `BytesText::decode`.
+  Now Text events does not contain escaped parts which are reported as `Event::GeneralRef`.
+
+[#766]: https://github.com/tafia/quick-xml/pull/766
+[general entity]: https://www.w3.org/TR/xml11/#gen-entity
+
 
 ## 0.37.0 -- 2024-10-27
 

diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
@@ -54,7 +54,7 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> {
                 }
             }
             Event::Text(e) => {
-                criterion::black_box(e.unescape()?);
+                criterion::black_box(e.decode()?);
             }
             Event::CData(e) => {
                 criterion::black_box(e.into_inner());
@@ -79,7 +79,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
                 }
             }
             Event::Text(e) => {
-                criterion::black_box(e.unescape()?);
+                criterion::black_box(e.decode()?);
             }
             Event::CData(e) => {
                 criterion::black_box(e.into_inner());
@@ -105,7 +105,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
                 }
             }
             (resolved_ns, Event::Text(e)) => {
-                criterion::black_box(e.unescape()?);
+                criterion::black_box(e.decode()?);
                 criterion::black_box(resolved_ns);
             }
             (resolved_ns, Event::CData(e)) => {
@@ -133,7 +133,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
                 }
             }
             (resolved_ns, Event::Text(e)) => {
-                criterion::black_box(e.unescape()?);
+                criterion::black_box(e.decode()?);
                 criterion::black_box(resolved_ns);
             }
             (resolved_ns, Event::CData(e)) => {

diff --git a/benches/microbenches.rs b/benches/microbenches.rs
@@ -145,7 +145,7 @@ fn one_event(c: &mut Criterion) {
             config.trim_text(true);
             config.check_end_names = false;
             match r.read_event() {
-                Ok(Event::Comment(e)) => nbtxt += e.unescape().unwrap().len(),
+                Ok(Event::Comment(e)) => nbtxt += e.decode().unwrap().len(),
                 something_else => panic!("Did not expect {:?}", something_else),
             };
 

diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs
@@ -1,76 +1,207 @@
-//! This example demonstrate how custom entities can be extracted from the DOCTYPE!,
-//! and later use to decode text and attribute values.
+//! This example demonstrate how custom entities can be extracted from the DOCTYPE,
+//! and later use to:
+//! - insert new pieces of document (particular case - insert only textual content)
+//! - decode attribute values
 //!
 //! NB: this example is deliberately kept simple:
 //! * it assumes that the XML file is UTF-8 encoded (custom_entities must only contain UTF-8 data)
 //! * it only handles internal entities;
 //! * the regex in this example is simple but brittle;
 //! * it does not support the use of entities in entity declaration.
 
-use std::collections::HashMap;
+use std::borrow::Cow;
+use std::collections::{HashMap, VecDeque};
+use std::str::from_utf8;
 
-use quick_xml::escape::resolve_predefined_entity;
-use quick_xml::events::Event;
+use quick_xml::encoding::Decoder;
+use quick_xml::errors::Error;
+use quick_xml::escape::EscapeError;
+use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
+use quick_xml::name::QName;
 use quick_xml::reader::Reader;
 use regex::bytes::Regex;
 
-const DATA: &str = r#"
+use pretty_assertions::assert_eq;
 
-    <?xml version="1.0"?>
-    <!DOCTYPE test [
-    <!ENTITY msg "hello world" >
-    ]>
-    <test label="&msg;">&msg;</test>
+struct MyReader<'i> {
+    /// Stack of readers, the first element is the initial reader, the other are
+    /// readers created for each resolved entity
+    readers: VecDeque<Reader<&'i [u8]>>,
+    /// Map of captured internal _parsed general entities_. _Parsed_ means that
+    /// value of the entity is parsed by XML reader
+    entities: HashMap<&'i [u8], &'i [u8]>,
+    /// In this example we use simple regular expression to capture entities from DTD.
+    /// In real application you should use DTD parser.
+    entity_re: Regex,
+}
+impl<'i> MyReader<'i> {
+    fn new(input: &'i str) -> Result<Self, regex::Error> {
+        let mut reader = Reader::from_str(input);
+        reader.config_mut().trim_text(true);
 
-"#;
+        let mut readers = VecDeque::new();
+        readers.push_back(reader);
 
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let mut reader = Reader::from_str(DATA);
-    reader.config_mut().trim_text(true);
-
-    let mut custom_entities: HashMap<String, String> = HashMap::new();
-    let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
-
-    loop {
-        match reader.read_event() {
-            Ok(Event::DocType(ref e)) => {
-                for cap in entity_re.captures_iter(e) {
-                    custom_entities.insert(
-                        reader.decoder().decode(&cap[1])?.into_owned(),
-                        reader.decoder().decode(&cap[2])?.into_owned(),
-                    );
-                }
-            }
-            Ok(Event::Start(ref e)) => {
-                if let b"test" = e.name().as_ref() {
-                    let attributes = e
-                        .attributes()
-                        .map(|a| {
-                            a.unwrap()
-                                .decode_and_unescape_value_with(reader.decoder(), |ent| {
-                                    custom_entities.get(ent).map(|s| s.as_str())
-                                })
-                                .unwrap()
-                                .into_owned()
-                        })
-                        .collect::<Vec<_>>();
-                    println!("attributes values: {:?}", attributes);
+        // Capture "name" and "content" from such string:
+        // <!ENTITY name "content" >
+        let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
+        Ok(Self {
+            readers,
+            entities: HashMap::new(),
+            entity_re,
+        })
+    }
+    fn read_event(&mut self) -> Result<Event<'i>, Error> {
+        loop {
+            if let Some(mut reader) = self.readers.pop_back() {
+                match dbg!(reader.read_event())? {
+                    // Capture defined entities from the DTD inside document and skip that event
+                    Event::DocType(e) => {
+                        self.readers.push_back(reader);
+                        self.capture(e);
+                        continue;
+                    }
+                    // When entity is referenced, create new reader with the same settings as
+                    // the current reader have and push it to the top of stack. Then try to
+                    // read next event from it (on next iteration)
+                    Event::GeneralRef(e) => {
+                        if let Some(ch) = e.resolve_char_ref()? {
+                            self.readers.push_back(reader);
+                            return Ok(Event::Text(BytesText::from_escaped(ch.to_string())));
+                        }
+                        let mut r = Reader::from_reader(self.resolve(&e)?);
+                        *r.config_mut() = reader.config().clone();
+
+                        self.readers.push_back(reader);
+                        self.readers.push_back(r);
+                        continue;
+                    }
+                    // When reader is exhausted, do not return it to the stack
+                    Event::Eof => continue,
+
+                    // Return all other events to caller
+                    e => {
+                        self.readers.push_back(reader);
+                        return Ok(e);
+                    }
                 }
             }
-            Ok(Event::Text(ref e)) => {
-                println!(
-                    "text value: {}",
-                    e.unescape_with(|ent| match custom_entities.get(ent) {
-                        Some(s) => Some(s.as_str()),
-                        None => resolve_predefined_entity(ent),
-                    })
-                    .unwrap()
-                );
-            }
-            Ok(Event::Eof) => break,
-            Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
-            _ => (),
+            return Ok(Event::Eof);
         }
     }
+
+    /// In this example we use simple regular expression to capture entities from DTD.
+    /// In real application you should use DTD parser
+    fn capture(&mut self, doctype: BytesText<'i>) {
+        let doctype = match doctype.into_inner() {
+            Cow::Borrowed(doctype) => doctype,
+            Cow::Owned(_) => unreachable!("We are sure that event will be borrowed"),
+        };
+        for cap in self.entity_re.captures_iter(doctype) {
+            self.entities.insert(
+                cap.get(1).unwrap().as_bytes(),
+                cap.get(2).unwrap().as_bytes(),
+            );
+        }
+    }
+
+    fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> {
+        match self.entities.get(entity) {
+            Some(replacement) => Ok(replacement),
+            None => Err(EscapeError::UnrecognizedEntity(
+                0..0,
+                String::from_utf8_lossy(entity).into_owned(),
+            )),
+        }
+    }
+
+    fn get_entity(&self, entity: &str) -> Option<&'i str> {
+        self.entities
+            .get(entity.as_bytes())
+            // SAFETY: We are sure that slices are correct UTF-8 because we get
+            // them from rust string
+            .map(|value| from_utf8(value).unwrap())
+    }
+
+    fn decoder(&self) -> Decoder {
+        self.readers.back().unwrap().decoder()
+    }
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut reader = MyReader::new(
+        r#"
+        <!DOCTYPE test [
+        <!ENTITY text "hello world" >
+        <!ENTITY element1 "<dtd attr = 'Message: &text;'/>" >
+        <!ENTITY element2 "<a>&element1;</a>" >
+        ]>
+        <test label="Message: &text;">&#39;&element2;&#x27;</test>
+        "#,
+    )?;
+
+    let event = reader.read_event()?;
+    assert_eq!(
+        event,
+        Event::Start(BytesStart::from_content(
+            r#"test label="Message: &text;""#,
+            4
+        ))
+    );
+    if let Event::Start(e) = event {
+        let mut attrs = e.attributes();
+
+        let label = attrs.next().unwrap()?;
+        assert_eq!(label.key, QName(b"label"));
+        assert_eq!(
+            label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?,
+            "Message: hello world"
+        );
+
+        assert_eq!(attrs.next(), None);
+    }
+
+    // This is decoded decimal character reference &#39;
+    assert_eq!(
+        reader.read_event()?,
+        Event::Text(BytesText::from_escaped("'"))
+    );
+
+    //--------------------------------------------------------------------------
+    // This part was inserted into original document from entity defined in DTD
+
+    assert_eq!(reader.read_event()?, Event::Start(BytesStart::new("a")));
+    let event = reader.read_event()?;
+    assert_eq!(
+        event,
+        Event::Empty(BytesStart::from_content(
+            r#"dtd attr = 'Message: &text;'"#,
+            3
+        ))
+    );
+    if let Event::Start(e) = event {
+        let mut attrs = e.attributes();
+
+        let attr = attrs.next().unwrap()?;
+        assert_eq!(attr.key, QName(b"attr"));
+        assert_eq!(
+            attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?,
+            "Message: hello world"
+        );
+
+        assert_eq!(attrs.next(), None);
+    }
+    assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("a")));
+    //--------------------------------------------------------------------------
+
+    // This is decoded hexadecimal character reference &#x27;
+    assert_eq!(
+        reader.read_event()?,
+        Event::Text(BytesText::from_escaped("'"))
+    );
+
+    assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("test")));
+    assert_eq!(reader.read_event()?, Event::Eof);
+
     Ok(())
 }
diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs
@@ -43,7 +43,7 @@ where
             | Ok(Event::Comment(ref e))
             | Ok(Event::DocType(ref e)) => {
                 debug_format!(e);
-                if let Err(err) = e.unescape() {
+                if let Err(err) = e.decode() {
                     debug_format!(err);
                     break;
                 }
@@ -55,6 +55,11 @@ where
                     break;
                 }
             }
+            Ok(Event::GeneralRef(ref e)) => {
+                debug_format!(e);
+                debug_format!(e.is_char_ref());
+                debug_format!(e.resolve_char_ref());
+            }
             Ok(Event::PI(ref e)) => {
                 debug_format!(e);
             }