From dbfa0bd3152e0f899868034a61fc81664a253fb4 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 12 Jun 2024 01:10:51 +0500 Subject: [PATCH] Replace `BytesText::unescape` and `unescape_with` by `decode` Text events produces by the Reader can not contain escaped data anymore, all such data is represented by the Event::GeneralRef --- Changelog.md | 2 ++ benches/macrobenches.rs | 8 ++++---- benches/microbenches.rs | 2 +- fuzz/fuzz_targets/fuzz_target_1.rs | 2 +- src/de/mod.rs | 6 ++---- src/events/mod.rs | 28 ++++------------------------ src/reader/async_tokio.rs | 6 +++--- src/reader/buffered_reader.rs | 2 +- src/reader/mod.rs | 2 +- src/reader/ns_reader.rs | 8 ++++---- src/reader/slice_reader.rs | 2 +- tests/encodings.rs | 2 +- tests/fuzzing.rs | 2 +- tests/reader.rs | 2 +- tests/roundtrip.rs | 2 +- 15 files changed, 28 insertions(+), 48 deletions(-) diff --git a/Changelog.md b/Changelog.md index de3ad58e..d16a9ca3 100644 --- a/Changelog.md +++ b/Changelog.md @@ -57,6 +57,8 @@ XML specification. See the updated `custom_entities` example! - [#826]: Removed `DeError::InvalidInt`, `DeError::InvalidFloat` and `DeError::InvalidBoolean`. Now the responsibility for returning the error lies with the visitor of the type. See rationale in https://github.com/serde-rs/serde/pull/2811 +- [#766]: `BytesText::unescape` and `BytesText::unescape_with` replaced by `BytesText::decode`. + Now Text events does not contain escaped parts which are reported as `Event::GeneralRef`. [#227]: https://github.com/tafia/quick-xml/issues/227 [#655]: https://github.com/tafia/quick-xml/issues/655 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 2b882b12..a89c34e4 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -54,7 +54,7 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { } } Event::Text(e) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); } Event::CData(e) => { criterion::black_box(e.into_inner()); @@ -79,7 +79,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { } } Event::Text(e) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); } Event::CData(e) => { criterion::black_box(e.into_inner()); @@ -105,7 +105,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { } } (resolved_ns, Event::Text(e)) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); criterion::black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { @@ -133,7 +133,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { } } (resolved_ns, Event::Text(e)) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); criterion::black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 2f4ece04..498ad7a2 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -145,7 +145,7 @@ fn one_event(c: &mut Criterion) { config.trim_text(true); config.check_end_names = false; match r.read_event() { - Ok(Event::Comment(e)) => nbtxt += e.unescape().unwrap().len(), + Ok(Event::Comment(e)) => nbtxt += e.decode().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index d13e6081..dbadfe2f 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -43,7 +43,7 @@ where | Ok(Event::Comment(ref e)) | Ok(Event::DocType(ref e)) => { debug_format!(e); - if let Err(err) = e.unescape() { + if let Err(err) = e.decode() { debug_format!(err); break; } diff --git a/src/de/mod.rs b/src/de/mod.rs index 484c31b0..a5bacfae 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2223,9 +2223,7 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { // FIXME: Actually, we should trim after decoding text, but now we trim before e.inplace_trim_end(); } - result - .to_mut() - .push_str(&e.unescape_with(|entity| self.entity_resolver.resolve(entity))?); + result.to_mut().push_str(&e.decode()?); } PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?), @@ -2247,7 +2245,7 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { // FIXME: Actually, we should trim after decoding text, but now we trim before continue; } - self.drain_text(e.unescape_with(|entity| self.entity_resolver.resolve(entity))?) + self.drain_text(e.decode()?) } PayloadEvent::CData(e) => self.drain_text(e.decode()?), PayloadEvent::DocType(e) => { diff --git a/src/events/mod.rs b/src/events/mod.rs index c274085a..e8b46f15 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -47,10 +47,7 @@ use std::str::from_utf8; use crate::encoding::{Decoder, EncodingError}; use crate::errors::{Error, IllFormedError}; -use crate::escape::{ - escape, minimal_escape, parse_number, partial_escape, resolve_predefined_entity, unescape_with, - EscapeError, -}; +use crate::escape::{escape, minimal_escape, parse_number, partial_escape, EscapeError}; use crate::name::{LocalName, QName}; #[cfg(feature = "serialize")] use crate::utils::CowRef; @@ -580,29 +577,12 @@ impl<'a> BytesText<'a> { } } - /// Decodes then unescapes the content of the event. - /// - /// This will allocate if the value contains any escape sequences or in - /// non-UTF-8 encoding. - pub fn unescape(&self) -> Result, Error> { - self.unescape_with(resolve_predefined_entity) - } - - /// Decodes then unescapes the content of the event with custom entities. + /// Decodes the content of the event. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. - pub fn unescape_with<'entity>( - &self, - resolve_entity: impl FnMut(&str) -> Option<&'entity str>, - ) -> Result, Error> { - let decoded = self.decoder.decode_cow(&self.content)?; - - match unescape_with(&decoded, resolve_entity)? { - // Because result is borrowed, no replacements was done and we can use original string - Cow::Borrowed(_) => Ok(decoded), - Cow::Owned(s) => Ok(s.into()), - } + pub fn decode(&self) -> Result, EncodingError> { + self.decoder.decode_cow(&self.content) } /// Removes leading XML whitespace bytes from text content. diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index a9237de0..a79ced82 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -103,7 +103,7 @@ impl Reader { /// loop { /// match reader.read_event_into_async(&mut buf).await { /// Ok(Event::Start(_)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), + /// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()), /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// Ok(Event::Eof) => break, /// _ => (), @@ -237,7 +237,7 @@ impl NsReader { /// } /// } /// Event::Text(e) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -373,7 +373,7 @@ impl NsReader { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index f268448c..44930420 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -372,7 +372,7 @@ impl Reader { /// loop { /// match reader.read_event_into(&mut buf) { /// Ok(Event::Start(_)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), + /// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()), /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// Ok(Event::Eof) => break, /// _ => (), diff --git a/src/reader/mod.rs b/src/reader/mod.rs index cf806e3e..f95327fb 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -718,7 +718,7 @@ where /// _ => (), /// } /// } -/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), +/// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()), /// /// // There are several other `Event`s we do not consider here /// _ => (), diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 07220815..d9f84f62 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -419,7 +419,7 @@ impl NsReader { /// } /// } /// Event::Text(e) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -478,7 +478,7 @@ impl NsReader { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), @@ -664,7 +664,7 @@ impl<'i> NsReader<&'i [u8]> { /// } /// } /// Event::Text(e) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -726,7 +726,7 @@ impl<'i> NsReader<&'i [u8]> { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 37439597..c3b501ec 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -62,7 +62,7 @@ impl<'a> Reader<&'a [u8]> { /// loop { /// match reader.read_event().unwrap() { /// Event::Start(e) => count += 1, - /// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()), + /// Event::Text(e) => txt.push(e.decode().unwrap().into_owned()), /// Event::Eof => break, /// _ => (), /// } diff --git a/tests/encodings.rs b/tests/encodings.rs index 5f5676fa..7b64e167 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -37,7 +37,7 @@ fn test_koi8_r_encoding() { loop { match r.read_event_into(&mut buf) { Ok(Text(e)) => { - e.unescape().unwrap(); + e.decode().unwrap(); } Ok(Eof) => break, _ => (), diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs index 2740763c..25cf6989 100644 --- a/tests/fuzzing.rs +++ b/tests/fuzzing.rs @@ -38,7 +38,7 @@ fn fuzz_101() { } } Ok(Event::Text(e)) => { - if e.unescape().is_err() { + if e.decode().is_err() { break; } } diff --git a/tests/reader.rs b/tests/reader.rs index e05166ec..fecdeabc 100644 --- a/tests/reader.rs +++ b/tests/reader.rs @@ -172,7 +172,7 @@ fn test_escaped_content() { "content unexpected: expecting 'test', got '{:?}'", from_utf8(&e) ); - match e.unescape() { + match e.decode() { Ok(c) => assert_eq!(c, "test"), Err(e) => panic!( "cannot escape content at position {}: {:?}", diff --git a/tests/roundtrip.rs b/tests/roundtrip.rs index 68726195..4fb9ec53 100644 --- a/tests/roundtrip.rs +++ b/tests/roundtrip.rs @@ -236,7 +236,7 @@ fn reescape_text() { match reader.read_event().unwrap() { Eof => break, Text(e) => { - let t = e.unescape().unwrap(); + let t = e.decode().unwrap(); assert!(writer.write_event(Text(BytesText::new(&t))).is_ok()); } e => assert!(writer.write_event(e).is_ok()),