-
Notifications
You must be signed in to change notification settings - Fork 238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Rework handling general entity references (&entity;
)
#766
base: master
Are you sure you want to change the base?
Changes from all commits
a6d486e
dfea110
08ec03a
094a88e
dcc3a6c
0631d47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,7 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { | |
} | ||
} | ||
Event::Text(e) => { | ||
criterion::black_box(e.unescape()?); | ||
criterion::black_box(e.decode()?); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. About And would there be any way to, say, return the original raw unexpanded XML between two tags from that wrapper, or would that be impossible without dropping to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think,
I think, it could be possible to implement that, but only if something requested that. I think, that this is niche feature. The API could be a special method that need to call instead of |
||
} | ||
Event::CData(e) => { | ||
criterion::black_box(e.into_inner()); | ||
|
@@ -79,7 +79,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { | |
} | ||
} | ||
Event::Text(e) => { | ||
criterion::black_box(e.unescape()?); | ||
criterion::black_box(e.decode()?); | ||
} | ||
Event::CData(e) => { | ||
criterion::black_box(e.into_inner()); | ||
|
@@ -105,7 +105,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { | |
} | ||
} | ||
(resolved_ns, Event::Text(e)) => { | ||
criterion::black_box(e.unescape()?); | ||
criterion::black_box(e.decode()?); | ||
criterion::black_box(resolved_ns); | ||
} | ||
(resolved_ns, Event::CData(e)) => { | ||
|
@@ -133,7 +133,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { | |
} | ||
} | ||
(resolved_ns, Event::Text(e)) => { | ||
criterion::black_box(e.unescape()?); | ||
criterion::black_box(e.decode()?); | ||
criterion::black_box(resolved_ns); | ||
} | ||
(resolved_ns, Event::CData(e)) => { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,76 +1,207 @@ | ||
//! This example demonstrate how custom entities can be extracted from the DOCTYPE!, | ||
//! and later use to decode text and attribute values. | ||
//! This example demonstrate how custom entities can be extracted from the DOCTYPE, | ||
//! and later use to: | ||
//! - insert new pieces of document (particular case - insert only textual content) | ||
//! - decode attribute values | ||
//! | ||
//! NB: this example is deliberately kept simple: | ||
//! * it assumes that the XML file is UTF-8 encoded (custom_entities must only contain UTF-8 data) | ||
//! * it only handles internal entities; | ||
//! * the regex in this example is simple but brittle; | ||
//! * it does not support the use of entities in entity declaration. | ||
|
||
use std::collections::HashMap; | ||
use std::borrow::Cow; | ||
use std::collections::{HashMap, VecDeque}; | ||
use std::str::from_utf8; | ||
|
||
use quick_xml::escape::resolve_predefined_entity; | ||
use quick_xml::events::Event; | ||
use quick_xml::encoding::Decoder; | ||
use quick_xml::errors::Error; | ||
use quick_xml::escape::EscapeError; | ||
use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event}; | ||
use quick_xml::name::QName; | ||
use quick_xml::reader::Reader; | ||
use regex::bytes::Regex; | ||
|
||
const DATA: &str = r#" | ||
use pretty_assertions::assert_eq; | ||
|
||
<?xml version="1.0"?> | ||
<!DOCTYPE test [ | ||
<!ENTITY msg "hello world" > | ||
]> | ||
<test label="&msg;">&msg;</test> | ||
struct MyReader<'i> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mentioned a future PR that would implement this functionality on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My point just being that the current example as-is is a bit much to expect people to implement or copy and paste, and I just want to check my understanding that it's not a long-term solution. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have not really decided yet whether to leave this example as it is to demonstrate how the reader stack can be implemented if for some reason the standard solution does not work, or rewrite it to a new API. If it remains as it is, then there will be a mention of the standard way |
||
/// Stack of readers, the first element is the initial reader, the other are | ||
/// readers created for each resolved entity | ||
readers: VecDeque<Reader<&'i [u8]>>, | ||
/// Map of captured internal _parsed general entities_. _Parsed_ means that | ||
/// value of the entity is parsed by XML reader | ||
entities: HashMap<&'i [u8], &'i [u8]>, | ||
/// In this example we use simple regular expression to capture entities from DTD. | ||
/// In real application you should use DTD parser. | ||
entity_re: Regex, | ||
} | ||
impl<'i> MyReader<'i> { | ||
fn new(input: &'i str) -> Result<Self, regex::Error> { | ||
let mut reader = Reader::from_str(input); | ||
reader.config_mut().trim_text(true); | ||
|
||
"#; | ||
let mut readers = VecDeque::new(); | ||
readers.push_back(reader); | ||
|
||
fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
let mut reader = Reader::from_str(DATA); | ||
reader.config_mut().trim_text(true); | ||
|
||
let mut custom_entities: HashMap<String, String> = HashMap::new(); | ||
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?; | ||
|
||
loop { | ||
match reader.read_event() { | ||
Ok(Event::DocType(ref e)) => { | ||
for cap in entity_re.captures_iter(e) { | ||
custom_entities.insert( | ||
reader.decoder().decode(&cap[1])?.into_owned(), | ||
reader.decoder().decode(&cap[2])?.into_owned(), | ||
); | ||
} | ||
} | ||
Ok(Event::Start(ref e)) => { | ||
if let b"test" = e.name().as_ref() { | ||
let attributes = e | ||
.attributes() | ||
.map(|a| { | ||
a.unwrap() | ||
.decode_and_unescape_value_with(reader.decoder(), |ent| { | ||
custom_entities.get(ent).map(|s| s.as_str()) | ||
}) | ||
.unwrap() | ||
.into_owned() | ||
}) | ||
.collect::<Vec<_>>(); | ||
println!("attributes values: {:?}", attributes); | ||
// Capture "name" and "content" from such string: | ||
// <!ENTITY name "content" > | ||
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?; | ||
Ok(Self { | ||
readers, | ||
entities: HashMap::new(), | ||
entity_re, | ||
}) | ||
} | ||
fn read_event(&mut self) -> Result<Event<'i>, Error> { | ||
loop { | ||
if let Some(mut reader) = self.readers.pop_back() { | ||
match dbg!(reader.read_event())? { | ||
// Capture defined entities from the DTD inside document and skip that event | ||
Event::DocType(e) => { | ||
self.readers.push_back(reader); | ||
self.capture(e); | ||
continue; | ||
} | ||
// When entity is referenced, create new reader with the same settings as | ||
// the current reader have and push it to the top of stack. Then try to | ||
// read next event from it (on next iteration) | ||
Event::GeneralRef(e) => { | ||
if let Some(ch) = e.resolve_char_ref()? { | ||
self.readers.push_back(reader); | ||
return Ok(Event::Text(BytesText::from_escaped(ch.to_string()))); | ||
} | ||
let mut r = Reader::from_reader(self.resolve(&e)?); | ||
*r.config_mut() = reader.config().clone(); | ||
|
||
self.readers.push_back(reader); | ||
self.readers.push_back(r); | ||
continue; | ||
} | ||
// When reader is exhausted, do not return it to the stack | ||
Event::Eof => continue, | ||
|
||
// Return all other events to caller | ||
e => { | ||
self.readers.push_back(reader); | ||
return Ok(e); | ||
} | ||
} | ||
} | ||
Ok(Event::Text(ref e)) => { | ||
println!( | ||
"text value: {}", | ||
e.unescape_with(|ent| match custom_entities.get(ent) { | ||
Some(s) => Some(s.as_str()), | ||
None => resolve_predefined_entity(ent), | ||
}) | ||
.unwrap() | ||
); | ||
} | ||
Ok(Event::Eof) => break, | ||
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), | ||
_ => (), | ||
return Ok(Event::Eof); | ||
} | ||
} | ||
|
||
/// In this example we use simple regular expression to capture entities from DTD. | ||
/// In real application you should use DTD parser | ||
fn capture(&mut self, doctype: BytesText<'i>) { | ||
let doctype = match doctype.into_inner() { | ||
Cow::Borrowed(doctype) => doctype, | ||
Cow::Owned(_) => unreachable!("We are sure that event will be borrowed"), | ||
}; | ||
for cap in self.entity_re.captures_iter(doctype) { | ||
self.entities.insert( | ||
cap.get(1).unwrap().as_bytes(), | ||
cap.get(2).unwrap().as_bytes(), | ||
); | ||
} | ||
} | ||
|
||
fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> { | ||
match self.entities.get(entity) { | ||
Some(replacement) => Ok(replacement), | ||
None => Err(EscapeError::UnrecognizedEntity( | ||
0..0, | ||
String::from_utf8_lossy(entity).into_owned(), | ||
)), | ||
} | ||
} | ||
|
||
fn get_entity(&self, entity: &str) -> Option<&'i str> { | ||
self.entities | ||
.get(entity.as_bytes()) | ||
// SAFETY: We are sure that slices are correct UTF-8 because we get | ||
// them from rust string | ||
.map(|value| from_utf8(value).unwrap()) | ||
} | ||
|
||
fn decoder(&self) -> Decoder { | ||
self.readers.back().unwrap().decoder() | ||
} | ||
} | ||
|
||
fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
let mut reader = MyReader::new( | ||
r#" | ||
<!DOCTYPE test [ | ||
<!ENTITY text "hello world" > | ||
<!ENTITY element1 "<dtd attr = 'Message: &text;'/>" > | ||
<!ENTITY element2 "<a>&element1;</a>" > | ||
]> | ||
<test label="Message: &text;">'&element2;'</test> | ||
"#, | ||
)?; | ||
|
||
let event = reader.read_event()?; | ||
assert_eq!( | ||
event, | ||
Event::Start(BytesStart::from_content( | ||
r#"test label="Message: &text;""#, | ||
4 | ||
)) | ||
); | ||
if let Event::Start(e) = event { | ||
let mut attrs = e.attributes(); | ||
|
||
let label = attrs.next().unwrap()?; | ||
assert_eq!(label.key, QName(b"label")); | ||
assert_eq!( | ||
label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, | ||
"Message: hello world" | ||
); | ||
|
||
assert_eq!(attrs.next(), None); | ||
} | ||
|
||
// This is decoded decimal character reference ' | ||
assert_eq!( | ||
reader.read_event()?, | ||
Event::Text(BytesText::from_escaped("'")) | ||
); | ||
|
||
//-------------------------------------------------------------------------- | ||
// This part was inserted into original document from entity defined in DTD | ||
|
||
assert_eq!(reader.read_event()?, Event::Start(BytesStart::new("a"))); | ||
let event = reader.read_event()?; | ||
assert_eq!( | ||
event, | ||
Event::Empty(BytesStart::from_content( | ||
r#"dtd attr = 'Message: &text;'"#, | ||
3 | ||
)) | ||
); | ||
if let Event::Start(e) = event { | ||
let mut attrs = e.attributes(); | ||
|
||
let attr = attrs.next().unwrap()?; | ||
assert_eq!(attr.key, QName(b"attr")); | ||
assert_eq!( | ||
attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, | ||
"Message: hello world" | ||
); | ||
|
||
assert_eq!(attrs.next(), None); | ||
} | ||
assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("a"))); | ||
//-------------------------------------------------------------------------- | ||
|
||
// This is decoded hexadecimal character reference ' | ||
assert_eq!( | ||
reader.read_event()?, | ||
Event::Text(BytesText::from_escaped("'")) | ||
); | ||
|
||
assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("test"))); | ||
assert_eq!(reader.read_event()?, Event::Eof); | ||
|
||
Ok(()) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Which applications?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant case from #719 here