Skip to content

Commit 7b9f455

Browse files
authored
Merge pull request #766 from Mingun/entity-ref
Rework handling general entity references (`&entity;`)
2 parents f9c4309 + 69d0020 commit 7b9f455

26 files changed

+1569
-146
lines changed

Changelog.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,36 @@
1313

1414
## Unreleased
1515

16+
### Significant changes
17+
18+
Now references to entities (as predefined, such as `<`, as user-defined) reported as a new
19+
`Event::GeneralRef`.
20+
Caller can parse the content of the entity and stream events from it as it is required by the
21+
XML specification. See the updated `custom_entities` example!
22+
1623
### New Features
1724

1825
- [#863]: Add `Attributes::into_map_access(&str)` and `Attributes::into_deserializer()` when `serialize`
1926
feature is enabled. This will allow do deserialize serde types right from attributes. Both methods
2027
returns the same type which implements serde's `Deserializer` and `MapAccess` traits.
28+
- [#766]: Allow to parse resolved entities as XML fragments and stream events from them.
29+
- [#766]: Added new event `Event::GeneralRef` with content of [general entity].
30+
- [#766]: Added new configuration option `allow_dangling_amp` which allows to have
31+
a `&` not followed by `;` in the textual data which is required for some applications
32+
for compatibility reasons.
2133

2234
### Bug Fixes
2335

2436
### Misc Changes
2537

2638
- [#863]: Remove `From<QName<'a>> for BytesStart<'a>` because now `BytesStart` stores the
2739
encoding in which its data is encoded, but `QName` is a simple wrapper around byte slice.
40+
- [#766]: `BytesText::unescape` and `BytesText::unescape_with` replaced by `BytesText::decode`.
41+
Now Text events does not contain escaped parts which are reported as `Event::GeneralRef`.
2842

43+
[#766]: https://github.com/tafia/quick-xml/pull/766
2944
[#863]: https://github.com/tafia/quick-xml/pull/863
45+
[general entity]: https://www.w3.org/TR/xml11/#gen-entity
3046

3147

3248
## 0.37.5 -- 2025-04-27

benches/macrobenches.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> {
5454
}
5555
}
5656
Event::Text(e) => {
57-
criterion::black_box(e.unescape()?);
57+
criterion::black_box(e.decode()?);
5858
}
5959
Event::CData(e) => {
6060
criterion::black_box(e.into_inner());
@@ -79,7 +79,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
7979
}
8080
}
8181
Event::Text(e) => {
82-
criterion::black_box(e.unescape()?);
82+
criterion::black_box(e.decode()?);
8383
}
8484
Event::CData(e) => {
8585
criterion::black_box(e.into_inner());
@@ -105,7 +105,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
105105
}
106106
}
107107
(resolved_ns, Event::Text(e)) => {
108-
criterion::black_box(e.unescape()?);
108+
criterion::black_box(e.decode()?);
109109
criterion::black_box(resolved_ns);
110110
}
111111
(resolved_ns, Event::CData(e)) => {
@@ -133,7 +133,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
133133
}
134134
}
135135
(resolved_ns, Event::Text(e)) => {
136-
criterion::black_box(e.unescape()?);
136+
criterion::black_box(e.decode()?);
137137
criterion::black_box(resolved_ns);
138138
}
139139
(resolved_ns, Event::CData(e)) => {

benches/microbenches.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ fn one_event(c: &mut Criterion) {
145145
config.trim_text(true);
146146
config.check_end_names = false;
147147
match r.read_event() {
148-
Ok(Event::Comment(e)) => nbtxt += e.unescape().unwrap().len(),
148+
Ok(Event::Comment(e)) => nbtxt += e.decode().unwrap().len(),
149149
something_else => panic!("Did not expect {:?}", something_else),
150150
};
151151

examples/custom_entities.rs

Lines changed: 187 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,207 @@
1-
//! This example demonstrate how custom entities can be extracted from the DOCTYPE!,
2-
//! and later use to decode text and attribute values.
1+
//! This example demonstrate how custom entities can be extracted from the DOCTYPE,
2+
//! and later use to:
3+
//! - insert new pieces of document (particular case - insert only textual content)
4+
//! - decode attribute values
35
//!
46
//! NB: this example is deliberately kept simple:
57
//! * it assumes that the XML file is UTF-8 encoded (custom_entities must only contain UTF-8 data)
68
//! * it only handles internal entities;
79
//! * the regex in this example is simple but brittle;
810
//! * it does not support the use of entities in entity declaration.
911
10-
use std::collections::HashMap;
12+
use std::borrow::Cow;
13+
use std::collections::{HashMap, VecDeque};
14+
use std::str::from_utf8;
1115

12-
use quick_xml::escape::resolve_predefined_entity;
13-
use quick_xml::events::Event;
16+
use quick_xml::encoding::Decoder;
17+
use quick_xml::errors::Error;
18+
use quick_xml::escape::EscapeError;
19+
use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
20+
use quick_xml::name::QName;
1421
use quick_xml::reader::Reader;
1522
use regex::bytes::Regex;
1623

17-
const DATA: &str = r#"
24+
use pretty_assertions::assert_eq;
1825

19-
<?xml version="1.0"?>
20-
<!DOCTYPE test [
21-
<!ENTITY msg "hello world" >
22-
]>
23-
<test label="&msg;">&msg;</test>
26+
struct MyReader<'i> {
27+
/// Stack of readers, the first element is the initial reader, the other are
28+
/// readers created for each resolved entity
29+
readers: VecDeque<Reader<&'i [u8]>>,
30+
/// Map of captured internal _parsed general entities_. _Parsed_ means that
31+
/// value of the entity is parsed by XML reader
32+
entities: HashMap<&'i [u8], &'i [u8]>,
33+
/// In this example we use simple regular expression to capture entities from DTD.
34+
/// In real application you should use DTD parser.
35+
entity_re: Regex,
36+
}
37+
impl<'i> MyReader<'i> {
38+
fn new(input: &'i str) -> Result<Self, regex::Error> {
39+
let mut reader = Reader::from_str(input);
40+
reader.config_mut().trim_text(true);
2441

25-
"#;
42+
let mut readers = VecDeque::new();
43+
readers.push_back(reader);
2644

27-
fn main() -> Result<(), Box<dyn std::error::Error>> {
28-
let mut reader = Reader::from_str(DATA);
29-
reader.config_mut().trim_text(true);
30-
31-
let mut custom_entities: HashMap<String, String> = HashMap::new();
32-
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
33-
34-
loop {
35-
match reader.read_event() {
36-
Ok(Event::DocType(ref e)) => {
37-
for cap in entity_re.captures_iter(e) {
38-
custom_entities.insert(
39-
reader.decoder().decode(&cap[1])?.into_owned(),
40-
reader.decoder().decode(&cap[2])?.into_owned(),
41-
);
42-
}
43-
}
44-
Ok(Event::Start(ref e)) => {
45-
if let b"test" = e.name().as_ref() {
46-
let attributes = e
47-
.attributes()
48-
.map(|a| {
49-
a.unwrap()
50-
.decode_and_unescape_value_with(reader.decoder(), |ent| {
51-
custom_entities.get(ent).map(|s| s.as_str())
52-
})
53-
.unwrap()
54-
.into_owned()
55-
})
56-
.collect::<Vec<_>>();
57-
println!("attributes values: {:?}", attributes);
45+
// Capture "name" and "content" from such string:
46+
// <!ENTITY name "content" >
47+
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
48+
Ok(Self {
49+
readers,
50+
entities: HashMap::new(),
51+
entity_re,
52+
})
53+
}
54+
fn read_event(&mut self) -> Result<Event<'i>, Error> {
55+
loop {
56+
if let Some(mut reader) = self.readers.pop_back() {
57+
match dbg!(reader.read_event())? {
58+
// Capture defined entities from the DTD inside document and skip that event
59+
Event::DocType(e) => {
60+
self.readers.push_back(reader);
61+
self.capture(e);
62+
continue;
63+
}
64+
// When entity is referenced, create new reader with the same settings as
65+
// the current reader have and push it to the top of stack. Then try to
66+
// read next event from it (on next iteration)
67+
Event::GeneralRef(e) => {
68+
if let Some(ch) = e.resolve_char_ref()? {
69+
self.readers.push_back(reader);
70+
return Ok(Event::Text(BytesText::from_escaped(ch.to_string())));
71+
}
72+
let mut r = Reader::from_reader(self.resolve(&e)?);
73+
*r.config_mut() = reader.config().clone();
74+
75+
self.readers.push_back(reader);
76+
self.readers.push_back(r);
77+
continue;
78+
}
79+
// When reader is exhausted, do not return it to the stack
80+
Event::Eof => continue,
81+
82+
// Return all other events to caller
83+
e => {
84+
self.readers.push_back(reader);
85+
return Ok(e);
86+
}
5887
}
5988
}
60-
Ok(Event::Text(ref e)) => {
61-
println!(
62-
"text value: {}",
63-
e.unescape_with(|ent| match custom_entities.get(ent) {
64-
Some(s) => Some(s.as_str()),
65-
None => resolve_predefined_entity(ent),
66-
})
67-
.unwrap()
68-
);
69-
}
70-
Ok(Event::Eof) => break,
71-
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
72-
_ => (),
89+
return Ok(Event::Eof);
7390
}
7491
}
92+
93+
/// In this example we use simple regular expression to capture entities from DTD.
94+
/// In real application you should use DTD parser
95+
fn capture(&mut self, doctype: BytesText<'i>) {
96+
let doctype = match doctype.into_inner() {
97+
Cow::Borrowed(doctype) => doctype,
98+
Cow::Owned(_) => unreachable!("We are sure that event will be borrowed"),
99+
};
100+
for cap in self.entity_re.captures_iter(doctype) {
101+
self.entities.insert(
102+
cap.get(1).unwrap().as_bytes(),
103+
cap.get(2).unwrap().as_bytes(),
104+
);
105+
}
106+
}
107+
108+
fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> {
109+
match self.entities.get(entity) {
110+
Some(replacement) => Ok(replacement),
111+
None => Err(EscapeError::UnrecognizedEntity(
112+
0..0,
113+
String::from_utf8_lossy(entity).into_owned(),
114+
)),
115+
}
116+
}
117+
118+
fn get_entity(&self, entity: &str) -> Option<&'i str> {
119+
self.entities
120+
.get(entity.as_bytes())
121+
// SAFETY: We are sure that slices are correct UTF-8 because we get
122+
// them from rust string
123+
.map(|value| from_utf8(value).unwrap())
124+
}
125+
126+
fn decoder(&self) -> Decoder {
127+
self.readers.back().unwrap().decoder()
128+
}
129+
}
130+
131+
fn main() -> Result<(), Box<dyn std::error::Error>> {
132+
let mut reader = MyReader::new(
133+
r#"
134+
<!DOCTYPE test [
135+
<!ENTITY text "hello world" >
136+
<!ENTITY element1 "<dtd attr = 'Message: &text;'/>" >
137+
<!ENTITY element2 "<a>&element1;</a>" >
138+
]>
139+
<test label="Message: &text;">&#39;&element2;&#x27;</test>
140+
"#,
141+
)?;
142+
143+
let event = reader.read_event()?;
144+
assert_eq!(
145+
event,
146+
Event::Start(BytesStart::from_content(
147+
r#"test label="Message: &text;""#,
148+
4
149+
))
150+
);
151+
if let Event::Start(e) = event {
152+
let mut attrs = e.attributes();
153+
154+
let label = attrs.next().unwrap()?;
155+
assert_eq!(label.key, QName(b"label"));
156+
assert_eq!(
157+
label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?,
158+
"Message: hello world"
159+
);
160+
161+
assert_eq!(attrs.next(), None);
162+
}
163+
164+
// This is decoded decimal character reference &#39;
165+
assert_eq!(
166+
reader.read_event()?,
167+
Event::Text(BytesText::from_escaped("'"))
168+
);
169+
170+
//--------------------------------------------------------------------------
171+
// This part was inserted into original document from entity defined in DTD
172+
173+
assert_eq!(reader.read_event()?, Event::Start(BytesStart::new("a")));
174+
let event = reader.read_event()?;
175+
assert_eq!(
176+
event,
177+
Event::Empty(BytesStart::from_content(
178+
r#"dtd attr = 'Message: &text;'"#,
179+
3
180+
))
181+
);
182+
if let Event::Start(e) = event {
183+
let mut attrs = e.attributes();
184+
185+
let attr = attrs.next().unwrap()?;
186+
assert_eq!(attr.key, QName(b"attr"));
187+
assert_eq!(
188+
attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?,
189+
"Message: hello world"
190+
);
191+
192+
assert_eq!(attrs.next(), None);
193+
}
194+
assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("a")));
195+
//--------------------------------------------------------------------------
196+
197+
// This is decoded hexadecimal character reference &#x27;
198+
assert_eq!(
199+
reader.read_event()?,
200+
Event::Text(BytesText::from_escaped("'"))
201+
);
202+
203+
assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("test")));
204+
assert_eq!(reader.read_event()?, Event::Eof);
205+
75206
Ok(())
76207
}

fuzz/fuzz_targets/fuzz_target_1.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ where
4343
| Ok(Event::Comment(ref e))
4444
| Ok(Event::DocType(ref e)) => {
4545
debug_format!(e);
46-
if let Err(err) = e.unescape() {
46+
if let Err(err) = e.decode() {
4747
debug_format!(err);
4848
break;
4949
}
@@ -55,6 +55,11 @@ where
5555
break;
5656
}
5757
}
58+
Ok(Event::GeneralRef(ref e)) => {
59+
debug_format!(e);
60+
debug_format!(e.is_char_ref());
61+
debug_format!(e.resolve_char_ref());
62+
}
5863
Ok(Event::PI(ref e)) => {
5964
debug_format!(e);
6065
}

0 commit comments

Comments
 (0)