|
1 |
| -//! This example demonstrate how custom entities can be extracted from the DOCTYPE!, |
2 |
| -//! and later use to decode text and attribute values. |
| 1 | +//! This example demonstrate how custom entities can be extracted from the DOCTYPE, |
| 2 | +//! and later use to: |
| 3 | +//! - insert new pieces of document (particular case - insert only textual content) |
| 4 | +//! - decode attribute values |
3 | 5 | //!
|
4 | 6 | //! NB: this example is deliberately kept simple:
|
5 | 7 | //! * it assumes that the XML file is UTF-8 encoded (custom_entities must only contain UTF-8 data)
|
6 | 8 | //! * it only handles internal entities;
|
7 | 9 | //! * the regex in this example is simple but brittle;
|
8 | 10 | //! * it does not support the use of entities in entity declaration.
|
9 | 11 |
|
10 |
| -use std::collections::HashMap; |
| 12 | +use std::borrow::Cow; |
| 13 | +use std::collections::{HashMap, VecDeque}; |
| 14 | +use std::str::from_utf8; |
11 | 15 |
|
12 |
| -use quick_xml::escape::resolve_predefined_entity; |
13 |
| -use quick_xml::events::Event; |
| 16 | +use quick_xml::encoding::Decoder; |
| 17 | +use quick_xml::errors::Error; |
| 18 | +use quick_xml::escape::EscapeError; |
| 19 | +use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event}; |
| 20 | +use quick_xml::name::QName; |
14 | 21 | use quick_xml::reader::Reader;
|
15 | 22 | use regex::bytes::Regex;
|
16 | 23 |
|
17 |
| -const DATA: &str = r#" |
| 24 | +use pretty_assertions::assert_eq; |
18 | 25 |
|
19 |
| - <?xml version="1.0"?> |
20 |
| - <!DOCTYPE test [ |
21 |
| - <!ENTITY msg "hello world" > |
22 |
| - ]> |
23 |
| - <test label="&msg;">&msg;</test> |
| 26 | +struct MyReader<'i> { |
| 27 | + /// Stack of readers, the first element is the initial reader, the other are |
| 28 | + /// readers created for each resolved entity |
| 29 | + readers: VecDeque<Reader<&'i [u8]>>, |
| 30 | + /// Map of captured internal _parsed general entities_. _Parsed_ means that |
| 31 | + /// value of the entity is parsed by XML reader |
| 32 | + entities: HashMap<&'i [u8], &'i [u8]>, |
| 33 | + /// In this example we use simple regular expression to capture entities from DTD. |
| 34 | + /// In real application you should use DTD parser. |
| 35 | + entity_re: Regex, |
| 36 | +} |
| 37 | +impl<'i> MyReader<'i> { |
| 38 | + fn new(input: &'i str) -> Result<Self, regex::Error> { |
| 39 | + let mut reader = Reader::from_str(input); |
| 40 | + reader.config_mut().trim_text(true); |
24 | 41 |
|
25 |
| -"#; |
| 42 | + let mut readers = VecDeque::new(); |
| 43 | + readers.push_back(reader); |
26 | 44 |
|
27 |
| -fn main() -> Result<(), Box<dyn std::error::Error>> { |
28 |
| - let mut reader = Reader::from_str(DATA); |
29 |
| - reader.config_mut().trim_text(true); |
30 |
| - |
31 |
| - let mut custom_entities: HashMap<String, String> = HashMap::new(); |
32 |
| - let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?; |
33 |
| - |
34 |
| - loop { |
35 |
| - match reader.read_event() { |
36 |
| - Ok(Event::DocType(ref e)) => { |
37 |
| - for cap in entity_re.captures_iter(e) { |
38 |
| - custom_entities.insert( |
39 |
| - reader.decoder().decode(&cap[1])?.into_owned(), |
40 |
| - reader.decoder().decode(&cap[2])?.into_owned(), |
41 |
| - ); |
42 |
| - } |
43 |
| - } |
44 |
| - Ok(Event::Start(ref e)) => { |
45 |
| - if let b"test" = e.name().as_ref() { |
46 |
| - let attributes = e |
47 |
| - .attributes() |
48 |
| - .map(|a| { |
49 |
| - a.unwrap() |
50 |
| - .decode_and_unescape_value_with(reader.decoder(), |ent| { |
51 |
| - custom_entities.get(ent).map(|s| s.as_str()) |
52 |
| - }) |
53 |
| - .unwrap() |
54 |
| - .into_owned() |
55 |
| - }) |
56 |
| - .collect::<Vec<_>>(); |
57 |
| - println!("attributes values: {:?}", attributes); |
| 45 | + // Capture "name" and "content" from such string: |
| 46 | + // <!ENTITY name "content" > |
| 47 | + let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?; |
| 48 | + Ok(Self { |
| 49 | + readers, |
| 50 | + entities: HashMap::new(), |
| 51 | + entity_re, |
| 52 | + }) |
| 53 | + } |
| 54 | + fn read_event(&mut self) -> Result<Event<'i>, Error> { |
| 55 | + loop { |
| 56 | + if let Some(mut reader) = self.readers.pop_back() { |
| 57 | + match dbg!(reader.read_event())? { |
| 58 | + // Capture defined entities from the DTD inside document and skip that event |
| 59 | + Event::DocType(e) => { |
| 60 | + self.readers.push_back(reader); |
| 61 | + self.capture(e); |
| 62 | + continue; |
| 63 | + } |
| 64 | + // When entity is referenced, create new reader with the same settings as |
| 65 | + // the current reader have and push it to the top of stack. Then try to |
| 66 | + // read next event from it (on next iteration) |
| 67 | + Event::GeneralRef(e) => { |
| 68 | + if let Some(ch) = e.resolve_char_ref()? { |
| 69 | + self.readers.push_back(reader); |
| 70 | + return Ok(Event::Text(BytesText::from_escaped(ch.to_string()))); |
| 71 | + } |
| 72 | + let mut r = Reader::from_reader(self.resolve(&e)?); |
| 73 | + *r.config_mut() = reader.config().clone(); |
| 74 | + |
| 75 | + self.readers.push_back(reader); |
| 76 | + self.readers.push_back(r); |
| 77 | + continue; |
| 78 | + } |
| 79 | + // When reader is exhausted, do not return it to the stack |
| 80 | + Event::Eof => continue, |
| 81 | + |
| 82 | + // Return all other events to caller |
| 83 | + e => { |
| 84 | + self.readers.push_back(reader); |
| 85 | + return Ok(e); |
| 86 | + } |
58 | 87 | }
|
59 | 88 | }
|
60 |
| - Ok(Event::Text(ref e)) => { |
61 |
| - println!( |
62 |
| - "text value: {}", |
63 |
| - e.unescape_with(|ent| match custom_entities.get(ent) { |
64 |
| - Some(s) => Some(s.as_str()), |
65 |
| - None => resolve_predefined_entity(ent), |
66 |
| - }) |
67 |
| - .unwrap() |
68 |
| - ); |
69 |
| - } |
70 |
| - Ok(Event::Eof) => break, |
71 |
| - Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), |
72 |
| - _ => (), |
| 89 | + return Ok(Event::Eof); |
73 | 90 | }
|
74 | 91 | }
|
| 92 | + |
| 93 | + /// In this example we use simple regular expression to capture entities from DTD. |
| 94 | + /// In real application you should use DTD parser |
| 95 | + fn capture(&mut self, doctype: BytesText<'i>) { |
| 96 | + let doctype = match doctype.into_inner() { |
| 97 | + Cow::Borrowed(doctype) => doctype, |
| 98 | + Cow::Owned(_) => unreachable!("We are sure that event will be borrowed"), |
| 99 | + }; |
| 100 | + for cap in self.entity_re.captures_iter(doctype) { |
| 101 | + self.entities.insert( |
| 102 | + cap.get(1).unwrap().as_bytes(), |
| 103 | + cap.get(2).unwrap().as_bytes(), |
| 104 | + ); |
| 105 | + } |
| 106 | + } |
| 107 | + |
| 108 | + fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> { |
| 109 | + match self.entities.get(entity) { |
| 110 | + Some(replacement) => Ok(replacement), |
| 111 | + None => Err(EscapeError::UnrecognizedEntity( |
| 112 | + 0..0, |
| 113 | + String::from_utf8_lossy(entity).into_owned(), |
| 114 | + )), |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + fn get_entity(&self, entity: &str) -> Option<&'i str> { |
| 119 | + self.entities |
| 120 | + .get(entity.as_bytes()) |
| 121 | + // SAFETY: We are sure that slices are correct UTF-8 because we get |
| 122 | + // them from rust string |
| 123 | + .map(|value| from_utf8(value).unwrap()) |
| 124 | + } |
| 125 | + |
| 126 | + fn decoder(&self) -> Decoder { |
| 127 | + self.readers.back().unwrap().decoder() |
| 128 | + } |
| 129 | +} |
| 130 | + |
| 131 | +fn main() -> Result<(), Box<dyn std::error::Error>> { |
| 132 | + let mut reader = MyReader::new( |
| 133 | + r#" |
| 134 | + <!DOCTYPE test [ |
| 135 | + <!ENTITY text "hello world" > |
| 136 | + <!ENTITY element1 "<dtd attr = 'Message: &text;'/>" > |
| 137 | + <!ENTITY element2 "<a>&element1;</a>" > |
| 138 | + ]> |
| 139 | + <test label="Message: &text;">'&element2;'</test> |
| 140 | + "#, |
| 141 | + )?; |
| 142 | + |
| 143 | + let event = reader.read_event()?; |
| 144 | + assert_eq!( |
| 145 | + event, |
| 146 | + Event::Start(BytesStart::from_content( |
| 147 | + r#"test label="Message: &text;""#, |
| 148 | + 4 |
| 149 | + )) |
| 150 | + ); |
| 151 | + if let Event::Start(e) = event { |
| 152 | + let mut attrs = e.attributes(); |
| 153 | + |
| 154 | + let label = attrs.next().unwrap()?; |
| 155 | + assert_eq!(label.key, QName(b"label")); |
| 156 | + assert_eq!( |
| 157 | + label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, |
| 158 | + "Message: hello world" |
| 159 | + ); |
| 160 | + |
| 161 | + assert_eq!(attrs.next(), None); |
| 162 | + } |
| 163 | + |
| 164 | + // This is decoded decimal character reference ' |
| 165 | + assert_eq!( |
| 166 | + reader.read_event()?, |
| 167 | + Event::Text(BytesText::from_escaped("'")) |
| 168 | + ); |
| 169 | + |
| 170 | + //-------------------------------------------------------------------------- |
| 171 | + // This part was inserted into original document from entity defined in DTD |
| 172 | + |
| 173 | + assert_eq!(reader.read_event()?, Event::Start(BytesStart::new("a"))); |
| 174 | + let event = reader.read_event()?; |
| 175 | + assert_eq!( |
| 176 | + event, |
| 177 | + Event::Empty(BytesStart::from_content( |
| 178 | + r#"dtd attr = 'Message: &text;'"#, |
| 179 | + 3 |
| 180 | + )) |
| 181 | + ); |
| 182 | + if let Event::Start(e) = event { |
| 183 | + let mut attrs = e.attributes(); |
| 184 | + |
| 185 | + let attr = attrs.next().unwrap()?; |
| 186 | + assert_eq!(attr.key, QName(b"attr")); |
| 187 | + assert_eq!( |
| 188 | + attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, |
| 189 | + "Message: hello world" |
| 190 | + ); |
| 191 | + |
| 192 | + assert_eq!(attrs.next(), None); |
| 193 | + } |
| 194 | + assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("a"))); |
| 195 | + //-------------------------------------------------------------------------- |
| 196 | + |
| 197 | + // This is decoded hexadecimal character reference ' |
| 198 | + assert_eq!( |
| 199 | + reader.read_event()?, |
| 200 | + Event::Text(BytesText::from_escaped("'")) |
| 201 | + ); |
| 202 | + |
| 203 | + assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("test"))); |
| 204 | + assert_eq!(reader.read_event()?, Event::Eof); |
| 205 | + |
75 | 206 | Ok(())
|
76 | 207 | }
|
0 commit comments