v2 / vlib / encoding / xml / parser.v
617 lines · 546 sloc · 14.99 KB · b7ddc7da4853de0167532f1f4066827e86c6a42d
Raw
1module xml
2
3import io
4import os
5import strings
6
7const default_prolog_attributes = {
8 'version': '1.0'
9 'encoding': 'UTF-8'
10}
11const default_string_builder_cap = 32
12
13const element_len = '<!ELEMENT'.len
14const entity_len = '<!ENTITY'.len
15
16const doctype_chars = 'OCTYPE'.bytes()
17const double_dash = '--'.bytes()
18const c_tag = '[C'.bytes()
19const data_chars = 'DATA'.bytes()
20
21const byte_order_marking_first = u8(0xEF)
22const byte_order_marking_bytes = [u8(0xBB), 0xBF]
23
24// Helper types to assist in parsing
25
26struct TextSpan {
27mut:
28 start int
29 end int
30}
31
32enum AttributeParserState {
33 key
34 eq
35 value
36}
37
38fn parse_attributes(attribute_contents string) !map[string]string {
39 if attribute_contents.contains_u8(`<`) {
40 return error('Malformed XML. Found "<" in attribute string: "${attribute_contents}"')
41 }
42 mut attributes := map[string]string{}
43
44 mut state := AttributeParserState.key
45 mut key_span, mut value_span := TextSpan{}, TextSpan{}
46
47 for index, ch in attribute_contents {
48 match state {
49 .key {
50 match ch {
51 `=` {
52 state = AttributeParserState.eq
53 }
54 else {
55 key_span.end++
56 }
57 }
58 }
59 .eq {
60 match ch {
61 `=` {
62 return error('Duplicate "=" in attribute string: "${attribute_contents}"')
63 }
64 `'`, `"` {
65 state = AttributeParserState.value
66 value_span.start = index + 1
67 }
68 else {
69 return error('Invalid character in attribute string: "${attribute_contents}"')
70 }
71 }
72 }
73 .value {
74 match ch {
75 `'`, `"` {
76 state = AttributeParserState.key
77 value_span.end = index
78 attributes[attribute_contents[key_span.start..key_span.end].trim_space()] = attribute_contents[value_span.start..value_span.end]
79
80 key_span.start = index + 1
81 key_span.end = index + 1
82 }
83 else {
84 state = AttributeParserState.value
85 value_span.end++
86 }
87 }
88 }
89 }
90 }
91
92 return attributes
93}
94
95fn parse_comment(mut reader io.Reader) !XMLComment {
96 mut comment_buffer := strings.new_builder(default_string_builder_cap)
97
98 mut local_buf := [u8(0)]
99 for {
100 ch := next_char(mut reader, mut local_buf)!
101 match ch {
102 `-` {
103 after_ch := next_char(mut reader, mut local_buf)!
104 if after_ch == `-` {
105 if next_char(mut reader, mut local_buf)! == `>` {
106 break
107 }
108 return error('XML Comment not closed. Expected ">".')
109 } else {
110 comment_buffer.write_u8(ch)
111 comment_buffer.write_u8(after_ch)
112 }
113 }
114 else {
115 comment_buffer.write_u8(ch)
116 }
117 }
118 }
119
120 comment_contents := comment_buffer.str()
121 return XMLComment{comment_contents}
122}
123
124enum CDATAParserState {
125 normal
126 single
127 double
128}
129
130fn parse_cdata(mut reader io.Reader) !XMLCData {
131 mut contents_buf := strings.new_builder(default_string_builder_cap)
132
133 mut state := CDATAParserState.normal
134 mut local_buf := [u8(0)]
135
136 for {
137 ch := next_char(mut reader, mut local_buf)!
138 contents_buf.write_u8(ch)
139 match ch {
140 `]` {
141 match state {
142 .double {
143 // Another ] after the ]] for some reason. Keep the state
144 }
145 .single {
146 state = .double
147 }
148 .normal {
149 state = .single
150 }
151 }
152 }
153 `>` {
154 match state {
155 .double {
156 break
157 }
158 else {
159 state = .normal
160 }
161 }
162 }
163 else {
164 state = .normal
165 }
166 }
167 }
168
169 contents := contents_buf.str().trim_space()
170 if !contents.ends_with(']]>') {
171 return error('CDATA section not closed.')
172 }
173 return XMLCData{contents[1..contents.len - 3]}
174}
175
176fn parse_entity(contents string) !(DTDEntity, string) {
177 // We find the nearest '>' to the start of the ENTITY
178 entity_end := contents.index('>') or { return error('Entity declaration not closed.') }
179 entity_contents := contents[entity_len..entity_end]
180
181 name := entity_contents.trim_left(' \t\n').all_before(' ')
182 if name == '' {
183 return error('Entity is missing name.')
184 }
185 value := entity_contents.all_after_first(name).trim_space().trim('"\'')
186 if value.len == 0 {
187 return error('Entity is missing value.')
188 }
189
190 // TODO: Add support for SYSTEM and PUBLIC entities
191
192 return DTDEntity{name, value}, contents[entity_end + 1..]
193}
194
195fn parse_element(contents string) !(DTDElement, string) {
196 // We find the nearest '>' to the start of the ELEMENT
197 element_end := contents.index('>') or { return error('Element declaration not closed.') }
198 element_contents := contents[element_len..element_end].trim_left(' \t\r\n')
199
200 mut name_span := TextSpan{}
201
202 for ch in element_contents {
203 match ch {
204 ` `, `\t`, `\n` {
205 break
206 }
207 // Valid characters in an entity name are:
208 // 1. Lowercase alphabet - a-z
209 // 2. Uppercase alphabet - A-Z
210 // 3. Numbers - 0-9
211 // 4. Underscore - _
212 // 5. Colon - :
213 // 6. Period - .
214 `a`...`z`, `A`...`Z`, `0`...`9`, `_`, `:`, `.` {
215 name_span.end++
216 }
217 else {
218 return error('Invalid character in element name: "${ch}"')
219 }
220 }
221 }
222
223 name := element_contents[name_span.start..name_span.end].trim_left(' \t\n')
224 if name == '' {
225 return error('Element is missing name.')
226 }
227 definition_string := element_contents.all_after_first(name).trim_space().trim('"\'')
228
229 definition := if definition_string.starts_with('(') {
230 // We have a list of possible children
231
232 // Ensure that both ( and ) are present
233 if !definition_string.ends_with(')') {
234 return error('Element declaration not closed.')
235 }
236
237 definition_string.trim('()').split(',')
238 } else {
239 // Invalid definition
240 return error('Invalid element definition: ${definition_string}')
241 }
242
243 // TODO: Add support for SYSTEM and PUBLIC entities
244
245 return DTDElement{name, definition}, contents[element_end + 1..]
246}
247
248fn parse_doctype(mut reader io.Reader) !DocumentType {
249 // We may have more < in the doctype so keep count
250 mut depth := 1
251 mut doctype_buffer := strings.new_builder(default_string_builder_cap)
252 mut local_buf := [u8(0)]
253 for {
254 ch := next_char(mut reader, mut local_buf)!
255 doctype_buffer.write_u8(ch)
256 match ch {
257 `<` {
258 depth++
259 }
260 `>` {
261 depth--
262 if depth == 0 {
263 break
264 }
265 }
266 else {}
267 }
268 }
269
270 doctype_contents := doctype_buffer.str().trim_space()
271
272 name := doctype_contents.all_before('[').trim_space()
273
274 mut list_contents := doctype_contents.all_after('[').all_before(']').trim_space()
275 mut items := []DTDListItem{}
276
277 for list_contents.len > 0 {
278 if list_contents.starts_with('<!ENTITY') {
279 entity, remaining := parse_entity(list_contents)!
280 items << entity
281 list_contents = remaining.trim_space()
282 } else if list_contents.starts_with('<!ELEMENT') {
283 element, remaining := parse_element(list_contents)!
284 items << element
285 list_contents = remaining.trim_space()
286 } else {
287 return error('Unknown DOCTYPE list item: ${list_contents}')
288 }
289 }
290
291 return DocumentType{
292 name: name
293 dtd: DocumentTypeDefinition{
294 list: items
295 }
296 }
297}
298
299fn parse_prolog(mut reader io.Reader) !(Prolog, u8) {
300 // Skip trailing whitespace and invalid characters
301 mut local_buf := [u8(0)]
302 mut ch := next_char(mut reader, mut local_buf)!
303 for {
304 match ch {
305 ` `, `\t`, `\r`, `\n` {
306 ch = next_char(mut reader, mut local_buf)!
307 continue
308 }
309 `<` {
310 break
311 }
312 byte_order_marking_first {
313 // UTF-8 BOM
314 mut bom_buf := [u8(0), 0]
315 if reader.read(mut bom_buf)! != 2 {
316 return error('Invalid UTF-8 BOM.')
317 }
318 if bom_buf != byte_order_marking_bytes {
319 return error('Invalid UTF-8 BOM.')
320 }
321 ch = next_char(mut reader, mut local_buf)!
322 continue
323 }
324 else {
325 return error('Expecting a prolog or root node starting with "<".')
326 }
327 }
328 }
329
330 ch = next_char(mut reader, mut local_buf)!
331 if ch != `?` {
332 return Prolog{}, ch
333 }
334
335 ch = next_char(mut reader, mut local_buf)!
336 if ch != `x` {
337 return error('Expecting a prolog starting with "<?x".')
338 }
339
340 ch = next_char(mut reader, mut local_buf)!
341 if ch != `m` {
342 return error('Expecting a prolog starting with "<?xm".')
343 }
344
345 ch = next_char(mut reader, mut local_buf)!
346 if ch != `l` {
347 return error('Expecting a prolog starting with "<?xml".')
348 }
349
350 mut prolog_buffer := strings.new_builder(default_string_builder_cap)
351
352 // Keep reading character by character until we find the end of the prolog
353 mut found_question_mark := false
354
355 for {
356 ch = next_char(mut reader, mut local_buf)!
357 match ch {
358 `?` {
359 if found_question_mark {
360 return error('Invalid prolog: Two question marks found in a row.')
361 }
362 found_question_mark = true
363 }
364 `>` {
365 if found_question_mark {
366 break
367 }
368 return error('Invalid prolog: Found ">" before "?".')
369 }
370 else {
371 if found_question_mark {
372 found_question_mark = false
373 prolog_buffer.write_u8(`?`)
374 }
375 prolog_buffer.write_u8(ch)
376 }
377 }
378 }
379
380 prolog_attributes := prolog_buffer.str().trim_space()
381
382 attributes := if prolog_attributes.len == 0 {
383 default_prolog_attributes
384 } else {
385 parse_attributes(prolog_attributes)!
386 }
387
388 version := attributes['version'] or { return error('XML declaration missing version.') }
389 encoding := attributes['encoding'] or { 'UTF-8' }
390
391 mut comments := []XMLComment{}
392 mut doctype := DocumentType{
393 name: ''
394 dtd: ''
395 }
396 mut found_doctype := false
397 for {
398 ch = next_char(mut reader, mut local_buf)!
399 match ch {
400 ` `, `\t`, `\n` {
401 continue
402 }
403 `<` {
404 // We have a comment, DOCTYPE, or root node
405 ch = next_char(mut reader, mut local_buf)!
406 match ch {
407 `!` {
408 // A comment or DOCTYPE
409 match next_char(mut reader, mut local_buf)! {
410 `-` {
411 // A comment
412 if next_char(mut reader, mut local_buf)! != `-` {
413 return error('Invalid comment.')
414 }
415 comments << parse_comment(mut reader)!
416 }
417 `D` {
418 if found_doctype {
419 return error('Duplicate DOCTYPE declaration.')
420 }
421 // <!D -> OCTYPE
422 mut doc_buf := []u8{len: 6}
423 if reader.read(mut doc_buf)! != 6 {
424 return error('Invalid DOCTYPE.')
425 }
426 if doc_buf != doctype_chars {
427 return error('Invalid DOCTYPE.')
428 }
429 found_doctype = true
430 doctype = parse_doctype(mut reader)!
431 }
432 else {
433 return error('Unsupported control sequence found in prolog.')
434 }
435 }
436 }
437 else {
438 // We have found the start of the root node
439 break
440 }
441 }
442 }
443 else {}
444 }
445 }
446
447 return Prolog{
448 version: version
449 encoding: encoding
450 doctype: doctype
451 comments: comments
452 }, ch
453}
454
455fn parse_children(name string, attributes map[string]string, mut reader io.Reader) !XMLNode {
456 mut inner_contents := strings.new_builder(default_string_builder_cap)
457
458 mut children := []XMLNodeContents{}
459 mut local_buf := [u8(0)]
460
461 for {
462 ch := next_char(mut reader, mut local_buf)!
463 match ch {
464 `<` {
465 second_char := next_char(mut reader, mut local_buf)!
466 match second_char {
467 `!` {
468 // Comment, CDATA
469 mut next_two := [u8(0), 0]
470 if reader.read(mut next_two)! != 2 {
471 return error('Invalid XML. Incomplete comment or CDATA declaration.')
472 }
473 if next_two == double_dash {
474 // Comment
475 comment := parse_comment(mut reader)!
476 children << comment
477 } else if next_two == c_tag {
478 // <![CDATA -> DATA
479 mut cdata_buf := []u8{len: 4}
480 if reader.read(mut cdata_buf)! != 4 {
481 return error('Invalid XML. Incomplete CDATA declaration.')
482 }
483 if cdata_buf != data_chars {
484 return error('Invalid XML. Expected "CDATA" after "<![C".')
485 }
486 cdata := parse_cdata(mut reader)!
487 children << cdata
488 } else {
489 return error('Invalid XML. Unknown control sequence: ${next_two.bytestr()}')
490 }
491 }
492 `/` {
493 // End of node
494 mut node_end_buffer := []u8{len: name.len + 1}
495 if reader.read(mut node_end_buffer)! != name.len + 1 {
496 return error('Invalid XML. Incomplete node end.')
497 }
498
499 mut ending_chars := name.bytes()
500 ending_chars << `>`
501
502 if node_end_buffer != ending_chars {
503 return error('XML node <${name}> not closed.')
504 }
505
506 collected_contents := inner_contents.str().trim_space()
507 if collected_contents.len > 0 {
508 // We have some inner text
509 children << collected_contents.replace('\r\n', '\n')
510 }
511 return XMLNode{
512 name: name
513 attributes: attributes
514 children: children
515 }
516 }
517 else {
518 // Start of child node
519 child := parse_single_node(second_char, mut reader) or {
520 if err.msg() == 'XML node cannot start with "</".' {
521 return error('XML node <${name}> not closed.')
522 } else {
523 return err
524 }
525 }
526 text := inner_contents.str().trim_space()
527 if text.len > 0 {
528 children << text.replace('\r\n', '\n')
529 }
530 children << child
531 }
532 }
533 }
534 else {
535 inner_contents.write_u8(ch)
536 }
537 }
538 }
539 return error('XML node <${name}> not closed.')
540}
541
542// parse_single_node parses a single XML node from the reader. The first character of the tag is passed
543// in as the first_char parameter.
544// This function is meant to assist in parsing nested nodes one at a time. Using this function as
545// opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large
546// XML documents without running out of memory.
547pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
548 mut contents := strings.new_builder(default_string_builder_cap)
549 contents.write_u8(first_char)
550
551 mut local_buf := [u8(0)]
552 for {
553 mut ch := next_char(mut reader, mut local_buf)!
554 if ch == `>` {
555 break
556 }
557 contents.write_u8(ch)
558 }
559
560 tag_contents := contents.str().trim_space()
561
562 parts := tag_contents.split_any(' \t\r\n')
563 name := parts[0].trim_right('/')
564
565 // Check if it is a self-closing tag
566 if tag_contents.ends_with('/') {
567 // We're not looking for children and inner text
568 return XMLNode{
569 name: name
570 attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())!
571 }
572 }
573
574 attribute_string := tag_contents[name.len..].trim_space()
575 attributes := parse_attributes(attribute_string)!
576
577 return parse_children(name, attributes, mut reader)
578}
579
580// XMLDocument.from_string parses an XML document from a string.
581pub fn XMLDocument.from_string(raw_contents string) !XMLDocument {
582 mut reader := FullBufferReader{
583 contents: raw_contents.bytes()
584 }
585 return XMLDocument.from_reader(mut reader)!
586}
587
588// XMLDocument.from_file parses an XML document from a file. Note that the file is read in its entirety
589// and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead.
590pub fn XMLDocument.from_file(path string) !XMLDocument {
591 mut reader := FullBufferReader{
592 contents: os.read_bytes(path)!
593 }
594 return XMLDocument.from_reader(mut reader)!
595}
596
597// XMLDocument.from_reader parses an XML document from a reader. This is the most generic way to parse
598// an XML document from any arbitrary source that implements that io.Reader interface.
599pub fn XMLDocument.from_reader(mut reader io.Reader) !XMLDocument {
600 prolog, first_char := parse_prolog(mut reader) or {
601 if err is os.Eof || err is io.Eof || err.msg() == 'Unexpected End Of File.' {
602 return error('XML document is empty.')
603 } else {
604 return err
605 }
606 }
607
608 root := parse_single_node(first_char, mut reader)!
609
610 return XMLDocument{
611 version: prolog.version
612 encoding: prolog.encoding
613 comments: prolog.comments
614 doctype: prolog.doctype
615 root: root
616 }
617}
618