Gitly


1 module xml
2 
3 import io
4 import os
5 import strings
6 
7 const default_prolog_attributes = {
8     'version':  '1.0'
9     'encoding': 'UTF-8'
10 }
11 const default_string_builder_cap = 32
12 
13 const element_len = '<!ELEMENT'.len
14 const entity_len = '<!ENTITY'.len
15 
16 const doctype_chars = 'OCTYPE'.bytes()
17 const double_dash = '--'.bytes()
18 const c_tag = '[C'.bytes()
19 const data_chars = 'DATA'.bytes()
20 
21 const byte_order_marking_first = u8(0xEF)
22 const byte_order_marking_bytes = [u8(0xBB), 0xBF]
23 
24 // Helper types to assist in parsing
25 
26 struct TextSpan {
27 mut:
28     start int
29     end   int
30 }
31 
32 enum AttributeParserState {
33     key
34     eq
35     value
36 }
37 
38 fn parse_attributes(attribute_contents string) !map[string]string {
39     if attribute_contents.contains_u8(`<`) {
40         return error('Malformed XML. Found "<" in attribute string: "${attribute_contents}"')
41     }
42     mut attributes := map[string]string{}
43 
44     mut state := AttributeParserState.key
45     mut key_span, mut value_span := TextSpan{}, TextSpan{}
46 
47     for index, ch in attribute_contents {
48         match state {
49             .key {
50                 match ch {
51                     `=` {
52                         state = AttributeParserState.eq
53                     }
54                     else {
55                         key_span.end++
56                     }
57                 }
58             }
59             .eq {
60                 match ch {
61                     `=` {
62                         return error('Duplicate "=" in attribute string: "${attribute_contents}"')
63                     }
64                     `'`, `"` {
65                         state = AttributeParserState.value
66                         value_span.start = index + 1
67                     }
68                     else {
69                         return error('Invalid character in attribute string: "${attribute_contents}"')
70                     }
71                 }
72             }
73             .value {
74                 match ch {
75                     `'`, `"` {
76                         state = AttributeParserState.key
77                         value_span.end = index
78                         attributes[attribute_contents[key_span.start..key_span.end].trim_space()] = attribute_contents[value_span.start..value_span.end]
79 
80                         key_span.start = index + 1
81                         key_span.end = index + 1
82                     }
83                     else {
84                         state = AttributeParserState.value
85                         value_span.end++
86                     }
87                 }
88             }
89         }
90     }
91 
92     return attributes
93 }
94 
95 fn parse_comment(mut reader io.Reader) !XMLComment {
96     mut comment_buffer := strings.new_builder(default_string_builder_cap)
97 
98     mut local_buf := [u8(0)]
99     for {
100         ch := next_char(mut reader, mut local_buf)!
101         match ch {
102             `-` {
103                 after_ch := next_char(mut reader, mut local_buf)!
104                 if after_ch == `-` {
105                     if next_char(mut reader, mut local_buf)! == `>` {
106                         break
107                     }
108                     return error('XML Comment not closed. Expected ">".')
109                 } else {
110                     comment_buffer.write_u8(ch)
111                     comment_buffer.write_u8(after_ch)
112                 }
113             }
114             else {
115                 comment_buffer.write_u8(ch)
116             }
117         }
118     }
119 
120     comment_contents := comment_buffer.str()
121     return XMLComment{comment_contents}
122 }
123 
124 enum CDATAParserState {
125     normal
126     single
127     double
128 }
129 
130 fn parse_cdata(mut reader io.Reader) !XMLCData {
131     mut contents_buf := strings.new_builder(default_string_builder_cap)
132 
133     mut state := CDATAParserState.normal
134     mut local_buf := [u8(0)]
135 
136     for {
137         ch := next_char(mut reader, mut local_buf)!
138         contents_buf.write_u8(ch)
139         match ch {
140             `]` {
141                 match state {
142                     .double {
143                         // Another ] after the ]] for some reason. Keep the state
144                     }
145                     .single {
146                         state = .double
147                     }
148                     .normal {
149                         state = .single
150                     }
151                 }
152             }
153             `>` {
154                 match state {
155                     .double {
156                         break
157                     }
158                     else {
159                         state = .normal
160                     }
161                 }
162             }
163             else {
164                 state = .normal
165             }
166         }
167     }
168 
169     contents := contents_buf.str().trim_space()
170     if !contents.ends_with(']]>') {
171         return error('CDATA section not closed.')
172     }
173     return XMLCData{contents[1..contents.len - 3]}
174 }
175 
176 fn parse_entity(contents string) !(DTDEntity, string) {
177     // We find the nearest '>' to the start of the ENTITY
178     entity_end := contents.index('>') or { return error('Entity declaration not closed.') }
179     entity_contents := contents[entity_len..entity_end]
180 
181     name := entity_contents.trim_left(' \t\n').all_before(' ')
182     if name == '' {
183         return error('Entity is missing name.')
184     }
185     value := entity_contents.all_after_first(name).trim_space().trim('"\'')
186     if value.len == 0 {
187         return error('Entity is missing value.')
188     }
189 
190     // TODO: Add support for SYSTEM and PUBLIC entities
191 
192     return DTDEntity{name, value}, contents[entity_end + 1..]
193 }
194 
195 fn parse_element(contents string) !(DTDElement, string) {
196     // We find the nearest '>' to the start of the ELEMENT
197     element_end := contents.index('>') or { return error('Element declaration not closed.') }
198     element_contents := contents[element_len..element_end].trim_left(' \t\r\n')
199 
200     mut name_span := TextSpan{}
201 
202     for ch in element_contents {
203         match ch {
204             ` `, `\t`, `\n` {
205                 break
206             }
207             // Valid characters in an entity name are:
208             // 1. Lowercase alphabet - a-z
209             // 2. Uppercase alphabet - A-Z
210             // 3. Numbers - 0-9
211             // 4. Underscore - _
212             // 5. Colon - :
213             // 6. Period - .
214             `a`...`z`, `A`...`Z`, `0`...`9`, `_`, `:`, `.` {
215                 name_span.end++
216             }
217             else {
218                 return error('Invalid character in element name: "${ch}"')
219             }
220         }
221     }
222 
223     name := element_contents[name_span.start..name_span.end].trim_left(' \t\n')
224     if name == '' {
225         return error('Element is missing name.')
226     }
227     definition_string := element_contents.all_after_first(name).trim_space().trim('"\'')
228 
229     definition := if definition_string.starts_with('(') {
230         // We have a list of possible children
231 
232         // Ensure that both ( and ) are present
233         if !definition_string.ends_with(')') {
234             return error('Element declaration not closed.')
235         }
236 
237         definition_string.trim('()').split(',')
238     } else {
239         // Invalid definition
240         return error('Invalid element definition: ${definition_string}')
241     }
242 
243     // TODO: Add support for SYSTEM and PUBLIC entities
244 
245     return DTDElement{name, definition}, contents[element_end + 1..]
246 }
247 
248 fn parse_doctype(mut reader io.Reader) !DocumentType {
249     // We may have more < in the doctype so keep count
250     mut depth := 1
251     mut doctype_buffer := strings.new_builder(default_string_builder_cap)
252     mut local_buf := [u8(0)]
253     for {
254         ch := next_char(mut reader, mut local_buf)!
255         doctype_buffer.write_u8(ch)
256         match ch {
257             `<` {
258                 depth++
259             }
260             `>` {
261                 depth--
262                 if depth == 0 {
263                     break
264                 }
265             }
266             else {}
267         }
268     }
269 
270     doctype_contents := doctype_buffer.str().trim_space()
271 
272     name := doctype_contents.all_before('[').trim_space()
273 
274     mut list_contents := doctype_contents.all_after('[').all_before(']').trim_space()
275     mut items := []DTDListItem{}
276 
277     for list_contents.len > 0 {
278         if list_contents.starts_with('<!ENTITY') {
279             entity, remaining := parse_entity(list_contents)!
280             items << entity
281             list_contents = remaining.trim_space()
282         } else if list_contents.starts_with('<!ELEMENT') {
283             element, remaining := parse_element(list_contents)!
284             items << element
285             list_contents = remaining.trim_space()
286         } else {
287             return error('Unknown DOCTYPE list item: ${list_contents}')
288         }
289     }
290 
291     return DocumentType{
292         name: name
293         dtd:  DocumentTypeDefinition{
294             list: items
295         }
296     }
297 }
298 
299 fn parse_prolog(mut reader io.Reader) !(Prolog, u8) {
300     // Skip trailing whitespace and invalid characters
301     mut local_buf := [u8(0)]
302     mut ch := next_char(mut reader, mut local_buf)!
303     for {
304         match ch {
305             ` `, `\t`, `\r`, `\n` {
306                 ch = next_char(mut reader, mut local_buf)!
307                 continue
308             }
309             `<` {
310                 break
311             }
312             byte_order_marking_first {
313                 // UTF-8 BOM
314                 mut bom_buf := [u8(0), 0]
315                 if reader.read(mut bom_buf)! != 2 {
316                     return error('Invalid UTF-8 BOM.')
317                 }
318                 if bom_buf != byte_order_marking_bytes {
319                     return error('Invalid UTF-8 BOM.')
320                 }
321                 ch = next_char(mut reader, mut local_buf)!
322                 continue
323             }
324             else {
325                 return error('Expecting a prolog or root node starting with "<".')
326             }
327         }
328     }
329 
330     ch = next_char(mut reader, mut local_buf)!
331     if ch != `?` {
332         return Prolog{}, ch
333     }
334 
335     ch = next_char(mut reader, mut local_buf)!
336     if ch != `x` {
337         return error('Expecting a prolog starting with "<?x".')
338     }
339 
340     ch = next_char(mut reader, mut local_buf)!
341     if ch != `m` {
342         return error('Expecting a prolog starting with "<?xm".')
343     }
344 
345     ch = next_char(mut reader, mut local_buf)!
346     if ch != `l` {
347         return error('Expecting a prolog starting with "<?xml".')
348     }
349 
350     mut prolog_buffer := strings.new_builder(default_string_builder_cap)
351 
352     // Keep reading character by character until we find the end of the prolog
353     mut found_question_mark := false
354 
355     for {
356         ch = next_char(mut reader, mut local_buf)!
357         match ch {
358             `?` {
359                 if found_question_mark {
360                     return error('Invalid prolog: Two question marks found in a row.')
361                 }
362                 found_question_mark = true
363             }
364             `>` {
365                 if found_question_mark {
366                     break
367                 }
368                 return error('Invalid prolog: Found ">" before "?".')
369             }
370             else {
371                 if found_question_mark {
372                     found_question_mark = false
373                     prolog_buffer.write_u8(`?`)
374                 }
375                 prolog_buffer.write_u8(ch)
376             }
377         }
378     }
379 
380     prolog_attributes := prolog_buffer.str().trim_space()
381 
382     attributes := if prolog_attributes.len == 0 {
383         default_prolog_attributes
384     } else {
385         parse_attributes(prolog_attributes)!
386     }
387 
388     version := attributes['version'] or { return error('XML declaration missing version.') }
389     encoding := attributes['encoding'] or { 'UTF-8' }
390 
391     mut comments := []XMLComment{}
392     mut doctype := DocumentType{
393         name: ''
394         dtd:  ''
395     }
396     mut found_doctype := false
397     for {
398         ch = next_char(mut reader, mut local_buf)!
399         match ch {
400             ` `, `\t`, `\n` {
401                 continue
402             }
403             `<` {
404                 // We have a comment, DOCTYPE, or root node
405                 ch = next_char(mut reader, mut local_buf)!
406                 match ch {
407                     `!` {
408                         // A comment or DOCTYPE
409                         match next_char(mut reader, mut local_buf)! {
410                             `-` {
411                                 // A comment
412                                 if next_char(mut reader, mut local_buf)! != `-` {
413                                     return error('Invalid comment.')
414                                 }
415                                 comments << parse_comment(mut reader)!
416                             }
417                             `D` {
418                                 if found_doctype {
419                                     return error('Duplicate DOCTYPE declaration.')
420                                 }
421                                 // <!D -> OCTYPE
422                                 mut doc_buf := []u8{len: 6}
423                                 if reader.read(mut doc_buf)! != 6 {
424                                     return error('Invalid DOCTYPE.')
425                                 }
426                                 if doc_buf != doctype_chars {
427                                     return error('Invalid DOCTYPE.')
428                                 }
429                                 found_doctype = true
430                                 doctype = parse_doctype(mut reader)!
431                             }
432                             else {
433                                 return error('Unsupported control sequence found in prolog.')
434                             }
435                         }
436                     }
437                     else {
438                         // We have found the start of the root node
439                         break
440                     }
441                 }
442             }
443             else {}
444         }
445     }
446 
447     return Prolog{
448         version:  version
449         encoding: encoding
450         doctype:  doctype
451         comments: comments
452     }, ch
453 }
454 
455 fn parse_children(name string, attributes map[string]string, mut reader io.Reader) !XMLNode {
456     mut inner_contents := strings.new_builder(default_string_builder_cap)
457 
458     mut children := []XMLNodeContents{}
459     mut local_buf := [u8(0)]
460 
461     for {
462         ch := next_char(mut reader, mut local_buf)!
463         match ch {
464             `<` {
465                 second_char := next_char(mut reader, mut local_buf)!
466                 match second_char {
467                     `!` {
468                         // Comment, CDATA
469                         mut next_two := [u8(0), 0]
470                         if reader.read(mut next_two)! != 2 {
471                             return error('Invalid XML. Incomplete comment or CDATA declaration.')
472                         }
473                         if next_two == double_dash {
474                             // Comment
475                             comment := parse_comment(mut reader)!
476                             children << comment
477                         } else if next_two == c_tag {
478                             // <![CDATA -> DATA
479                             mut cdata_buf := []u8{len: 4}
480                             if reader.read(mut cdata_buf)! != 4 {
481                                 return error('Invalid XML. Incomplete CDATA declaration.')
482                             }
483                             if cdata_buf != data_chars {
484                                 return error('Invalid XML. Expected "CDATA" after "<![C".')
485                             }
486                             cdata := parse_cdata(mut reader)!
487                             children << cdata
488                         } else {
489                             return error('Invalid XML. Unknown control sequence: ${next_two.bytestr()}')
490                         }
491                     }
492                     `/` {
493                         // End of node
494                         mut node_end_buffer := []u8{len: name.len + 1}
495                         if reader.read(mut node_end_buffer)! != name.len + 1 {
496                             return error('Invalid XML. Incomplete node end.')
497                         }
498 
499                         mut ending_chars := name.bytes()
500                         ending_chars << `>`
501 
502                         if node_end_buffer != ending_chars {
503                             return error('XML node <${name}> not closed.')
504                         }
505 
506                         collected_contents := inner_contents.str().trim_space()
507                         if collected_contents.len > 0 {
508                             // We have some inner text
509                             children << collected_contents.replace('\r\n', '\n')
510                         }
511                         return XMLNode{
512                             name:       name
513                             attributes: attributes
514                             children:   children
515                         }
516                     }
517                     else {
518                         // Start of child node
519                         child := parse_single_node(second_char, mut reader) or {
520                             if err.msg() == 'XML node cannot start with "</".' {
521                                 return error('XML node <${name}> not closed.')
522                             } else {
523                                 return err
524                             }
525                         }
526                         text := inner_contents.str().trim_space()
527                         if text.len > 0 {
528                             children << text.replace('\r\n', '\n')
529                         }
530                         children << child
531                     }
532                 }
533             }
534             else {
535                 inner_contents.write_u8(ch)
536             }
537         }
538     }
539     return error('XML node <${name}> not closed.')
540 }
541 
542 // parse_single_node parses a single XML node from the reader. The first character of the tag is passed
543 // in as the first_char parameter.
544 // This function is meant to assist in parsing nested nodes one at a time. Using this function as
545 // opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large
546 // XML documents without running out of memory.
547 pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
548     mut contents := strings.new_builder(default_string_builder_cap)
549     contents.write_u8(first_char)
550 
551     mut local_buf := [u8(0)]
552     for {
553         mut ch := next_char(mut reader, mut local_buf)!
554         if ch == `>` {
555             break
556         }
557         contents.write_u8(ch)
558     }
559 
560     tag_contents := contents.str().trim_space()
561 
562     parts := tag_contents.split_any(' \t\r\n')
563     name := parts[0].trim_right('/')
564 
565     // Check if it is a self-closing tag
566     if tag_contents.ends_with('/') {
567         // We're not looking for children and inner text
568         return XMLNode{
569             name:       name
570             attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())!
571         }
572     }
573 
574     attribute_string := tag_contents[name.len..].trim_space()
575     attributes := parse_attributes(attribute_string)!
576 
577     return parse_children(name, attributes, mut reader)
578 }
579 
580 // XMLDocument.from_string parses an XML document from a string.
581 pub fn XMLDocument.from_string(raw_contents string) !XMLDocument {
582     mut reader := FullBufferReader{
583         contents: raw_contents.bytes()
584     }
585     return XMLDocument.from_reader(mut reader)!
586 }
587 
588 // XMLDocument.from_file parses an XML document from a file. Note that the file is read in its entirety
589 // and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead.
590 pub fn XMLDocument.from_file(path string) !XMLDocument {
591     mut reader := FullBufferReader{
592         contents: os.read_bytes(path)!
593     }
594     return XMLDocument.from_reader(mut reader)!
595 }
596 
597 // XMLDocument.from_reader parses an XML document from a reader. This is the most generic way to parse
598 // an XML document from any arbitrary source that implements that io.Reader interface.
599 pub fn XMLDocument.from_reader(mut reader io.Reader) !XMLDocument {
600     prolog, first_char := parse_prolog(mut reader) or {
601         if err is os.Eof || err is io.Eof || err.msg() == 'Unexpected End Of File.' {
602             return error('XML document is empty.')
603         } else {
604             return err
605         }
606     }
607 
608     root := parse_single_node(first_char, mut reader)!
609 
610     return XMLDocument{
611         version:  prolog.version
612         encoding: prolog.encoding
613         comments: prolog.comments
614         doctype:  prolog.doctype
615         root:     root
616     }
617 }
618

1	module xml
2
3	import io
4	import os
5	import strings
6
7	const default_prolog_attributes = {
8	'version': '1.0'
9	'encoding': 'UTF-8'
10	}
11	const default_string_builder_cap = 32
12
13	const element_len = '<!ELEMENT'.len
14	const entity_len = '<!ENTITY'.len
15
16	const doctype_chars = 'OCTYPE'.bytes()
17	const double_dash = '--'.bytes()
18	const c_tag = '[C'.bytes()
19	const data_chars = 'DATA'.bytes()
20
21	const byte_order_marking_first = u8(0xEF)
22	const byte_order_marking_bytes = [u8(0xBB), 0xBF]
23
24	// Helper types to assist in parsing
25
26	struct TextSpan {
27	mut:
28	start int
29	end int
30	}
31
32	enum AttributeParserState {
33	key
34	eq
35	value
36	}
37
38	fn parse_attributes(attribute_contents string) !map[string]string {
39	if attribute_contents.contains_u8(`<`) {
40	return error('Malformed XML. Found "<" in attribute string: "${attribute_contents}"')
41	}
42	mut attributes := map[string]string{}
43
44	mut state := AttributeParserState.key
45	mut key_span, mut value_span := TextSpan{}, TextSpan{}
46
47	for index, ch in attribute_contents {
48	match state {
49	.key {
50	match ch {
51	`=` {
52	state = AttributeParserState.eq
53	}
54	else {
55	key_span.end++
56	}
57	}
58	}
59	.eq {
60	match ch {
61	`=` {
62	return error('Duplicate "=" in attribute string: "${attribute_contents}"')
63	}
64	`'`, `"` {
65	state = AttributeParserState.value
66	value_span.start = index + 1
67	}
68	else {
69	return error('Invalid character in attribute string: "${attribute_contents}"')
70	}
71	}
72	}
73	.value {
74	match ch {
75	`'`, `"` {
76	state = AttributeParserState.key
77	value_span.end = index
78	attributes[attribute_contents[key_span.start..key_span.end].trim_space()] = attribute_contents[value_span.start..value_span.end]
79
80	key_span.start = index + 1
81	key_span.end = index + 1
82	}
83	else {
84	state = AttributeParserState.value
85	value_span.end++
86	}
87	}
88	}
89	}
90	}
91
92	return attributes
93	}
94
95	fn parse_comment(mut reader io.Reader) !XMLComment {
96	mut comment_buffer := strings.new_builder(default_string_builder_cap)
97
98	mut local_buf := [u8(0)]
99	for {
100	ch := next_char(mut reader, mut local_buf)!
101	match ch {
102	`-` {
103	after_ch := next_char(mut reader, mut local_buf)!
104	if after_ch == `-` {
105	if next_char(mut reader, mut local_buf)! == `>` {
106	break
107	}
108	return error('XML Comment not closed. Expected ">".')
109	} else {
110	comment_buffer.write_u8(ch)
111	comment_buffer.write_u8(after_ch)
112	}
113	}
114	else {
115	comment_buffer.write_u8(ch)
116	}
117	}
118	}
119
120	comment_contents := comment_buffer.str()
121	return XMLComment{comment_contents}
122	}
123
124	enum CDATAParserState {
125	normal
126	single
127	double
128	}
129
130	fn parse_cdata(mut reader io.Reader) !XMLCData {
131	mut contents_buf := strings.new_builder(default_string_builder_cap)
132
133	mut state := CDATAParserState.normal
134	mut local_buf := [u8(0)]
135
136	for {
137	ch := next_char(mut reader, mut local_buf)!
138	contents_buf.write_u8(ch)
139	match ch {
140	`]` {
141	match state {
142	.double {
143	// Another ] after the ]] for some reason. Keep the state
144	}
145	.single {
146	state = .double
147	}
148	.normal {
149	state = .single
150	}
151	}
152	}
153	`>` {
154	match state {
155	.double {
156	break
157	}
158	else {
159	state = .normal
160	}
161	}
162	}
163	else {
164	state = .normal
165	}
166	}
167	}
168
169	contents := contents_buf.str().trim_space()
170	if !contents.ends_with(']]>') {
171	return error('CDATA section not closed.')
172	}
173	return XMLCData{contents[1..contents.len - 3]}
174	}
175
176	fn parse_entity(contents string) !(DTDEntity, string) {
177	// We find the nearest '>' to the start of the ENTITY
178	entity_end := contents.index('>') or { return error('Entity declaration not closed.') }
179	entity_contents := contents[entity_len..entity_end]
180
181	name := entity_contents.trim_left(' \t\n').all_before(' ')
182	if name == '' {
183	return error('Entity is missing name.')
184	}
185	value := entity_contents.all_after_first(name).trim_space().trim('"\'')
186	if value.len == 0 {
187	return error('Entity is missing value.')
188	}
189
190	// TODO: Add support for SYSTEM and PUBLIC entities
191
192	return DTDEntity{name, value}, contents[entity_end + 1..]
193	}
194
195	fn parse_element(contents string) !(DTDElement, string) {
196	// We find the nearest '>' to the start of the ELEMENT
197	element_end := contents.index('>') or { return error('Element declaration not closed.') }
198	element_contents := contents[element_len..element_end].trim_left(' \t\r\n')
199
200	mut name_span := TextSpan{}
201
202	for ch in element_contents {
203	match ch {
204	` `, `\t`, `\n` {
205	break
206	}
207	// Valid characters in an entity name are:
208	// 1. Lowercase alphabet - a-z
209	// 2. Uppercase alphabet - A-Z
210	// 3. Numbers - 0-9
211	// 4. Underscore - _
212	// 5. Colon - :
213	// 6. Period - .
214	`a`...`z`, `A`...`Z`, `0`...`9`, `_`, `:`, `.` {
215	name_span.end++
216	}
217	else {
218	return error('Invalid character in element name: "${ch}"')
219	}
220	}
221	}
222
223	name := element_contents[name_span.start..name_span.end].trim_left(' \t\n')
224	if name == '' {
225	return error('Element is missing name.')
226	}
227	definition_string := element_contents.all_after_first(name).trim_space().trim('"\'')
228
229	definition := if definition_string.starts_with('(') {
230	// We have a list of possible children
231
232	// Ensure that both ( and ) are present
233	if !definition_string.ends_with(')') {
234	return error('Element declaration not closed.')
235	}
236
237	definition_string.trim('()').split(',')
238	} else {
239	// Invalid definition
240	return error('Invalid element definition: ${definition_string}')
241	}
242
243	// TODO: Add support for SYSTEM and PUBLIC entities
244
245	return DTDElement{name, definition}, contents[element_end + 1..]
246	}
247
248	fn parse_doctype(mut reader io.Reader) !DocumentType {
249	// We may have more < in the doctype so keep count
250	mut depth := 1
251	mut doctype_buffer := strings.new_builder(default_string_builder_cap)
252	mut local_buf := [u8(0)]
253	for {
254	ch := next_char(mut reader, mut local_buf)!
255	doctype_buffer.write_u8(ch)
256	match ch {
257	`<` {
258	depth++
259	}
260	`>` {
261	depth--
262	if depth == 0 {
263	break
264	}
265	}
266	else {}
267	}
268	}
269
270	doctype_contents := doctype_buffer.str().trim_space()
271
272	name := doctype_contents.all_before('[').trim_space()
273
274	mut list_contents := doctype_contents.all_after('[').all_before(']').trim_space()
275	mut items := []DTDListItem{}
276
277	for list_contents.len > 0 {
278	if list_contents.starts_with('<!ENTITY') {
279	entity, remaining := parse_entity(list_contents)!
280	items << entity
281	list_contents = remaining.trim_space()
282	} else if list_contents.starts_with('<!ELEMENT') {
283	element, remaining := parse_element(list_contents)!
284	items << element
285	list_contents = remaining.trim_space()
286	} else {
287	return error('Unknown DOCTYPE list item: ${list_contents}')
288	}
289	}
290
291	return DocumentType{
292	name: name
293	dtd: DocumentTypeDefinition{
294	list: items
295	}
296	}
297	}
298
299	fn parse_prolog(mut reader io.Reader) !(Prolog, u8) {
300	// Skip trailing whitespace and invalid characters
301	mut local_buf := [u8(0)]
302	mut ch := next_char(mut reader, mut local_buf)!
303	for {
304	match ch {
305	` `, `\t`, `\r`, `\n` {
306	ch = next_char(mut reader, mut local_buf)!
307	continue
308	}
309	`<` {
310	break
311	}
312	byte_order_marking_first {
313	// UTF-8 BOM
314	mut bom_buf := [u8(0), 0]
315	if reader.read(mut bom_buf)! != 2 {
316	return error('Invalid UTF-8 BOM.')
317	}
318	if bom_buf != byte_order_marking_bytes {
319	return error('Invalid UTF-8 BOM.')
320	}
321	ch = next_char(mut reader, mut local_buf)!
322	continue
323	}
324	else {
325	return error('Expecting a prolog or root node starting with "<".')
326	}
327	}
328	}
329
330	ch = next_char(mut reader, mut local_buf)!
331	if ch != `?` {
332	return Prolog{}, ch
333	}
334
335	ch = next_char(mut reader, mut local_buf)!
336	if ch != `x` {
337	return error('Expecting a prolog starting with "<?x".')
338	}
339
340	ch = next_char(mut reader, mut local_buf)!
341	if ch != `m` {
342	return error('Expecting a prolog starting with "<?xm".')
343	}
344
345	ch = next_char(mut reader, mut local_buf)!
346	if ch != `l` {
347	return error('Expecting a prolog starting with "<?xml".')
348	}
349
350	mut prolog_buffer := strings.new_builder(default_string_builder_cap)
351
352	// Keep reading character by character until we find the end of the prolog
353	mut found_question_mark := false
354
355	for {
356	ch = next_char(mut reader, mut local_buf)!
357	match ch {
358	`?` {
359	if found_question_mark {
360	return error('Invalid prolog: Two question marks found in a row.')
361	}
362	found_question_mark = true
363	}
364	`>` {
365	if found_question_mark {
366	break
367	}
368	return error('Invalid prolog: Found ">" before "?".')
369	}
370	else {
371	if found_question_mark {
372	found_question_mark = false
373	prolog_buffer.write_u8(`?`)
374	}
375	prolog_buffer.write_u8(ch)
376	}
377	}
378	}
379
380	prolog_attributes := prolog_buffer.str().trim_space()
381
382	attributes := if prolog_attributes.len == 0 {
383	default_prolog_attributes
384	} else {
385	parse_attributes(prolog_attributes)!
386	}
387
388	version := attributes['version'] or { return error('XML declaration missing version.') }
389	encoding := attributes['encoding'] or { 'UTF-8' }
390
391	mut comments := []XMLComment{}
392	mut doctype := DocumentType{
393	name: ''
394	dtd: ''
395	}
396	mut found_doctype := false
397	for {
398	ch = next_char(mut reader, mut local_buf)!
399	match ch {
400	` `, `\t`, `\n` {
401	continue
402	}
403	`<` {
404	// We have a comment, DOCTYPE, or root node
405	ch = next_char(mut reader, mut local_buf)!
406	match ch {
407	`!` {
408	// A comment or DOCTYPE
409	match next_char(mut reader, mut local_buf)! {
410	`-` {
411	// A comment
412	if next_char(mut reader, mut local_buf)! != `-` {
413	return error('Invalid comment.')
414	}
415	comments << parse_comment(mut reader)!
416	}
417	`D` {
418	if found_doctype {
419	return error('Duplicate DOCTYPE declaration.')
420	}
421	// <!D -> OCTYPE
422	mut doc_buf := []u8{len: 6}
423	if reader.read(mut doc_buf)! != 6 {
424	return error('Invalid DOCTYPE.')
425	}
426	if doc_buf != doctype_chars {
427	return error('Invalid DOCTYPE.')
428	}
429	found_doctype = true
430	doctype = parse_doctype(mut reader)!
431	}
432	else {
433	return error('Unsupported control sequence found in prolog.')
434	}
435	}
436	}
437	else {
438	// We have found the start of the root node
439	break
440	}
441	}
442	}
443	else {}
444	}
445	}
446
447	return Prolog{
448	version: version
449	encoding: encoding
450	doctype: doctype
451	comments: comments
452	}, ch
453	}
454
455	fn parse_children(name string, attributes map[string]string, mut reader io.Reader) !XMLNode {
456	mut inner_contents := strings.new_builder(default_string_builder_cap)
457
458	mut children := []XMLNodeContents{}
459	mut local_buf := [u8(0)]
460
461	for {
462	ch := next_char(mut reader, mut local_buf)!
463	match ch {
464	`<` {
465	second_char := next_char(mut reader, mut local_buf)!
466	match second_char {
467	`!` {
468	// Comment, CDATA
469	mut next_two := [u8(0), 0]
470	if reader.read(mut next_two)! != 2 {
471	return error('Invalid XML. Incomplete comment or CDATA declaration.')
472	}
473	if next_two == double_dash {
474	// Comment
475	comment := parse_comment(mut reader)!
476	children << comment
477	} else if next_two == c_tag {
478	// <![CDATA -> DATA
479	mut cdata_buf := []u8{len: 4}
480	if reader.read(mut cdata_buf)! != 4 {
481	return error('Invalid XML. Incomplete CDATA declaration.')
482	}
483	if cdata_buf != data_chars {
484	return error('Invalid XML. Expected "CDATA" after "<![C".')
485	}
486	cdata := parse_cdata(mut reader)!
487	children << cdata
488	} else {
489	return error('Invalid XML. Unknown control sequence: ${next_two.bytestr()}')
490	}
491	}
492	`/` {
493	// End of node
494	mut node_end_buffer := []u8{len: name.len + 1}
495	if reader.read(mut node_end_buffer)! != name.len + 1 {
496	return error('Invalid XML. Incomplete node end.')
497	}
498
499	mut ending_chars := name.bytes()
500	ending_chars << `>`
501
502	if node_end_buffer != ending_chars {
503	return error('XML node <${name}> not closed.')
504	}
505
506	collected_contents := inner_contents.str().trim_space()
507	if collected_contents.len > 0 {
508	// We have some inner text
509	children << collected_contents.replace('\r\n', '\n')
510	}
511	return XMLNode{
512	name: name
513	attributes: attributes
514	children: children
515	}
516	}
517	else {
518	// Start of child node
519	child := parse_single_node(second_char, mut reader) or {
520	if err.msg() == 'XML node cannot start with "</".' {
521	return error('XML node <${name}> not closed.')
522	} else {
523	return err
524	}
525	}
526	text := inner_contents.str().trim_space()
527	if text.len > 0 {
528	children << text.replace('\r\n', '\n')
529	}
530	children << child
531	}
532	}
533	}
534	else {
535	inner_contents.write_u8(ch)
536	}
537	}
538	}
539	return error('XML node <${name}> not closed.')
540	}
541
542	// parse_single_node parses a single XML node from the reader. The first character of the tag is passed
543	// in as the first_char parameter.
544	// This function is meant to assist in parsing nested nodes one at a time. Using this function as
545	// opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large
546	// XML documents without running out of memory.
547	pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
548	mut contents := strings.new_builder(default_string_builder_cap)
549	contents.write_u8(first_char)
550
551	mut local_buf := [u8(0)]
552	for {
553	mut ch := next_char(mut reader, mut local_buf)!
554	if ch == `>` {
555	break
556	}
557	contents.write_u8(ch)
558	}
559
560	tag_contents := contents.str().trim_space()
561
562	parts := tag_contents.split_any(' \t\r\n')
563	name := parts[0].trim_right('/')
564
565	// Check if it is a self-closing tag
566	if tag_contents.ends_with('/') {
567	// We're not looking for children and inner text
568	return XMLNode{
569	name: name
570	attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())!
571	}
572	}
573
574	attribute_string := tag_contents[name.len..].trim_space()
575	attributes := parse_attributes(attribute_string)!
576
577	return parse_children(name, attributes, mut reader)
578	}
579
580	// XMLDocument.from_string parses an XML document from a string.
581	pub fn XMLDocument.from_string(raw_contents string) !XMLDocument {
582	mut reader := FullBufferReader{
583	contents: raw_contents.bytes()
584	}
585	return XMLDocument.from_reader(mut reader)!
586	}
587
588	// XMLDocument.from_file parses an XML document from a file. Note that the file is read in its entirety
589	// and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead.
590	pub fn XMLDocument.from_file(path string) !XMLDocument {
591	mut reader := FullBufferReader{
592	contents: os.read_bytes(path)!
593	}
594	return XMLDocument.from_reader(mut reader)!
595	}
596
597	// XMLDocument.from_reader parses an XML document from a reader. This is the most generic way to parse
598	// an XML document from any arbitrary source that implements that io.Reader interface.
599	pub fn XMLDocument.from_reader(mut reader io.Reader) !XMLDocument {
600	prolog, first_char := parse_prolog(mut reader) or {
601	if err is os.Eof \|\| err is io.Eof \|\| err.msg() == 'Unexpected End Of File.' {
602	return error('XML document is empty.')
603	} else {
604	return err
605	}
606	}
607
608	root := parse_single_node(first_char, mut reader)!
609
610	return XMLDocument{
611	version: prolog.version
612	encoding: prolog.encoding
613	comments: prolog.comments
614	doctype: prolog.doctype
615	root: root
616	}
617	}
618