| 1 | module xml |
| 2 | |
| 3 | import io |
| 4 | import os |
| 5 | import strings |
| 6 | |
| 7 | const default_prolog_attributes = { |
| 8 | 'version': '1.0' |
| 9 | 'encoding': 'UTF-8' |
| 10 | } |
| 11 | const default_string_builder_cap = 32 |
| 12 | |
| 13 | const element_len = '<!ELEMENT'.len |
| 14 | const entity_len = '<!ENTITY'.len |
| 15 | |
| 16 | const doctype_chars = 'OCTYPE'.bytes() |
| 17 | const double_dash = '--'.bytes() |
| 18 | const c_tag = '[C'.bytes() |
| 19 | const data_chars = 'DATA'.bytes() |
| 20 | |
| 21 | const byte_order_marking_first = u8(0xEF) |
| 22 | const byte_order_marking_bytes = [u8(0xBB), 0xBF] |
| 23 | |
| 24 | // Helper types to assist in parsing |
| 25 | |
| 26 | struct TextSpan { |
| 27 | mut: |
| 28 | start int |
| 29 | end int |
| 30 | } |
| 31 | |
| 32 | enum AttributeParserState { |
| 33 | key |
| 34 | eq |
| 35 | value |
| 36 | } |
| 37 | |
| 38 | fn parse_attributes(attribute_contents string) !map[string]string { |
| 39 | if attribute_contents.contains_u8(`<`) { |
| 40 | return error('Malformed XML. Found "<" in attribute string: "${attribute_contents}"') |
| 41 | } |
| 42 | mut attributes := map[string]string{} |
| 43 | |
| 44 | mut state := AttributeParserState.key |
| 45 | mut key_span, mut value_span := TextSpan{}, TextSpan{} |
| 46 | |
| 47 | for index, ch in attribute_contents { |
| 48 | match state { |
| 49 | .key { |
| 50 | match ch { |
| 51 | `=` { |
| 52 | state = AttributeParserState.eq |
| 53 | } |
| 54 | else { |
| 55 | key_span.end++ |
| 56 | } |
| 57 | } |
| 58 | } |
| 59 | .eq { |
| 60 | match ch { |
| 61 | `=` { |
| 62 | return error('Duplicate "=" in attribute string: "${attribute_contents}"') |
| 63 | } |
| 64 | `'`, `"` { |
| 65 | state = AttributeParserState.value |
| 66 | value_span.start = index + 1 |
| 67 | } |
| 68 | else { |
| 69 | return error('Invalid character in attribute string: "${attribute_contents}"') |
| 70 | } |
| 71 | } |
| 72 | } |
| 73 | .value { |
| 74 | match ch { |
| 75 | `'`, `"` { |
| 76 | state = AttributeParserState.key |
| 77 | value_span.end = index |
| 78 | attributes[attribute_contents[key_span.start..key_span.end].trim_space()] = attribute_contents[value_span.start..value_span.end] |
| 79 | |
| 80 | key_span.start = index + 1 |
| 81 | key_span.end = index + 1 |
| 82 | } |
| 83 | else { |
| 84 | state = AttributeParserState.value |
| 85 | value_span.end++ |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | return attributes |
| 93 | } |
| 94 | |
| 95 | fn parse_comment(mut reader io.Reader) !XMLComment { |
| 96 | mut comment_buffer := strings.new_builder(default_string_builder_cap) |
| 97 | |
| 98 | mut local_buf := [u8(0)] |
| 99 | for { |
| 100 | ch := next_char(mut reader, mut local_buf)! |
| 101 | match ch { |
| 102 | `-` { |
| 103 | after_ch := next_char(mut reader, mut local_buf)! |
| 104 | if after_ch == `-` { |
| 105 | if next_char(mut reader, mut local_buf)! == `>` { |
| 106 | break |
| 107 | } |
| 108 | return error('XML Comment not closed. Expected ">".') |
| 109 | } else { |
| 110 | comment_buffer.write_u8(ch) |
| 111 | comment_buffer.write_u8(after_ch) |
| 112 | } |
| 113 | } |
| 114 | else { |
| 115 | comment_buffer.write_u8(ch) |
| 116 | } |
| 117 | } |
| 118 | } |
| 119 | |
| 120 | comment_contents := comment_buffer.str() |
| 121 | return XMLComment{comment_contents} |
| 122 | } |
| 123 | |
| 124 | enum CDATAParserState { |
| 125 | normal |
| 126 | single |
| 127 | double |
| 128 | } |
| 129 | |
| 130 | fn parse_cdata(mut reader io.Reader) !XMLCData { |
| 131 | mut contents_buf := strings.new_builder(default_string_builder_cap) |
| 132 | |
| 133 | mut state := CDATAParserState.normal |
| 134 | mut local_buf := [u8(0)] |
| 135 | |
| 136 | for { |
| 137 | ch := next_char(mut reader, mut local_buf)! |
| 138 | contents_buf.write_u8(ch) |
| 139 | match ch { |
| 140 | `]` { |
| 141 | match state { |
| 142 | .double { |
| 143 | // Another ] after the ]] for some reason. Keep the state |
| 144 | } |
| 145 | .single { |
| 146 | state = .double |
| 147 | } |
| 148 | .normal { |
| 149 | state = .single |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | `>` { |
| 154 | match state { |
| 155 | .double { |
| 156 | break |
| 157 | } |
| 158 | else { |
| 159 | state = .normal |
| 160 | } |
| 161 | } |
| 162 | } |
| 163 | else { |
| 164 | state = .normal |
| 165 | } |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | contents := contents_buf.str().trim_space() |
| 170 | if !contents.ends_with(']]>') { |
| 171 | return error('CDATA section not closed.') |
| 172 | } |
| 173 | return XMLCData{contents[1..contents.len - 3]} |
| 174 | } |
| 175 | |
| 176 | fn parse_entity(contents string) !(DTDEntity, string) { |
| 177 | // We find the nearest '>' to the start of the ENTITY |
| 178 | entity_end := contents.index('>') or { return error('Entity declaration not closed.') } |
| 179 | entity_contents := contents[entity_len..entity_end] |
| 180 | |
| 181 | name := entity_contents.trim_left(' \t\n').all_before(' ') |
| 182 | if name == '' { |
| 183 | return error('Entity is missing name.') |
| 184 | } |
| 185 | value := entity_contents.all_after_first(name).trim_space().trim('"\'') |
| 186 | if value.len == 0 { |
| 187 | return error('Entity is missing value.') |
| 188 | } |
| 189 | |
| 190 | // TODO: Add support for SYSTEM and PUBLIC entities |
| 191 | |
| 192 | return DTDEntity{name, value}, contents[entity_end + 1..] |
| 193 | } |
| 194 | |
| 195 | fn parse_element(contents string) !(DTDElement, string) { |
| 196 | // We find the nearest '>' to the start of the ELEMENT |
| 197 | element_end := contents.index('>') or { return error('Element declaration not closed.') } |
| 198 | element_contents := contents[element_len..element_end].trim_left(' \t\r\n') |
| 199 | |
| 200 | mut name_span := TextSpan{} |
| 201 | |
| 202 | for ch in element_contents { |
| 203 | match ch { |
| 204 | ` `, `\t`, `\n` { |
| 205 | break |
| 206 | } |
| 207 | // Valid characters in an entity name are: |
| 208 | // 1. Lowercase alphabet - a-z |
| 209 | // 2. Uppercase alphabet - A-Z |
| 210 | // 3. Numbers - 0-9 |
| 211 | // 4. Underscore - _ |
| 212 | // 5. Colon - : |
| 213 | // 6. Period - . |
| 214 | `a`...`z`, `A`...`Z`, `0`...`9`, `_`, `:`, `.` { |
| 215 | name_span.end++ |
| 216 | } |
| 217 | else { |
| 218 | return error('Invalid character in element name: "${ch}"') |
| 219 | } |
| 220 | } |
| 221 | } |
| 222 | |
| 223 | name := element_contents[name_span.start..name_span.end].trim_left(' \t\n') |
| 224 | if name == '' { |
| 225 | return error('Element is missing name.') |
| 226 | } |
| 227 | definition_string := element_contents.all_after_first(name).trim_space().trim('"\'') |
| 228 | |
| 229 | definition := if definition_string.starts_with('(') { |
| 230 | // We have a list of possible children |
| 231 | |
| 232 | // Ensure that both ( and ) are present |
| 233 | if !definition_string.ends_with(')') { |
| 234 | return error('Element declaration not closed.') |
| 235 | } |
| 236 | |
| 237 | definition_string.trim('()').split(',') |
| 238 | } else { |
| 239 | // Invalid definition |
| 240 | return error('Invalid element definition: ${definition_string}') |
| 241 | } |
| 242 | |
| 243 | // TODO: Add support for SYSTEM and PUBLIC entities |
| 244 | |
| 245 | return DTDElement{name, definition}, contents[element_end + 1..] |
| 246 | } |
| 247 | |
| 248 | fn parse_doctype(mut reader io.Reader) !DocumentType { |
| 249 | // We may have more < in the doctype so keep count |
| 250 | mut depth := 1 |
| 251 | mut doctype_buffer := strings.new_builder(default_string_builder_cap) |
| 252 | mut local_buf := [u8(0)] |
| 253 | for { |
| 254 | ch := next_char(mut reader, mut local_buf)! |
| 255 | doctype_buffer.write_u8(ch) |
| 256 | match ch { |
| 257 | `<` { |
| 258 | depth++ |
| 259 | } |
| 260 | `>` { |
| 261 | depth-- |
| 262 | if depth == 0 { |
| 263 | break |
| 264 | } |
| 265 | } |
| 266 | else {} |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | doctype_contents := doctype_buffer.str().trim_space() |
| 271 | |
| 272 | name := doctype_contents.all_before('[').trim_space() |
| 273 | |
| 274 | mut list_contents := doctype_contents.all_after('[').all_before(']').trim_space() |
| 275 | mut items := []DTDListItem{} |
| 276 | |
| 277 | for list_contents.len > 0 { |
| 278 | if list_contents.starts_with('<!ENTITY') { |
| 279 | entity, remaining := parse_entity(list_contents)! |
| 280 | items << entity |
| 281 | list_contents = remaining.trim_space() |
| 282 | } else if list_contents.starts_with('<!ELEMENT') { |
| 283 | element, remaining := parse_element(list_contents)! |
| 284 | items << element |
| 285 | list_contents = remaining.trim_space() |
| 286 | } else { |
| 287 | return error('Unknown DOCTYPE list item: ${list_contents}') |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | return DocumentType{ |
| 292 | name: name |
| 293 | dtd: DocumentTypeDefinition{ |
| 294 | list: items |
| 295 | } |
| 296 | } |
| 297 | } |
| 298 | |
| 299 | fn parse_prolog(mut reader io.Reader) !(Prolog, u8) { |
| 300 | // Skip trailing whitespace and invalid characters |
| 301 | mut local_buf := [u8(0)] |
| 302 | mut ch := next_char(mut reader, mut local_buf)! |
| 303 | for { |
| 304 | match ch { |
| 305 | ` `, `\t`, `\r`, `\n` { |
| 306 | ch = next_char(mut reader, mut local_buf)! |
| 307 | continue |
| 308 | } |
| 309 | `<` { |
| 310 | break |
| 311 | } |
| 312 | byte_order_marking_first { |
| 313 | // UTF-8 BOM |
| 314 | mut bom_buf := [u8(0), 0] |
| 315 | if reader.read(mut bom_buf)! != 2 { |
| 316 | return error('Invalid UTF-8 BOM.') |
| 317 | } |
| 318 | if bom_buf != byte_order_marking_bytes { |
| 319 | return error('Invalid UTF-8 BOM.') |
| 320 | } |
| 321 | ch = next_char(mut reader, mut local_buf)! |
| 322 | continue |
| 323 | } |
| 324 | else { |
| 325 | return error('Expecting a prolog or root node starting with "<".') |
| 326 | } |
| 327 | } |
| 328 | } |
| 329 | |
| 330 | ch = next_char(mut reader, mut local_buf)! |
| 331 | if ch != `?` { |
| 332 | return Prolog{}, ch |
| 333 | } |
| 334 | |
| 335 | ch = next_char(mut reader, mut local_buf)! |
| 336 | if ch != `x` { |
| 337 | return error('Expecting a prolog starting with "<?x".') |
| 338 | } |
| 339 | |
| 340 | ch = next_char(mut reader, mut local_buf)! |
| 341 | if ch != `m` { |
| 342 | return error('Expecting a prolog starting with "<?xm".') |
| 343 | } |
| 344 | |
| 345 | ch = next_char(mut reader, mut local_buf)! |
| 346 | if ch != `l` { |
| 347 | return error('Expecting a prolog starting with "<?xml".') |
| 348 | } |
| 349 | |
| 350 | mut prolog_buffer := strings.new_builder(default_string_builder_cap) |
| 351 | |
| 352 | // Keep reading character by character until we find the end of the prolog |
| 353 | mut found_question_mark := false |
| 354 | |
| 355 | for { |
| 356 | ch = next_char(mut reader, mut local_buf)! |
| 357 | match ch { |
| 358 | `?` { |
| 359 | if found_question_mark { |
| 360 | return error('Invalid prolog: Two question marks found in a row.') |
| 361 | } |
| 362 | found_question_mark = true |
| 363 | } |
| 364 | `>` { |
| 365 | if found_question_mark { |
| 366 | break |
| 367 | } |
| 368 | return error('Invalid prolog: Found ">" before "?".') |
| 369 | } |
| 370 | else { |
| 371 | if found_question_mark { |
| 372 | found_question_mark = false |
| 373 | prolog_buffer.write_u8(`?`) |
| 374 | } |
| 375 | prolog_buffer.write_u8(ch) |
| 376 | } |
| 377 | } |
| 378 | } |
| 379 | |
| 380 | prolog_attributes := prolog_buffer.str().trim_space() |
| 381 | |
| 382 | attributes := if prolog_attributes.len == 0 { |
| 383 | default_prolog_attributes |
| 384 | } else { |
| 385 | parse_attributes(prolog_attributes)! |
| 386 | } |
| 387 | |
| 388 | version := attributes['version'] or { return error('XML declaration missing version.') } |
| 389 | encoding := attributes['encoding'] or { 'UTF-8' } |
| 390 | |
| 391 | mut comments := []XMLComment{} |
| 392 | mut doctype := DocumentType{ |
| 393 | name: '' |
| 394 | dtd: '' |
| 395 | } |
| 396 | mut found_doctype := false |
| 397 | for { |
| 398 | ch = next_char(mut reader, mut local_buf)! |
| 399 | match ch { |
| 400 | ` `, `\t`, `\n` { |
| 401 | continue |
| 402 | } |
| 403 | `<` { |
| 404 | // We have a comment, DOCTYPE, or root node |
| 405 | ch = next_char(mut reader, mut local_buf)! |
| 406 | match ch { |
| 407 | `!` { |
| 408 | // A comment or DOCTYPE |
| 409 | match next_char(mut reader, mut local_buf)! { |
| 410 | `-` { |
| 411 | // A comment |
| 412 | if next_char(mut reader, mut local_buf)! != `-` { |
| 413 | return error('Invalid comment.') |
| 414 | } |
| 415 | comments << parse_comment(mut reader)! |
| 416 | } |
| 417 | `D` { |
| 418 | if found_doctype { |
| 419 | return error('Duplicate DOCTYPE declaration.') |
| 420 | } |
| 421 | // <!D -> OCTYPE |
| 422 | mut doc_buf := []u8{len: 6} |
| 423 | if reader.read(mut doc_buf)! != 6 { |
| 424 | return error('Invalid DOCTYPE.') |
| 425 | } |
| 426 | if doc_buf != doctype_chars { |
| 427 | return error('Invalid DOCTYPE.') |
| 428 | } |
| 429 | found_doctype = true |
| 430 | doctype = parse_doctype(mut reader)! |
| 431 | } |
| 432 | else { |
| 433 | return error('Unsupported control sequence found in prolog.') |
| 434 | } |
| 435 | } |
| 436 | } |
| 437 | else { |
| 438 | // We have found the start of the root node |
| 439 | break |
| 440 | } |
| 441 | } |
| 442 | } |
| 443 | else {} |
| 444 | } |
| 445 | } |
| 446 | |
| 447 | return Prolog{ |
| 448 | version: version |
| 449 | encoding: encoding |
| 450 | doctype: doctype |
| 451 | comments: comments |
| 452 | }, ch |
| 453 | } |
| 454 | |
| 455 | fn parse_children(name string, attributes map[string]string, mut reader io.Reader) !XMLNode { |
| 456 | mut inner_contents := strings.new_builder(default_string_builder_cap) |
| 457 | |
| 458 | mut children := []XMLNodeContents{} |
| 459 | mut local_buf := [u8(0)] |
| 460 | |
| 461 | for { |
| 462 | ch := next_char(mut reader, mut local_buf)! |
| 463 | match ch { |
| 464 | `<` { |
| 465 | second_char := next_char(mut reader, mut local_buf)! |
| 466 | match second_char { |
| 467 | `!` { |
| 468 | // Comment, CDATA |
| 469 | mut next_two := [u8(0), 0] |
| 470 | if reader.read(mut next_two)! != 2 { |
| 471 | return error('Invalid XML. Incomplete comment or CDATA declaration.') |
| 472 | } |
| 473 | if next_two == double_dash { |
| 474 | // Comment |
| 475 | comment := parse_comment(mut reader)! |
| 476 | children << comment |
| 477 | } else if next_two == c_tag { |
| 478 | // <![CDATA -> DATA |
| 479 | mut cdata_buf := []u8{len: 4} |
| 480 | if reader.read(mut cdata_buf)! != 4 { |
| 481 | return error('Invalid XML. Incomplete CDATA declaration.') |
| 482 | } |
| 483 | if cdata_buf != data_chars { |
| 484 | return error('Invalid XML. Expected "CDATA" after "<![C".') |
| 485 | } |
| 486 | cdata := parse_cdata(mut reader)! |
| 487 | children << cdata |
| 488 | } else { |
| 489 | return error('Invalid XML. Unknown control sequence: ${next_two.bytestr()}') |
| 490 | } |
| 491 | } |
| 492 | `/` { |
| 493 | // End of node |
| 494 | mut node_end_buffer := []u8{len: name.len + 1} |
| 495 | if reader.read(mut node_end_buffer)! != name.len + 1 { |
| 496 | return error('Invalid XML. Incomplete node end.') |
| 497 | } |
| 498 | |
| 499 | mut ending_chars := name.bytes() |
| 500 | ending_chars << `>` |
| 501 | |
| 502 | if node_end_buffer != ending_chars { |
| 503 | return error('XML node <${name}> not closed.') |
| 504 | } |
| 505 | |
| 506 | collected_contents := inner_contents.str().trim_space() |
| 507 | if collected_contents.len > 0 { |
| 508 | // We have some inner text |
| 509 | children << collected_contents.replace('\r\n', '\n') |
| 510 | } |
| 511 | return XMLNode{ |
| 512 | name: name |
| 513 | attributes: attributes |
| 514 | children: children |
| 515 | } |
| 516 | } |
| 517 | else { |
| 518 | // Start of child node |
| 519 | child := parse_single_node(second_char, mut reader) or { |
| 520 | if err.msg() == 'XML node cannot start with "</".' { |
| 521 | return error('XML node <${name}> not closed.') |
| 522 | } else { |
| 523 | return err |
| 524 | } |
| 525 | } |
| 526 | text := inner_contents.str().trim_space() |
| 527 | if text.len > 0 { |
| 528 | children << text.replace('\r\n', '\n') |
| 529 | } |
| 530 | children << child |
| 531 | } |
| 532 | } |
| 533 | } |
| 534 | else { |
| 535 | inner_contents.write_u8(ch) |
| 536 | } |
| 537 | } |
| 538 | } |
| 539 | return error('XML node <${name}> not closed.') |
| 540 | } |
| 541 | |
| 542 | // parse_single_node parses a single XML node from the reader. The first character of the tag is passed |
| 543 | // in as the first_char parameter. |
| 544 | // This function is meant to assist in parsing nested nodes one at a time. Using this function as |
| 545 | // opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large |
| 546 | // XML documents without running out of memory. |
| 547 | pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode { |
| 548 | mut contents := strings.new_builder(default_string_builder_cap) |
| 549 | contents.write_u8(first_char) |
| 550 | |
| 551 | mut local_buf := [u8(0)] |
| 552 | for { |
| 553 | mut ch := next_char(mut reader, mut local_buf)! |
| 554 | if ch == `>` { |
| 555 | break |
| 556 | } |
| 557 | contents.write_u8(ch) |
| 558 | } |
| 559 | |
| 560 | tag_contents := contents.str().trim_space() |
| 561 | |
| 562 | parts := tag_contents.split_any(' \t\r\n') |
| 563 | name := parts[0].trim_right('/') |
| 564 | |
| 565 | // Check if it is a self-closing tag |
| 566 | if tag_contents.ends_with('/') { |
| 567 | // We're not looking for children and inner text |
| 568 | return XMLNode{ |
| 569 | name: name |
| 570 | attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())! |
| 571 | } |
| 572 | } |
| 573 | |
| 574 | attribute_string := tag_contents[name.len..].trim_space() |
| 575 | attributes := parse_attributes(attribute_string)! |
| 576 | |
| 577 | return parse_children(name, attributes, mut reader) |
| 578 | } |
| 579 | |
| 580 | // XMLDocument.from_string parses an XML document from a string. |
| 581 | pub fn XMLDocument.from_string(raw_contents string) !XMLDocument { |
| 582 | mut reader := FullBufferReader{ |
| 583 | contents: raw_contents.bytes() |
| 584 | } |
| 585 | return XMLDocument.from_reader(mut reader)! |
| 586 | } |
| 587 | |
| 588 | // XMLDocument.from_file parses an XML document from a file. Note that the file is read in its entirety |
| 589 | // and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead. |
| 590 | pub fn XMLDocument.from_file(path string) !XMLDocument { |
| 591 | mut reader := FullBufferReader{ |
| 592 | contents: os.read_bytes(path)! |
| 593 | } |
| 594 | return XMLDocument.from_reader(mut reader)! |
| 595 | } |
| 596 | |
| 597 | // XMLDocument.from_reader parses an XML document from a reader. This is the most generic way to parse |
| 598 | // an XML document from any arbitrary source that implements that io.Reader interface. |
| 599 | pub fn XMLDocument.from_reader(mut reader io.Reader) !XMLDocument { |
| 600 | prolog, first_char := parse_prolog(mut reader) or { |
| 601 | if err is os.Eof || err is io.Eof || err.msg() == 'Unexpected End Of File.' { |
| 602 | return error('XML document is empty.') |
| 603 | } else { |
| 604 | return err |
| 605 | } |
| 606 | } |
| 607 | |
| 608 | root := parse_single_node(first_char, mut reader)! |
| 609 | |
| 610 | return XMLDocument{ |
| 611 | version: prolog.version |
| 612 | encoding: prolog.encoding |
| 613 | comments: prolog.comments |
| 614 | doctype: prolog.doctype |
| 615 | root: root |
| 616 | } |
| 617 | } |
| 618 | |