From c9429f33313165110ef84234d8ec477646eec7b2 Mon Sep 17 00:00:00 2001 From: Subhomoy Haldar Date: Thu, 16 Nov 2023 18:13:36 +0000 Subject: [PATCH] encoding.xml: make functions public, add documentation, tests, fix attribute parsing for self-closing tags (#19901) --- vlib/encoding/xml/README.md | 4 + vlib/encoding/xml/encoding.v | 109 +++++++++++------- vlib/encoding/xml/encoding_test.v | 179 ++++++++++++++++++++++++++++++ vlib/encoding/xml/parser.v | 9 +- vlib/encoding/xml/parser_test.v | 125 +++++++++++++++++++++ vlib/encoding/xml/query.v | 40 ++++++- vlib/encoding/xml/query_test.v | 52 +++++++++ vlib/encoding/xml/types.v | 18 +-- 8 files changed, 482 insertions(+), 54 deletions(-) create mode 100644 vlib/encoding/xml/encoding_test.v create mode 100644 vlib/encoding/xml/parser_test.v create mode 100644 vlib/encoding/xml/query_test.v diff --git a/vlib/encoding/xml/README.md b/vlib/encoding/xml/README.md index 161baf1af..7b28ebdbe 100644 --- a/vlib/encoding/xml/README.md +++ b/vlib/encoding/xml/README.md @@ -7,6 +7,10 @@ Note that this is not a streaming XML parser. It reads the entire document into memory and then parses it. This is not a problem for small documents, but it might be a problem for extremely large documents (several hundred megabytes or more). +The public function `parse_single_node` can be used to parse a single node from +an implementation of `io.Reader`, which can help parse large XML documents on an +element-by-element basis. Sample usage is provided in the `parser_test.v` file. + ## Usage ### Parsing XML Files diff --git a/vlib/encoding/xml/encoding.v b/vlib/encoding/xml/encoding.v index 2ef924af8..159287d88 100644 --- a/vlib/encoding/xml/encoding.v +++ b/vlib/encoding/xml/encoding.v @@ -26,38 +26,43 @@ pub fn (node XMLNode) pretty_str(original_indent string, depth int, reverse_enti builder.write_string(value) builder.write_u8(`"`) } - builder.write_string('>\n') - for child in node.children { - match child { - string { - builder.write_string(indent) - builder.write_string(original_indent) - builder.write_string(escape_text(child, reverse_entities: reverse_entities)) - } - XMLNode { - builder.write_string(child.pretty_str(original_indent, depth + 1, reverse_entities)) - } - XMLComment { - builder.write_string(indent) - builder.write_string(original_indent) - builder.write_string('') - } - XMLCData { - builder.write_string(indent) - builder.write_string(original_indent) - builder.write_string('') + if node.children.len > 0 { + builder.write_string('>\n') + for child in node.children { + match child { + string { + builder.write_string(indent) + builder.write_string(original_indent) + builder.write_string(escape_text(child, reverse_entities: reverse_entities)) + } + XMLNode { + builder.write_string(child.pretty_str(original_indent, depth + 1, + reverse_entities)) + } + XMLComment { + builder.write_string(indent) + builder.write_string(original_indent) + builder.write_string('') + } + XMLCData { + builder.write_string(indent) + builder.write_string(original_indent) + builder.write_string('') + } } + builder.write_u8(`\n`) } - builder.write_u8(`\n`) + builder.write_string(indent) + builder.write_string('`) + } else { + builder.write_string('/>') } - builder.write_string(indent) - builder.write_string('`) return builder.str() } @@ -73,10 +78,20 @@ fn (list []DTDListItem) pretty_str(indent string) string { for item in list { match item { DTDEntity { - builder.write_string('${indent}') + builder.write_string(indent) + builder.write_string('') } DTDElement { - builder.write_string('${indent}') + builder.write_string(indent) + builder.write_string('') } } builder.write_u8(`\n`) @@ -86,11 +101,17 @@ fn (list []DTDListItem) pretty_str(indent string) string { } fn (doctype DocumentType) pretty_str(indent string) string { + mut builder := strings.new_builder(1024) match doctype.dtd { string { content := doctype.dtd return if content.len > 0 { - '' + builder.write_string('\n') + builder.str() } else { '' } @@ -100,13 +121,11 @@ fn (doctype DocumentType) pretty_str(indent string) string { return '' } - mut builder := strings.new_builder(1024) builder.write_string('') - builder.write_u8(`\n`) + builder.write_string('>\n') return builder.str() } } @@ -117,7 +136,12 @@ fn (doctype DocumentType) pretty_str(indent string) string { pub fn (doc XMLDocument) pretty_str(indent string) string { mut document_builder := strings.new_builder(1024) - prolog := '' + document_builder.write_string('\n') + comments := if doc.comments.len > 0 { mut comments_buffer := strings.new_builder(512) for comment in doc.comments { @@ -131,11 +155,14 @@ pub fn (doc XMLDocument) pretty_str(indent string) string { '' } - document_builder.write_string(prolog) - document_builder.write_u8(`\n`) - document_builder.write_string(doc.doctype.pretty_str(indent)) - document_builder.write_u8(`\n`) - document_builder.write_string(comments) + doctype_string := doc.doctype.pretty_str(indent) + if doctype_string.len > 0 { + document_builder.write_string(doctype_string) + document_builder.write_u8(`\n`) + } + if comments.len > 0 { + document_builder.write_string(comments) + } document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities)) return document_builder.str() diff --git a/vlib/encoding/xml/encoding_test.v b/vlib/encoding/xml/encoding_test.v new file mode 100644 index 000000000..26fbe5bd4 --- /dev/null +++ b/vlib/encoding/xml/encoding_test.v @@ -0,0 +1,179 @@ +module main + +import encoding.xml + +fn test_node() { + nodes := [ + xml.XMLNode{ + name: 'test' + attributes: { + 'test:key': ' test_value ' + 'test:other': '123456' + } + children: [ + xml.XMLNode{ + name: 'child' + attributes: { + 'child:key': 'child_value' + } + }, + 'Sample text', + ] + }, + xml.XMLNode{ + name: 's' + attributes: { + 'k': 'v' + } + children: [ + 'Hello, world!', + xml.XMLNode{ + name: 'c' + attributes: { + 'k2': 'v2' + } + }, + ] + }, + xml.XMLNode{ + name: 'ext' + attributes: { + 'uri': '{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}' + 'xmlns:xcalcf': 'http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures' + } + children: [ + xml.XMLNode{ + name: 'xcalcf:calcFeatures' + children: [ + xml.XMLNode{ + name: 'xcalcf:feature' + attributes: { + 'name': 'microsoft.com:RD' + } + }, + xml.XMLNode{ + name: 'xcalcf:feature' + attributes: { + 'name': 'microsoft.com:Single' + } + }, + xml.XMLNode{ + name: 'xcalcf:feature' + attributes: { + 'name': 'microsoft.com:FV' + } + }, + xml.XMLNode{ + name: 'xcalcf:feature' + attributes: { + 'name': 'microsoft.com:CNMTM' + } + }, + xml.XMLNode{ + name: 'xcalcf:feature' + attributes: { + 'name': 'microsoft.com:LET_WF' + } + }, + xml.XMLNode{ + name: 'xcalcf:feature' + attributes: { + 'name': 'microsoft.com:LAMBDA_WF' + } + }, + xml.XMLNode{ + name: 'xcalcf:feature' + attributes: { + 'name': 'microsoft.com:ARRAYTEXT_WF' + } + }, + ] + }, + ] + }, + ] + values := [ + ' + + + Sample text + '.trim_indent(), + ' + + Hello, world! + + '.trim_indent(), + ' + + + + + + + + + + + '.trim_indent(), + ] + for i, node in nodes { + assert node.pretty_str('\t', 0, xml.default_entities_reverse) == values[i] + } +} + +fn test_doc() { + docs := [ + xml.XMLDocument{ + root: xml.XMLNode{ + name: 'test' + attributes: { + 'test:key': ' test_value ' + 'test:other': '123456' + } + children: [ + xml.XMLNode{ + name: 'child' + attributes: { + 'child:key': 'child_value' + } + }, + 'Sample text', + ] + } + }, + xml.XMLDocument{ + root: xml.XMLNode{ + name: 's' + attributes: { + 'k': 'v' + } + children: [ + 'Hello, world!', + xml.XMLNode{ + name: 'c' + attributes: { + 'k2': 'v2' + } + }, + ] + } + }, + ] + values := [ + ' + + + + Sample text + '.trim_indent(), + ' + + + Hello, world! + + '.trim_indent(), + ] + for i, doc in docs { + assert doc.pretty_str('\t') == values[i] + } +} diff --git a/vlib/encoding/xml/parser.v b/vlib/encoding/xml/parser.v index 8f5a63f6e..86dbc6594 100644 --- a/vlib/encoding/xml/parser.v +++ b/vlib/encoding/xml/parser.v @@ -541,7 +541,12 @@ fn parse_children(name string, attributes map[string]string, mut reader io.Reade return error('XML node <${name}> not closed.') } -fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode { +// parse_single_node parses a single XML node from the reader. The first character of the tag is passed +// in as the first_char parameter. +// This function is meant to assist in parsing nested nodes one at a time. Using this function as +// opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large +// XML documents without running out of memory. +pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode { mut contents := strings.new_builder(xml.default_string_builder_cap) contents.write_u8(first_char) @@ -564,7 +569,7 @@ fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode { // We're not looking for children and inner text return XMLNode{ name: name - attributes: parse_attributes(tag_contents[name.len - 1..tag_contents.len].trim_space())! + attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())! } } diff --git a/vlib/encoding/xml/parser_test.v b/vlib/encoding/xml/parser_test.v new file mode 100644 index 000000000..3de74829b --- /dev/null +++ b/vlib/encoding/xml/parser_test.v @@ -0,0 +1,125 @@ +module xml + +const ( + sample_doc = ' + + + + Sample Text + + + + + + + + + More Sample Text + + +' + xml_elements = [ + XMLNode{ + name: 'c' + attributes: { + 'id': 'c1' + } + }, + XMLNode{ + name: 'c' + attributes: { + 'id': 'c2' + } + children: [ + 'Sample Text', + ] + }, + XMLNode{ + name: 'c' + attributes: { + 'id': 'c3' + } + }, + XMLNode{ + name: 'abc' + attributes: { + 'id': 'c4' + } + }, + XMLNode{ + name: 'xyz' + attributes: { + 'id': 'c5' + } + }, + XMLNode{ + name: 'c' + attributes: { + 'id': 'c6' + } + }, + XMLNode{ + name: 'cx' + attributes: { + 'id': 'c7' + } + }, + XMLNode{ + name: 'cd' + attributes: { + 'id': 'c8' + } + }, + XMLNode{ + name: 'child' + attributes: { + 'id': 'c9' + } + children: [ + 'More Sample Text', + ] + }, + XMLNode{ + name: 'cz' + attributes: { + 'id': 'c10' + } + }, + ] +) + +fn test_single_element_parsing() ! { + mut reader := FullBufferReader{ + contents: xml.sample_doc.bytes() + } + // Skip the "" tag + mut skip := []u8{len: 6} + reader.read(mut skip)! + + mut local_buf := [u8(0)] + mut ch := next_char(mut reader, mut local_buf)! + + mut count := 0 + + for count < xml.xml_elements.len { + match ch { + `<` { + next_ch := next_char(mut reader, mut local_buf)! + match next_ch { + `/` {} + else { + parsed_element := parse_single_node(next_ch, mut reader)! + assert xml.xml_elements[count] == parsed_element + count++ + } + } + ch = next_char(mut reader, mut local_buf)! + } + else { + for ch != `<` { + ch = next_char(mut reader, mut local_buf)! + } + } + } + } +} diff --git a/vlib/encoding/xml/query.v b/vlib/encoding/xml/query.v index 9d310aff7..22a6a7992 100644 --- a/vlib/encoding/xml/query.v +++ b/vlib/encoding/xml/query.v @@ -1,6 +1,8 @@ module xml -fn (node XMLNode) get_element_by_id(id string) ?XMLNode { +// get_element_by_id returns the first element with the given id, or none if no +// such element exists in the subtree rooted at this node. +pub fn (node XMLNode) get_element_by_id(id string) ?XMLNode { // Is this the node we're looking for? if attribute_id := node.attributes['id'] { if attribute_id == id { @@ -27,7 +29,9 @@ fn (node XMLNode) get_element_by_id(id string) ?XMLNode { return none } -fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode { +// get_elements_by_tag returns all elements with the given tag name in the subtree +// rooted at this node. If there are no such elements, an empty array is returned. +pub fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode { mut result := []XMLNode{} if node.name == tag { @@ -48,13 +52,45 @@ fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode { return result } +// get_elements_by_attribute returns all elements with the given attribute-value pair in +// the subtree rooted at this node. If there are no such elements, an empty array is returned. +pub fn (node XMLNode) get_elements_by_attribute(attribute string, value string) []XMLNode { + mut result := []XMLNode{} + + if attribute_value := node.attributes[attribute] { + if attribute_value == value { + result << node + } + } + + if node.children.len == 0 { + return result + } + + // Recurse into children + for child in node.children { + if child is XMLNode { + result << child.get_elements_by_attribute(attribute, value) + } + } + + return result +} + // get_element_by_id returns the first element with the given id, or none if no // such element exists. pub fn (doc XMLDocument) get_element_by_id(id string) ?XMLNode { return doc.root.get_element_by_id(id) } +// get_elements_by_attribute returns all elements with the given attribute-value pair. +// If there are no such elements, an empty array is returned. +pub fn (doc XMLDocument) get_elements_by_attribute(attribute string, value string) []XMLNode { + return doc.root.get_elements_by_attribute(attribute, value) +} + // get_elements_by_tag returns all elements with the given tag name. +// If there are no such elements, an empty array is returned. pub fn (doc XMLDocument) get_elements_by_tag(tag string) []XMLNode { return doc.root.get_elements_by_tag(tag) } diff --git a/vlib/encoding/xml/query_test.v b/vlib/encoding/xml/query_test.v new file mode 100644 index 000000000..a1e84973b --- /dev/null +++ b/vlib/encoding/xml/query_test.v @@ -0,0 +1,52 @@ +module main + +import encoding.xml + +const ( + sample_document = ' + + + + Text1 + Text2 + + Text3 + Text4 + Text5 + + Text6 + + Text7 + + Text8 + Text9 + +' +) + +fn test_querying() ! { + doc := xml.XMLDocument.from_string(sample_document)! + + assert doc.root.name == 'root' + assert doc.root.children.len == 3 + + middle_tag := doc.get_element_by_id('middle-tag')? + assert middle_tag.name == 'b' + assert middle_tag.attributes['attr'] == 'value2' + assert middle_tag.children.len == 4 + + innermost := middle_tag.get_element_by_id('innermost')? + assert innermost.name == 'f' + assert innermost.attributes['attr'] == 'value6' + + for count in 1 .. 13 { + assert doc.get_elements_by_attribute('attr', 'value${count}').len == 1 + } + + i_tags := doc.get_elements_by_tag('i') + assert i_tags.len == 1 + assert i_tags[0].name == 'i' + assert i_tags[0].attributes['attr'] == 'value9' + assert i_tags[0].children.len == 1 + assert i_tags[0].children[0] as string == 'Text6' +} diff --git a/vlib/encoding/xml/types.v b/vlib/encoding/xml/types.v index f61b7c953..7c013971f 100644 --- a/vlib/encoding/xml/types.v +++ b/vlib/encoding/xml/types.v @@ -4,12 +4,12 @@ pub type XMLNodeContents = XMLCData | XMLComment | XMLNode | string pub struct XMLCData { pub: - text string [required] + text string @[required] } pub struct XMLComment { pub: - text string [required] + text string @[required] } // XMLNode represents a single XML node. It contains the node name, @@ -17,7 +17,7 @@ pub: // other XML nodes, CDATA, plain text, or comments. pub struct XMLNode { pub: - name string [required] + name string @[required] attributes map[string]string children []XMLNodeContents } @@ -31,19 +31,19 @@ pub: pub struct XMLDocument { Prolog pub: - root XMLNode [required] + root XMLNode @[required] } pub type DTDListItem = DTDElement | DTDEntity pub struct DTDEntity { - name string [required] - value string [required] + name string @[required] + value string @[required] } pub struct DTDElement { - name string [required] - definition []string [required] + name string @[required] + definition []string @[required] } pub struct DocumentTypeDefinition { @@ -52,7 +52,7 @@ pub struct DocumentTypeDefinition { } pub struct DocumentType { - name string [required] + name string @[required] dtd DTDInfo } -- 2.39.5