From c9429f33313165110ef84234d8ec477646eec7b2 Mon Sep 17 00:00:00 2001
From: Subhomoy Haldar <hungrybluedev@gmail.com>
Date: Thu, 16 Nov 2023 18:13:36 +0000
Subject: [PATCH] encoding.xml: make functions public, add documentation,
 tests, fix attribute parsing for self-closing tags  (#19901)

---
 vlib/encoding/xml/README.md       |   4 +
 vlib/encoding/xml/encoding.v      | 109 +++++++++++-------
 vlib/encoding/xml/encoding_test.v | 179 ++++++++++++++++++++++++++++++
 vlib/encoding/xml/parser.v        |   9 +-
 vlib/encoding/xml/parser_test.v   | 125 +++++++++++++++++++++
 vlib/encoding/xml/query.v         |  40 ++++++-
 vlib/encoding/xml/query_test.v    |  52 +++++++++
 vlib/encoding/xml/types.v         |  18 +--
 8 files changed, 482 insertions(+), 54 deletions(-)
 create mode 100644 vlib/encoding/xml/encoding_test.v
 create mode 100644 vlib/encoding/xml/parser_test.v
 create mode 100644 vlib/encoding/xml/query_test.v

diff --git a/vlib/encoding/xml/README.md b/vlib/encoding/xml/README.md
index 161baf1af..7b28ebdbe 100644
--- a/vlib/encoding/xml/README.md
+++ b/vlib/encoding/xml/README.md
@@ -7,6 +7,10 @@ Note that this is not a streaming XML parser. It reads the entire document into
 memory and then parses it. This is not a problem for small documents, but it
 might be a problem for extremely large documents (several hundred megabytes or more).
 
+The public function `parse_single_node` can be used to parse a single node from
+an implementation of `io.Reader`, which can help parse large XML documents on an
+element-by-element basis. Sample usage is provided in the `parser_test.v` file.
+
 ## Usage
 
 ### Parsing XML Files
diff --git a/vlib/encoding/xml/encoding.v b/vlib/encoding/xml/encoding.v
index 2ef924af8..159287d88 100644
--- a/vlib/encoding/xml/encoding.v
+++ b/vlib/encoding/xml/encoding.v
@@ -26,38 +26,43 @@ pub fn (node XMLNode) pretty_str(original_indent string, depth int, reverse_enti
 		builder.write_string(value)
 		builder.write_u8(`"`)
 	}
-	builder.write_string('>\n')
-	for child in node.children {
-		match child {
-			string {
-				builder.write_string(indent)
-				builder.write_string(original_indent)
-				builder.write_string(escape_text(child, reverse_entities: reverse_entities))
-			}
-			XMLNode {
-				builder.write_string(child.pretty_str(original_indent, depth + 1, reverse_entities))
-			}
-			XMLComment {
-				builder.write_string(indent)
-				builder.write_string(original_indent)
-				builder.write_string('<!--')
-				builder.write_string(child.text)
-				builder.write_string('-->')
-			}
-			XMLCData {
-				builder.write_string(indent)
-				builder.write_string(original_indent)
-				builder.write_string('<![CDATA[')
-				builder.write_string(child.text)
-				builder.write_string(']]>')
+	if node.children.len > 0 {
+		builder.write_string('>\n')
+		for child in node.children {
+			match child {
+				string {
+					builder.write_string(indent)
+					builder.write_string(original_indent)
+					builder.write_string(escape_text(child, reverse_entities: reverse_entities))
+				}
+				XMLNode {
+					builder.write_string(child.pretty_str(original_indent, depth + 1,
+						reverse_entities))
+				}
+				XMLComment {
+					builder.write_string(indent)
+					builder.write_string(original_indent)
+					builder.write_string('<!--')
+					builder.write_string(child.text)
+					builder.write_string('-->')
+				}
+				XMLCData {
+					builder.write_string(indent)
+					builder.write_string(original_indent)
+					builder.write_string('<![CDATA[')
+					builder.write_string(child.text)
+					builder.write_string(']]>')
+				}
 			}
+			builder.write_u8(`\n`)
 		}
-		builder.write_u8(`\n`)
+		builder.write_string(indent)
+		builder.write_string('</')
+		builder.write_string(node.name)
+		builder.write_u8(`>`)
+	} else {
+		builder.write_string('/>')
 	}
-	builder.write_string(indent)
-	builder.write_string('</')
-	builder.write_string(node.name)
-	builder.write_u8(`>`)
 	return builder.str()
 }
 
@@ -73,10 +78,20 @@ fn (list []DTDListItem) pretty_str(indent string) string {
 	for item in list {
 		match item {
 			DTDEntity {
-				builder.write_string('${indent}<!ENTITY ${item.name} "${item.value}">')
+				builder.write_string(indent)
+				builder.write_string('<!ENTITY ')
+				builder.write_string(item.name)
+				builder.write_string(' "')
+				builder.write_string(item.value)
+				builder.write_string('">')
 			}
 			DTDElement {
-				builder.write_string('${indent}<!ELEMENT ${item.name} ${item.definition}>')
+				builder.write_string(indent)
+				builder.write_string('<!ELEMENT ')
+				builder.write_string(item.name)
+				builder.write_string(' [')
+				builder.write_string(item.definition.join(', '))
+				builder.write_string(']>')
 			}
 		}
 		builder.write_u8(`\n`)
@@ -86,11 +101,17 @@ fn (list []DTDListItem) pretty_str(indent string) string {
 }
 
 fn (doctype DocumentType) pretty_str(indent string) string {
+	mut builder := strings.new_builder(1024)
 	match doctype.dtd {
 		string {
 			content := doctype.dtd
 			return if content.len > 0 {
-				'<!DOCTYPE ${doctype.name} SYSTEM "${content}">'
+				builder.write_string('<!DOCTYPE ')
+				builder.write_string(doctype.name)
+				builder.write_string(' SYSTEM ')
+				builder.write_string(content)
+				builder.write_string('>\n')
+				builder.str()
 			} else {
 				''
 			}
@@ -100,13 +121,11 @@ fn (doctype DocumentType) pretty_str(indent string) string {
 				return ''
 			}
 
-			mut builder := strings.new_builder(1024)
 			builder.write_string('<!DOCTYPE ')
 			builder.write_string(doctype.name)
 			builder.write_string(' ')
 			builder.write_string(doctype.dtd.list.pretty_str(indent))
-			builder.write_string('>')
-			builder.write_u8(`\n`)
+			builder.write_string('>\n')
 			return builder.str()
 		}
 	}
@@ -117,7 +136,12 @@ fn (doctype DocumentType) pretty_str(indent string) string {
 pub fn (doc XMLDocument) pretty_str(indent string) string {
 	mut document_builder := strings.new_builder(1024)
 
-	prolog := '<?xml version="${doc.version}" encoding="${doc.encoding}"?>'
+	document_builder.write_string('<?xml version="')
+	document_builder.write_string(doc.version)
+	document_builder.write_string('" encoding="')
+	document_builder.write_string(doc.encoding)
+	document_builder.write_string('"?>\n')
+
 	comments := if doc.comments.len > 0 {
 		mut comments_buffer := strings.new_builder(512)
 		for comment in doc.comments {
@@ -131,11 +155,14 @@ pub fn (doc XMLDocument) pretty_str(indent string) string {
 		''
 	}
 
-	document_builder.write_string(prolog)
-	document_builder.write_u8(`\n`)
-	document_builder.write_string(doc.doctype.pretty_str(indent))
-	document_builder.write_u8(`\n`)
-	document_builder.write_string(comments)
+	doctype_string := doc.doctype.pretty_str(indent)
+	if doctype_string.len > 0 {
+		document_builder.write_string(doctype_string)
+		document_builder.write_u8(`\n`)
+	}
+	if comments.len > 0 {
+		document_builder.write_string(comments)
+	}
 	document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities))
 
 	return document_builder.str()
diff --git a/vlib/encoding/xml/encoding_test.v b/vlib/encoding/xml/encoding_test.v
new file mode 100644
index 000000000..26fbe5bd4
--- /dev/null
+++ b/vlib/encoding/xml/encoding_test.v
@@ -0,0 +1,179 @@
+module main
+
+import encoding.xml
+
+fn test_node() {
+	nodes := [
+		xml.XMLNode{
+			name: 'test'
+			attributes: {
+				'test:key':   ' test_value '
+				'test:other': '123456'
+			}
+			children: [
+				xml.XMLNode{
+					name: 'child'
+					attributes: {
+						'child:key': 'child_value'
+					}
+				},
+				'Sample text',
+			]
+		},
+		xml.XMLNode{
+			name: 's'
+			attributes: {
+				'k': 'v'
+			}
+			children: [
+				'Hello, world!',
+				xml.XMLNode{
+					name: 'c'
+					attributes: {
+						'k2': 'v2'
+					}
+				},
+			]
+		},
+		xml.XMLNode{
+			name: 'ext'
+			attributes: {
+				'uri':          '{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}'
+				'xmlns:xcalcf': 'http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures'
+			}
+			children: [
+				xml.XMLNode{
+					name: 'xcalcf:calcFeatures'
+					children: [
+						xml.XMLNode{
+							name: 'xcalcf:feature'
+							attributes: {
+								'name': 'microsoft.com:RD'
+							}
+						},
+						xml.XMLNode{
+							name: 'xcalcf:feature'
+							attributes: {
+								'name': 'microsoft.com:Single'
+							}
+						},
+						xml.XMLNode{
+							name: 'xcalcf:feature'
+							attributes: {
+								'name': 'microsoft.com:FV'
+							}
+						},
+						xml.XMLNode{
+							name: 'xcalcf:feature'
+							attributes: {
+								'name': 'microsoft.com:CNMTM'
+							}
+						},
+						xml.XMLNode{
+							name: 'xcalcf:feature'
+							attributes: {
+								'name': 'microsoft.com:LET_WF'
+							}
+						},
+						xml.XMLNode{
+							name: 'xcalcf:feature'
+							attributes: {
+								'name': 'microsoft.com:LAMBDA_WF'
+							}
+						},
+						xml.XMLNode{
+							name: 'xcalcf:feature'
+							attributes: {
+								'name': 'microsoft.com:ARRAYTEXT_WF'
+							}
+						},
+					]
+				},
+			]
+		},
+	]
+	values := [
+		'
+		<test test:key=" test_value " test:other="123456">
+			<child child:key="child_value"/>
+			Sample text
+		</test>'.trim_indent(),
+		'
+		<s k="v">
+			Hello, world!
+			<c k2="v2"/>
+		</s>'.trim_indent(),
+		'
+		<ext uri="{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}" xmlns:xcalcf="http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures">
+			<xcalcf:calcFeatures>
+				<xcalcf:feature name="microsoft.com:RD"/>
+				<xcalcf:feature name="microsoft.com:Single"/>
+				<xcalcf:feature name="microsoft.com:FV"/>
+				<xcalcf:feature name="microsoft.com:CNMTM"/>
+				<xcalcf:feature name="microsoft.com:LET_WF"/>
+				<xcalcf:feature name="microsoft.com:LAMBDA_WF"/>
+				<xcalcf:feature name="microsoft.com:ARRAYTEXT_WF"/>
+			</xcalcf:calcFeatures>
+		</ext>'.trim_indent(),
+	]
+	for i, node in nodes {
+		assert node.pretty_str('\t', 0, xml.default_entities_reverse) == values[i]
+	}
+}
+
+fn test_doc() {
+	docs := [
+		xml.XMLDocument{
+			root: xml.XMLNode{
+				name: 'test'
+				attributes: {
+					'test:key':   ' test_value '
+					'test:other': '123456'
+				}
+				children: [
+					xml.XMLNode{
+						name: 'child'
+						attributes: {
+							'child:key': 'child_value'
+						}
+					},
+					'Sample text',
+				]
+			}
+		},
+		xml.XMLDocument{
+			root: xml.XMLNode{
+				name: 's'
+				attributes: {
+					'k': 'v'
+				}
+				children: [
+					'Hello, world!',
+					xml.XMLNode{
+						name: 'c'
+						attributes: {
+							'k2': 'v2'
+						}
+					},
+				]
+			}
+		},
+	]
+	values := [
+		'
+		<?xml version="1.0" encoding="UTF-8"?>
+		<test test:key=" test_value " test:other="123456">
+			<child child:key="child_value"/>
+			Sample text
+		</test>'.trim_indent(),
+		'
+		<?xml version="1.0" encoding="UTF-8"?>
+		<s k="v">
+			Hello, world!
+			<c k2="v2"/>
+		</s>'.trim_indent(),
+	]
+	for i, doc in docs {
+		assert doc.pretty_str('\t') == values[i]
+	}
+}
diff --git a/vlib/encoding/xml/parser.v b/vlib/encoding/xml/parser.v
index 8f5a63f6e..86dbc6594 100644
--- a/vlib/encoding/xml/parser.v
+++ b/vlib/encoding/xml/parser.v
@@ -541,7 +541,12 @@ fn parse_children(name string, attributes map[string]string, mut reader io.Reade
 	return error('XML node <${name}> not closed.')
 }
 
-fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
+// parse_single_node parses a single XML node from the reader. The first character of the tag is passed
+// in as the first_char parameter.
+// This function is meant to assist in parsing nested nodes one at a time. Using this function as
+// opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large
+// XML documents without running out of memory.
+pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
 	mut contents := strings.new_builder(xml.default_string_builder_cap)
 	contents.write_u8(first_char)
 
@@ -564,7 +569,7 @@ fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
 		// We're not looking for children and inner text
 		return XMLNode{
 			name: name
-			attributes: parse_attributes(tag_contents[name.len - 1..tag_contents.len].trim_space())!
+			attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())!
 		}
 	}
 
diff --git a/vlib/encoding/xml/parser_test.v b/vlib/encoding/xml/parser_test.v
new file mode 100644
index 000000000..3de74829b
--- /dev/null
+++ b/vlib/encoding/xml/parser_test.v
@@ -0,0 +1,125 @@
+module xml
+
+const (
+	sample_doc = '
+<root>
+	<c id="c1"/>
+	<c id="c2">
+		Sample Text
+	</c>
+	<c id="c3"/>
+	<abc id="c4"/>
+	<xyz id="c5"/>
+	<c id="c6"/>
+	<cx id="c7"/>
+	<cd id="c8"/>
+	<child id="c9">
+		More Sample Text
+	</child>
+	<cz id="c10"/>
+</root>'
+	xml_elements = [
+		XMLNode{
+			name: 'c'
+			attributes: {
+				'id': 'c1'
+			}
+		},
+		XMLNode{
+			name: 'c'
+			attributes: {
+				'id': 'c2'
+			}
+			children: [
+				'Sample Text',
+			]
+		},
+		XMLNode{
+			name: 'c'
+			attributes: {
+				'id': 'c3'
+			}
+		},
+		XMLNode{
+			name: 'abc'
+			attributes: {
+				'id': 'c4'
+			}
+		},
+		XMLNode{
+			name: 'xyz'
+			attributes: {
+				'id': 'c5'
+			}
+		},
+		XMLNode{
+			name: 'c'
+			attributes: {
+				'id': 'c6'
+			}
+		},
+		XMLNode{
+			name: 'cx'
+			attributes: {
+				'id': 'c7'
+			}
+		},
+		XMLNode{
+			name: 'cd'
+			attributes: {
+				'id': 'c8'
+			}
+		},
+		XMLNode{
+			name: 'child'
+			attributes: {
+				'id': 'c9'
+			}
+			children: [
+				'More Sample Text',
+			]
+		},
+		XMLNode{
+			name: 'cz'
+			attributes: {
+				'id': 'c10'
+			}
+		},
+	]
+)
+
+fn test_single_element_parsing() ! {
+	mut reader := FullBufferReader{
+		contents: xml.sample_doc.bytes()
+	}
+	// Skip the "<root>" tag
+	mut skip := []u8{len: 6}
+	reader.read(mut skip)!
+
+	mut local_buf := [u8(0)]
+	mut ch := next_char(mut reader, mut local_buf)!
+
+	mut count := 0
+
+	for count < xml.xml_elements.len {
+		match ch {
+			`<` {
+				next_ch := next_char(mut reader, mut local_buf)!
+				match next_ch {
+					`/` {}
+					else {
+						parsed_element := parse_single_node(next_ch, mut reader)!
+						assert xml.xml_elements[count] == parsed_element
+						count++
+					}
+				}
+				ch = next_char(mut reader, mut local_buf)!
+			}
+			else {
+				for ch != `<` {
+					ch = next_char(mut reader, mut local_buf)!
+				}
+			}
+		}
+	}
+}
diff --git a/vlib/encoding/xml/query.v b/vlib/encoding/xml/query.v
index 9d310aff7..22a6a7992 100644
--- a/vlib/encoding/xml/query.v
+++ b/vlib/encoding/xml/query.v
@@ -1,6 +1,8 @@
 module xml
 
-fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
+// get_element_by_id returns the first element with the given id, or none if no
+// such element exists in the subtree rooted at this node.
+pub fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
 	// Is this the node we're looking for?
 	if attribute_id := node.attributes['id'] {
 		if attribute_id == id {
@@ -27,7 +29,9 @@ fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
 	return none
 }
 
-fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
+// get_elements_by_tag returns all elements with the given tag name in the subtree
+// rooted at this node. If there are no such elements, an empty array is returned.
+pub fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
 	mut result := []XMLNode{}
 
 	if node.name == tag {
@@ -48,13 +52,45 @@ fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
 	return result
 }
 
+// get_elements_by_attribute returns all elements with the given attribute-value pair in
+// the subtree rooted at this node. If there are no such elements, an empty array is returned.
+pub fn (node XMLNode) get_elements_by_attribute(attribute string, value string) []XMLNode {
+	mut result := []XMLNode{}
+
+	if attribute_value := node.attributes[attribute] {
+		if attribute_value == value {
+			result << node
+		}
+	}
+
+	if node.children.len == 0 {
+		return result
+	}
+
+	// Recurse into children
+	for child in node.children {
+		if child is XMLNode {
+			result << child.get_elements_by_attribute(attribute, value)
+		}
+	}
+
+	return result
+}
+
 // get_element_by_id returns the first element with the given id, or none if no
 // such element exists.
 pub fn (doc XMLDocument) get_element_by_id(id string) ?XMLNode {
 	return doc.root.get_element_by_id(id)
 }
 
+// get_elements_by_attribute returns all elements with the given attribute-value pair.
+// If there are no such elements, an empty array is returned.
+pub fn (doc XMLDocument) get_elements_by_attribute(attribute string, value string) []XMLNode {
+	return doc.root.get_elements_by_attribute(attribute, value)
+}
+
 // get_elements_by_tag returns all elements with the given tag name.
+// If there are no such elements, an empty array is returned.
 pub fn (doc XMLDocument) get_elements_by_tag(tag string) []XMLNode {
 	return doc.root.get_elements_by_tag(tag)
 }
diff --git a/vlib/encoding/xml/query_test.v b/vlib/encoding/xml/query_test.v
new file mode 100644
index 000000000..a1e84973b
--- /dev/null
+++ b/vlib/encoding/xml/query_test.v
@@ -0,0 +1,52 @@
+module main
+
+import encoding.xml
+
+const (
+	sample_document = '
+<root>
+	<a attr="value1">
+		<b id="middle-tag" attr="value2">
+			<c attr="value3">Text1</c>
+			<d attr="value4">Text2</d>
+			<e attr="value5">
+				<f id="innermost" attr="value6">Text3</f>
+				<g attr="value7">Text4</g>
+				<h attr="value8">Text5</h>
+			</e>
+			<i attr="value9">Text6</i>
+		</b>
+		<j attr="value10">Text7</j>
+	</a>
+	<k attr="value11">Text8</k>
+	<l attr="value12">Text9</l>
+</root>
+'
+)
+
+fn test_querying() ! {
+	doc := xml.XMLDocument.from_string(sample_document)!
+
+	assert doc.root.name == 'root'
+	assert doc.root.children.len == 3
+
+	middle_tag := doc.get_element_by_id('middle-tag')?
+	assert middle_tag.name == 'b'
+	assert middle_tag.attributes['attr'] == 'value2'
+	assert middle_tag.children.len == 4
+
+	innermost := middle_tag.get_element_by_id('innermost')?
+	assert innermost.name == 'f'
+	assert innermost.attributes['attr'] == 'value6'
+
+	for count in 1 .. 13 {
+		assert doc.get_elements_by_attribute('attr', 'value${count}').len == 1
+	}
+
+	i_tags := doc.get_elements_by_tag('i')
+	assert i_tags.len == 1
+	assert i_tags[0].name == 'i'
+	assert i_tags[0].attributes['attr'] == 'value9'
+	assert i_tags[0].children.len == 1
+	assert i_tags[0].children[0] as string == 'Text6'
+}
diff --git a/vlib/encoding/xml/types.v b/vlib/encoding/xml/types.v
index f61b7c953..7c013971f 100644
--- a/vlib/encoding/xml/types.v
+++ b/vlib/encoding/xml/types.v
@@ -4,12 +4,12 @@ pub type XMLNodeContents = XMLCData | XMLComment | XMLNode | string
 
 pub struct XMLCData {
 pub:
-	text string [required]
+	text string @[required]
 }
 
 pub struct XMLComment {
 pub:
-	text string [required]
+	text string @[required]
 }
 
 // XMLNode represents a single XML node. It contains the node name,
@@ -17,7 +17,7 @@ pub:
 // other XML nodes, CDATA, plain text, or comments.
 pub struct XMLNode {
 pub:
-	name       string            [required]
+	name       string            @[required]
 	attributes map[string]string
 	children   []XMLNodeContents
 }
@@ -31,19 +31,19 @@ pub:
 pub struct XMLDocument {
 	Prolog
 pub:
-	root XMLNode [required]
+	root XMLNode @[required]
 }
 
 pub type DTDListItem = DTDElement | DTDEntity
 
 pub struct DTDEntity {
-	name  string [required]
-	value string [required]
+	name  string @[required]
+	value string @[required]
 }
 
 pub struct DTDElement {
-	name       string   [required]
-	definition []string [required]
+	name       string   @[required]
+	definition []string @[required]
 }
 
 pub struct DocumentTypeDefinition {
@@ -52,7 +52,7 @@ pub struct DocumentTypeDefinition {
 }
 
 pub struct DocumentType {
-	name string  [required]
+	name string  @[required]
 	dtd  DTDInfo
 }
 
-- 
2.39.5