From e158655f4f28a73358b9e24e82963a193d9fc30e Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Tue, 21 Apr 2026 16:46:11 +0300 Subject: [PATCH] net: fix net.html parser ignoring inner tags in tag.content (fixes #14138) --- vlib/net/html/README.md | 13 +++++++++++++ vlib/net/html/dom.v | 19 +++++++++++++++++++ vlib/net/html/html_test.v | 12 +++++++++++- vlib/net/html/tag.v | 23 +++++++++++++++++++++-- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/vlib/net/html/README.md b/vlib/net/html/README.md index a4f503f44..8aa6c74ce 100644 --- a/vlib/net/html/README.md +++ b/vlib/net/html/README.md @@ -15,4 +15,17 @@ fn main() { } ``` +For tags with nested markup, `tag.content` preserves the inner HTML: + +```v +import net.html + +fn main() { + doc := html.parse('

before in between after

') + tag := doc.get_tags(name: 'p')[0] + println(tag.content) // before in between after + println(tag.text()) // before in between after +} +``` + More examples found on [`parser_test.v`](parser_test.v) and [`html_test.v`](html_test.v) diff --git a/vlib/net/html/dom.v b/vlib/net/html/dom.v index 435d3f323..028dd3583 100644 --- a/vlib/net/html/dom.v +++ b/vlib/net/html/dom.v @@ -1,6 +1,7 @@ module html import os +import strings // The W3C Document Object Model (DOM) is a platform and language-neutral // interface that allows programs and scripts to dynamically access and @@ -172,6 +173,24 @@ fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) { } } // println(tag_list[root_index]) for debug purposes dom.root = tag_list[0] + mut root := dom.root + dom.normalize_tag_content(mut root) +} + +fn (mut dom DocumentObjectModel) normalize_tag_content(mut tag Tag) { + tag.text_content = tag.content + tag.content_is_inner_html = true + if tag.children.len == 0 { + return + } + mut inner_html := strings.new_builder(tag.content.len + tag.children.len * 32) + inner_html.write_string(tag.text_content) + for idx := 0; idx < tag.children.len; idx++ { + mut child := tag.children[idx] + dom.normalize_tag_content(mut child) + inner_html.write_string(child.str()) + } + tag.content = inner_html.str() } // get_root returns the root of the document. diff --git a/vlib/net/html/html_test.v b/vlib/net/html/html_test.v index 2f30fe31d..f66927ac3 100644 --- a/vlib/net/html/html_test.v +++ b/vlib/net/html/html_test.v @@ -26,7 +26,17 @@ fn test_parse_inline_tags() { assert p_tags.len == 1 p_tag := p_tags[0] - assert p_tag.str() == '

before in between after

' + assert p_tag.content == 'before in between after' + assert p_tag.str() == '

before in between after

' assert p_tag.text() == 'before in between after' } + +fn test_parse_content_with_nested_tags() { + doc := parse('womAll') + td_tag := doc.get_tags(name: 'td')[0] + a_tag := td_tag.get_tags('a')[0] + assert td_tag.content == 'womAll' + assert a_tag.content == 'womAll' + assert td_tag.text() == 'womAll' +} diff --git a/vlib/net/html/tag.v b/vlib/net/html/tag.v index 7059b4f5e..df52e85fe 100644 --- a/vlib/net/html/tag.v +++ b/vlib/net/html/tag.v @@ -22,6 +22,9 @@ pub mut: position_in_parent int closed bool close_type CloseTagType = .in_name +mut: + text_content string + content_is_inner_html bool } fn (mut tag Tag) add_parent(t &Tag, position int) { @@ -36,18 +39,34 @@ fn (mut tag Tag) add_child(t &Tag) int { // text returns the text contents of the tag. pub fn (tag &Tag) text() string { + if tag.name == 'text' { + return tag.leading_text() + } if tag.name.len >= 2 && tag.name[..2] == 'br' { return '\n' } mut text_str := strings.new_builder(200) - text_str.write_string(tag.content) + text_str.write_string(tag.leading_text()) for child in tag.children { text_str.write_string(child.text()) } return text_str.str() } +fn (tag &Tag) leading_text() string { + if tag.text_content.len > 0 { + return tag.text_content + } + if !tag.content_is_inner_html || tag.children.len == 0 || tag.name == 'text' { + return tag.content + } + return '' +} + pub fn (tag &Tag) str() string { + if tag.name == 'text' { + return tag.leading_text() + } mut html_str := strings.new_builder(200) html_str.write_string('<${tag.name}') for key, value in tag.attributes { @@ -58,7 +77,7 @@ pub fn (tag &Tag) str() string { } html_str.write_string(if tag.closed && tag.close_type == .in_name { '/>' } else { '>' }) html_str.write_string(tag.content) - if tag.children.len > 0 { + if !tag.content_is_inner_html && tag.children.len > 0 { for child in tag.children { html_str.write_string(child.str()) } -- 2.39.5