From 46c3d7f13d605a08603985fe4e6f82f2a8771775 Mon Sep 17 00:00:00 2001 From: JalonSolov Date: Sat, 9 May 2026 19:10:40 -0400 Subject: [PATCH] x.markdown: UTF-8 handling and plaintext render (#27115) --- vlib/x/markdown/README.md | 13 +- vlib/x/markdown/inline.v | 66 ++++++-- vlib/x/markdown/markdown.v | 16 ++ vlib/x/markdown/markdown_test.v | 36 ++++ vlib/x/markdown/parser.v | 4 +- vlib/x/markdown/plaintext.v | 287 ++++++++++++++++++++++++++++++++ vlib/x/markdown/util.v | 26 +-- 7 files changed, 414 insertions(+), 34 deletions(-) create mode 100644 vlib/x/markdown/plaintext.v diff --git a/vlib/x/markdown/README.md b/vlib/x/markdown/README.md index e27145e7a..159c8fea9 100644 --- a/vlib/x/markdown/README.md +++ b/vlib/x/markdown/README.md @@ -2,9 +2,12 @@ // Use of this source code is governed by an MIT license // that can be found in the LICENSE file. -# vlib/x/markdown - Markdown Parser and HTML Renderer +# vlib/x/markdown - Markdown Parser and Renderers -A CommonMark-compliant Markdown parser and HTML renderer for V, with support for GitHub Flavored Markdown (GFM) extensions. Designed for feature parity with [github.com/yuin/goldmark](https://github.com/yuin/goldmark). +A CommonMark-compliant Markdown parser for V, with HTML and console-friendly +plain-text renderers, plus support for GitHub Flavored Markdown (GFM) +extensions. Designed for feature parity with +[github.com/yuin/goldmark](https://github.com/yuin/goldmark). ## Features @@ -39,8 +42,9 @@ import x.markdown fn main() { html := markdown.to_html('# Hello\n\nWorld') + text := markdown.to_plaintext('# Hello\n\nWorld') println(html) - // Output:

Hello

\n

World

\n + println(text) } ``` @@ -108,6 +112,8 @@ fn main() { ### Top-Level Functions - `to_html(src: string) string` - Convert Markdown to HTML with default settings - `to_html(src: string, opts: Options) string` - Convert with custom options +- `to_plaintext(src: string) string` - Convert Markdown to UTF-8 plain text +- `to_plaintext(src: string, opts: Options) string` - Convert plain text with options - `parse_inline(src: string, opts: Options, ref_map: map) []&Node` - Parse inline content only ### Main Structs @@ -118,6 +124,7 @@ to share link references. Methods: - `convert(src: string) string` - Parse and render to HTML in one call +- `convert_plaintext(src: string) string` - Parse and render to plain text - `parse(src: string) &Node` - Parse to AST only #### `Options` (@[params]) diff --git a/vlib/x/markdown/inline.v b/vlib/x/markdown/inline.v index 1b3ad44a9..7a79e1d53 100644 --- a/vlib/x/markdown/inline.v +++ b/vlib/x/markdown/inline.v @@ -35,6 +35,30 @@ mut: active bool = true } +@[inline] +fn rune_at_or_space(s string, pos int) rune { + if pos < 0 || pos >= s.len { + return ` ` + } + char_len := utf8_char_len(s[pos]) + if char_len <= 0 || pos + char_len > s.len { + return rune(s[pos]) + } + return s[pos..pos + char_len].runes()[0] +} + +@[inline] +fn prev_rune_or_space(s string, pos int) rune { + if pos <= 0 { + return ` ` + } + mut i := pos - 1 + for i > 0 && (s[i] & u8(0xC0)) == u8(0x80) { + i-- + } + return rune_at_or_space(s, i) +} + // parse parses the full inline source and returns a node slice. fn (mut p InlineParser) parse() []&Node { mut result := []&Node{} @@ -47,8 +71,8 @@ fn (mut p InlineParser) parse() []&Node { p.pos++ } run := p.src[start..p.pos] - before := if start > 0 { p.src[start - 1] } else { u8(` `) } - after := if p.pos < p.src.len { p.src[p.pos] } else { u8(` `) } + before := prev_rune_or_space(p.src, start) + after := rune_at_or_space(p.src, p.pos) can_open := can_open_emphasis(ch, before, after) can_close := can_close_emphasis(ch, before, after) result << text_node(run) @@ -248,13 +272,15 @@ fn (mut p InlineParser) parse_one() []&Node { return [p.parse_newline()] } else { + r := rune_at_or_space(p.src, p.pos) + step := utf8_char_len(p.src[p.pos]) if p.opts.linkify { if node := p.try_linkify() { return [node] } } - p.pos++ - return [text_node(c.ascii_str())] + p.pos += step + return [text_node(r.str())] } } } @@ -283,7 +309,7 @@ fn merge_text_nodes(nodes []&Node) []&Node { } // can_open_emphasis reports whether a delimiter run can open emphasis. -fn can_open_emphasis(delim u8, before u8, after u8) bool { +fn can_open_emphasis(delim u8, before rune, after rune) bool { left_flanking := !is_unicode_space(after) && (!is_ascii_punct(after) || is_unicode_space(before) || is_ascii_punct(before)) right_flanking := !is_unicode_space(before) @@ -299,7 +325,7 @@ fn can_open_emphasis(delim u8, before u8, after u8) bool { } // can_close_emphasis reports whether a delimiter run can close emphasis. -fn can_close_emphasis(delim u8, before u8, after u8) bool { +fn can_close_emphasis(delim u8, before rune, after rune) bool { left_flanking := !is_unicode_space(after) && (!is_ascii_punct(after) || is_unicode_space(before) || is_ascii_punct(before)) right_flanking := !is_unicode_space(before) @@ -381,16 +407,16 @@ fn (mut p InlineParser) try_emphasis(c u8) ?&Node { // Prevent splitting an intraword __ run into a synthetic single-underscore opener. if c == `_` && run == 1 && start > 1 && p.src[start - 1] == `_` { - before2 := p.src[start - 2] - after1 := if start + run < p.src.len { p.src[start + run] } else { u8(` `) } + before2 := prev_rune_or_space(p.src, start - 1) + after1 := rune_at_or_space(p.src, start + run) if is_wordish(before2) && is_wordish(after1) { p.pos = start return none } } - before := if start > 0 { p.src[start - 1] } else { u8(` `) } - after := if start + run < p.src.len { p.src[start + run] } else { u8(` `) } + before := prev_rune_or_space(p.src, start) + after := rune_at_or_space(p.src, start + run) opener_can_open := can_open_emphasis(c, before, after) opener_can_close := can_close_emphasis(c, before, after) @@ -431,7 +457,7 @@ fn (mut p InlineParser) try_emphasis(c u8) ?&Node { // is_wordish reports whether c behaves like a word character for emphasis // boundary checks (includes non-ASCII bytes used in UTF-8 sequences). @[inline] -fn is_wordish(c u8) bool { +fn is_wordish(c rune) bool { return !is_unicode_space(c) && !is_ascii_punct(c) } @@ -454,8 +480,8 @@ fn (mut p InlineParser) match_close_delim(c u8, count int, opener_run int, opene } if close_run >= count { // Verify right-flanking. - before_close := if close_pos > 0 { p.src[close_pos - 1] } else { u8(` `) } - after_close := if p.pos < p.src.len { p.src[p.pos] } else { u8(` `) } + before_close := prev_rune_or_space(p.src, close_pos) + after_close := rune_at_or_space(p.src, p.pos) closer_can_close := can_close_emphasis(c, before_close, after_close) closer_can_open := can_open_emphasis(c, before_close, after_close) if closer_can_close { @@ -511,12 +537,14 @@ fn (mut p InlineParser) match_close_delim(c u8, count int, opener_run int, opene inner := p.parse_one() if p.pos <= loop_start_pos { // Safety net: force progress to avoid recursive delimiter stalls. - content_nodes << text_node(p.src[loop_start_pos].ascii_str()) - p.pos = loop_start_pos + 1 + fallback := rune_at_or_space(p.src, loop_start_pos) + content_nodes << text_node(fallback.str()) + p.pos = loop_start_pos + utf8_char_len(p.src[loop_start_pos]) continue } content_nodes << inner } + // Not found; reset. p.pos = content_start return none @@ -722,8 +750,12 @@ fn parse_inline_link_dest_from(s string, start int) (string, string, int) { // skip_ws returns the position in s after skipping whitespace from start. fn skip_ws(s string, start int) int { mut i := start - for i < s.len && (s[i] == ` ` || s[i] == `\t` || s[i] == `\n`) { - i++ + for i < s.len { + ch := rune_at_or_space(s, i) + if ch !in unicode_space { + break + } + i += utf8_char_len(s[i]) } return i } diff --git a/vlib/x/markdown/markdown.v b/vlib/x/markdown/markdown.v index 78c8daeda..eec5c7555 100644 --- a/vlib/x/markdown/markdown.v +++ b/vlib/x/markdown/markdown.v @@ -112,6 +112,12 @@ pub fn to_html(src string, opts Options) string { return md.convert(src) } +// to_plaintext converts the markdown source to UTF-8 plain text with the given options. +pub fn to_plaintext(src string, opts Options) string { + mut md := Markdown.new(opts) + return md.convert_plaintext(src) +} + // convert parses the markdown source and renders it to an HTML string. pub fn (mut m Markdown) convert(src string) string { doc := m.parse(src) @@ -122,6 +128,16 @@ pub fn (mut m Markdown) convert(src string) string { return r.render(doc) } +// convert_plaintext parses the markdown source and renders it to UTF-8 plain text. +pub fn (mut m Markdown) convert_plaintext(src string) string { + doc := m.parse(src) + mut r := PlainTextRenderer{ + opts: m.opts + ref_map: m.ref_map + } + return r.render(doc) +} + // parse parses the markdown source into an AST and returns the document root. // Link reference definitions collected during parsing are cached so that // subsequent parse/convert calls on the same Markdown instance share them. diff --git a/vlib/x/markdown/markdown_test.v b/vlib/x/markdown/markdown_test.v index e27de7735..810eca404 100644 --- a/vlib/x/markdown/markdown_test.v +++ b/vlib/x/markdown/markdown_test.v @@ -323,3 +323,39 @@ fn test_emphasis_goldmark_parity_edge_cases() { assert to_html('_a_*_b**_aba*') == '

a_b**_aba

\n' assert to_html('x_ ***b*ab*bb_a*a a') == '

x_ babbb_aa a

\n' } + +fn test_to_plaintext_basic_blocks_and_inlines() { + text := to_plaintext('# Héllo\n\nA *b* [site](https://example.com)') + assert text.contains('# Héllo') + assert text.contains('A *b* site (https://example.com)') +} + +fn test_to_plaintext_task_list() { + text := to_plaintext('- [ ] todo\n- [x] done', task_list: true) + assert text.contains('☐') + assert text.contains('☑') + assert text.contains('todo') + assert text.contains('done') +} + +fn test_to_plaintext_footnotes() { + text := to_plaintext('Text[^n]\n\n[^n]: note body', footnotes: true) + assert text.contains('Text[1]') + assert text.contains('Footnotes:') + assert text.contains('[1] note body') +} + +fn test_to_plaintext_table_rows_are_separated() { + text := to_plaintext('| a | b |\n|---|---|\n| 1 | 2 |', extensions: gfm()) + assert text.contains('a | b |') + assert text.contains('1 | 2 |') + assert text.contains('a | b | \n1 | 2 |') + assert !text.contains('a | b | 1 | 2 |') +} + +fn test_to_plaintext_blockquote_footnotes_share_global_order() { + text := to_plaintext('> quote[^q]\n\n[^q]: note body', footnotes: true) + assert text.contains('> quote[1]') + assert text.contains('Footnotes:') + assert text.contains('[1] note body') +} diff --git a/vlib/x/markdown/parser.v b/vlib/x/markdown/parser.v index 44f62feb3..6d7aff417 100644 --- a/vlib/x/markdown/parser.v +++ b/vlib/x/markdown/parser.v @@ -15,7 +15,7 @@ const block_level_tags = [ 'menuitem', 'meta', 'nav', 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'search', 'section', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', 'ul' -] +]! // vfmt on // BlockParser parses markdown block structure line by line into an AST. @@ -985,7 +985,7 @@ fn is_table_separator(line string) bool { } else { c } - for ch in inner.bytes() { + for ch in inner.runes() { if ch != `-` { return false } diff --git a/vlib/x/markdown/plaintext.v b/vlib/x/markdown/plaintext.v new file mode 100644 index 000000000..a352a4476 --- /dev/null +++ b/vlib/x/markdown/plaintext.v @@ -0,0 +1,287 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module markdown + +import strings + +// PlainTextRenderer renders a parsed markdown AST to UTF-8 console-friendly text. +struct PlainTextRenderer { + opts Options + ref_map map[string]LinkRef +mut: + sb strings.Builder + // footnote tracking + fn_order []string + fn_nodes map[string]&Node + // list tracking + list_depth int + list_nums []int +} + +// render renders the document node to plain text. +pub fn (mut r PlainTextRenderer) render(doc &Node) string { + r.sb = strings.new_builder(1024) + if r.opts.footnotes { + for child in doc.children { + if child.kind == .footnote_def { + r.fn_nodes[child.fn_label] = child + } + } + } + r.render_children(doc) + if r.opts.footnotes && r.fn_order.len > 0 { + r.render_footnotes_section() + } + return r.sb.str().trim_right('\n') + '\n' +} + +fn (mut r PlainTextRenderer) render_node(node &Node) { + match node.kind { + .document { r.render_children(node) } + .heading { r.render_heading(node) } + .paragraph { r.render_paragraph(node) } + .blockquote { r.render_blockquote(node) } + .list { r.render_list(node) } + .list_item { r.render_list_item(node) } + .code_block, .fenced_code { r.render_code_block(node) } + .thematic_break { r.sb.write_string('---\n') } + .html_block { r.render_html_block(node) } + .link_ref_def, .footnote_def {} + .table { r.render_table(node) } + .table_head, .table_body { r.render_children(node) } + .table_row { r.render_table_row(node) } + .table_cell { r.render_table_cell(node) } + .definition_list { r.render_children(node) } + .definition_term { r.render_definition_term(node) } + .definition_desc { r.render_definition_desc(node) } + .text { r.render_text(node) } + .emphasis { r.render_wrapped(node, '*') } + .strong { r.render_wrapped(node, '**') } + .code_span { r.sb.write_string('`' + node.literal + '`') } + .link { r.render_link(node) } + .image { r.render_image(node) } + .autolink { r.sb.write_string(node.literal) } + .raw_html { r.render_raw_html(node) } + .hard_break, .soft_break { r.sb.write_string('\n') } + .strikethrough { r.render_wrapped(node, '~~') } + .task_checkbox { r.render_task_checkbox(node) } + .footnote_ref { r.render_footnote_ref(node) } + } +} + +fn (mut r PlainTextRenderer) render_children(node &Node) { + for child in node.children { + r.render_node(child) + } +} + +fn (mut r PlainTextRenderer) render_inline(src string) { + nodes := parse_inline(src, r.opts, r.ref_map) + for node in nodes { + r.render_node(node) + } +} + +fn (mut r PlainTextRenderer) render_heading(node &Node) { + r.sb.write_string('${'#'.repeat(node.level)} ') + if node.children.len > 0 { + r.render_children(node) + } else { + r.render_inline(node.literal) + } + r.sb.write_string('\n\n') +} + +fn (mut r PlainTextRenderer) render_paragraph(node &Node) { + if node.children.len > 0 { + r.render_children(node) + } else { + r.render_inline(node.literal) + } + r.sb.write_string('\n') +} + +fn (mut r PlainTextRenderer) render_blockquote(node &Node) { + mut inner := strings.new_builder(128) + mut rr := PlainTextRenderer{ + opts: r.opts + ref_map: r.ref_map + fn_order: r.fn_order.clone() + fn_nodes: r.fn_nodes + } + rr.sb = inner + rr.render_children(node) + for line in rr.sb.str().trim_right('\n').split('\n') { + r.sb.write_string('> ${line}\n') + } + // Keep footnote reference order discovered inside the blockquote. + r.fn_order = rr.fn_order + r.sb.write_string('\n') +} + +fn (mut r PlainTextRenderer) render_list(node &Node) { + r.list_depth++ + if node.is_ordered { + r.list_nums << node.list_start + } else { + r.list_nums << 0 + } + r.render_children(node) + r.list_nums.delete_last() + r.list_depth-- + if r.list_depth == 0 { + r.sb.write_string('\n') + } +} + +fn (mut r PlainTextRenderer) render_list_item(node &Node) { + indent := ' '.repeat(if r.list_depth > 0 { r.list_depth - 1 } else { 0 }) + idx := r.list_nums.len - 1 + marker := if idx >= 0 && r.list_nums[idx] > 0 { + m := '${r.list_nums[idx]}. ' + r.list_nums[idx]++ + m + } else { + '- ' + } + r.sb.write_string(indent + marker) + for i, child in node.children { + if i > 0 { + r.sb.write_string(' ') + } + if child.kind == .paragraph { + if child.children.len > 0 { + r.render_children(child) + } else { + r.render_inline(child.literal) + } + } else if child.kind == .list { + r.sb.write_string('\n') + r.render_node(child) + } else { + r.render_node(child) + } + } + r.sb.write_string('\n') +} + +fn (mut r PlainTextRenderer) render_code_block(node &Node) { + r.sb.write_string('```\n') + r.sb.write_string(node.literal.trim_right('\n')) + r.sb.write_string('\n```\n\n') +} + +fn (mut r PlainTextRenderer) render_html_block(node &Node) { + if r.opts.renderer_opts.unsafe_ { + r.sb.write_string(node.literal) + } else { + r.sb.write_string('[raw HTML omitted]\n') + } +} + +fn (mut r PlainTextRenderer) render_table(node &Node) { + r.render_children(node) + r.sb.write_string('\n') +} + +fn (mut r PlainTextRenderer) render_table_row(node &Node) { + r.render_children(node) + r.sb.write_string('\n') +} + +fn (mut r PlainTextRenderer) render_table_cell(node &Node) { + r.sb.write_string(node.literal.trim_space()) + r.sb.write_string(' | ') +} + +fn (mut r PlainTextRenderer) render_definition_term(node &Node) { + r.render_inline(node.literal) + r.sb.write_string('\n') + for child in node.children { + r.render_node(child) + } +} + +fn (mut r PlainTextRenderer) render_definition_desc(node &Node) { + r.sb.write_string(' - ') + r.render_inline(node.literal) + r.sb.write_string('\n') +} + +fn (mut r PlainTextRenderer) render_text(node &Node) { + content := if r.opts.typographer { + smart_punctuate(node.literal) + } else { + node.literal + } + r.sb.write_string(content) +} + +fn (mut r PlainTextRenderer) render_wrapped(node &Node, marker string) { + r.sb.write_string(marker) + r.render_children(node) + r.sb.write_string(marker) +} + +fn (mut r PlainTextRenderer) render_link(node &Node) { + r.render_children(node) + if node.dest.len > 0 { + r.sb.write_string(' (${node.dest})') + } +} + +fn (mut r PlainTextRenderer) render_image(node &Node) { + alt := node.text_content().trim_space() + if alt.len == 0 { + r.sb.write_string('[image]') + } else { + r.sb.write_string('[image: ${alt}]') + } + if node.dest.len > 0 { + r.sb.write_string(' (${node.dest})') + } +} + +fn (mut r PlainTextRenderer) render_raw_html(node &Node) { + if r.opts.renderer_opts.unsafe_ { + r.sb.write_string(node.literal) + } +} + +fn (mut r PlainTextRenderer) render_task_checkbox(node &Node) { + r.sb.write_string(if node.checked { '☑' } else { '☐' }) +} + +fn (mut r PlainTextRenderer) render_footnote_ref(node &Node) { + label := node.fn_label + mut idx := 0 + for i, l in r.fn_order { + if l == label { + idx = i + 1 + break + } + } + if idx == 0 { + r.fn_order << label + idx = r.fn_order.len + } + r.sb.write_string('[${idx}]') +} + +fn (mut r PlainTextRenderer) render_footnotes_section() { + r.sb.write_string('\nFootnotes:\n') + for label in r.fn_order { + fn_node := r.fn_nodes[label] or { continue } + mut idx := 0 + for i, l in r.fn_order { + if l == label { + idx = i + 1 + break + } + } + r.sb.write_string('[${idx}] ') + r.render_inline(fn_node.literal) + r.sb.write_string('\n') + } +} diff --git a/vlib/x/markdown/util.v b/vlib/x/markdown/util.v index 28b336231..0a6bf721e 100644 --- a/vlib/x/markdown/util.v +++ b/vlib/x/markdown/util.v @@ -98,15 +98,18 @@ fn url_encode(s string) string { fn normalize_label(s string) string { mut out := strings.new_builder(s.len) mut in_space := true // start true so we trim leading space - for i := 0; i < s.len; i++ { - c := s[i] - if c == ` ` || c == `\t` || c == `\n` || c == `\r` { + for ch in s.runes() { + if ch in unicode_space { if !in_space { out.write_u8(` `) in_space = true } } else { - out.write_u8(ascii_lower(c)) + if ch >= `A` && ch <= `Z` { + out.write_u8(u8(ch + 32)) + } else { + out.write_string(ch.str()) + } in_space = false } } @@ -129,14 +132,14 @@ fn ascii_lower(c u8) u8 { // is_unicode_space returns true for CommonMark Unicode whitespace. @[inline] -fn is_unicode_space(c u8) bool { +fn is_unicode_space(c rune) bool { return c in unicode_space } // is_ascii_punct returns true if c is an ASCII punctuation character. @[inline] -fn is_ascii_punct(c u8) bool { - return c in ascii_punct +fn is_ascii_punct(c rune) bool { + return c <= 0x7f && u8(c) in ascii_punct } const digits = [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`]! @@ -163,12 +166,11 @@ fn is_alnum(c u8) bool { fn heading_id_from_text(text string) string { mut sb := strings.new_builder(text.len) mut prev_dash := true // start true so we trim leading dashes - for i := 0; i < text.len; i++ { - c := text[i] - if is_alnum(c) { - sb.write_u8(ascii_lower(c)) + for ch in text.runes() { + if ch <= 0x7f && is_alnum(u8(ch)) { + sb.write_u8(ascii_lower(u8(ch))) prev_dash = false - } else if c == `-` || is_unicode_space(c) || c == `_` { + } else if ch == `-` || ch == `_` || ch in unicode_space { if !prev_dash { sb.write_u8(`-`) prev_dash = true -- 2.39.5