From 3094cf12ec589dd1a0c63b37010c156c581dd373 Mon Sep 17 00:00:00 2001 From: JalonSolov Date: Thu, 23 Apr 2026 21:42:32 -0400 Subject: [PATCH] x.markdown: small cleanups, speedups (#26965) * x.markdown: small cleanups, speedups * x.markdown: more small changes --- vlib/x/markdown/html.v | 11 ------ vlib/x/markdown/markdown.v | 19 ++++------ vlib/x/markdown/markdown_test.v | 40 ++++++--------------- vlib/x/markdown/node.v | 52 +++++++--------------------- vlib/x/markdown/parser.v | 57 ++++++++---------------------- vlib/x/markdown/util.v | 61 ++++++++++++++++++++++++++++++--- 6 files changed, 101 insertions(+), 139 deletions(-) diff --git a/vlib/x/markdown/html.v b/vlib/x/markdown/html.v index 7f64d52a9..f23e1986a 100644 --- a/vlib/x/markdown/html.v +++ b/vlib/x/markdown/html.v @@ -60,7 +60,6 @@ fn (mut r HTMLRenderer) render_node(node &Node) { .definition_term { r.render_definition_term(node) } .definition_desc { r.render_definition_desc(node) } .footnote_def {} // rendered in the footnote section - // Inline nodes. .text { r.render_text(node) } .emphasis { r.render_emphasis(node) } .strong { r.render_strong(node) } @@ -92,8 +91,6 @@ fn (mut r HTMLRenderer) render_inline(src string) { } } -// ---- Block elements ---- - fn (mut r HTMLRenderer) render_heading(node &Node) { tag := 'h${node.level}' if node.id.len > 0 { @@ -203,8 +200,6 @@ fn (mut r HTMLRenderer) render_html_block(node &Node) { } } -// ---- Table ---- - fn (mut r HTMLRenderer) render_table(node &Node) { r.sb.write_string('\n') r.render_children(node) @@ -249,8 +244,6 @@ fn (mut r HTMLRenderer) render_table_cell(node &Node) { r.sb.write_string('\n') } -// ---- Definition list ---- - fn (mut r HTMLRenderer) render_definition_list(node &Node) { r.sb.write_string('
\n') r.render_children(node) @@ -272,8 +265,6 @@ fn (mut r HTMLRenderer) render_definition_desc(node &Node) { r.sb.write_string('\n') } -// ---- Footnotes ---- - fn (mut r HTMLRenderer) render_footnote_ref(node &Node) { label := node.fn_label // Assign an ordinal on first encounter. @@ -302,8 +293,6 @@ fn (mut r HTMLRenderer) render_footnotes_section() { r.sb.write_string('\n\n') } -// ---- Inline elements ---- - fn (mut r HTMLRenderer) render_text(node &Node) { content := if r.opts.typographer { smart_punctuate(node.literal) diff --git a/vlib/x/markdown/markdown.v b/vlib/x/markdown/markdown.v index a8a5f4ba5..78c8daeda 100644 --- a/vlib/x/markdown/markdown.v +++ b/vlib/x/markdown/markdown.v @@ -93,9 +93,9 @@ pub mut: ref_map map[string]LinkRef } -// new creates a Markdown processor with the given options. +// Markdown.new creates a Markdown processor with the given options. // All extensions in opts.extensions are applied immediately. -pub fn new(opts Options) Markdown { +pub fn Markdown.new(opts Options) Markdown { mut m := Markdown{ opts: opts ref_map: map[string]LinkRef{} @@ -106,16 +106,9 @@ pub fn new(opts Options) Markdown { return m } -// to_html converts the markdown source to HTML using default settings -// (CommonMark only, no extensions, raw HTML stripped). -pub fn to_html(src string) string { - mut md := new(Options{}) - return md.convert(src) -} - -// to_html_opts converts the markdown source to HTML with the given options. -pub fn to_html_opts(src string, opts Options) string { - mut md := new(opts) +// to_html converts the markdown source to HTML with the given options. +pub fn to_html(src string, opts Options) string { + mut md := Markdown.new(opts) return md.convert(src) } @@ -133,7 +126,7 @@ pub fn (mut m Markdown) convert(src string) string { // Link reference definitions collected during parsing are cached so that // subsequent parse/convert calls on the same Markdown instance share them. pub fn (mut m Markdown) parse(src string) &Node { - mut p := new_block_parser(src, m.opts, m.ref_map) + mut p := BlockParser.new(src, m.opts, m.ref_map) doc := p.parse() for k, v in p.ref_map { m.ref_map[k] = v diff --git a/vlib/x/markdown/markdown_test.v b/vlib/x/markdown/markdown_test.v index 6293b3571..e27de7735 100644 --- a/vlib/x/markdown/markdown_test.v +++ b/vlib/x/markdown/markdown_test.v @@ -137,9 +137,7 @@ fn test_shortcut_reference_still_resolves_normally() { fn test_gfm_table_header_uses_th_cells() { src := '| a | b |\n| --- | --- |\n| 1 | 2 |' - html := to_html_opts(src, Options{ - extensions: gfm() - }) + html := to_html(src, extensions: gfm()) assert html.contains('
') assert html.contains('') assert html.contains('') @@ -225,9 +223,7 @@ fn test_setext_heading_multiline_text() { fn test_task_list() { src := '- [ ] unchecked\n- [x] checked\n- [X] also checked' - html := to_html_opts(src, Options{ - task_list: true - }) + html := to_html(src, task_list: true) assert html.contains('') assert html.contains('') assert html.contains('unchecked') @@ -244,29 +240,25 @@ fn test_task_list_not_applied_without_extension() { fn test_task_list_marker_requires_space_after_closing_bracket() { // GFM task markers are [ ]/[x]/[X] followed by whitespace or end of item. src := '- [x]ok\n- [ ]todo' - html := to_html_opts(src, Options{ - task_list: true - }) + html := to_html(src, task_list: true) assert !html.contains('') } fn test_footnote_definition_inside_list_item_is_preserved() { src := '- item[^note]\n\n [^note]: footnote in list\n\noutside[^note]' - html := to_html_opts(src, Options{ - footnotes: true - }) + html := to_html(src, footnotes: true) assert html.contains('item1') assert html.contains('outside1') assert html.contains('
  • footnote in list') @@ -275,9 +267,7 @@ fn test_footnote_definition_inside_list_item_is_preserved() { fn test_footnote_definition_inside_blockquote_is_preserved() { src := '> quote[^q]\n>\n> [^q]: footnote in quote' - html := to_html_opts(src, Options{ - footnotes: true - }) + html := to_html(src, footnotes: true) assert html.contains('quote1') assert html.contains('
  • footnote in quote') assert html.contains('
  • ') @@ -308,9 +298,7 @@ fn test_link_ref_def_multiline_no_title_next_line_is_content() { } fn test_gfm_helper_sets_core_extension_flags() { - md := new(Options{ - extensions: gfm() - }) + md := Markdown.new(extensions: gfm()) assert md.opts.tables assert md.opts.strikethrough assert md.opts.linkify @@ -318,19 +306,13 @@ fn test_gfm_helper_sets_core_extension_flags() { } fn test_individual_extension_helpers_set_flags() { - md_footnote := new(Options{ - extensions: [Extension(footnote())] - }) + md_footnote := Markdown.new(extensions: [Extension(footnote())]) assert md_footnote.opts.footnotes - md_typographer := new(Options{ - extensions: [Extension(typographer())] - }) + md_typographer := Markdown.new(extensions: [Extension(typographer())]) assert md_typographer.opts.typographer - md_definition_list := new(Options{ - extensions: [Extension(definition_list())] - }) + md_definition_list := Markdown.new(extensions: [Extension(definition_list())]) assert md_definition_list.opts.definition_list } diff --git a/vlib/x/markdown/node.v b/vlib/x/markdown/node.v index 720c90e1b..f03fb7170 100644 --- a/vlib/x/markdown/node.v +++ b/vlib/x/markdown/node.v @@ -7,9 +7,7 @@ import strings // NodeKind identifies what kind of AST node a Node represents. pub enum NodeKind { - // ------- document root ------- document - // ------- block elements ------- heading paragraph blockquote @@ -20,19 +18,15 @@ pub enum NodeKind { thematic_break html_block link_ref_def - // GFM block extensions table table_head table_body table_row table_cell - // Definition list (Pandoc-style) definition_list definition_term definition_desc - // Footnote definition block footnote_def - // ------- inline elements ------- text emphasis strong @@ -43,11 +37,8 @@ pub enum NodeKind { raw_html hard_break soft_break - // GFM inline extensions strikethrough - // Footnote reference inline footnote_ref - // Task list checkbox (inline, first child of a list_item) task_checkbox } @@ -64,39 +55,22 @@ pub enum Alignment { @[heap] pub struct Node { pub mut: - kind NodeKind - // ----- block-level fields ----- - // heading: 1–6 - level int - // list: true when there are no blank lines between items - is_tight bool - // list: true for ordered (1. 2. 3.), false for bullet (- * +) + kind NodeKind + level int + is_tight bool is_ordered bool - // list: starting number of an ordered list list_start int = 1 - // fenced_code: the info string after the opening fence (e.g. "go") fence_info string - // ----- inline-level fields ----- - // text / code_span / raw_html / html_block: literal string content - literal string - // link / image: URL destination - dest string - // link / image: optional title - title string - // link: reference label (for reference-style links) - label string - // task_checkbox: true when the checkbox is checked ([x]) - checked bool - // table_cell: column alignment - align Alignment - // heading: optional explicit or auto-generated id attribute - id string - // footnote_ref / footnote_def: footnote label - fn_label string - // footnote_def: 1-based ordinal assigned during rendering - fn_index int - // ----- tree structure ----- - children []&Node + literal string + dest string + title string + label string + checked bool + align Alignment + id string + fn_label string + fn_index int + children []&Node } // new_node allocates and returns a new Node of the given kind. diff --git a/vlib/x/markdown/parser.v b/vlib/x/markdown/parser.v index 43127ea75..44f62feb3 100644 --- a/vlib/x/markdown/parser.v +++ b/vlib/x/markdown/parser.v @@ -5,6 +5,19 @@ module markdown import strings +// block_level_tags lists HTML tags that start an HTML block (type 6). +// vfmt off +const block_level_tags = [ + 'address', 'article', 'aside', 'base', 'basefont', 'blockquote', 'body', 'caption', 'center', + 'col', 'colgroup', 'dd', 'details', 'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset', + 'figcaption', 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', + 'h6', 'head', 'header', 'hr', 'html', 'iframe', 'legend', 'li', 'link', 'main', 'menu', + 'menuitem', 'meta', 'nav', 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'search', + 'section', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', + 'ul' +] +// vfmt on + // BlockParser parses markdown block structure line by line into an AST. // After block parsing, inline content is parsed for every leaf node. struct BlockParser { @@ -16,8 +29,8 @@ mut: fn_defs map[string]&Node } -// new_block_parser creates a BlockParser for the given source. -fn new_block_parser(src string, opts Options, ref_map map[string]LinkRef) BlockParser { +// BlockParser.new creates a BlockParser for the given source. +fn BlockParser.new(src string, opts Options, ref_map map[string]LinkRef) BlockParser { normalized := src.replace('\r\n', '\n').replace('\r', '\n') lines := normalized.split('\n') mut refs := map[string]LinkRef{} @@ -179,8 +192,6 @@ fn (mut p BlockParser) parse_blocks(mut parent Node, indent int) { } } -// ---- Thematic break ---- - // is_thematic_break returns true if line is a valid thematic break // (three or more -, *, or _ with optional spaces). fn is_thematic_break(line string) bool { @@ -204,8 +215,6 @@ fn is_thematic_break(line string) bool { return count >= 3 } -// ---- ATX headings ---- - // try_atx_heading attempts to parse an ATX heading from line. // Returns the heading node on success. fn (mut p BlockParser) try_atx_heading(line string) ?&Node { @@ -241,8 +250,6 @@ fn (mut p BlockParser) try_atx_heading(line string) ?&Node { return node } -// ---- Fenced code blocks ---- - // try_fenced_code attempts to parse a fenced code block starting at p.pos. fn (mut p BlockParser) try_fenced_code(line string, indent int) ?&Node { fence_char, fence_len := detect_fence(line) @@ -297,8 +304,6 @@ fn detect_fence(line string) (u8, int) { return 0, 0 } -// ---- Indented code block ---- - // parse_indented_code collects lines that are indented by at least (indent+4) // spaces (or blank) into an indented code block. fn (mut p BlockParser) parse_indented_code(indent int) &Node { @@ -328,17 +333,6 @@ fn (mut p BlockParser) parse_indented_code(indent int) &Node { return node } -// ---- HTML blocks ---- - -// block_level_tags lists HTML tags that start an HTML block (type 6). -const block_level_tags = ['address', 'article', 'aside', 'base', 'basefont', 'blockquote', 'body', - 'caption', 'center', 'col', 'colgroup', 'dd', 'details', 'dialog', 'dir', 'div', 'dl', 'dt', - 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', - 'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'iframe', 'legend', 'li', 'link', 'main', - 'menu', 'menuitem', 'meta', 'nav', 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'search', - 'section', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', - 'ul'] - // try_html_block attempts to parse an HTML block starting at p.pos. fn (mut p BlockParser) try_html_block(line string, indent int) ?&Node { html_type := detect_html_block_type(line) @@ -478,8 +472,6 @@ fn is_complete_html_tag(s string) bool { return end == s.len - 1 || s[end + 1..].trim_space().len == 0 } -// ---- Link reference definitions ---- - // try_link_ref_def attempts to parse a link reference definition at p.pos. // CommonMark allows the title to appear on the next line when the destination // is alone on the first line. Returns true and advances p.pos if successful. @@ -537,9 +529,6 @@ fn (mut p BlockParser) try_link_ref_def(line string) bool { title = parsed_title extra_lines = 1 } - // If the next line starts with something that is not a title - // delimiter, we simply leave `title` empty and do not consume - // that line (it will be parsed as the next block). } } } @@ -626,8 +615,6 @@ fn parse_link_title(s string) (string, string) { return '', s } -// ---- Blockquote ---- - // parse_blockquote parses a blockquote block and returns a blockquote node. fn (mut p BlockParser) parse_blockquote(indent int) &Node { mut bq_lines := []string{} @@ -659,8 +646,6 @@ fn (mut p BlockParser) parse_blockquote(indent int) &Node { return node } -// ---- Lists ---- - // ListMarker holds parsed list marker information. struct ListMarker { is_ordered bool @@ -926,8 +911,6 @@ fn (mut p BlockParser) parse_list_item(base_indent int) &Node { return item } -// ---- Tables (GFM) ---- - // try_table attempts to parse a GFM table starting at p.pos. // A table requires a header row, an alignment row (|---|), then data rows. fn (mut p BlockParser) try_table(indent int) ?&Node { @@ -1082,8 +1065,6 @@ fn split_table_cells(line string) []string { return cells } -// ---- Definition list ---- - // try_definition_list attempts to parse a definition list starting at p.pos. fn (mut p BlockParser) try_definition_list(indent int) ?&Node { if p.pos + 1 >= p.lines.len { @@ -1127,8 +1108,6 @@ fn (mut p BlockParser) try_definition_list(indent int) ?&Node { return dl } -// ---- Footnote definitions ---- - // try_footnote_def attempts to parse a footnote definition starting at p.pos. fn (mut p BlockParser) try_footnote_def(line string, indent int) bool { if !line.starts_with('[^') { @@ -1164,8 +1143,6 @@ fn (mut p BlockParser) try_footnote_def(line string, indent int) bool { return true } -// ---- Paragraph / Setext heading ---- - // parse_paragraph parses a paragraph block, upgrading it to a setext heading // if the immediately following line is a setext underline (=== or ---). fn (mut p BlockParser) parse_paragraph(indent int) &Node { @@ -1237,10 +1214,6 @@ fn is_setext_underline(line string) bool { return true } -// ---- Inline parsing kick-off ---- -// After block parsing, leaf node .literal fields contain raw inline text. -// The inline parser is invoked lazily by the HTML renderer. - // unescape_string decodes CommonMark backslash escapes in s. fn unescape_string(s string) string { if !s.contains('\\') { diff --git a/vlib/x/markdown/util.v b/vlib/x/markdown/util.v index 402e56d3c..28b336231 100644 --- a/vlib/x/markdown/util.v +++ b/vlib/x/markdown/util.v @@ -5,6 +5,57 @@ module markdown import strings +// unicode_space lists Unicode code points considered whitespace +// vfmt off +const unicode_space = [ + ` `, // space + `\t`, // tab + 0x0a, // LF + 0x0b, // Vertical Tab + 0x0c, // FF + 0x0d, // CR + 0x0085, // next line + 0x00A0, // no-break space + 0x1680, // ogham space mark + 0x180E, // mongolian vowel separator + 0x2000, // en quad + 0x2001, // em quad + 0x2002, // en space + 0x2003, // em space + 0x2004, // three-per-em space + 0x2005, // four-per-em space + 0x2006, // six-per-em space + 0x2007, // figure space + 0x2008, // punctuation space + 0x2009, // thin space + 0x200A, // hair space + 0x200B, // zero width space + 0x200C, // zero width non-joiner + 0x200D, // zero width joiner + 0x2028, // line separator + 0x2029, // paragraph separator + 0x202F, // narrow no-break space + 0x205F, // medium mathematical space + 0x2060, // word joiner + 0x3000, // ideographic space + 0xFEFF, // zero width non-breaking space +]! + +// ascii_punct lists ASCII punctuation characters +const ascii_punct = [ + `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, `*`, `+`, `,`, `-`, `.`, `/`, `:`, + `;`, `<`, `=`, `>`, `?`, `@`, `[`, `\\`, `]`, `^`, `_`, `\``, `{`, `|`, `}`, `~`, +]! + +// alpha lists ASCII letters a-z and A-Z +const alpha = [ + `a`, `b`, `c`, `d`, `e`, `f`, `g`, `h`, `i`, `j`, `k`, `l`, `m`, + `n`, `o`, `p`, `q`, `r`, `s`, `t`, `u`, `v`, `w`, `x`, `y`, `z`, + `A`, `B`, `C`, `D`, `E`, `F`, `G`, `H`, `I`, `J`, `K`, `L`, `M`, + `N`, `O`, `P`, `Q`, `R`, `S`, `T`, `U`, `V`, `W`, `X`, `Y`, `Z`, +]! +// vfmt on + // html_escape replaces HTML special characters in s with their entity equivalents. fn html_escape(s string) string { if s.index_any('&<>"') == -1 { @@ -79,26 +130,26 @@ fn ascii_lower(c u8) u8 { // is_unicode_space returns true for CommonMark Unicode whitespace. @[inline] fn is_unicode_space(c u8) bool { - return c == ` ` || c == `\t` || c == `\n` || c == `\r` || c == 0x0c || c == 0x0b + return c in unicode_space } // is_ascii_punct returns true if c is an ASCII punctuation character. @[inline] fn is_ascii_punct(c u8) bool { - return (c >= `!` && c <= `/`) || (c >= `:` && c <= `@`) || (c >= `[` && c <= 96) - || (c >= `{` && c <= `~`) + return c in ascii_punct } +const digits = [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`]! // is_digit returns true if c is an ASCII decimal digit. @[inline] fn is_digit(c u8) bool { - return c >= `0` && c <= `9` + return c in digits } // is_alpha returns true if c is an ASCII letter. @[inline] fn is_alpha(c u8) bool { - return (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) + return c in alpha } // is_alnum returns true if c is an ASCII letter or digit. -- 2.39.5
    ab