// Copyright 2026 The V Language. All rights reserved. // Use of this source code is governed by an MIT license // that can be found in the LICENSE file. module markdown import strings // block_level_tags lists HTML tags that start an HTML block (type 6). // vfmt off const block_level_tags = [ 'address', 'article', 'aside', 'base', 'basefont', 'blockquote', 'body', 'caption', 'center', 'col', 'colgroup', 'dd', 'details', 'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'iframe', 'legend', 'li', 'link', 'main', 'menu', 'menuitem', 'meta', 'nav', 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'search', 'section', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', 'ul' ]! // vfmt on // BlockParser parses markdown block structure line by line into an AST. // After block parsing, inline content is parsed for every leaf node. struct BlockParser { opts Options mut: lines []string pos int ref_map map[string]LinkRef fn_defs map[string]&Node } // BlockParser.new creates a BlockParser for the given source. fn BlockParser.new(src string, opts Options, ref_map map[string]LinkRef) BlockParser { normalized := src.replace('\r\n', '\n').replace('\r', '\n') lines := normalized.split('\n') mut refs := map[string]LinkRef{} for k, v in ref_map { refs[k] = v } return BlockParser{ opts: opts lines: lines ref_map: refs fn_defs: map[string]&Node{} } } // nested_block_parser creates a nested parser that inherits the current options // and reference definitions. fn (p &BlockParser) nested_block_parser(lines []string) BlockParser { mut refs := map[string]LinkRef{} for k, v in p.ref_map { refs[k] = v } return BlockParser{ opts: p.opts lines: lines ref_map: refs fn_defs: map[string]&Node{} } } // merge_nested_state propagates nested parser state back to the parent parser. fn (mut p BlockParser) merge_nested_state(inner BlockParser) { for k, v in inner.ref_map { p.ref_map[k] = v } if p.opts.footnotes { for k, v in inner.fn_defs { if k !in p.fn_defs { p.fn_defs[k] = v } } } } // parse parses the full document and returns the AST root node. fn (mut p BlockParser) parse() &Node { mut doc := new_node(.document) p.parse_blocks(mut doc, 0) // Attach collected footnote definitions as children of the document. if p.opts.footnotes { for _, fn_node in p.fn_defs { doc.append_child(fn_node) } } return doc } // parse_blocks fills parent with block-level children parsed from p.lines[p.pos..]. // indent is the minimum leading-space indent already consumed by a container. fn (mut p BlockParser) parse_blocks(mut parent Node, indent int) { for p.pos < p.lines.len { line_raw := p.lines[p.pos] line := expand_tabs(line_raw) trimmed := line.trim_left(' \t') // --- blank line --- if is_blank(line) { p.pos++ continue } stripped := trim_indent(line, indent) sp := leading_spaces(stripped) content := trim_indent(stripped, sp) // --- thematic break (---, ***, ___) --- if is_thematic_break(stripped) { node := new_node(.thematic_break) parent.append_child(node) p.pos++ continue } // --- ATX heading (# ... ######) --- if heading := p.try_atx_heading(stripped) { parent.append_child(heading) p.pos++ continue } // --- fenced code block (``` or ~~~) --- if fenced := p.try_fenced_code(stripped, indent) { parent.append_child(fenced) continue } // --- HTML block --- if html_blk := p.try_html_block(stripped, indent) { parent.append_child(html_blk) continue } // --- link reference definition --- // CommonMark allows 0-3 leading spaces after container indentation. if sp <= 3 && p.try_link_ref_def(content) { continue } // --- footnote definition (if footnotes extension enabled) --- if p.opts.footnotes { if p.try_footnote_def(stripped, indent) { continue } } // --- blockquote (>) --- if stripped.starts_with('>') { bq := p.parse_blockquote(indent) parent.append_child(bq) continue } // --- indented code block (4 spaces) --- if sp >= 4 && !is_blank(stripped) { cb := p.parse_indented_code(indent) parent.append_child(cb) continue } // --- list (bullet or ordered) --- if is_list_marker(stripped) { lst := p.parse_list(indent) parent.append_child(lst) continue } // --- GFM table (if tables extension enabled) --- if p.opts.tables { if tbl := p.try_table(indent) { parent.append_child(tbl) continue } } // --- definition list (if extension enabled) --- if p.opts.definition_list { if dl := p.try_definition_list(indent) { parent.append_child(dl) continue } } // --- paragraph (including setext headings) --- para := p.parse_paragraph(indent) if para.kind == .heading || para.kind == .paragraph { parent.append_child(para) } _ = trimmed _ = content } } // is_thematic_break returns true if line is a valid thematic break // (three or more -, *, or _ with optional spaces). fn is_thematic_break(line string) bool { trimmed := line.trim_space() if trimmed.len < 3 { return false } mut c := trimmed[0] if c != `-` && c != `*` && c != `_` { return false } mut count := 0 for i := 0; i < trimmed.len; i++ { ch := trimmed[i] if ch == c { count++ } else if ch != ` ` && ch != `\t` { return false } } return count >= 3 } // try_atx_heading attempts to parse an ATX heading from line. // Returns the heading node on success. fn (mut p BlockParser) try_atx_heading(line string) ?&Node { if line.len == 0 || line[0] != `#` { return none } mut level := 0 for level < line.len && line[level] == `#` { level++ } if level > 6 { return none } if level < line.len && line[level] != ` ` && line[level] != `\t` { return none } mut content := line[level..].trim_space() // Strip trailing # sequence. for content.ends_with('#') { stripped := content.trim_right('#') if stripped.len == 0 || stripped.ends_with(' ') || stripped.ends_with('\t') { content = stripped.trim_right(' \t') break } break } mut node := new_node(.heading) node.level = level node.literal = content if p.opts.parser_opts.auto_heading_id { node.id = heading_id_from_text(content) } return node } // try_fenced_code attempts to parse a fenced code block starting at p.pos. fn (mut p BlockParser) try_fenced_code(line string, indent int) ?&Node { fence_char, fence_len := detect_fence(line) if fence_len < 3 { return none } info := line[fence_len..].trim_space() // info string must not contain a backtick when using backtick fence. if fence_char == 96 && info.contains('`') { return none } p.pos++ mut code_lines := []string{} for p.pos < p.lines.len { raw := expand_tabs(p.lines[p.pos]) stripped := trim_indent(raw, indent) // Check for closing fence. close_char, close_len := detect_fence(stripped) if close_char == fence_char && close_len >= fence_len { rest := stripped[close_len..].trim_space() if rest.len == 0 { p.pos++ break } } code_lines << trim_indent(raw, indent) p.pos++ } mut node := new_node(.fenced_code) node.fence_info = info node.literal = code_lines.join('\n') + '\n' return node } // detect_fence returns (fence_char, fence_length) if line starts with a valid // code-fence sequence, or (0, 0) if not. fn detect_fence(line string) (u8, int) { if line.len < 3 { return 0, 0 } c := line[0] if c != 96 && c != `~` { return 0, 0 } mut n := 0 for n < line.len && line[n] == c { n++ } if n >= 3 { return c, n } return 0, 0 } // parse_indented_code collects lines that are indented by at least (indent+4) // spaces (or blank) into an indented code block. fn (mut p BlockParser) parse_indented_code(indent int) &Node { mut lines := []string{} for p.pos < p.lines.len { raw := expand_tabs(p.lines[p.pos]) if is_blank(raw) { // Blank line may be included, but only if followed by more indented code. lines << '' p.pos++ continue } stripped := trim_indent(raw, indent) sp := leading_spaces(stripped) if sp < 4 { break } lines << trim_indent(stripped, 4) p.pos++ } // Trim trailing blank lines. for lines.len > 0 && lines[lines.len - 1] == '' { lines = unsafe { lines[..lines.len - 1] } } mut node := new_node(.code_block) node.literal = lines.join('\n') + '\n' return node } // try_html_block attempts to parse an HTML block starting at p.pos. fn (mut p BlockParser) try_html_block(line string, indent int) ?&Node { html_type := detect_html_block_type(line) if html_type == 0 { return none } mut raw_lines := []string{} raw_lines << p.lines[p.pos] p.pos++ // Types 1-5 end at specific end patterns; type 6-7 end at blank line. for p.pos < p.lines.len { raw := p.lines[p.pos] expanded := expand_tabs(raw) stripped := trim_indent(expanded, indent) match html_type { 1 { raw_lines << raw p.pos++ low := stripped.to_lower() if low.contains('') || low.contains('') || low.contains('') || low.contains('') { break } } 2 { raw_lines << raw p.pos++ if stripped.contains('-->') { break } } 3 { raw_lines << raw p.pos++ if stripped.contains('?>') { break } } 4 { raw_lines << raw p.pos++ if stripped.contains('>') { break } } 5 { raw_lines << raw p.pos++ if stripped.contains(']]>') { break } } 6, 7 { if is_blank(stripped) { break } raw_lines << raw p.pos++ } else {} } } mut node := new_node(.html_block) node.literal = raw_lines.join('\n') + '\n' return node } // detect_html_block_type returns the HTML block type (1-7) or 0 if the line // does not start an HTML block. fn detect_html_block_type(line string) int { stripped := line.trim_left(' \t') if stripped.len == 0 || stripped[0] != `<` { return 0 } low := stripped.to_lower() // Type 2: HTML comment if low.starts_with('