From 5e4a634512f74766ee5732dbb33f1d4555b5cb0a Mon Sep 17 00:00:00 2001 From: JalonSolov Date: Mon, 20 Apr 2026 13:08:51 -0400 Subject: [PATCH] x.markdown: add pure V markdown module (#26925) * add pure V markdown module * x.markdown: implement AI suggestions --- .gitignore | 37 +- vlib/x/markdown/README.md | 288 +++++++ vlib/x/markdown/extension.v | 115 +++ vlib/x/markdown/html.v | 405 ++++++++++ vlib/x/markdown/inline.v | 877 +++++++++++++++++++++ vlib/x/markdown/markdown.v | 142 ++++ vlib/x/markdown/markdown_test.v | 343 +++++++++ vlib/x/markdown/node.v | 144 ++++ vlib/x/markdown/parser.v | 1261 +++++++++++++++++++++++++++++++ vlib/x/markdown/util.v | 244 ++++++ 10 files changed, 3835 insertions(+), 21 deletions(-) create mode 100644 vlib/x/markdown/README.md create mode 100644 vlib/x/markdown/extension.v create mode 100644 vlib/x/markdown/html.v create mode 100644 vlib/x/markdown/inline.v create mode 100644 vlib/x/markdown/markdown.v create mode 100644 vlib/x/markdown/markdown_test.v create mode 100644 vlib/x/markdown/node.v create mode 100644 vlib/x/markdown/parser.v create mode 100644 vlib/x/markdown/util.v diff --git a/.gitignore b/.gitignore index 925ab8389..1a88525cd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,13 @@ # ignore sub-level untracked files and the v binary */**/* + # unignore checker test files !vlib/v/checker/tests/*.vv !vlib/v/checker/tests/*.out -v -v.exe -v2 -v2.exe -vdbg -vdbg.exe + +# unignore vlib/x/markdown +!vlib/x/markdown/** + !*/ # Do not add !*.* here; it overrides local excludes from .git/info/exclude. *.exe @@ -28,23 +27,20 @@ a.out .noprefix.vrepl_temp # ignore v build files +# *.exe is already ignored above +/v +/v2 /vc /v.c /v.*.c /v.c.out /v_old -/v_old.exe +/vdbg /vold -/vold.exe /vnew -/vnew.exe /vprod /vprod_gcc /vprod_clang -/vprod.exe -/vprod_gcc.exe -/vprod_clang.exe -/vprod_msvc.exe .vrepl_temp.v fns.txt .noprefix.vrepl_temp.v @@ -68,11 +64,10 @@ cache/ !GNUmakefile # ignore editor files -.idea -.project -.classpath -.c9 -.vs +.project/ +.classpath/ +.c9/ +.vs/ *.launch .settings/ *.sublime-workspace @@ -143,7 +138,7 @@ vls.log .idea/ /*.iml -#ignore generated files: +# ignore generated files: wasm.v # ignore large GTK *.gir files @@ -153,7 +148,7 @@ Gtk-4.0.gir vlib/builtin/js/*.js vlib/v/tests/*.js -#ignore tags indexes, used by emacs/vim: +# ignore tags indexes, used by emacs/vim: ETAGS TAGS tags @@ -169,7 +164,7 @@ autofuzz.log .project.gf .aider* -#ignore common file names for bugs/reproductions +# ignore common file names for bugs/reproductions bug* issue* diff --git a/vlib/x/markdown/README.md b/vlib/x/markdown/README.md new file mode 100644 index 000000000..96ad43e4f --- /dev/null +++ b/vlib/x/markdown/README.md @@ -0,0 +1,288 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. + +# vlib/x/markdown - Markdown Parser and HTML Renderer + +A CommonMark-compliant Markdown parser and HTML renderer for V, with support for GitHub Flavored Markdown (GFM) extensions. Designed for feature parity with [github.com/yuin/goldmark](https://github.com/yuin/goldmark). + +## Features + +### CommonMark Support +- **Block-level elements**: headings (ATX and setext), paragraphs, blockquotes, + lists (bullet and ordered), code blocks (indented and fenced), HTML blocks, + thematic breaks +- **Inline elements**: emphasis (*em* and **strong**), code spans, links + (inline and reference), images, autolinks, hard/soft line breaks, HTML + entities, raw HTML +- **Link reference definitions** for DRY Markdown + +### GFM Extensions (via `.gfm()` helper or individual extensions) +- **Tables**: `| col | col |` with alignment (`:--`, `:--:`, `--:`) +- **Strikethrough**: `~~text~~` +- **Task lists**: `- [ ] todo` and `- [x] done` +- **Linkify**: bare URLs become links + +### Additional Extensions +- **Footnotes**: `[^1]` references and `[^1]: footnote text` definitions +- **Typographer**: smart punctuation (`--` → en-dash, `---` → em-dash, + `...` → ellipsis, smart quotes) +- **Auto-heading IDs**: automatic `id` attributes on headings from text content +- **Definition lists**: Pandoc-style (requires extension) + +## Quick Start + +### Basic Usage + +```v +import x.markdown + +fn main() { + html := markdown.to_html('# Hello\n\nWorld') + println(html) + // Output:

Hello

\n

World

\n +} +``` + +### With Extensions + +```v oksyntax +mut md := markdown.new(Options{ + extensions: markdown.gfm() +}) +html := md.convert('| Name |\n|------|\n| Alice |') +println(html) // Renders as HTML table +``` + +### Fine-Grained Configuration + +```v +import x.markdown + +fn main() { + mut md := markdown.new(markdown.Options{ + extensions: [markdown.Extension(markdown.footnote()), markdown.typographer()] + parser_opts: markdown.ParserOptions{ + auto_heading_id: true + } + renderer_opts: markdown.RendererOptions{ + unsafe_: true + xhtml: true + } + }) + source := '# Title' + html := md.convert(source) + println(html) +} +``` + +### Parse to AST and Walk + +```v +import x.markdown + +fn main() { + mut md := markdown.new(markdown.Options{}) + source := '# Hello\n\n`x`' + doc := md.parse(source) + doc.walk(fn (node &markdown.Node) bool { + match node.kind { + .heading { + println('Heading level ${node.level}') + } + .code_span { + println('Code: ${node.literal}') + } + else {} + } + + return true + }) +} +``` + +## API Overview + +### Top-Level Functions +- `to_html(src: string) string` - Convert Markdown to HTML with default settings +- `to_html_opts(src: string, opts: Options) string` - Convert with custom options +- `parse_inline(src: string, opts: Options, ref_map: map) []&Node` - Parse inline content only + +### Main Structs + +#### `Markdown` +The main processor. Create with `new()`, reuse across multiple calls to share link references. + +Methods: +- `convert(src: string) string` - Parse and render to HTML in one call +- `parse(src: string) &Node` - Parse to AST only + +#### `Options` (@[params]) +```v oksyntax +pub struct Options { +pub mut: + extensions []Extension + parser_opts ParserOptions + renderer_opts RendererOptions + // Extension feature flags (set by extensions) + tables bool + strikethrough bool + linkify bool + task_list bool + footnotes bool + typographer bool + definition_list bool +} +``` + +#### `ParserOptions` (@[params]) +```v oksyntax +pub struct ParserOptions { +pub mut: + auto_heading_id bool // Generate id from heading text +} +``` + +#### `RendererOptions` (@[params]) +```v oksyntax +pub struct RendererOptions { +pub mut: + unsafe_ bool // Allow raw HTML (default: false) + hard_wraps bool // Convert all \n to
(default: false) + xhtml bool // Output XHTML self-closing tags (default: false) +} +``` + +#### `Node` +An AST node. Navigate with `.children`, inspect with `.kind`, `.literal`, `.level`, etc. + +Methods: +- `text_content() string` - Extract plain text from this node and descendants +- `walk(f: fn(&Node) bool) bool` - Traverse AST pre-order; return false from callback to stop + +### Extensions + +Available as functions returning extension structs: +- `table()` - GFM tables +- `strikethrough()` - GFM strikethrough +- `linkify()` - Bare URL autolinks +- `task_list()` - GFM task lists +- `footnote()` - Footnote references and definitions +- `typographer()` - Smart punctuation +- `definition_list()` - Pandoc-style definition lists +- `gfm()` - Convenience helper returning `[table(), strikethrough(), linkify(), task_list()]` + +## Examples + +### Simple Emphasis + +```v oksyntax +assert markdown.to_html('*em*').contains('em') +assert markdown.to_html('**strong**').contains('strong') +``` + +### Links and Images + +```v oksyntax +// Inline link +html := markdown.to_html('[click](https://example.com)') +// Reference link +html = markdown.to_html('[click][ref]\n\n[ref]: https://example.com') +// Image +html = markdown.to_html('![alt](image.png "title")') +``` + +### Code Blocks + +```v oksyntax +// Indented code +html := markdown.to_html(' code') +// Fenced code +html = markdown.to_html('```v\nfn main() {}\n```') +``` + +### Lists + +```v oksyntax +// Bullet list +html := markdown.to_html('- item 1\n- item 2') +// Ordered list +html = markdown.to_html('1. first\n2. second') +// Task list (enable via extension or task_list option) +html = markdown.to_html_opts('- [x] done', Options{ task_list: true }) +``` + +### Tables (GFM) + +```v oksyntax +src := '| Left | Center | Right |\n|:--|:--:|--:|\n| A | B | C |' +html := markdown.to_html_opts(src, Options{ tables: true }) +``` + +### Footnotes + +```v oksyntax +src := 'Text[^1]\n\n[^1]: Footnote body.' +html := markdown.to_html_opts(src, Options{ footnotes: true }) +// Renders with reference and footnote section at bottom +``` + +## Design Notes + +### Block Parsing +- Reads source line-by-line, building a block-level AST +- Handles lazy continuation lines for blockquotes and lists +- Collects link reference definitions for inline resolution + +### Inline Parsing +- Parses raw text from paragraph/heading/cell nodes using a simple state machine +- Emphasis/strong uses a delimiter-run resolution pass aligned with CommonMark rules +- Backticks, brackets, and HTML are handled specially + +### Rendering +- Tree walk via `render_node()` dispatch on `NodeKind` +- Inline nodes parsed on-demand during rendering +- Link references cached in `Markdown` for reuse across multiple convert calls + +## Limitations and Known Issues + +- Definition list syntax is Pandoc-style; CommonMark does not define this + +**Status**: All core features (headings, emphasis, links, code, lists, +blockquotes, task lists, tables, HTML escaping) work reliably without crashes. + +## Testing + +Run the test suite: + +```bash +v -silent test vlib/x/markdown/markdown_test.v +``` + +Or write your own: + +```v oksyntax +import x.markdown + +fn test_my_markdown() { + html := markdown.to_html('# Test') + assert html == '

Test

\n' +} +``` + +## Contributing + +- Follow V style guidelines (use `v fmt -w` on edits) +- Add tests for new features +- Update documentation for public API changes +- Keep CommonMark compliance as the baseline + +## License + +MIT, same as V. + +## References + +- [CommonMark Specification](https://spec.commonmark.org/) +- [GitHub Flavored Markdown](https://github.github.com/gfm/) +- [goldmark (Go implementation)](https://github.com/yuin/goldmark) diff --git a/vlib/x/markdown/extension.v b/vlib/x/markdown/extension.v new file mode 100644 index 000000000..aff85c4db --- /dev/null +++ b/vlib/x/markdown/extension.v @@ -0,0 +1,115 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module markdown + +// Extension is the interface implemented by markdown extensions. +// An extension configures the Markdown processor by enabling parser and +// renderer features. +pub interface Extension { + // extend is called once when the extension is registered with a Markdown processor. + extend(mut m Markdown) +} + +// TableExt adds GitHub Flavored Markdown table support (| col | col |). +pub struct TableExt {} + +// extend implements Extension for TableExt. +pub fn (_ TableExt) extend(mut m Markdown) { + m.opts.tables = true +} + +// StrikethroughExt adds GFM strikethrough support (~~text~~). +pub struct StrikethroughExt {} + +// extend implements Extension for StrikethroughExt. +pub fn (_ StrikethroughExt) extend(mut m Markdown) { + m.opts.strikethrough = true +} + +// LinkifyExt adds autolink support for bare URLs and email addresses. +pub struct LinkifyExt {} + +// extend implements Extension for LinkifyExt. +pub fn (_ LinkifyExt) extend(mut m Markdown) { + m.opts.linkify = true +} + +// TaskListExt adds GFM task list item support (- [ ] / - [x]). +pub struct TaskListExt {} + +// extend implements Extension for TaskListExt. +pub fn (_ TaskListExt) extend(mut m Markdown) { + m.opts.task_list = true +} + +// FootnoteExt adds footnote support ([^label] references and [^label]: definitions). +pub struct FootnoteExt {} + +// extend implements Extension for FootnoteExt. +pub fn (_ FootnoteExt) extend(mut m Markdown) { + m.opts.footnotes = true +} + +// TypographerExt replaces ASCII punctuation sequences with Unicode typographic +// equivalents: -- en dash, --- em dash, ... ellipsis, and smart quotes. +pub struct TypographerExt {} + +// extend implements Extension for TypographerExt. +pub fn (_ TypographerExt) extend(mut m Markdown) { + m.opts.typographer = true +} + +// DefinitionListExt adds Pandoc-style definition list support. +pub struct DefinitionListExt {} + +// extend implements Extension for DefinitionListExt. +pub fn (_ DefinitionListExt) extend(mut m Markdown) { + m.opts.definition_list = true +} + +// table returns a TableExt extension value. +pub fn table() TableExt { + return TableExt{} +} + +// strikethrough returns a StrikethroughExt extension value. +pub fn strikethrough() StrikethroughExt { + return StrikethroughExt{} +} + +// linkify returns a LinkifyExt extension value. +pub fn linkify() LinkifyExt { + return LinkifyExt{} +} + +// task_list returns a TaskListExt extension value. +pub fn task_list() TaskListExt { + return TaskListExt{} +} + +// footnote returns a FootnoteExt extension value. +pub fn footnote() FootnoteExt { + return FootnoteExt{} +} + +// typographer returns a TypographerExt extension value. +pub fn typographer() TypographerExt { + return TypographerExt{} +} + +// definition_list returns a DefinitionListExt extension value. +pub fn definition_list() DefinitionListExt { + return DefinitionListExt{} +} + +// gfm returns the core GitHub Flavored Markdown extensions: +// TableExt, StrikethroughExt, LinkifyExt, and TaskListExt. +pub fn gfm() []Extension { + return [ + Extension(TableExt{}), + StrikethroughExt{}, + LinkifyExt{}, + TaskListExt{}, + ] +} diff --git a/vlib/x/markdown/html.v b/vlib/x/markdown/html.v new file mode 100644 index 000000000..7f64d52a9 --- /dev/null +++ b/vlib/x/markdown/html.v @@ -0,0 +1,405 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module markdown + +import strings + +// HTMLRenderer renders a parsed markdown AST to an HTML string. +struct HTMLRenderer { + opts Options + ref_map map[string]LinkRef +mut: + sb strings.Builder + // footnote tracking + fn_order []string // ordered list of encountered fn labels + fn_nodes map[string]&Node // label → footnote_def node + tight_list bool // whether we're inside a tight list + in_table_head bool +} + +// render renders the document node to an HTML string. +pub fn (mut r HTMLRenderer) render(doc &Node) string { + r.sb = strings.new_builder(1024) + // Pre-collect footnote definitions if extension is enabled. + if r.opts.footnotes { + for child in doc.children { + if child.kind == .footnote_def { + r.fn_nodes[child.fn_label] = child + } + } + } + r.render_children(doc) + // Append footnotes section if any refs were used. + if r.opts.footnotes && r.fn_order.len > 0 { + r.render_footnotes_section() + } + return r.sb.str() +} + +// render_node dispatches rendering to the appropriate method. +fn (mut r HTMLRenderer) render_node(node &Node) { + match node.kind { + .document { r.render_children(node) } + .heading { r.render_heading(node) } + .paragraph { r.render_paragraph(node) } + .blockquote { r.render_blockquote(node) } + .list { r.render_list(node) } + .list_item { r.render_list_item(node) } + .code_block { r.render_code_block(node) } + .fenced_code { r.render_fenced_code(node) } + .thematic_break { r.render_thematic_break() } + .html_block { r.render_html_block(node) } + .link_ref_def {} // already collected, nothing to render + .table { r.render_table(node) } + .table_head { r.render_table_section(node, 'thead') } + .table_body { r.render_table_section(node, 'tbody') } + .table_row { r.render_table_row(node) } + .table_cell { r.render_table_cell(node) } + .definition_list { r.render_definition_list(node) } + .definition_term { r.render_definition_term(node) } + .definition_desc { r.render_definition_desc(node) } + .footnote_def {} // rendered in the footnote section + // Inline nodes. + .text { r.render_text(node) } + .emphasis { r.render_emphasis(node) } + .strong { r.render_strong(node) } + .code_span { r.render_code_span(node) } + .link { r.render_link(node) } + .image { r.render_image(node) } + .autolink { r.render_autolink(node) } + .raw_html { r.render_raw_html(node) } + .hard_break { r.render_hard_break() } + .soft_break { r.render_soft_break() } + .strikethrough { r.render_strikethrough(node) } + .task_checkbox { r.render_task_checkbox(node) } + .footnote_ref { r.render_footnote_ref(node) } + } +} + +// render_children renders all children of node. +fn (mut r HTMLRenderer) render_children(node &Node) { + for child in node.children { + r.render_node(child) + } +} + +// render_inline parses and renders inline content from a literal string. +fn (mut r HTMLRenderer) render_inline(src string) { + nodes := parse_inline(src, r.opts, r.ref_map) + for node in nodes { + r.render_node(node) + } +} + +// ---- Block elements ---- + +fn (mut r HTMLRenderer) render_heading(node &Node) { + tag := 'h${node.level}' + if node.id.len > 0 { + r.sb.write_string('<${tag} id="${html_escape(node.id)}">') + } else { + r.sb.write_string('<${tag}>') + } + if node.children.len > 0 { + r.render_children(node) + } else { + r.render_inline(node.literal) + } + r.sb.write_string('\n') +} + +fn (mut r HTMLRenderer) render_paragraph(node &Node) { + if r.tight_list { + // In a tight list, paragraph content is rendered directly without

tags. + if node.children.len > 0 { + r.render_children(node) + } else { + r.render_inline(node.literal) + } + return + } + r.sb.write_string('

') + if node.children.len > 0 { + r.render_children(node) + } else { + r.render_inline(node.literal) + } + r.sb.write_string('

\n') +} + +fn (mut r HTMLRenderer) render_blockquote(node &Node) { + r.sb.write_string('
\n') + r.render_children(node) + r.sb.write_string('
\n') +} + +fn (mut r HTMLRenderer) render_list(node &Node) { + tag := if node.is_ordered { 'ol' } else { 'ul' } + if node.is_ordered && node.list_start != 1 { + r.sb.write_string('<${tag} start="${node.list_start}">\n') + } else { + r.sb.write_string('<${tag}>\n') + } + prev_tight := r.tight_list + r.tight_list = node.is_tight + r.render_children(node) + r.tight_list = prev_tight + r.sb.write_string('\n') +} + +fn (mut r HTMLRenderer) render_list_item(node &Node) { + // Check if this is a task list item (first child is task_checkbox). + if r.opts.task_list && node.children.len > 0 && node.children[0].kind == .task_checkbox { + chk := node.children[0] + checked_attr := if chk.checked { ' checked=""' } else { '' } + if r.opts.renderer_opts.xhtml { + r.sb.write_string('
  • ') + } else { + r.sb.write_string('
  • ') + } + for i := 1; i < node.children.len; i++ { + r.render_node(node.children[i]) + } + r.sb.write_string('
  • \n') + return + } + r.sb.write_string('
  • ') + r.render_children(node) + r.sb.write_string('
  • \n') +} + +fn (mut r HTMLRenderer) render_code_block(node &Node) { + r.sb.write_string('
    ')
    +	r.sb.write_string(html_escape(node.literal))
    +	r.sb.write_string('
    \n') +} + +fn (mut r HTMLRenderer) render_fenced_code(node &Node) { + if node.fence_info.len > 0 { + // Use only the first word of the info string as the language class. + lang := node.fence_info.split(' ')[0].split('\t')[0] + r.sb.write_string('
    ')
    +	} else {
    +		r.sb.write_string('
    ')
    +	}
    +	r.sb.write_string(html_escape(node.literal))
    +	r.sb.write_string('
    \n') +} + +fn (mut r HTMLRenderer) render_thematic_break() { + if r.opts.renderer_opts.xhtml { + r.sb.write_string('
    \n') + } else { + r.sb.write_string('
    \n') + } +} + +fn (mut r HTMLRenderer) render_html_block(node &Node) { + if r.opts.renderer_opts.unsafe_ { + r.sb.write_string(node.literal) + } else { + r.sb.write_string('\n') + } +} + +// ---- Table ---- + +fn (mut r HTMLRenderer) render_table(node &Node) { + r.sb.write_string('\n') + r.render_children(node) + r.sb.write_string('
    \n') +} + +fn (mut r HTMLRenderer) render_table_section(node &Node, tag string) { + prev_in_table_head := r.in_table_head + r.in_table_head = tag == 'thead' + r.sb.write_string('<${tag}>\n') + r.render_children(node) + r.sb.write_string('\n') + r.in_table_head = prev_in_table_head +} + +fn (mut r HTMLRenderer) render_table_row(node &Node) { + // Determine cell tag based on parent kind (table_head uses th). + // We pass the context via a field or inspect the row context. + // Since we don't have parent pointer, check if this is a header row via the + // node's parent tracking. We'll check node.children[0].align as a proxy. + // Instead, use a simple flag: if any sibling is a table_head, use . + // For simplicity, we use always and let render_table_cell decide. + r.sb.write_string('\n') + r.render_children(node) + r.sb.write_string('\n') +} + +fn (mut r HTMLRenderer) render_table_cell(node &Node) { + // We use a flag in the renderer to know if we're in the head. + // Simple approach: the cell tag is set by the surrounding context. + // We'll use and trust the renderer state. + align_attr := match node.align { + .left { ' align="left"' } + .center { ' align="center"' } + .right { ' align="right"' } + else { '' } + } + + cell_tag := if r.in_table_head { 'th' } else { 'td' } + r.sb.write_string('<${cell_tag}${align_attr}>') + r.render_inline(node.literal) + r.sb.write_string('\n') +} + +// ---- Definition list ---- + +fn (mut r HTMLRenderer) render_definition_list(node &Node) { + r.sb.write_string('
    \n') + r.render_children(node) + r.sb.write_string('
    \n') +} + +fn (mut r HTMLRenderer) render_definition_term(node &Node) { + r.sb.write_string('
    ') + r.render_inline(node.literal) + r.sb.write_string('
    \n') + for child in node.children { + r.render_node(child) + } +} + +fn (mut r HTMLRenderer) render_definition_desc(node &Node) { + r.sb.write_string('
    ') + r.render_inline(node.literal) + r.sb.write_string('
    \n') +} + +// ---- Footnotes ---- + +fn (mut r HTMLRenderer) render_footnote_ref(node &Node) { + label := node.fn_label + // Assign an ordinal on first encounter. + mut idx := 0 + for i, l in r.fn_order { + if l == label { + idx = i + 1 + break + } + } + if idx == 0 { + r.fn_order << label + idx = r.fn_order.len + } + r.sb.write_string('${idx}') +} + +fn (mut r HTMLRenderer) render_footnotes_section() { + r.sb.write_string('
    \n
      \n') + for label in r.fn_order { + fn_node := r.fn_nodes[label] or { continue } + r.sb.write_string('
    1. ') + r.render_inline(fn_node.literal) + r.sb.write_string('
    2. \n') + } + r.sb.write_string('
    \n
    \n') +} + +// ---- Inline elements ---- + +fn (mut r HTMLRenderer) render_text(node &Node) { + content := if r.opts.typographer { + smart_punctuate(node.literal) + } else { + node.literal + } + r.sb.write_string(html_escape(content)) +} + +fn (mut r HTMLRenderer) render_emphasis(node &Node) { + r.sb.write_string('') + r.render_children(node) + r.sb.write_string('') +} + +fn (mut r HTMLRenderer) render_strong(node &Node) { + r.sb.write_string('') + r.render_children(node) + r.sb.write_string('') +} + +fn (mut r HTMLRenderer) render_code_span(node &Node) { + r.sb.write_string('') + r.sb.write_string(html_escape(node.literal)) + r.sb.write_string('') +} + +fn (mut r HTMLRenderer) render_link(node &Node) { + r.sb.write_string(' 0 { + r.sb.write_string(' title="${html_escape(node.title)}"') + } + r.sb.write_string('>') + r.render_children(node) + r.sb.write_string('') +} + +fn (mut r HTMLRenderer) render_image(node &Node) { + alt := node.text_content() + r.sb.write_string('${html_escape(alt)} 0 { + r.sb.write_string(' title="${html_escape(node.title)}"') + } + if r.opts.renderer_opts.xhtml { + r.sb.write_string(' />') + } else { + r.sb.write_string('>') + } +} + +fn (mut r HTMLRenderer) render_autolink(node &Node) { + r.sb.write_string('') + r.sb.write_string(html_escape(node.literal)) + r.sb.write_string('') +} + +fn (mut r HTMLRenderer) render_raw_html(node &Node) { + if r.opts.renderer_opts.unsafe_ { + r.sb.write_string(node.literal) + } else { + r.sb.write_string('') + } +} + +fn (mut r HTMLRenderer) render_hard_break() { + if r.opts.renderer_opts.xhtml { + r.sb.write_string('
    \n') + } else { + r.sb.write_string('
    \n') + } +} + +fn (mut r HTMLRenderer) render_soft_break() { + if r.opts.renderer_opts.hard_wraps { + if r.opts.renderer_opts.xhtml { + r.sb.write_string('
    \n') + } else { + r.sb.write_string('
    \n') + } + } else { + r.sb.write_string('\n') + } +} + +fn (mut r HTMLRenderer) render_strikethrough(node &Node) { + r.sb.write_string('') + r.render_children(node) + r.sb.write_string('') +} + +fn (mut r HTMLRenderer) render_task_checkbox(node &Node) { + // Rendered inline in render_list_item; standalone fallback: + checked := if node.checked { ' checked=""' } else { '' } + if r.opts.renderer_opts.xhtml { + r.sb.write_string('') + } else { + r.sb.write_string('') + } +} diff --git a/vlib/x/markdown/inline.v b/vlib/x/markdown/inline.v new file mode 100644 index 000000000..1b3ad44a9 --- /dev/null +++ b/vlib/x/markdown/inline.v @@ -0,0 +1,877 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module markdown + +import encoding.html as ehtml + +// parse_inline parses src as inline content and returns a slice of inline nodes. +pub fn parse_inline(src string, opts Options, ref_map map[string]LinkRef) []&Node { + mut p := InlineParser{ + src: src + opts: opts + ref_map: ref_map + } + return p.parse() +} + +// InlineParser parses inline markdown content. +struct InlineParser { + opts Options + ref_map map[string]LinkRef +mut: + src string + pos int +} + +struct EmphDelim { +mut: + node_idx int + ch u8 + length int + orig_len int + can_open bool + can_close bool + active bool = true +} + +// parse parses the full inline source and returns a node slice. +fn (mut p InlineParser) parse() []&Node { + mut result := []&Node{} + mut delims := []EmphDelim{} + for p.pos < p.src.len { + if p.src[p.pos] == `*` || p.src[p.pos] == `_` { + start := p.pos + ch := p.src[p.pos] + for p.pos < p.src.len && p.src[p.pos] == ch { + p.pos++ + } + run := p.src[start..p.pos] + before := if start > 0 { p.src[start - 1] } else { u8(` `) } + after := if p.pos < p.src.len { p.src[p.pos] } else { u8(` `) } + can_open := can_open_emphasis(ch, before, after) + can_close := can_close_emphasis(ch, before, after) + result << text_node(run) + if can_open || can_close { + delims << EmphDelim{ + node_idx: result.len - 1 + ch: ch + length: run.len + orig_len: run.len + can_open: can_open + can_close: can_close + } + } + continue + } + nodes := p.parse_one() + for n in nodes { + result << n + } + } + if delims.len > 0 { + resolve_emphasis(mut result, mut delims) + result = compact_empty_text_nodes(result) + } + return merge_text_nodes(result) +} + +fn resolve_emphasis(mut nodes []&Node, mut delims []EmphDelim) { + mut i := 0 + for i < delims.len { + if !delims[i].active || !delims[i].can_close || delims[i].length == 0 { + i++ + continue + } + mut opener := i - 1 + for opener >= 0 { + if !delims[opener].active || !delims[opener].can_open || delims[opener].length == 0 { + opener-- + continue + } + if delims[opener].ch != delims[i].ch { + opener-- + continue + } + if (delims[i].can_open || delims[opener].can_close) + && (delims[opener].orig_len + delims[i].orig_len) % 3 == 0 + && (delims[opener].orig_len % 3 != 0 || delims[i].orig_len % 3 != 0) { + opener-- + continue + } + if delims[opener].node_idx + 1 >= delims[i].node_idx { + opener-- + continue + } + break + } + if opener < 0 { + i++ + continue + } + + use_len := if delims[opener].length >= 2 && delims[i].length >= 2 { 2 } else { 1 } + opener_idx := delims[opener].node_idx + closer_idx := delims[i].node_idx + if opener_idx < 0 || closer_idx < 0 || opener_idx >= nodes.len || closer_idx >= nodes.len + || opener_idx >= closer_idx { + delims[i].active = false + i++ + continue + } + if nodes[opener_idx].literal.len < use_len || nodes[closer_idx].literal.len < use_len { + delims[i].active = false + i++ + continue + } + + nodes[opener_idx].literal = nodes[opener_idx].literal[..nodes[opener_idx].literal.len - use_len] + nodes[closer_idx].literal = nodes[closer_idx].literal[use_len..] + delims[opener].length -= use_len + delims[i].length -= use_len + + mut emph := new_node(if use_len == 2 { .strong } else { .emphasis }) + for child in nodes[opener_idx + 1..closer_idx] { + emph.append_child(child) + } + + n_inner := closer_idx - opener_idx - 1 + if n_inner > 0 { + nodes.delete_many(opener_idx + 1, n_inner) + nodes.insert(opener_idx + 1, emph) + } + + mut delta := n_inner - 1 + if delta < 0 { + delta = 0 + } + for j := 0; j < delims.len; j++ { + if !delims[j].active { + continue + } + if delims[j].node_idx > opener_idx && delims[j].node_idx < closer_idx { + delims[j].active = false + continue + } + if delta > 0 && delims[j].node_idx >= closer_idx { + delims[j].node_idx -= delta + } + } + + if delims[opener].length == 0 { + delims[opener].can_open = false + } + if delims[i].length == 0 { + delims[i].can_close = false + } + i = opener + 1 + } +} + +fn compact_empty_text_nodes(nodes []&Node) []&Node { + mut out := []&Node{} + for n in nodes { + if n.kind == .text && n.literal.len == 0 { + continue + } + out << n + } + return out +} + +// parse_one parses one or more inline elements at the current position. +fn (mut p InlineParser) parse_one() []&Node { + if p.pos >= p.src.len { + return [] + } + c := p.src[p.pos] + match c { + `\\` { + return [p.parse_backslash()] + } + 96 { // backtick + if node := p.try_code_span() { + return [node] + } + p.pos++ + return [text_node('`')] + } + `*`, `_` { + p.pos++ + return [text_node(c.ascii_str())] + } + `~` { + if p.opts.strikethrough { + if node := p.try_strikethrough() { + return [node] + } + } + p.pos++ + return [text_node('~')] + } + `[` { + if nodes := p.try_link_or_footnote() { + return nodes + } + p.pos++ + return [text_node('[')] + } + `!` { + if p.pos + 1 < p.src.len && p.src[p.pos + 1] == `[` { + saved := p.pos + p.pos += 2 + if nodes := p.try_image_after_bang() { + return nodes + } + p.pos = saved + p.pos++ + return [text_node('!')] + } + p.pos++ + return [text_node('!')] + } + `<` { + if node := p.try_autolink_or_html() { + return [node] + } + p.pos++ + return [text_node('<')] + } + `&` { + if node := p.try_entity() { + return [node] + } + p.pos++ + return [text_node('&')] + } + `\n` { + return [p.parse_newline()] + } + else { + if p.opts.linkify { + if node := p.try_linkify() { + return [node] + } + } + p.pos++ + return [text_node(c.ascii_str())] + } + } +} + +// text_node creates a text node with the given literal string. +fn text_node(s string) &Node { + mut n := new_node(.text) + n.literal = s + return n +} + +// merge_text_nodes merges consecutive text nodes into one. +fn merge_text_nodes(nodes []&Node) []&Node { + if nodes.len <= 1 { + return nodes + } + mut result := []&Node{} + for node in nodes { + if result.len > 0 && result[result.len - 1].kind == .text && node.kind == .text { + result[result.len - 1].literal += node.literal + } else { + result << node + } + } + return result +} + +// can_open_emphasis reports whether a delimiter run can open emphasis. +fn can_open_emphasis(delim u8, before u8, after u8) bool { + left_flanking := !is_unicode_space(after) && (!is_ascii_punct(after) || is_unicode_space(before) + || is_ascii_punct(before)) + right_flanking := !is_unicode_space(before) + && (!is_ascii_punct(before) || is_unicode_space(after) + || is_ascii_punct(after)) + if delim == `*` { + return left_flanking + } + if delim == `_` { + return left_flanking && (!right_flanking || is_ascii_punct(before)) + } + return false +} + +// can_close_emphasis reports whether a delimiter run can close emphasis. +fn can_close_emphasis(delim u8, before u8, after u8) bool { + left_flanking := !is_unicode_space(after) && (!is_ascii_punct(after) || is_unicode_space(before) + || is_ascii_punct(before)) + right_flanking := !is_unicode_space(before) + && (!is_ascii_punct(before) || is_unicode_space(after) + || is_ascii_punct(after)) + if delim == `*` { + return right_flanking + } + if delim == `_` { + return right_flanking && (!left_flanking || is_ascii_punct(after)) + } + return false +} + +// parse_backslash handles backslash escapes and hard line breaks. +fn (mut p InlineParser) parse_backslash() &Node { + p.pos++ // consume '\' + if p.pos >= p.src.len { + return text_node('\\') + } + ch := p.src[p.pos] + if ch == `\n` { + p.pos++ + return new_node(.hard_break) + } + if is_ascii_punct(ch) { + p.pos++ + return text_node(ch.ascii_str()) + } + return text_node('\\') +} + +// try_code_span attempts to parse a backtick code span. +fn (mut p InlineParser) try_code_span() ?&Node { + start := p.pos + mut n := 0 + for p.pos < p.src.len && p.src[p.pos] == 96 { + n++ + p.pos++ + } + content_start := p.pos + mut search := content_start + for search < p.src.len { + if p.src[search] == 96 { + close_start := search + mut close_n := 0 + for search < p.src.len && p.src[search] == 96 { + close_n++ + search++ + } + if close_n == n { + code_raw := p.src[content_start..close_start] + mut code := code_raw.replace('\n', ' ') + if code.len >= 2 && code[0] == ` ` && code[code.len - 1] == ` ` + && code.trim_space().len > 0 { + code = code[1..code.len - 1] + } + mut node := new_node(.code_span) + node.literal = code + p.pos = search + return node + } + } else { + search++ + } + } + p.pos = start + return none +} + +// try_emphasis attempts to parse *em*, **strong**, _em_, __strong__. +fn (mut p InlineParser) try_emphasis(c u8) ?&Node { + start := p.pos + mut run := 0 + for p.pos < p.src.len && p.src[p.pos] == c { + run++ + p.pos++ + } + + // Prevent splitting an intraword __ run into a synthetic single-underscore opener. + if c == `_` && run == 1 && start > 1 && p.src[start - 1] == `_` { + before2 := p.src[start - 2] + after1 := if start + run < p.src.len { p.src[start + run] } else { u8(` `) } + if is_wordish(before2) && is_wordish(after1) { + p.pos = start + return none + } + } + + before := if start > 0 { p.src[start - 1] } else { u8(` `) } + after := if start + run < p.src.len { p.src[start + run] } else { u8(` `) } + opener_can_open := can_open_emphasis(c, before, after) + opener_can_close := can_close_emphasis(c, before, after) + + if !opener_can_open { + p.pos = start + return none + } + + // Prefer em first for odd runs (e.g. ***foo*** -> foo). + if run % 2 == 1 { + p.pos = start + 1 + if node := p.match_close_delim(c, 1, run, opener_can_close) { + return node + } + if run >= 2 { + p.pos = start + 2 + if node := p.match_close_delim(c, 2, run, opener_can_close) { + return node + } + } + } else { + if run >= 2 { + p.pos = start + 2 + if node := p.match_close_delim(c, 2, run, opener_can_close) { + return node + } + } + p.pos = start + 1 + if node := p.match_close_delim(c, 1, run, opener_can_close) { + return node + } + } + + p.pos = start + return none +} + +// is_wordish reports whether c behaves like a word character for emphasis +// boundary checks (includes non-ASCII bytes used in UTF-8 sequences). +@[inline] +fn is_wordish(c u8) bool { + return !is_unicode_space(c) && !is_ascii_punct(c) +} + +// match_close_delim parses content after the opening delimiter run and finds +// a matching closing delimiter of exactly `count` characters. +fn (mut p InlineParser) match_close_delim(c u8, count int, opener_run int, opener_can_close bool) ?&Node { + content_start := p.pos + mut content_nodes := []&Node{} + + for p.pos < p.src.len { + loop_start_pos := p.pos + ch := p.src[p.pos] + // Check for closing delimiter. + if ch == c { + close_pos := p.pos + mut close_run := 0 + for p.pos < p.src.len && p.src[p.pos] == c { + close_run++ + p.pos++ + } + if close_run >= count { + // Verify right-flanking. + before_close := if close_pos > 0 { p.src[close_pos - 1] } else { u8(` `) } + after_close := if p.pos < p.src.len { p.src[p.pos] } else { u8(` `) } + closer_can_close := can_close_emphasis(c, before_close, after_close) + closer_can_open := can_open_emphasis(c, before_close, after_close) + if closer_can_close { + if count == 1 && opener_run == 1 && close_run > 1 && closer_can_open { + p.pos = close_pos + } else if count == 1 && opener_run > 1 && close_run > 1 && closer_can_open { + // Keep extra delimiters inside the emphasis span so nested + // strong parsing can consume them (e.g. foo***bar***baz). + inner_end := close_pos + (close_run - count) + if inner_end > content_start && inner_end <= p.src.len { + inner_nodes := parse_inline(p.src[content_start..inner_end], p.opts, + p.ref_map) + mut node := new_node(.emphasis) + for child in inner_nodes { + node.append_child(child) + } + p.pos = close_pos + close_run + return node + } + } else { + if opener_can_close && closer_can_open && (opener_run + close_run) % 3 == 0 + && (opener_run % 3 != 0 || close_run % 3 != 0) { + p.pos = close_pos + } else { + if content_nodes.len == 0 { + content_nodes << text_node(c.ascii_str()) + p.pos = close_pos + 1 + continue + } + // Rewind extra closing chars beyond `count`. + p.pos = close_pos + count + kind := if count == 2 { NodeKind.strong } else { NodeKind.emphasis } + mut node := new_node(kind) + for child in content_nodes { + node.append_child(child) + } + return node + } + } + } + } + p.pos = close_pos + if count == 1 && opener_run > 1 && opener_can_close { + content_nodes << text_node(c.ascii_str()) + p.pos++ + continue + } + } + if ch == `\n` { + // Newlines stop emphasis search. + break + } + inner := p.parse_one() + if p.pos <= loop_start_pos { + // Safety net: force progress to avoid recursive delimiter stalls. + content_nodes << text_node(p.src[loop_start_pos].ascii_str()) + p.pos = loop_start_pos + 1 + continue + } + content_nodes << inner + } + // Not found; reset. + p.pos = content_start + return none +} + +// try_strikethrough parses ~~text~~. +fn (mut p InlineParser) try_strikethrough() ?&Node { + if p.pos + 1 >= p.src.len || p.src[p.pos + 1] != `~` { + return none + } + p.pos += 2 + close := p.src.index_after_('~~', p.pos) + if close < 0 { + p.pos -= 2 + return none + } + inner := p.src[p.pos..close] + inner_nodes := parse_inline(inner, p.opts, p.ref_map) + mut node := new_node(.strikethrough) + for child in inner_nodes { + node.append_child(child) + } + p.pos = close + 2 + return node +} + +// try_link_or_footnote handles [ and attempts to parse a link or footnote ref. +fn (mut p InlineParser) try_link_or_footnote() ?[]&Node { + saved := p.pos + p.pos++ // consume '[' + // Footnote reference [^label]. + if p.opts.footnotes && p.pos < p.src.len && p.src[p.pos] == `^` { + fn_start := p.pos + 1 + fn_close := p.src.index_after_(']', fn_start) + if fn_close > fn_start { + label := p.src[fn_start..fn_close] + mut fn_ref := new_node(.footnote_ref) + fn_ref.fn_label = label + p.pos = fn_close + 1 + return [fn_ref] + } + } + text_start := p.pos + close := find_bracket_close(p.src, p.pos) + if close < 0 { + p.pos = saved + return none + } + link_text := p.src[text_start..close] + p.pos = close + 1 + + // Inline link (url). + if p.pos < p.src.len && p.src[p.pos] == `(` { + dest, title, end := parse_inline_link_dest_from(p.src, p.pos + 1) + if end >= 0 { + inner_nodes := parse_inline(link_text, p.opts, p.ref_map) + mut node := new_node(.link) + node.dest = unescape_string(dest) + node.title = unescape_string(title) + for child in inner_nodes { + node.append_child(child) + } + p.pos = end + 1 + return [node] + } + } + // Full reference [text][label]. + if p.pos < p.src.len && p.src[p.pos] == `[` { + ref_start := p.pos + 1 + ref_close := p.src.index_after_(']', ref_start) + if ref_close >= 0 { + raw_label := p.src[ref_start..ref_close] + label := normalize_label(if raw_label.len > 0 { raw_label } else { link_text }) + if label in p.ref_map { + ref := p.ref_map[label] + mut node := new_node(.link) + node.dest = ref.dest + node.title = ref.title + node.label = label + inner_nodes := parse_inline(link_text, p.opts, p.ref_map) + for child in inner_nodes { + node.append_child(child) + } + p.pos = ref_close + 1 + return [node] + } + if raw_label.len > 0 { + // Do not downgrade explicit [text][label] to shortcut [text]. + p.pos = saved + return none + } + } + } + // Shortcut reference [label]. + label := normalize_label(link_text) + if label in p.ref_map { + ref := p.ref_map[label] + mut node := new_node(.link) + node.dest = ref.dest + node.title = ref.title + node.label = label + inner_nodes := parse_inline(link_text, p.opts, p.ref_map) + for child in inner_nodes { + node.append_child(child) + } + return [node] + } + p.pos = saved + return none +} + +// try_image_after_bang parses the [alt](url) part of an image after '![' was consumed. +fn (mut p InlineParser) try_image_after_bang() ?[]&Node { + text_start := p.pos + close := find_bracket_close(p.src, p.pos) + if close < 0 { + return none + } + alt_text := p.src[text_start..close] + p.pos = close + 1 + + if p.pos < p.src.len && p.src[p.pos] == `(` { + dest, title, end := parse_inline_link_dest_from(p.src, p.pos + 1) + if end >= 0 { + mut node := new_node(.image) + node.dest = unescape_string(dest) + node.title = unescape_string(title) + inner_nodes := parse_inline(alt_text, p.opts, p.ref_map) + for child in inner_nodes { + node.append_child(child) + } + p.pos = end + 1 + return [node] + } + } + if p.pos < p.src.len && p.src[p.pos] == `[` { + ref_start := p.pos + 1 + ref_close := p.src.index_after_(']', ref_start) + if ref_close >= 0 { + raw_label := p.src[ref_start..ref_close] + label := normalize_label(if raw_label.len > 0 { raw_label } else { alt_text }) + if label in p.ref_map { + ref := p.ref_map[label] + mut node := new_node(.image) + node.dest = ref.dest + node.title = ref.title + inner_nodes := parse_inline(alt_text, p.opts, p.ref_map) + for child in inner_nodes { + node.append_child(child) + } + p.pos = ref_close + 1 + return [node] + } + } + } + return none +} + +// find_bracket_close finds the ] matching the [ at start, handling nesting and escapes. +fn find_bracket_close(s string, start int) int { + mut depth := 1 + mut i := start + for i < s.len { + if s[i] == `\\` && i + 1 < s.len { + i += 2 + continue + } + if s[i] == `[` { + depth++ + } else if s[i] == `]` { + depth-- + if depth == 0 { + return i + } + } + i++ + } + return -1 +} + +// parse_inline_link_dest_from parses (url) or (url "title") starting at s[start] +// (start is after the opening paren). Returns (dest, title, end_paren_pos) or ("","", -1). +fn parse_inline_link_dest_from(s string, start int) (string, string, int) { + i := skip_ws(s, start) + if i >= s.len { + return '', '', -1 + } + dest, after_dest := parse_link_dest(s[i..]) + j := i + (s[i..].len - after_dest.len) + k := skip_ws(s, j) + if k < s.len && s[k] == `)` { + return dest, '', k + } + title, after_title := parse_link_title(s[k..]) + l := k + (s[k..].len - after_title.len) + m := skip_ws(s, l) + if m < s.len && s[m] == `)` { + return dest, title, m + } + return '', '', -1 +} + +// skip_ws returns the position in s after skipping whitespace from start. +fn skip_ws(s string, start int) int { + mut i := start + for i < s.len && (s[i] == ` ` || s[i] == `\t` || s[i] == `\n`) { + i++ + } + return i +} + +// try_autolink_or_html handles <...> for autolinks and raw HTML. +fn (mut p InlineParser) try_autolink_or_html() ?&Node { + rest := p.src[p.pos..] + auto_end := try_autolink(rest) + if auto_end >= 0 { + content := rest[1..auto_end] + mut node := new_node(.autolink) + node.literal = content + if content.contains('@') && !content.contains('://') { + node.dest = 'mailto:' + content + } else { + node.dest = content + } + p.pos += auto_end + 1 + return node + } + raw_end := try_raw_html_tag(rest) + if raw_end >= 0 { + mut node := new_node(.raw_html) + node.literal = rest[..raw_end + 1] + p.pos += raw_end + 1 + return node + } + return none +} + +// try_autolink matches an returning the position of '>' or -1. +fn try_autolink(s string) int { + if s.len < 3 || s[0] != `<` { + return -1 + } + end := s.index_after_('>', 1) + if end < 0 { + return -1 + } + inner := s[1..end] + if inner.contains(' ') || inner.contains('<') { + return -1 + } + if inner.contains('://') { + return end + } + if inner.contains('@') && !inner.starts_with('@') { + return end + } + return -1 +} + +// try_raw_html_tag matches a raw HTML tag starting with '<' and returns the '>' position. +fn try_raw_html_tag(s string) int { + if s.len < 3 || s[0] != `<` { + return -1 + } + if s.starts_with('', 4) + if end >= 0 { + return end + 2 + } + return -1 + } + if s.starts_with('', 2) + if end >= 0 { + return end + 1 + } + return -1 + } + low := s.to_lower() + if low.starts_with('', 9) + if end >= 0 { + return end + 2 + } + return -1 + } + end := s.index_after_('>', 1) + if end < 0 { + return -1 + } + inner := s[1..end] + if inner.len == 0 { + return -1 + } + if !is_alpha(inner[0]) && inner[0] != `/` && inner[0] != `!` { + return -1 + } + return end +} + +// try_entity parses an HTML entity reference &name; or &#n; or &#xn' +fn (mut p InlineParser) try_entity() ?&Node { + rest := p.src[p.pos..] + semi := rest.index(';') or { return none } + if semi > 32 || semi < 2 { + return none + } + candidate := rest[..semi + 1] + decoded := ehtml.unescape(candidate, all: true) + if decoded == candidate { + return none + } + p.pos += semi + 1 + return text_node(decoded) +} + +// parse_newline handles a newline character. +fn (mut p InlineParser) parse_newline() &Node { + // Hard break if preceded by two or more spaces. + if p.pos >= 2 && p.src[p.pos - 1] == ` ` && p.src[p.pos - 2] == ` ` { + p.pos++ + return new_node(.hard_break) + } + p.pos++ + return new_node(.soft_break) +} + +// try_linkify matches a bare URL (linkify extension). +fn (mut p InlineParser) try_linkify() ?&Node { + rest := p.src[p.pos..] + for _, scheme in ['https://', 'http://', 'ftp://', 'mailto:'] { + if rest.starts_with(scheme) { + mut end := scheme.len + for end < rest.len { + ch := rest[end] + if ch == ` ` || ch == `<` || ch == `>` || ch == `"` || ch == `\n` || ch == `\t` { + break + } + end++ + } + for end > scheme.len { + last := rest[end - 1] + if last == `.` || last == `,` || last == `;` || last == `!` || last == `?` { + end-- + } else { + break + } + } + url := rest[..end] + mut node := new_node(.autolink) + node.literal = url + node.dest = url + p.pos += end + return node + } + } + return none +} diff --git a/vlib/x/markdown/markdown.v b/vlib/x/markdown/markdown.v new file mode 100644 index 000000000..a8a5f4ba5 --- /dev/null +++ b/vlib/x/markdown/markdown.v @@ -0,0 +1,142 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +// +// Module markdown provides CommonMark-compliant markdown parsing and HTML +// rendering with support for GitHub Flavored Markdown and additional +// extensions. It is designed for feature parity with github.com/yuin/goldmark. +// +// Basic usage: +// +// import x.markdown +// +// html := markdown.to_html('# Hello\n\nWorld') +// +// With GFM extensions: +// +// md := markdown.new(extensions: markdown.gfm()) +// html := md.convert('| a | b |\n|---|---|\n| 1 | 2 |') +// +// With fine-grained options: +// +// md := markdown.new( +// extensions: [markdown.Extension(markdown.footnote()), markdown.typographer()], +// parser_opts: markdown.ParserOptions{ auto_heading_id: true }, +// renderer_opts: markdown.RendererOptions{ unsafe_: true, xhtml: true }, +// ) +// html := md.convert(source) +// +// Parse to AST and walk: +// +// doc := md.parse(source) +// doc.walk(fn (node &markdown.Node) bool { +// println(node.kind) +// return true +// }) +module markdown + +// ParserOptions configures parser behaviour. +@[params] +pub struct ParserOptions { +pub mut: + // auto_heading_id generates an id attribute for every heading node + // derived from the heading text content (goldmark WithAutoHeadingID). + auto_heading_id bool +} + +// RendererOptions configures HTML renderer behaviour. +@[params] +pub struct RendererOptions { +pub mut: + // unsafe_ allows raw HTML from the source to be included in the output. + // When false (the default) raw HTML is replaced with an HTML comment. + unsafe_ bool + // hard_wraps converts every newline inside a paragraph to a
    tag. + hard_wraps bool + // xhtml outputs XHTML-style self-closing tags (e.g.
    ). + xhtml bool +} + +// Options configures a Markdown processor. +// Extension flags in the mut section are normally set by calling new() with +// an extensions slice; they can also be set directly. +@[params] +pub struct Options { +pub mut: + // extensions is the list of extensions applied when new() is called. + extensions []Extension + // parser_opts configures the parser. + parser_opts ParserOptions + // renderer_opts configures the renderer. + renderer_opts RendererOptions + // --- feature flags set by extensions --- + tables bool + strikethrough bool + linkify bool + task_list bool + footnotes bool + typographer bool + definition_list bool +} + +// LinkRef holds a collected link reference definition (url + optional title). +struct LinkRef { + dest string + title string +} + +// Markdown is the main markdown processor. Create one with new() and reuse it +// across multiple convert/parse calls; link reference definitions are cached. +pub struct Markdown { +pub mut: + opts Options + ref_map map[string]LinkRef +} + +// new creates a Markdown processor with the given options. +// All extensions in opts.extensions are applied immediately. +pub fn new(opts Options) Markdown { + mut m := Markdown{ + opts: opts + ref_map: map[string]LinkRef{} + } + for ext in opts.extensions { + ext.extend(mut m) + } + return m +} + +// to_html converts the markdown source to HTML using default settings +// (CommonMark only, no extensions, raw HTML stripped). +pub fn to_html(src string) string { + mut md := new(Options{}) + return md.convert(src) +} + +// to_html_opts converts the markdown source to HTML with the given options. +pub fn to_html_opts(src string, opts Options) string { + mut md := new(opts) + return md.convert(src) +} + +// convert parses the markdown source and renders it to an HTML string. +pub fn (mut m Markdown) convert(src string) string { + doc := m.parse(src) + mut r := HTMLRenderer{ + opts: m.opts + ref_map: m.ref_map // Use the updated ref_map after parse() + } + return r.render(doc) +} + +// parse parses the markdown source into an AST and returns the document root. +// Link reference definitions collected during parsing are cached so that +// subsequent parse/convert calls on the same Markdown instance share them. +pub fn (mut m Markdown) parse(src string) &Node { + mut p := new_block_parser(src, m.opts, m.ref_map) + doc := p.parse() + for k, v in p.ref_map { + m.ref_map[k] = v + } + return doc +} diff --git a/vlib/x/markdown/markdown_test.v b/vlib/x/markdown/markdown_test.v new file mode 100644 index 000000000..6293b3571 --- /dev/null +++ b/vlib/x/markdown/markdown_test.v @@ -0,0 +1,343 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module markdown + +fn test_to_html_heading() { + assert to_html('# Hello') == '

    Hello

    \n' + assert to_html('## World') == '

    World

    \n' +} + +fn test_to_html_paragraph() { + assert to_html('Hello world') == '

    Hello world

    \n' +} + +fn test_to_html_thematic_break() { + assert to_html('---') == '
    \n' +} + +fn test_to_html_emphasis() { + html := to_html('*em*') + assert html.contains('') +} + +fn test_to_html_strong() { + html := to_html('**bold**') + assert html.contains('') +} + +fn test_to_html_code_span() { + html := to_html('`code`') + assert html.contains('') + assert html.contains('code') +} + +fn test_to_html_link() { + html := to_html('[link](https://example.com)') + assert html.contains('') + assert html.contains('link') +} + +fn test_html_escape_in_text() { + html := to_html('A < B') + assert html.contains('<') +} + +fn test_named_entities_are_decoded_before_render() { + assert to_html('©') == '

    ©

    \n' + assert to_html('&') == '

    &

    \n' +} + +fn test_unknown_named_entity_is_left_as_literal_text() { + assert to_html('¬_a_real_entity;') == '

    &not_a_real_entity;

    \n' +} + +fn test_numeric_entities_are_decoded() { + assert to_html('© ©') == '

    © ©

    \n' +} + +fn test_empty_input() { + assert to_html('') == '' +} + +fn test_multiline_paragraph() { + html := to_html('line one\nline two') + assert html.contains('

    ') + assert html.contains('line one') +} + +fn test_fenced_code() { + html := to_html('```go\nfn main() {}\n```') + assert html.contains('') + assert html.contains('

  • ') + assert html.contains('item') +} + +fn test_ordered_list() { + html := to_html('1. first') + assert html.contains('
      ') + assert html.contains('
    1. ') + assert html.contains('first') +} + +fn test_ordered_list_marker_requires_whitespace_or_eol() { + assert to_html('1.test') == '

      1.test

      \n' + assert to_html('1)test') == '

      1)test

      \n' +} + +fn test_ordered_list_marker_allows_space_tab_or_eol() { + assert to_html('1. item') == '
        \n
      1. item
      2. \n
      \n' + assert to_html('1)\titem') == '
        \n
      1. item
      2. \n
      \n' + assert to_html('1.') == '
        \n
      1. \n
      \n' +} + +fn test_blockquote() { + html := to_html('> quote') + assert html.contains('
      ') + assert html.contains('quote') +} + +fn test_list_multiple_items() { + html := to_html('- item 1\n- item 2') + assert html.contains('
        ') + assert html.contains('item 1') + assert html.contains('item 2') +} + +fn test_invalid_link_ref_def_does_not_create_reference() { + src := '[bad]: ok') +} + +fn test_full_reference_does_not_fallback_to_shortcut_when_label_is_undefined() { + src := '[text]: https://example.com/text\n\n[text][missing]' + html := to_html(src) + assert html == '

        [text][missing]

        \n' +} + +fn test_shortcut_reference_still_resolves_normally() { + src := '[text]: https://example.com/text\n\n[text]' + html := to_html(src) + assert html == '

        text

        \n' +} + +fn test_gfm_table_header_uses_th_cells() { + src := '| a | b |\n| --- | --- |\n| 1 | 2 |' + html := to_html_opts(src, Options{ + extensions: gfm() + }) + assert html.contains('') + assert html.contains('a') + assert html.contains('b') +} + +fn test_emphasis_underscore_intraword_does_not_emphasize() { + assert to_html('foo_bar_baz') == '

        foo_bar_baz

        \n' + assert to_html('foo_bar_') == '

        foo_bar_

        \n' + assert to_html('_foo_bar') == '

        _foo_bar

        \n' +} + +fn test_emphasis_star_delimiters_still_emphasize() { + assert to_html('a*b*c') == '

        abc

        \n' +} + +fn test_emphasis_triple_delimiters() { + assert to_html('***foo***') == '

        foo

        \n' + assert to_html('___foo___') == '

        foo

        \n' + assert to_html('foo***bar***baz') == '

        foobarbaz

        \n' +} + +fn test_emphasis_nested_mixed_runs() { + assert to_html('**foo *bar***') == '

        foo bar

        \n' + assert to_html('*foo **bar***') == '

        foo bar

        \n' + assert to_html('*foo**bar**baz*') == '

        foobarbaz

        \n' + assert to_html('*foo **bar** baz*') == '

        foo bar baz

        \n' + assert to_html('**foo *bar* baz**') == '

        foo bar baz

        \n' +} + +fn test_emphasis_multiple_of_three_resolution() { + assert to_html('***foo** bar*') == '

        foo bar

        \n' + assert to_html('***foo* bar**') == '

        foo bar

        \n' + assert to_html('***foo**bar*') == '

        foobar

        \n' +} + +fn test_emphasis_underscore_punctuation_flanking() { + assert to_html('foo-_(bar)_') == '

        foo-(bar)

        \n' + assert to_html('foo__bar__baz') == '

        foo__bar__baz

        \n' + assert to_html('foo__bar__') == '

        foo__bar__

        \n' + assert to_html('__foo__bar') == '

        __foo__bar

        \n' +} + +fn test_setext_heading_leading_spaces() { + // CommonMark allows 0-3 leading spaces on the setext underline. + assert to_html('Foo\n ===') == '

        Foo

        \n' + assert to_html('Foo\n ---') == '

        Foo

        \n' + assert to_html('Foo\n ===') == '

        Foo

        \n' +} + +fn test_emphasis_leftover_delimiters_are_literal() { + // Unmatched delimiters become literal text. + assert to_html('*a**b**') == '

        *ab

        \n' + assert to_html('**a**b*') == '

        ab*

        \n' + assert to_html('*foo bar') == '

        *foo bar

        \n' +} + +fn test_emphasis_mixed_star_underscore() { + // * and _ delimiters do not pair with each other. + assert to_html('*foo _bar_ baz*') == '

        foo bar baz

        \n' + assert to_html('__foo *bar* baz__') == '

        foo bar baz

        \n' +} + +fn test_link_ref_def_with_leading_spaces() { + // CommonMark allows 0-3 leading spaces before a link ref def. + assert to_html(' [foo]: https://example.com\n\n[foo]') == '

        foo

        \n' + assert to_html(' [bar]: https://example.org\n\n[bar]') == '

        bar

        \n' + assert to_html(' [baz]: https://v-lang.io\n\n[baz]') == '

        baz

        \n' +} + +fn test_link_ref_def_with_four_leading_spaces_is_not_a_ref() { + // Four leading spaces start an indented code block, not a reference definition. + src := ' [foo]: https://example.com\n\n[foo]' + html := to_html(src) + assert !html.contains('Foo\nbar\n' +} + +fn test_task_list() { + src := '- [ ] unchecked\n- [x] checked\n- [X] also checked' + html := to_html_opts(src, Options{ + task_list: true + }) + assert html.contains('') + assert html.contains('') + assert html.contains('unchecked') + assert html.contains('checked') +} + +fn test_task_list_not_applied_without_extension() { + // Without the extension, task markers are rendered as plain text. + html := to_html('- [ ] item') + assert !html.contains('') +} + +fn test_footnote_definition_inside_list_item_is_preserved() { + src := '- item[^note]\n\n [^note]: footnote in list\n\noutside[^note]' + html := to_html_opts(src, Options{ + footnotes: true + }) + assert html.contains('item1') + assert html.contains('outside1') + assert html.contains('
      • footnote in list') + assert html.contains('
      • ') +} + +fn test_footnote_definition_inside_blockquote_is_preserved() { + src := '> quote[^q]\n>\n> [^q]: footnote in quote' + html := to_html_opts(src, Options{ + footnotes: true + }) + assert html.contains('quote1') + assert html.contains('
      • footnote in quote') + assert html.contains('
      • ') +} + +fn test_link_ref_def_multiline_title() { + // CommonMark allows the title on the next line when the destination is alone. + src := '[foo]: /url\n"a title"\n\n[foo]' + html := to_html(src) + assert html.contains('foo') +} + +fn test_link_ref_def_multiline_title_single_quotes() { + src := "[bar]: /path\n'my title'\n\n[bar]" + html := to_html(src) + assert html.contains('baz') + assert html.contains('some text') +} + +fn test_gfm_helper_sets_core_extension_flags() { + md := new(Options{ + extensions: gfm() + }) + assert md.opts.tables + assert md.opts.strikethrough + assert md.opts.linkify + assert md.opts.task_list +} + +fn test_individual_extension_helpers_set_flags() { + md_footnote := new(Options{ + extensions: [Extension(footnote())] + }) + assert md_footnote.opts.footnotes + + md_typographer := new(Options{ + extensions: [Extension(typographer())] + }) + assert md_typographer.opts.typographer + + md_definition_list := new(Options{ + extensions: [Extension(definition_list())] + }) + assert md_definition_list.opts.definition_list +} + +fn test_emphasis_goldmark_parity_edge_cases() { + assert to_html('_a* __*_* b b') == '

        a* __** b b

        \n' + assert to_html('* bb _ *__*a* a_') == '
          \n
        • bb _ *__a a_
        • \n
        \n' + assert to_html('baa _ a*aba**_ba') == '

        baa _ a*aba**_ba

        \n' + assert to_html('_a_*_b**_aba*') == '

        a_b**_aba

        \n' + assert to_html('x_ ***b*ab*bb_a*a a') == '

        x_ babbb_aa a

        \n' +} diff --git a/vlib/x/markdown/node.v b/vlib/x/markdown/node.v new file mode 100644 index 000000000..720c90e1b --- /dev/null +++ b/vlib/x/markdown/node.v @@ -0,0 +1,144 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module markdown + +import strings + +// NodeKind identifies what kind of AST node a Node represents. +pub enum NodeKind { + // ------- document root ------- + document + // ------- block elements ------- + heading + paragraph + blockquote + list + list_item + code_block + fenced_code + thematic_break + html_block + link_ref_def + // GFM block extensions + table + table_head + table_body + table_row + table_cell + // Definition list (Pandoc-style) + definition_list + definition_term + definition_desc + // Footnote definition block + footnote_def + // ------- inline elements ------- + text + emphasis + strong + code_span + link + image + autolink + raw_html + hard_break + soft_break + // GFM inline extensions + strikethrough + // Footnote reference inline + footnote_ref + // Task list checkbox (inline, first child of a list_item) + task_checkbox +} + +// Alignment is the text alignment of a table cell column. +pub enum Alignment { + none_ + left + center + right +} + +// Node is a node in the parsed markdown AST. +// A document is a tree of Nodes with .document as the root. +@[heap] +pub struct Node { +pub mut: + kind NodeKind + // ----- block-level fields ----- + // heading: 1–6 + level int + // list: true when there are no blank lines between items + is_tight bool + // list: true for ordered (1. 2. 3.), false for bullet (- * +) + is_ordered bool + // list: starting number of an ordered list + list_start int = 1 + // fenced_code: the info string after the opening fence (e.g. "go") + fence_info string + // ----- inline-level fields ----- + // text / code_span / raw_html / html_block: literal string content + literal string + // link / image: URL destination + dest string + // link / image: optional title + title string + // link: reference label (for reference-style links) + label string + // task_checkbox: true when the checkbox is checked ([x]) + checked bool + // table_cell: column alignment + align Alignment + // heading: optional explicit or auto-generated id attribute + id string + // footnote_ref / footnote_def: footnote label + fn_label string + // footnote_def: 1-based ordinal assigned during rendering + fn_index int + // ----- tree structure ----- + children []&Node +} + +// new_node allocates and returns a new Node of the given kind. +pub fn new_node(kind NodeKind) &Node { + return &Node{ + kind: kind + } +} + +// append_child appends child as the last child of n. +pub fn (mut n Node) append_child(child &Node) { + n.children << child +} + +// text_content returns the plain-text content of this node and all descendants, +// concatenated in document order. +pub fn (n &Node) text_content() string { + match n.kind { + .text, .code_span, .raw_html { + return n.literal + } + else { + mut sb := strings.new_builder(64) + for child in n.children { + sb.write_string(child.text_content()) + } + return sb.str() + } + } +} + +// walk traverses n and all its descendants in pre-order (root before children). +// The callback f receives each node; return false from f to stop traversal early. +// walk itself returns false if traversal was stopped, true otherwise. +pub fn (n &Node) walk(f fn (&Node) bool) bool { + if !f(n) { + return false + } + for child in n.children { + if !child.walk(f) { + return false + } + } + return true +} diff --git a/vlib/x/markdown/parser.v b/vlib/x/markdown/parser.v new file mode 100644 index 000000000..43127ea75 --- /dev/null +++ b/vlib/x/markdown/parser.v @@ -0,0 +1,1261 @@ +// Copyright 2026 The V Language. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module markdown + +import strings + +// BlockParser parses markdown block structure line by line into an AST. +// After block parsing, inline content is parsed for every leaf node. +struct BlockParser { + opts Options +mut: + lines []string + pos int + ref_map map[string]LinkRef + fn_defs map[string]&Node +} + +// new_block_parser creates a BlockParser for the given source. +fn new_block_parser(src string, opts Options, ref_map map[string]LinkRef) BlockParser { + normalized := src.replace('\r\n', '\n').replace('\r', '\n') + lines := normalized.split('\n') + mut refs := map[string]LinkRef{} + for k, v in ref_map { + refs[k] = v + } + return BlockParser{ + opts: opts + lines: lines + ref_map: refs + fn_defs: map[string]&Node{} + } +} + +// nested_block_parser creates a nested parser that inherits the current options +// and reference definitions. +fn (p &BlockParser) nested_block_parser(lines []string) BlockParser { + mut refs := map[string]LinkRef{} + for k, v in p.ref_map { + refs[k] = v + } + return BlockParser{ + opts: p.opts + lines: lines + ref_map: refs + fn_defs: map[string]&Node{} + } +} + +// merge_nested_state propagates nested parser state back to the parent parser. +fn (mut p BlockParser) merge_nested_state(inner BlockParser) { + for k, v in inner.ref_map { + p.ref_map[k] = v + } + if p.opts.footnotes { + for k, v in inner.fn_defs { + if k !in p.fn_defs { + p.fn_defs[k] = v + } + } + } +} + +// parse parses the full document and returns the AST root node. +fn (mut p BlockParser) parse() &Node { + mut doc := new_node(.document) + p.parse_blocks(mut doc, 0) + // Attach collected footnote definitions as children of the document. + if p.opts.footnotes { + for _, fn_node in p.fn_defs { + doc.append_child(fn_node) + } + } + return doc +} + +// parse_blocks fills parent with block-level children parsed from p.lines[p.pos..]. +// indent is the minimum leading-space indent already consumed by a container. +fn (mut p BlockParser) parse_blocks(mut parent Node, indent int) { + for p.pos < p.lines.len { + line_raw := p.lines[p.pos] + line := expand_tabs(line_raw) + trimmed := line.trim_left(' \t') + + // --- blank line --- + if is_blank(line) { + p.pos++ + continue + } + + stripped := trim_indent(line, indent) + sp := leading_spaces(stripped) + content := trim_indent(stripped, sp) + + // --- thematic break (---, ***, ___) --- + if is_thematic_break(stripped) { + node := new_node(.thematic_break) + parent.append_child(node) + p.pos++ + continue + } + + // --- ATX heading (# ... ######) --- + if heading := p.try_atx_heading(stripped) { + parent.append_child(heading) + p.pos++ + continue + } + + // --- fenced code block (``` or ~~~) --- + if fenced := p.try_fenced_code(stripped, indent) { + parent.append_child(fenced) + continue + } + + // --- HTML block --- + if html_blk := p.try_html_block(stripped, indent) { + parent.append_child(html_blk) + continue + } + + // --- link reference definition --- + // CommonMark allows 0-3 leading spaces after container indentation. + if sp <= 3 && p.try_link_ref_def(content) { + continue + } + + // --- footnote definition (if footnotes extension enabled) --- + if p.opts.footnotes { + if p.try_footnote_def(stripped, indent) { + continue + } + } + + // --- blockquote (>) --- + if stripped.starts_with('>') { + bq := p.parse_blockquote(indent) + parent.append_child(bq) + continue + } + + // --- indented code block (4 spaces) --- + if sp >= 4 && !is_blank(stripped) { + cb := p.parse_indented_code(indent) + parent.append_child(cb) + continue + } + + // --- list (bullet or ordered) --- + if is_list_marker(stripped) { + lst := p.parse_list(indent) + parent.append_child(lst) + continue + } + + // --- GFM table (if tables extension enabled) --- + if p.opts.tables { + if tbl := p.try_table(indent) { + parent.append_child(tbl) + continue + } + } + + // --- definition list (if extension enabled) --- + if p.opts.definition_list { + if dl := p.try_definition_list(indent) { + parent.append_child(dl) + continue + } + } + + // --- paragraph (including setext headings) --- + para := p.parse_paragraph(indent) + if para.kind == .heading || para.kind == .paragraph { + parent.append_child(para) + } + _ = trimmed + _ = content + } +} + +// ---- Thematic break ---- + +// is_thematic_break returns true if line is a valid thematic break +// (three or more -, *, or _ with optional spaces). +fn is_thematic_break(line string) bool { + trimmed := line.trim_space() + if trimmed.len < 3 { + return false + } + mut c := trimmed[0] + if c != `-` && c != `*` && c != `_` { + return false + } + mut count := 0 + for i := 0; i < trimmed.len; i++ { + ch := trimmed[i] + if ch == c { + count++ + } else if ch != ` ` && ch != `\t` { + return false + } + } + return count >= 3 +} + +// ---- ATX headings ---- + +// try_atx_heading attempts to parse an ATX heading from line. +// Returns the heading node on success. +fn (mut p BlockParser) try_atx_heading(line string) ?&Node { + if line.len == 0 || line[0] != `#` { + return none + } + mut level := 0 + for level < line.len && line[level] == `#` { + level++ + } + if level > 6 { + return none + } + if level < line.len && line[level] != ` ` && line[level] != `\t` { + return none + } + mut content := line[level..].trim_space() + // Strip trailing # sequence. + for content.ends_with('#') { + stripped := content.trim_right('#') + if stripped.len == 0 || stripped.ends_with(' ') || stripped.ends_with('\t') { + content = stripped.trim_right(' \t') + break + } + break + } + mut node := new_node(.heading) + node.level = level + node.literal = content + if p.opts.parser_opts.auto_heading_id { + node.id = heading_id_from_text(content) + } + return node +} + +// ---- Fenced code blocks ---- + +// try_fenced_code attempts to parse a fenced code block starting at p.pos. +fn (mut p BlockParser) try_fenced_code(line string, indent int) ?&Node { + fence_char, fence_len := detect_fence(line) + if fence_len < 3 { + return none + } + info := line[fence_len..].trim_space() + // info string must not contain a backtick when using backtick fence. + if fence_char == 96 && info.contains('`') { + return none + } + p.pos++ + mut code_lines := []string{} + for p.pos < p.lines.len { + raw := expand_tabs(p.lines[p.pos]) + stripped := trim_indent(raw, indent) + // Check for closing fence. + close_char, close_len := detect_fence(stripped) + if close_char == fence_char && close_len >= fence_len { + rest := stripped[close_len..].trim_space() + if rest.len == 0 { + p.pos++ + break + } + } + code_lines << trim_indent(raw, indent) + p.pos++ + } + mut node := new_node(.fenced_code) + node.fence_info = info + node.literal = code_lines.join('\n') + '\n' + return node +} + +// detect_fence returns (fence_char, fence_length) if line starts with a valid +// code-fence sequence, or (0, 0) if not. +fn detect_fence(line string) (u8, int) { + if line.len < 3 { + return 0, 0 + } + c := line[0] + if c != 96 && c != `~` { + return 0, 0 + } + mut n := 0 + for n < line.len && line[n] == c { + n++ + } + if n >= 3 { + return c, n + } + return 0, 0 +} + +// ---- Indented code block ---- + +// parse_indented_code collects lines that are indented by at least (indent+4) +// spaces (or blank) into an indented code block. +fn (mut p BlockParser) parse_indented_code(indent int) &Node { + mut lines := []string{} + for p.pos < p.lines.len { + raw := expand_tabs(p.lines[p.pos]) + if is_blank(raw) { + // Blank line may be included, but only if followed by more indented code. + lines << '' + p.pos++ + continue + } + stripped := trim_indent(raw, indent) + sp := leading_spaces(stripped) + if sp < 4 { + break + } + lines << trim_indent(stripped, 4) + p.pos++ + } + // Trim trailing blank lines. + for lines.len > 0 && lines[lines.len - 1] == '' { + lines = unsafe { lines[..lines.len - 1] } + } + mut node := new_node(.code_block) + node.literal = lines.join('\n') + '\n' + return node +} + +// ---- HTML blocks ---- + +// block_level_tags lists HTML tags that start an HTML block (type 6). +const block_level_tags = ['address', 'article', 'aside', 'base', 'basefont', 'blockquote', 'body', + 'caption', 'center', 'col', 'colgroup', 'dd', 'details', 'dialog', 'dir', 'div', 'dl', 'dt', + 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', + 'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'iframe', 'legend', 'li', 'link', 'main', + 'menu', 'menuitem', 'meta', 'nav', 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'search', + 'section', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', + 'ul'] + +// try_html_block attempts to parse an HTML block starting at p.pos. +fn (mut p BlockParser) try_html_block(line string, indent int) ?&Node { + html_type := detect_html_block_type(line) + if html_type == 0 { + return none + } + mut raw_lines := []string{} + raw_lines << p.lines[p.pos] + p.pos++ + // Types 1-5 end at specific end patterns; type 6-7 end at blank line. + for p.pos < p.lines.len { + raw := p.lines[p.pos] + expanded := expand_tabs(raw) + stripped := trim_indent(expanded, indent) + match html_type { + 1 { + raw_lines << raw + p.pos++ + low := stripped.to_lower() + if low.contains('') || low.contains('
  • ') || low.contains('') + || low.contains('') { + break + } + } + 2 { + raw_lines << raw + p.pos++ + if stripped.contains('-->') { + break + } + } + 3 { + raw_lines << raw + p.pos++ + if stripped.contains('?>') { + break + } + } + 4 { + raw_lines << raw + p.pos++ + if stripped.contains('>') { + break + } + } + 5 { + raw_lines << raw + p.pos++ + if stripped.contains(']]>') { + break + } + } + 6, 7 { + if is_blank(stripped) { + break + } + raw_lines << raw + p.pos++ + } + else {} + } + } + mut node := new_node(.html_block) + node.literal = raw_lines.join('\n') + '\n' + return node +} + +// detect_html_block_type returns the HTML block type (1-7) or 0 if the line +// does not start an HTML block. +fn detect_html_block_type(line string) int { + stripped := line.trim_left(' \t') + if stripped.len == 0 || stripped[0] != `<` { + return 0 + } + low := stripped.to_lower() + // Type 2: HTML comment + if low.starts_with('