Gitly


1 // Copyright 2026 The V Language. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module markdown
5 
6 import strings
7 
8 // block_level_tags lists HTML tags that start an HTML block (type 6).
9 // vfmt off
10 const block_level_tags = [
11     'address', 'article', 'aside', 'base', 'basefont', 'blockquote', 'body', 'caption',    'center',
12     'col', 'colgroup', 'dd', 'details', 'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset',
13     'figcaption', 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5',
14     'h6', 'head', 'header', 'hr', 'html', 'iframe', 'legend', 'li', 'link', 'main', 'menu',
15     'menuitem', 'meta', 'nav', 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'search',
16     'section', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track',
17     'ul'
18 ]!
19 // vfmt on
20 
21 // BlockParser parses markdown block structure line by line into an AST.
22 // After block parsing, inline content is parsed for every leaf node.
23 struct BlockParser {
24     opts Options
25 mut:
26     lines   []string
27     pos     int
28     ref_map map[string]LinkRef
29     fn_defs map[string]&Node
30 }
31 
32 // BlockParser.new creates a BlockParser for the given source.
33 fn BlockParser.new(src string, opts Options, ref_map map[string]LinkRef) BlockParser {
34     normalized := src.replace('\r\n', '\n').replace('\r', '\n')
35     lines := normalized.split('\n')
36     mut refs := map[string]LinkRef{}
37     for k, v in ref_map {
38         refs[k] = v
39     }
40     return BlockParser{
41         opts:    opts
42         lines:   lines
43         ref_map: refs
44         fn_defs: map[string]&Node{}
45     }
46 }
47 
48 // nested_block_parser creates a nested parser that inherits the current options
49 // and reference definitions.
50 fn (p &BlockParser) nested_block_parser(lines []string) BlockParser {
51     mut refs := map[string]LinkRef{}
52     for k, v in p.ref_map {
53         refs[k] = v
54     }
55     return BlockParser{
56         opts:    p.opts
57         lines:   lines
58         ref_map: refs
59         fn_defs: map[string]&Node{}
60     }
61 }
62 
63 // merge_nested_state propagates nested parser state back to the parent parser.
64 fn (mut p BlockParser) merge_nested_state(inner BlockParser) {
65     for k, v in inner.ref_map {
66         p.ref_map[k] = v
67     }
68     if p.opts.footnotes {
69         for k, v in inner.fn_defs {
70             if k !in p.fn_defs {
71                 p.fn_defs[k] = v
72             }
73         }
74     }
75 }
76 
77 // parse parses the full document and returns the AST root node.
78 fn (mut p BlockParser) parse() &Node {
79     mut doc := new_node(.document)
80     p.parse_blocks(mut doc, 0)
81     // Attach collected footnote definitions as children of the document.
82     if p.opts.footnotes {
83         for _, fn_node in p.fn_defs {
84             doc.append_child(fn_node)
85         }
86     }
87     return doc
88 }
89 
90 // parse_blocks fills parent with block-level children parsed from p.lines[p.pos..].
91 // indent is the minimum leading-space indent already consumed by a container.
92 fn (mut p BlockParser) parse_blocks(mut parent Node, indent int) {
93     for p.pos < p.lines.len {
94         line_raw := p.lines[p.pos]
95         line := expand_tabs(line_raw)
96         trimmed := line.trim_left(' \t')
97 
98         // --- blank line ---
99         if is_blank(line) {
100             p.pos++
101             continue
102         }
103 
104         stripped := trim_indent(line, indent)
105         sp := leading_spaces(stripped)
106         content := trim_indent(stripped, sp)
107 
108         // --- thematic break (---, ***, ___) ---
109         if is_thematic_break(stripped) {
110             node := new_node(.thematic_break)
111             parent.append_child(node)
112             p.pos++
113             continue
114         }
115 
116         // --- ATX heading (# ... ######) ---
117         if heading := p.try_atx_heading(stripped) {
118             parent.append_child(heading)
119             p.pos++
120             continue
121         }
122 
123         // --- fenced code block (``` or ~~~) ---
124         if fenced := p.try_fenced_code(stripped, indent) {
125             parent.append_child(fenced)
126             continue
127         }
128 
129         // --- HTML block ---
130         if html_blk := p.try_html_block(stripped, indent) {
131             parent.append_child(html_blk)
132             continue
133         }
134 
135         // --- link reference definition ---
136         // CommonMark allows 0-3 leading spaces after container indentation.
137         if sp <= 3 && p.try_link_ref_def(content) {
138             continue
139         }
140 
141         // --- footnote definition (if footnotes extension enabled) ---
142         if p.opts.footnotes {
143             if p.try_footnote_def(stripped, indent) {
144                 continue
145             }
146         }
147 
148         // --- blockquote (>) ---
149         if stripped.starts_with('>') {
150             bq := p.parse_blockquote(indent)
151             parent.append_child(bq)
152             continue
153         }
154 
155         // --- indented code block (4 spaces) ---
156         if sp >= 4 && !is_blank(stripped) {
157             cb := p.parse_indented_code(indent)
158             parent.append_child(cb)
159             continue
160         }
161 
162         // --- list (bullet or ordered) ---
163         if is_list_marker(stripped) {
164             lst := p.parse_list(indent)
165             parent.append_child(lst)
166             continue
167         }
168 
169         // --- GFM table (if tables extension enabled) ---
170         if p.opts.tables {
171             if tbl := p.try_table(indent) {
172                 parent.append_child(tbl)
173                 continue
174             }
175         }
176 
177         // --- definition list (if extension enabled) ---
178         if p.opts.definition_list {
179             if dl := p.try_definition_list(indent) {
180                 parent.append_child(dl)
181                 continue
182             }
183         }
184 
185         // --- paragraph (including setext headings) ---
186         para := p.parse_paragraph(indent)
187         if para.kind == .heading || para.kind == .paragraph {
188             parent.append_child(para)
189         }
190         _ = trimmed
191         _ = content
192     }
193 }
194 
195 // is_thematic_break returns true if line is a valid thematic break
196 // (three or more -, *, or _ with optional spaces).
197 fn is_thematic_break(line string) bool {
198     trimmed := line.trim_space()
199     if trimmed.len < 3 {
200         return false
201     }
202     mut c := trimmed[0]
203     if c != `-` && c != `*` && c != `_` {
204         return false
205     }
206     mut count := 0
207     for i := 0; i < trimmed.len; i++ {
208         ch := trimmed[i]
209         if ch == c {
210             count++
211         } else if ch != ` ` && ch != `\t` {
212             return false
213         }
214     }
215     return count >= 3
216 }
217 
218 // try_atx_heading attempts to parse an ATX heading from line.
219 // Returns the heading node on success.
220 fn (mut p BlockParser) try_atx_heading(line string) ?&Node {
221     if line.len == 0 || line[0] != `#` {
222         return none
223     }
224     mut level := 0
225     for level < line.len && line[level] == `#` {
226         level++
227     }
228     if level > 6 {
229         return none
230     }
231     if level < line.len && line[level] != ` ` && line[level] != `\t` {
232         return none
233     }
234     mut content := line[level..].trim_space()
235     // Strip trailing # sequence.
236     for content.ends_with('#') {
237         stripped := content.trim_right('#')
238         if stripped.len == 0 || stripped.ends_with(' ') || stripped.ends_with('\t') {
239             content = stripped.trim_right(' \t')
240             break
241         }
242         break
243     }
244     mut node := new_node(.heading)
245     node.level = level
246     node.literal = content
247     if p.opts.parser_opts.auto_heading_id {
248         node.id = heading_id_from_text(content)
249     }
250     return node
251 }
252 
253 // try_fenced_code attempts to parse a fenced code block starting at p.pos.
254 fn (mut p BlockParser) try_fenced_code(line string, indent int) ?&Node {
255     fence_char, fence_len := detect_fence(line)
256     if fence_len < 3 {
257         return none
258     }
259     info := line[fence_len..].trim_space()
260     // info string must not contain a backtick when using backtick fence.
261     if fence_char == 96 && info.contains('`') {
262         return none
263     }
264     p.pos++
265     mut code_lines := []string{}
266     for p.pos < p.lines.len {
267         raw := expand_tabs(p.lines[p.pos])
268         stripped := trim_indent(raw, indent)
269         // Check for closing fence.
270         close_char, close_len := detect_fence(stripped)
271         if close_char == fence_char && close_len >= fence_len {
272             rest := stripped[close_len..].trim_space()
273             if rest.len == 0 {
274                 p.pos++
275                 break
276             }
277         }
278         code_lines << trim_indent(raw, indent)
279         p.pos++
280     }
281     mut node := new_node(.fenced_code)
282     node.fence_info = info
283     node.literal = code_lines.join('\n') + '\n'
284     return node
285 }
286 
287 // detect_fence returns (fence_char, fence_length) if line starts with a valid
288 // code-fence sequence, or (0, 0) if not.
289 fn detect_fence(line string) (u8, int) {
290     if line.len < 3 {
291         return 0, 0
292     }
293     c := line[0]
294     if c != 96 && c != `~` {
295         return 0, 0
296     }
297     mut n := 0
298     for n < line.len && line[n] == c {
299         n++
300     }
301     if n >= 3 {
302         return c, n
303     }
304     return 0, 0
305 }
306 
307 // parse_indented_code collects lines that are indented by at least (indent+4)
308 // spaces (or blank) into an indented code block.
309 fn (mut p BlockParser) parse_indented_code(indent int) &Node {
310     mut lines := []string{}
311     for p.pos < p.lines.len {
312         raw := expand_tabs(p.lines[p.pos])
313         if is_blank(raw) {
314             // Blank line may be included, but only if followed by more indented code.
315             lines << ''
316             p.pos++
317             continue
318         }
319         stripped := trim_indent(raw, indent)
320         sp := leading_spaces(stripped)
321         if sp < 4 {
322             break
323         }
324         lines << trim_indent(stripped, 4)
325         p.pos++
326     }
327     // Trim trailing blank lines.
328     for lines.len > 0 && lines[lines.len - 1] == '' {
329         lines = unsafe { lines[..lines.len - 1] }
330     }
331     mut node := new_node(.code_block)
332     node.literal = lines.join('\n') + '\n'
333     return node
334 }
335 
336 // try_html_block attempts to parse an HTML block starting at p.pos.
337 fn (mut p BlockParser) try_html_block(line string, indent int) ?&Node {
338     html_type := detect_html_block_type(line)
339     if html_type == 0 {
340         return none
341     }
342     mut raw_lines := []string{}
343     raw_lines << p.lines[p.pos]
344     p.pos++
345     // Types 1-5 end at specific end patterns; type 6-7 end at blank line.
346     for p.pos < p.lines.len {
347         raw := p.lines[p.pos]
348         expanded := expand_tabs(raw)
349         stripped := trim_indent(expanded, indent)
350         match html_type {
351             1 {
352                 raw_lines << raw
353                 p.pos++
354                 low := stripped.to_lower()
355                 if low.contains('</script>') || low.contains('</pre>') || low.contains('</style>')
356                     || low.contains('</textarea>') {
357                     break
358                 }
359             }
360             2 {
361                 raw_lines << raw
362                 p.pos++
363                 if stripped.contains('-->') {
364                     break
365                 }
366             }
367             3 {
368                 raw_lines << raw
369                 p.pos++
370                 if stripped.contains('?>') {
371                     break
372                 }
373             }
374             4 {
375                 raw_lines << raw
376                 p.pos++
377                 if stripped.contains('>') {
378                     break
379                 }
380             }
381             5 {
382                 raw_lines << raw
383                 p.pos++
384                 if stripped.contains(']]>') {
385                     break
386                 }
387             }
388             6, 7 {
389                 if is_blank(stripped) {
390                     break
391                 }
392                 raw_lines << raw
393                 p.pos++
394             }
395             else {}
396         }
397     }
398     mut node := new_node(.html_block)
399     node.literal = raw_lines.join('\n') + '\n'
400     return node
401 }
402 
403 // detect_html_block_type returns the HTML block type (1-7) or 0 if the line
404 // does not start an HTML block.
405 fn detect_html_block_type(line string) int {
406     stripped := line.trim_left(' \t')
407     if stripped.len == 0 || stripped[0] != `<` {
408         return 0
409     }
410     low := stripped.to_lower()
411     // Type 2: HTML comment
412     if low.starts_with('<!--') {
413         return 2
414     }
415     // Type 3: processing instruction
416     if low.starts_with('<?') {
417         return 3
418     }
419     // Type 5: CDATA
420     if low.starts_with('<![cdata[') {
421         return 5
422     }
423     // Type 4: <!X
424     if low.len > 2 && low[1] == `!` && low[2] >= `a` && low[2] <= `z` {
425         return 4
426     }
427     // Type 1: script/pre/style/textarea
428     for _, tag in ['<script', '<pre', '<style', '<textarea'] {
429         if low.starts_with(tag) {
430             rest := low[tag.len..]
431             if rest.len == 0 || rest[0] == ` ` || rest[0] == `\t` || rest[0] == `>`
432                 || rest[0] == `\n` {
433                 return 1
434             }
435         }
436     }
437     // Type 6: block-level tag
438     tag_name := extract_tag_name(low[1..])
439     if tag_name.len > 0 {
440         for bt in block_level_tags {
441             if tag_name == bt {
442                 return 6
443             }
444         }
445     }
446     // Type 7: complete open/close tag not in type 6
447     if is_complete_html_tag(stripped) {
448         return 7
449     }
450     return 0
451 }
452 
453 // extract_tag_name extracts the tag name from the beginning of s.
454 fn extract_tag_name(s string) string {
455     mut end := 0
456     for end < s.len && (is_alnum(s[end]) || s[end] == `-`) {
457         end++
458     }
459     return s[..end].to_lower()
460 }
461 
462 // is_complete_html_tag returns true if s looks like a complete open or close tag.
463 fn is_complete_html_tag(s string) bool {
464     if s.len < 3 {
465         return false
466     }
467     // Must start with < and end with >
468     if s[0] != `<` {
469         return false
470     }
471     end := s.index('>') or { return false }
472     return end == s.len - 1 || s[end + 1..].trim_space().len == 0
473 }
474 
475 // try_link_ref_def attempts to parse a link reference definition at p.pos.
476 // CommonMark allows the title to appear on the next line when the destination
477 // is alone on the first line.  Returns true and advances p.pos if successful.
478 fn (mut p BlockParser) try_link_ref_def(line string) bool {
479     if !line.starts_with('[') {
480         return false
481     }
482     // Find closing bracket.
483     mut i := 1
484     for i < line.len && line[i] != `]` {
485         if line[i] == `\\` {
486             i++
487         }
488         i++
489     }
490     if i >= line.len || line[i] != `]` || i + 1 >= line.len || line[i + 1] != `:` {
491         return false
492     }
493     label := normalize_label(line[1..i])
494     rest := line[i + 2..].trim_left(' \t')
495     if label.len == 0 {
496         return false
497     }
498     // Parse destination.
499     dest, after_dest := parse_link_dest(rest)
500     if after_dest == rest {
501         return false
502     }
503     // Parse optional title.  The title may appear on the same line or, if the
504     // destination is the only content on the first line, on the very next line.
505     mut title := ''
506     mut extra_lines := 0 // number of additional lines consumed for the title
507     title_str := after_dest.trim_left(' \t')
508     if title_str.len > 0 {
509         // Title (or unwanted content) is on the same line as the destination.
510         parsed_title, title_rest := parse_link_title(title_str)
511         if title_rest == title_str {
512             // Not a valid title; reject the whole definition.
513             return false
514         }
515         if title_rest.trim_space().len > 0 {
516             // Trailing content after the title – invalid.
517             return false
518         }
519         title = parsed_title
520     } else {
521         // Destination was alone on its line; look for a title on the next line.
522         next_idx := p.pos + 1
523         if next_idx < p.lines.len {
524             next_line := expand_tabs(p.lines[next_idx]).trim_left(' \t')
525             if next_line.len > 0 {
526                 parsed_title, title_rest := parse_link_title(next_line)
527                 if title_rest != next_line && title_rest.trim_space().len == 0 {
528                     // Valid title on the next line; consume it.
529                     title = parsed_title
530                     extra_lines = 1
531                 }
532             }
533         }
534     }
535     if label !in p.ref_map {
536         p.ref_map[label] = LinkRef{
537             dest:  dest
538             title: title
539         }
540     }
541     p.pos += 1 + extra_lines
542     return true
543 }
544 
545 // parse_link_dest parses a link destination from s and returns (dest, rest).
546 fn parse_link_dest(s string) (string, string) {
547     if s.len == 0 {
548         return '', s
549     }
550     if s[0] == `<` {
551         // Angle-bracket form: <url>
552         end := s.index_after_('>', 1)
553         if end < 0 {
554             return '', s
555         }
556         return s[1..end], s[end + 1..]
557     }
558     // Regular form: no spaces, no control characters, balanced parentheses.
559     mut parens := 0
560     mut i := 0
561     for i < s.len {
562         c := s[i]
563         if c == ` ` || c == `\t` || c == `\n` {
564             break
565         }
566         if c == `(` {
567             parens++
568         } else if c == `)` {
569             if parens == 0 {
570                 break
571             }
572             parens--
573         } else if c == `\\` && i + 1 < s.len {
574             i += 2
575             continue
576         }
577         i++
578     }
579     if i == 0 {
580         return '', s
581     }
582     if parens != 0 {
583         return '', s
584     }
585     return s[..i], s[i..]
586 }
587 
588 // parse_link_title parses an optional link title from s and returns (title, rest).
589 // Returns empty string if no valid title is found.
590 fn parse_link_title(s string) (string, string) {
591     if s.len == 0 {
592         return '', s
593     }
594     open := s[0]
595     mut close := u8(0)
596     match open {
597         `"` { close = `"` }
598         `'` { close = `'` }
599         `(` { close = `)` }
600         else { return '', s }
601     }
602 
603     mut i := 1
604     for i < s.len {
605         c := s[i]
606         if c == close {
607             return unescape_string(s[1..i]), s[i + 1..]
608         }
609         if c == `\\` && i + 1 < s.len {
610             i += 2
611             continue
612         }
613         i++
614     }
615     return '', s
616 }
617 
618 // parse_blockquote parses a blockquote block and returns a blockquote node.
619 fn (mut p BlockParser) parse_blockquote(indent int) &Node {
620     mut bq_lines := []string{}
621     for p.pos < p.lines.len {
622         raw := expand_tabs(p.lines[p.pos])
623         stripped := trim_indent(raw, indent)
624         if is_blank(stripped) {
625             // Lazy continuation stops at blank.
626             p.pos++
627             break
628         }
629         if stripped.starts_with('> ') {
630             bq_lines << stripped[2..]
631             p.pos++
632         } else if stripped.starts_with('>') {
633             bq_lines << stripped[1..]
634             p.pos++
635         } else {
636             // Lazy continuation line.
637             bq_lines << stripped
638             p.pos++
639         }
640     }
641     mut node := new_node(.blockquote)
642     // Recursively parse the blockquote content.
643     mut inner := p.nested_block_parser(bq_lines)
644     inner.parse_blocks(mut node, 0)
645     p.merge_nested_state(inner)
646     return node
647 }
648 
649 // ListMarker holds parsed list marker information.
650 struct ListMarker {
651     is_ordered  bool
652     bullet_char u8
653     start       int
654     indent      int // total indent of content after marker
655 }
656 
657 // is_list_marker returns true if line starts with a bullet or ordered list marker.
658 fn is_list_marker(line string) bool {
659     if line.len == 0 {
660         return false
661     }
662     _, ok := parse_list_marker(line)
663     return ok
664 }
665 
666 // parse_list_marker parses a list marker from the beginning of line.
667 fn parse_list_marker(line string) (ListMarker, bool) {
668     sp := leading_spaces(line)
669     rest := line[sp..]
670     if rest.len == 0 {
671         return ListMarker{}, false
672     }
673     // Bullet list: -, *, +
674     if rest[0] == `-` || rest[0] == `*` || rest[0] == `+` {
675         if rest.len < 2 || (rest[1] != ` ` && rest[1] != `\t`) {
676             // Only a bare marker with no space is not valid (except empty item for -)
677             if rest.len == 1 {
678                 return ListMarker{
679                     is_ordered:  false
680                     bullet_char: rest[0]
681                     indent:      sp + 2
682                 }, true
683             }
684             return ListMarker{}, false
685         }
686         content_indent := sp + 1 + (if rest.len > 1 && rest[1] == `\t` { 3 } else { 1 })
687         return ListMarker{
688             is_ordered:  false
689             bullet_char: rest[0]
690             indent:      content_indent
691         }, true
692     }
693     // Ordered list: 1. or 1)
694     mut num_end := 0
695     for num_end < rest.len && is_digit(rest[num_end]) {
696         num_end++
697     }
698     if num_end > 0 && num_end < rest.len && (rest[num_end] == `.` || rest[num_end] == `)`) {
699         marker_end := num_end + 1
700         if marker_end < rest.len && rest[marker_end] != ` ` && rest[marker_end] != `\t` {
701             return ListMarker{}, false
702         }
703         num_str := rest[..num_end]
704         start := num_str.int()
705         mut content_indent := sp + marker_end + 1
706         if marker_end < rest.len && rest[marker_end] == `\t` {
707             content_indent = sp + marker_end + (4 - ((sp + marker_end) % 4))
708         }
709         return ListMarker{
710             is_ordered:  true
711             start:       start
712             bullet_char: rest[num_end]
713             indent:      content_indent
714         }, true
715     }
716     return ListMarker{}, false
717 }
718 
719 // parse_list parses a list (bullet or ordered) and returns a list node.
720 fn (mut p BlockParser) parse_list(indent int) &Node {
721     // Determine list type from the first item's marker.
722     if p.pos >= p.lines.len {
723         return new_node(.list)
724     }
725 
726     first_raw := expand_tabs(p.lines[p.pos])
727     first_line := trim_indent(first_raw, indent)
728     marker, ok := parse_list_marker(first_line)
729     if !ok {
730         return new_node(.list)
731     }
732 
733     mut list := new_node(.list)
734     list.is_ordered = marker.is_ordered
735     list.list_start = if marker.is_ordered { marker.start } else { 1 }
736     list.is_tight = true
737 
738     mut had_blank := false
739 
740     for p.pos < p.lines.len {
741         raw := expand_tabs(p.lines[p.pos])
742         stripped := trim_indent(raw, indent)
743 
744         if is_blank(stripped) {
745             had_blank = true
746             p.pos++
747             continue
748         }
749 
750         cur_marker, marker_ok := parse_list_marker(stripped)
751         if !marker_ok {
752             break
753         }
754         // Different list type → stop.
755         if cur_marker.is_ordered != marker.is_ordered {
756             break
757         }
758         if !cur_marker.is_ordered && cur_marker.bullet_char != marker.bullet_char {
759             break
760         }
761 
762         if had_blank {
763             list.is_tight = false
764         }
765         had_blank = false
766 
767         item := p.parse_list_item(indent)
768         list.append_child(item)
769     }
770 
771     return list
772 }
773 
774 // parse_list_item parses a single list item and returns a list_item node.
775 fn (mut p BlockParser) parse_list_item(base_indent int) &Node {
776     if p.pos >= p.lines.len {
777         return new_node(.list_item)
778     }
779 
780     mut item := new_node(.list_item)
781 
782     // Get first line of the item
783     first_raw := expand_tabs(p.lines[p.pos])
784     first_stripped := trim_indent(first_raw, base_indent)
785 
786     // Extract content after marker
787     marker_line := first_stripped
788     mut marker_idx := 0
789 
790     // Find where marker ends in stripped line
791     if marker_line.len > 0
792         && (marker_line[0] == `-` || marker_line[0] == `*` || marker_line[0] == `+`) {
793         // Bullet marker: skip marker and whitespace
794         marker_idx = 1
795         for marker_idx < marker_line.len
796             && (marker_line[marker_idx] == ` ` || marker_line[marker_idx] == `\t`) {
797             marker_idx++
798         }
799     } else {
800         // Ordered marker: skip number and . or )
801         for marker_idx < marker_line.len && is_digit(marker_line[marker_idx]) {
802             marker_idx++
803         }
804         if marker_idx < marker_line.len
805             && (marker_line[marker_idx] == `.` || marker_line[marker_idx] == `)`) {
806             marker_idx++
807         }
808         // Skip whitespace after marker
809         for marker_idx < marker_line.len
810             && (marker_line[marker_idx] == ` ` || marker_line[marker_idx] == `\t`) {
811             marker_idx++
812         }
813     }
814 
815     // Get content after marker
816     first_content := if marker_idx < marker_line.len {
817         marker_line[marker_idx..]
818     } else {
819         ''
820     }
821 
822     // Detect task list checkbox: [ ] or [x] or [X] at the start of content.
823     mut task_checked := false
824     mut has_task := false
825     mut task_content_start := 0
826     if p.opts.task_list && first_content.len >= 3 && first_content[0] == `[` {
827         if (first_content[1] == ` ` || first_content[1] == `x` || first_content[1] == `X`)
828             && first_content[2] == `]` {
829             if first_content.len == 3 || first_content[3] == ` ` || first_content[3] == `\t` {
830                 has_task = true
831                 task_checked = first_content[1] != ` `
832                 // Skip the checkbox and one optional following whitespace char.
833                 task_content_start = if first_content.len > 3 { 4 } else { 3 }
834             }
835         }
836     }
837     actual_first_content := if has_task {
838         first_content[task_content_start..]
839     } else {
840         first_content
841     }
842 
843     // Collect lines belonging to this item
844     mut item_lines := [actual_first_content]
845     p.pos++
846 
847     // Content indent is where subsequent lines must be indented to
848     first_leading := leading_spaces(first_stripped)
849     content_indent := first_leading + marker_idx
850 
851     for p.pos < p.lines.len {
852         raw := expand_tabs(p.lines[p.pos])
853         stripped_base := trim_indent(raw, base_indent)
854 
855         if is_blank(stripped_base) {
856             item_lines << ''
857             p.pos++
858             continue
859         }
860 
861         sp := leading_spaces(stripped_base)
862 
863         // If line has enough indent, include it
864         if sp >= content_indent {
865             // Remove the content indent
866             item_lines << trim_indent(stripped_base, content_indent)
867             p.pos++
868             continue
869         }
870 
871         // If line starts a new list item at base indent level, stop
872         if sp < 2 {
873             _, mk := parse_list_marker(stripped_base)
874             if mk {
875                 break
876             }
877         }
878 
879         // Check if it might be a sub-block (less indented but meaningful)
880         if is_thematic_break(stripped_base) || stripped_base.starts_with('#') {
881             break
882         }
883 
884         item_lines << stripped_base
885         p.pos++
886     }
887 
888     // Trim trailing blank lines
889     for item_lines.len > 0 && item_lines[item_lines.len - 1] == '' {
890         item_lines = unsafe { item_lines[..item_lines.len - 1] }
891     }
892 
893     // Recursively parse the item's content with fresh parser
894     mut inner := p.nested_block_parser(item_lines)
895     inner.parse_blocks(mut item, 0)
896 
897     // Merge back any new link references and footnote definitions.
898     p.merge_nested_state(inner)
899 
900     // Prepend task checkbox node if detected (must be first child).
901     if has_task {
902         mut chk := new_node(.task_checkbox)
903         chk.checked = task_checked
904         mut new_children := [chk]
905         for child in item.children {
906             new_children << child
907         }
908         item.children = new_children
909     }
910 
911     return item
912 }
913 
914 // try_table attempts to parse a GFM table starting at p.pos.
915 // A table requires a header row, an alignment row (|---|), then data rows.
916 fn (mut p BlockParser) try_table(indent int) ?&Node {
917     if p.pos + 1 >= p.lines.len {
918         return none
919     }
920     header_raw := expand_tabs(p.lines[p.pos])
921     header_line := trim_indent(header_raw, indent)
922     sep_raw := expand_tabs(p.lines[p.pos + 1])
923     sep_line := trim_indent(sep_raw, indent)
924 
925     if !is_table_separator(sep_line) {
926         return none
927     }
928     if !header_line.contains('|') {
929         return none
930     }
931 
932     aligns := parse_table_alignments(sep_line)
933     if aligns.len == 0 {
934         return none
935     }
936 
937     mut tbl := new_node(.table)
938 
939     // Header row.
940     mut head := new_node(.table_head)
941     header_row := parse_table_row(header_line, aligns)
942     head.append_child(header_row)
943     tbl.append_child(head)
944 
945     p.pos += 2
946 
947     // Body rows.
948     mut body := new_node(.table_body)
949     for p.pos < p.lines.len {
950         raw := expand_tabs(p.lines[p.pos])
951         stripped := trim_indent(raw, indent)
952         if is_blank(stripped) || !stripped.contains('|') {
953             break
954         }
955         body.append_child(parse_table_row(stripped, aligns))
956         p.pos++
957     }
958     if body.children.len > 0 {
959         tbl.append_child(body)
960     }
961     return tbl
962 }
963 
964 // is_table_separator returns true if line is a table alignment separator row.
965 fn is_table_separator(line string) bool {
966     trimmed := line.trim('| \t')
967     if trimmed.len == 0 {
968         return false
969     }
970     cells := split_table_cells(line)
971     if cells.len == 0 {
972         return false
973     }
974     for cell in cells {
975         c := cell.trim_space()
976         if c.len == 0 {
977             continue
978         }
979         inner := if c.starts_with(':') && c.ends_with(':') {
980             c[1..c.len - 1]
981         } else if c.starts_with(':') {
982             c[1..]
983         } else if c.ends_with(':') {
984             c[..c.len - 1]
985         } else {
986             c
987         }
988         for ch in inner.runes() {
989             if ch != `-` {
990                 return false
991             }
992         }
993         if inner.len == 0 {
994             return false
995         }
996     }
997     return true
998 }
999 
1000 // parse_table_alignments returns the alignment for each column from a separator line.
1001 fn parse_table_alignments(line string) []Alignment {
1002     cells := split_table_cells(line)
1003     mut aligns := []Alignment{}
1004     for cell in cells {
1005         c := cell.trim_space()
1006         if c.len == 0 {
1007             continue
1008         }
1009         left := c.starts_with(':')
1010         right := c.ends_with(':')
1011         if left && right {
1012             aligns << .center
1013         } else if left {
1014             aligns << .left
1015         } else if right {
1016             aligns << .right
1017         } else {
1018             aligns << .none_
1019         }
1020     }
1021     return aligns
1022 }
1023 
1024 // parse_table_row parses a table row line into a table_row node.
1025 fn parse_table_row(line string, aligns []Alignment) &Node {
1026     cells := split_table_cells(line)
1027     mut row := new_node(.table_row)
1028     for i, cell_text in cells {
1029         mut cell := new_node(.table_cell)
1030         cell.align = if i < aligns.len { aligns[i] } else { .none_ }
1031         cell.literal = cell_text.trim_space()
1032         row.append_child(cell)
1033     }
1034     return row
1035 }
1036 
1037 // split_table_cells splits a table row line by pipe characters.
1038 fn split_table_cells(line string) []string {
1039     trimmed := line.trim_space()
1040     // Strip leading/trailing pipe.
1041     inner := if trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.len > 1 {
1042         trimmed[1..trimmed.len - 1]
1043     } else if trimmed.starts_with('|') {
1044         trimmed[1..]
1045     } else if trimmed.ends_with('|') {
1046         trimmed[..trimmed.len - 1]
1047     } else {
1048         trimmed
1049     }
1050     mut cells := []string{}
1051     mut current := strings.new_builder(32)
1052     for i := 0; i < inner.len; i++ {
1053         if inner[i] == `\\` && i + 1 < inner.len && inner[i + 1] == `|` {
1054             current.write_u8(`|`)
1055             i++
1056         } else if inner[i] == `|` {
1057             cells << current.str()
1058             current = strings.new_builder(32)
1059         } else {
1060             current.write_u8(inner[i])
1061         }
1062     }
1063     last := current.str()
1064     cells << last
1065     return cells
1066 }
1067 
1068 // try_definition_list attempts to parse a definition list starting at p.pos.
1069 fn (mut p BlockParser) try_definition_list(indent int) ?&Node {
1070     if p.pos + 1 >= p.lines.len {
1071         return none
1072     }
1073     next_raw := expand_tabs(p.lines[p.pos + 1])
1074     next_stripped := trim_indent(next_raw, indent)
1075     if !next_stripped.starts_with(':') {
1076         return none
1077     }
1078     mut dl := new_node(.definition_list)
1079     for p.pos < p.lines.len {
1080         raw := expand_tabs(p.lines[p.pos])
1081         stripped := trim_indent(raw, indent)
1082         if is_blank(stripped) {
1083             p.pos++
1084             break
1085         }
1086         // Collect term line.
1087         mut term := new_node(.definition_term)
1088         term.literal = stripped
1089         p.pos++
1090         // Collect one or more definitions (:).
1091         for p.pos < p.lines.len {
1092             def_raw := expand_tabs(p.lines[p.pos])
1093             def_stripped := trim_indent(def_raw, indent)
1094             if def_stripped.starts_with(':') {
1095                 mut desc := new_node(.definition_desc)
1096                 desc.literal = def_stripped[1..].trim_left(' \t')
1097                 term.append_child(desc)
1098                 p.pos++
1099             } else {
1100                 break
1101             }
1102         }
1103         dl.append_child(term)
1104     }
1105     if dl.children.len == 0 {
1106         return none
1107     }
1108     return dl
1109 }
1110 
1111 // try_footnote_def attempts to parse a footnote definition starting at p.pos.
1112 fn (mut p BlockParser) try_footnote_def(line string, indent int) bool {
1113     if !line.starts_with('[^') {
1114         return false
1115     }
1116     end := line.index_after_(']', 2)
1117     if end < 0 || end + 1 >= line.len || line[end + 1] != `:` {
1118         return false
1119     }
1120     label := line[2..end]
1121     if label.len == 0 {
1122         return false
1123     }
1124     content := line[end + 2..].trim_left(' \t')
1125     mut fn_node := new_node(.footnote_def)
1126     fn_node.fn_label = label
1127     fn_node.literal = content
1128     p.pos++
1129     // Collect continuation lines (indented by at least 4 spaces).
1130     for p.pos < p.lines.len {
1131         raw := expand_tabs(p.lines[p.pos])
1132         stripped := trim_indent(raw, indent)
1133         if is_blank(stripped) || leading_spaces(stripped) >= 4 {
1134             fn_node.literal += '\n' + stripped.trim_left('    ')
1135             p.pos++
1136         } else {
1137             break
1138         }
1139     }
1140     if label !in p.fn_defs {
1141         p.fn_defs[label] = fn_node
1142     }
1143     return true
1144 }
1145 
1146 // parse_paragraph parses a paragraph block, upgrading it to a setext heading
1147 // if the immediately following line is a setext underline (=== or ---).
1148 fn (mut p BlockParser) parse_paragraph(indent int) &Node {
1149     mut para_lines := []string{}
1150     for p.pos < p.lines.len {
1151         raw := expand_tabs(p.lines[p.pos])
1152         stripped := trim_indent(raw, indent)
1153 
1154         if is_blank(stripped) {
1155             p.pos++
1156             break
1157         }
1158         // Check for setext underline on the next line while para_lines is non-empty.
1159         if para_lines.len > 0 {
1160             if is_setext_underline(stripped) {
1161                 level := if stripped.trim_left(' \t')[0] == `=` { 1 } else { 2 }
1162                 content := para_lines.join('\n').trim_space()
1163                 mut node := new_node(.heading)
1164                 node.level = level
1165                 node.literal = content
1166                 if p.opts.parser_opts.auto_heading_id {
1167                     node.id = heading_id_from_text(content)
1168                 }
1169                 p.pos++
1170                 return node
1171             }
1172         }
1173         // Other block starters interrupt a paragraph.
1174         if para_lines.len > 0 {
1175             if is_thematic_break(stripped) || stripped.starts_with('#') || stripped.starts_with('>')
1176                 || stripped.starts_with('```') || stripped.starts_with('~~~') {
1177                 break
1178             }
1179             if is_list_marker(stripped) {
1180                 break
1181             }
1182         }
1183         para_lines << stripped
1184         p.pos++
1185     }
1186     mut node := new_node(.paragraph)
1187     node.literal = para_lines.join('\n').trim_space()
1188     return node
1189 }
1190 
1191 // is_setext_underline returns true if line is a setext heading underline
1192 // (0-3 leading spaces, then one or more = or - with optional trailing spaces).
1193 fn is_setext_underline(line string) bool {
1194     if line.len == 0 {
1195         return false
1196     }
1197     // Allow up to 3 leading spaces (CommonMark spec rule 80).
1198     mut start := 0
1199     for start < 3 && start < line.len && line[start] == ` ` {
1200         start++
1201     }
1202     if start >= line.len {
1203         return false
1204     }
1205     c := line[start]
1206     if c != `=` && c != `-` {
1207         return false
1208     }
1209     for i := start; i < line.len; i++ {
1210         if line[i] != c && line[i] != ` ` && line[i] != `\t` {
1211             return false
1212         }
1213     }
1214     return true
1215 }
1216 
1217 // unescape_string decodes CommonMark backslash escapes in s.
1218 fn unescape_string(s string) string {
1219     if !s.contains('\\') {
1220         return s
1221     }
1222     mut out := strings.new_builder(s.len)
1223     mut i := 0
1224     for i < s.len {
1225         if s[i] == `\\` && i + 1 < s.len && is_ascii_punct(s[i + 1]) {
1226             out.write_u8(s[i + 1])
1227             i += 2
1228         } else {
1229             out.write_u8(s[i])
1230             i++
1231         }
1232     }
1233     return out.str()
1234 }
1235