v2 / vlib / x / markdown / parser.v
1234 lines · 1147 sloc · 29.87 KB · 46c3d7f13d605a08603985fe4e6f82f2a8771775
Raw
1// Copyright 2026 The V Language. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module markdown
5
6import strings
7
8// block_level_tags lists HTML tags that start an HTML block (type 6).
9// vfmt off
10const block_level_tags = [
11 'address', 'article', 'aside', 'base', 'basefont', 'blockquote', 'body', 'caption', 'center',
12 'col', 'colgroup', 'dd', 'details', 'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset',
13 'figcaption', 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5',
14 'h6', 'head', 'header', 'hr', 'html', 'iframe', 'legend', 'li', 'link', 'main', 'menu',
15 'menuitem', 'meta', 'nav', 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'search',
16 'section', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track',
17 'ul'
18]!
19// vfmt on
20
21// BlockParser parses markdown block structure line by line into an AST.
22// After block parsing, inline content is parsed for every leaf node.
23struct BlockParser {
24 opts Options
25mut:
26 lines []string
27 pos int
28 ref_map map[string]LinkRef
29 fn_defs map[string]&Node
30}
31
32// BlockParser.new creates a BlockParser for the given source.
33fn BlockParser.new(src string, opts Options, ref_map map[string]LinkRef) BlockParser {
34 normalized := src.replace('\r\n', '\n').replace('\r', '\n')
35 lines := normalized.split('\n')
36 mut refs := map[string]LinkRef{}
37 for k, v in ref_map {
38 refs[k] = v
39 }
40 return BlockParser{
41 opts: opts
42 lines: lines
43 ref_map: refs
44 fn_defs: map[string]&Node{}
45 }
46}
47
48// nested_block_parser creates a nested parser that inherits the current options
49// and reference definitions.
50fn (p &BlockParser) nested_block_parser(lines []string) BlockParser {
51 mut refs := map[string]LinkRef{}
52 for k, v in p.ref_map {
53 refs[k] = v
54 }
55 return BlockParser{
56 opts: p.opts
57 lines: lines
58 ref_map: refs
59 fn_defs: map[string]&Node{}
60 }
61}
62
63// merge_nested_state propagates nested parser state back to the parent parser.
64fn (mut p BlockParser) merge_nested_state(inner BlockParser) {
65 for k, v in inner.ref_map {
66 p.ref_map[k] = v
67 }
68 if p.opts.footnotes {
69 for k, v in inner.fn_defs {
70 if k !in p.fn_defs {
71 p.fn_defs[k] = v
72 }
73 }
74 }
75}
76
77// parse parses the full document and returns the AST root node.
78fn (mut p BlockParser) parse() &Node {
79 mut doc := new_node(.document)
80 p.parse_blocks(mut doc, 0)
81 // Attach collected footnote definitions as children of the document.
82 if p.opts.footnotes {
83 for _, fn_node in p.fn_defs {
84 doc.append_child(fn_node)
85 }
86 }
87 return doc
88}
89
90// parse_blocks fills parent with block-level children parsed from p.lines[p.pos..].
91// indent is the minimum leading-space indent already consumed by a container.
92fn (mut p BlockParser) parse_blocks(mut parent Node, indent int) {
93 for p.pos < p.lines.len {
94 line_raw := p.lines[p.pos]
95 line := expand_tabs(line_raw)
96 trimmed := line.trim_left(' \t')
97
98 // --- blank line ---
99 if is_blank(line) {
100 p.pos++
101 continue
102 }
103
104 stripped := trim_indent(line, indent)
105 sp := leading_spaces(stripped)
106 content := trim_indent(stripped, sp)
107
108 // --- thematic break (---, ***, ___) ---
109 if is_thematic_break(stripped) {
110 node := new_node(.thematic_break)
111 parent.append_child(node)
112 p.pos++
113 continue
114 }
115
116 // --- ATX heading (# ... ######) ---
117 if heading := p.try_atx_heading(stripped) {
118 parent.append_child(heading)
119 p.pos++
120 continue
121 }
122
123 // --- fenced code block (``` or ~~~) ---
124 if fenced := p.try_fenced_code(stripped, indent) {
125 parent.append_child(fenced)
126 continue
127 }
128
129 // --- HTML block ---
130 if html_blk := p.try_html_block(stripped, indent) {
131 parent.append_child(html_blk)
132 continue
133 }
134
135 // --- link reference definition ---
136 // CommonMark allows 0-3 leading spaces after container indentation.
137 if sp <= 3 && p.try_link_ref_def(content) {
138 continue
139 }
140
141 // --- footnote definition (if footnotes extension enabled) ---
142 if p.opts.footnotes {
143 if p.try_footnote_def(stripped, indent) {
144 continue
145 }
146 }
147
148 // --- blockquote (>) ---
149 if stripped.starts_with('>') {
150 bq := p.parse_blockquote(indent)
151 parent.append_child(bq)
152 continue
153 }
154
155 // --- indented code block (4 spaces) ---
156 if sp >= 4 && !is_blank(stripped) {
157 cb := p.parse_indented_code(indent)
158 parent.append_child(cb)
159 continue
160 }
161
162 // --- list (bullet or ordered) ---
163 if is_list_marker(stripped) {
164 lst := p.parse_list(indent)
165 parent.append_child(lst)
166 continue
167 }
168
169 // --- GFM table (if tables extension enabled) ---
170 if p.opts.tables {
171 if tbl := p.try_table(indent) {
172 parent.append_child(tbl)
173 continue
174 }
175 }
176
177 // --- definition list (if extension enabled) ---
178 if p.opts.definition_list {
179 if dl := p.try_definition_list(indent) {
180 parent.append_child(dl)
181 continue
182 }
183 }
184
185 // --- paragraph (including setext headings) ---
186 para := p.parse_paragraph(indent)
187 if para.kind == .heading || para.kind == .paragraph {
188 parent.append_child(para)
189 }
190 _ = trimmed
191 _ = content
192 }
193}
194
195// is_thematic_break returns true if line is a valid thematic break
196// (three or more -, *, or _ with optional spaces).
197fn is_thematic_break(line string) bool {
198 trimmed := line.trim_space()
199 if trimmed.len < 3 {
200 return false
201 }
202 mut c := trimmed[0]
203 if c != `-` && c != `*` && c != `_` {
204 return false
205 }
206 mut count := 0
207 for i := 0; i < trimmed.len; i++ {
208 ch := trimmed[i]
209 if ch == c {
210 count++
211 } else if ch != ` ` && ch != `\t` {
212 return false
213 }
214 }
215 return count >= 3
216}
217
218// try_atx_heading attempts to parse an ATX heading from line.
219// Returns the heading node on success.
220fn (mut p BlockParser) try_atx_heading(line string) ?&Node {
221 if line.len == 0 || line[0] != `#` {
222 return none
223 }
224 mut level := 0
225 for level < line.len && line[level] == `#` {
226 level++
227 }
228 if level > 6 {
229 return none
230 }
231 if level < line.len && line[level] != ` ` && line[level] != `\t` {
232 return none
233 }
234 mut content := line[level..].trim_space()
235 // Strip trailing # sequence.
236 for content.ends_with('#') {
237 stripped := content.trim_right('#')
238 if stripped.len == 0 || stripped.ends_with(' ') || stripped.ends_with('\t') {
239 content = stripped.trim_right(' \t')
240 break
241 }
242 break
243 }
244 mut node := new_node(.heading)
245 node.level = level
246 node.literal = content
247 if p.opts.parser_opts.auto_heading_id {
248 node.id = heading_id_from_text(content)
249 }
250 return node
251}
252
253// try_fenced_code attempts to parse a fenced code block starting at p.pos.
254fn (mut p BlockParser) try_fenced_code(line string, indent int) ?&Node {
255 fence_char, fence_len := detect_fence(line)
256 if fence_len < 3 {
257 return none
258 }
259 info := line[fence_len..].trim_space()
260 // info string must not contain a backtick when using backtick fence.
261 if fence_char == 96 && info.contains('`') {
262 return none
263 }
264 p.pos++
265 mut code_lines := []string{}
266 for p.pos < p.lines.len {
267 raw := expand_tabs(p.lines[p.pos])
268 stripped := trim_indent(raw, indent)
269 // Check for closing fence.
270 close_char, close_len := detect_fence(stripped)
271 if close_char == fence_char && close_len >= fence_len {
272 rest := stripped[close_len..].trim_space()
273 if rest.len == 0 {
274 p.pos++
275 break
276 }
277 }
278 code_lines << trim_indent(raw, indent)
279 p.pos++
280 }
281 mut node := new_node(.fenced_code)
282 node.fence_info = info
283 node.literal = code_lines.join('\n') + '\n'
284 return node
285}
286
287// detect_fence returns (fence_char, fence_length) if line starts with a valid
288// code-fence sequence, or (0, 0) if not.
289fn detect_fence(line string) (u8, int) {
290 if line.len < 3 {
291 return 0, 0
292 }
293 c := line[0]
294 if c != 96 && c != `~` {
295 return 0, 0
296 }
297 mut n := 0
298 for n < line.len && line[n] == c {
299 n++
300 }
301 if n >= 3 {
302 return c, n
303 }
304 return 0, 0
305}
306
307// parse_indented_code collects lines that are indented by at least (indent+4)
308// spaces (or blank) into an indented code block.
309fn (mut p BlockParser) parse_indented_code(indent int) &Node {
310 mut lines := []string{}
311 for p.pos < p.lines.len {
312 raw := expand_tabs(p.lines[p.pos])
313 if is_blank(raw) {
314 // Blank line may be included, but only if followed by more indented code.
315 lines << ''
316 p.pos++
317 continue
318 }
319 stripped := trim_indent(raw, indent)
320 sp := leading_spaces(stripped)
321 if sp < 4 {
322 break
323 }
324 lines << trim_indent(stripped, 4)
325 p.pos++
326 }
327 // Trim trailing blank lines.
328 for lines.len > 0 && lines[lines.len - 1] == '' {
329 lines = unsafe { lines[..lines.len - 1] }
330 }
331 mut node := new_node(.code_block)
332 node.literal = lines.join('\n') + '\n'
333 return node
334}
335
336// try_html_block attempts to parse an HTML block starting at p.pos.
337fn (mut p BlockParser) try_html_block(line string, indent int) ?&Node {
338 html_type := detect_html_block_type(line)
339 if html_type == 0 {
340 return none
341 }
342 mut raw_lines := []string{}
343 raw_lines << p.lines[p.pos]
344 p.pos++
345 // Types 1-5 end at specific end patterns; type 6-7 end at blank line.
346 for p.pos < p.lines.len {
347 raw := p.lines[p.pos]
348 expanded := expand_tabs(raw)
349 stripped := trim_indent(expanded, indent)
350 match html_type {
351 1 {
352 raw_lines << raw
353 p.pos++
354 low := stripped.to_lower()
355 if low.contains('</script>') || low.contains('</pre>') || low.contains('</style>')
356 || low.contains('</textarea>') {
357 break
358 }
359 }
360 2 {
361 raw_lines << raw
362 p.pos++
363 if stripped.contains('-->') {
364 break
365 }
366 }
367 3 {
368 raw_lines << raw
369 p.pos++
370 if stripped.contains('?>') {
371 break
372 }
373 }
374 4 {
375 raw_lines << raw
376 p.pos++
377 if stripped.contains('>') {
378 break
379 }
380 }
381 5 {
382 raw_lines << raw
383 p.pos++
384 if stripped.contains(']]>') {
385 break
386 }
387 }
388 6, 7 {
389 if is_blank(stripped) {
390 break
391 }
392 raw_lines << raw
393 p.pos++
394 }
395 else {}
396 }
397 }
398 mut node := new_node(.html_block)
399 node.literal = raw_lines.join('\n') + '\n'
400 return node
401}
402
403// detect_html_block_type returns the HTML block type (1-7) or 0 if the line
404// does not start an HTML block.
405fn detect_html_block_type(line string) int {
406 stripped := line.trim_left(' \t')
407 if stripped.len == 0 || stripped[0] != `<` {
408 return 0
409 }
410 low := stripped.to_lower()
411 // Type 2: HTML comment
412 if low.starts_with('<!--') {
413 return 2
414 }
415 // Type 3: processing instruction
416 if low.starts_with('<?') {
417 return 3
418 }
419 // Type 5: CDATA
420 if low.starts_with('<![cdata[') {
421 return 5
422 }
423 // Type 4: <!X
424 if low.len > 2 && low[1] == `!` && low[2] >= `a` && low[2] <= `z` {
425 return 4
426 }
427 // Type 1: script/pre/style/textarea
428 for _, tag in ['<script', '<pre', '<style', '<textarea'] {
429 if low.starts_with(tag) {
430 rest := low[tag.len..]
431 if rest.len == 0 || rest[0] == ` ` || rest[0] == `\t` || rest[0] == `>`
432 || rest[0] == `\n` {
433 return 1
434 }
435 }
436 }
437 // Type 6: block-level tag
438 tag_name := extract_tag_name(low[1..])
439 if tag_name.len > 0 {
440 for bt in block_level_tags {
441 if tag_name == bt {
442 return 6
443 }
444 }
445 }
446 // Type 7: complete open/close tag not in type 6
447 if is_complete_html_tag(stripped) {
448 return 7
449 }
450 return 0
451}
452
453// extract_tag_name extracts the tag name from the beginning of s.
454fn extract_tag_name(s string) string {
455 mut end := 0
456 for end < s.len && (is_alnum(s[end]) || s[end] == `-`) {
457 end++
458 }
459 return s[..end].to_lower()
460}
461
462// is_complete_html_tag returns true if s looks like a complete open or close tag.
463fn is_complete_html_tag(s string) bool {
464 if s.len < 3 {
465 return false
466 }
467 // Must start with < and end with >
468 if s[0] != `<` {
469 return false
470 }
471 end := s.index('>') or { return false }
472 return end == s.len - 1 || s[end + 1..].trim_space().len == 0
473}
474
475// try_link_ref_def attempts to parse a link reference definition at p.pos.
476// CommonMark allows the title to appear on the next line when the destination
477// is alone on the first line. Returns true and advances p.pos if successful.
478fn (mut p BlockParser) try_link_ref_def(line string) bool {
479 if !line.starts_with('[') {
480 return false
481 }
482 // Find closing bracket.
483 mut i := 1
484 for i < line.len && line[i] != `]` {
485 if line[i] == `\\` {
486 i++
487 }
488 i++
489 }
490 if i >= line.len || line[i] != `]` || i + 1 >= line.len || line[i + 1] != `:` {
491 return false
492 }
493 label := normalize_label(line[1..i])
494 rest := line[i + 2..].trim_left(' \t')
495 if label.len == 0 {
496 return false
497 }
498 // Parse destination.
499 dest, after_dest := parse_link_dest(rest)
500 if after_dest == rest {
501 return false
502 }
503 // Parse optional title. The title may appear on the same line or, if the
504 // destination is the only content on the first line, on the very next line.
505 mut title := ''
506 mut extra_lines := 0 // number of additional lines consumed for the title
507 title_str := after_dest.trim_left(' \t')
508 if title_str.len > 0 {
509 // Title (or unwanted content) is on the same line as the destination.
510 parsed_title, title_rest := parse_link_title(title_str)
511 if title_rest == title_str {
512 // Not a valid title; reject the whole definition.
513 return false
514 }
515 if title_rest.trim_space().len > 0 {
516 // Trailing content after the title – invalid.
517 return false
518 }
519 title = parsed_title
520 } else {
521 // Destination was alone on its line; look for a title on the next line.
522 next_idx := p.pos + 1
523 if next_idx < p.lines.len {
524 next_line := expand_tabs(p.lines[next_idx]).trim_left(' \t')
525 if next_line.len > 0 {
526 parsed_title, title_rest := parse_link_title(next_line)
527 if title_rest != next_line && title_rest.trim_space().len == 0 {
528 // Valid title on the next line; consume it.
529 title = parsed_title
530 extra_lines = 1
531 }
532 }
533 }
534 }
535 if label !in p.ref_map {
536 p.ref_map[label] = LinkRef{
537 dest: dest
538 title: title
539 }
540 }
541 p.pos += 1 + extra_lines
542 return true
543}
544
545// parse_link_dest parses a link destination from s and returns (dest, rest).
546fn parse_link_dest(s string) (string, string) {
547 if s.len == 0 {
548 return '', s
549 }
550 if s[0] == `<` {
551 // Angle-bracket form: <url>
552 end := s.index_after_('>', 1)
553 if end < 0 {
554 return '', s
555 }
556 return s[1..end], s[end + 1..]
557 }
558 // Regular form: no spaces, no control characters, balanced parentheses.
559 mut parens := 0
560 mut i := 0
561 for i < s.len {
562 c := s[i]
563 if c == ` ` || c == `\t` || c == `\n` {
564 break
565 }
566 if c == `(` {
567 parens++
568 } else if c == `)` {
569 if parens == 0 {
570 break
571 }
572 parens--
573 } else if c == `\\` && i + 1 < s.len {
574 i += 2
575 continue
576 }
577 i++
578 }
579 if i == 0 {
580 return '', s
581 }
582 if parens != 0 {
583 return '', s
584 }
585 return s[..i], s[i..]
586}
587
588// parse_link_title parses an optional link title from s and returns (title, rest).
589// Returns empty string if no valid title is found.
590fn parse_link_title(s string) (string, string) {
591 if s.len == 0 {
592 return '', s
593 }
594 open := s[0]
595 mut close := u8(0)
596 match open {
597 `"` { close = `"` }
598 `'` { close = `'` }
599 `(` { close = `)` }
600 else { return '', s }
601 }
602
603 mut i := 1
604 for i < s.len {
605 c := s[i]
606 if c == close {
607 return unescape_string(s[1..i]), s[i + 1..]
608 }
609 if c == `\\` && i + 1 < s.len {
610 i += 2
611 continue
612 }
613 i++
614 }
615 return '', s
616}
617
618// parse_blockquote parses a blockquote block and returns a blockquote node.
619fn (mut p BlockParser) parse_blockquote(indent int) &Node {
620 mut bq_lines := []string{}
621 for p.pos < p.lines.len {
622 raw := expand_tabs(p.lines[p.pos])
623 stripped := trim_indent(raw, indent)
624 if is_blank(stripped) {
625 // Lazy continuation stops at blank.
626 p.pos++
627 break
628 }
629 if stripped.starts_with('> ') {
630 bq_lines << stripped[2..]
631 p.pos++
632 } else if stripped.starts_with('>') {
633 bq_lines << stripped[1..]
634 p.pos++
635 } else {
636 // Lazy continuation line.
637 bq_lines << stripped
638 p.pos++
639 }
640 }
641 mut node := new_node(.blockquote)
642 // Recursively parse the blockquote content.
643 mut inner := p.nested_block_parser(bq_lines)
644 inner.parse_blocks(mut node, 0)
645 p.merge_nested_state(inner)
646 return node
647}
648
649// ListMarker holds parsed list marker information.
650struct ListMarker {
651 is_ordered bool
652 bullet_char u8
653 start int
654 indent int // total indent of content after marker
655}
656
657// is_list_marker returns true if line starts with a bullet or ordered list marker.
658fn is_list_marker(line string) bool {
659 if line.len == 0 {
660 return false
661 }
662 _, ok := parse_list_marker(line)
663 return ok
664}
665
666// parse_list_marker parses a list marker from the beginning of line.
667fn parse_list_marker(line string) (ListMarker, bool) {
668 sp := leading_spaces(line)
669 rest := line[sp..]
670 if rest.len == 0 {
671 return ListMarker{}, false
672 }
673 // Bullet list: -, *, +
674 if rest[0] == `-` || rest[0] == `*` || rest[0] == `+` {
675 if rest.len < 2 || (rest[1] != ` ` && rest[1] != `\t`) {
676 // Only a bare marker with no space is not valid (except empty item for -)
677 if rest.len == 1 {
678 return ListMarker{
679 is_ordered: false
680 bullet_char: rest[0]
681 indent: sp + 2
682 }, true
683 }
684 return ListMarker{}, false
685 }
686 content_indent := sp + 1 + (if rest.len > 1 && rest[1] == `\t` { 3 } else { 1 })
687 return ListMarker{
688 is_ordered: false
689 bullet_char: rest[0]
690 indent: content_indent
691 }, true
692 }
693 // Ordered list: 1. or 1)
694 mut num_end := 0
695 for num_end < rest.len && is_digit(rest[num_end]) {
696 num_end++
697 }
698 if num_end > 0 && num_end < rest.len && (rest[num_end] == `.` || rest[num_end] == `)`) {
699 marker_end := num_end + 1
700 if marker_end < rest.len && rest[marker_end] != ` ` && rest[marker_end] != `\t` {
701 return ListMarker{}, false
702 }
703 num_str := rest[..num_end]
704 start := num_str.int()
705 mut content_indent := sp + marker_end + 1
706 if marker_end < rest.len && rest[marker_end] == `\t` {
707 content_indent = sp + marker_end + (4 - ((sp + marker_end) % 4))
708 }
709 return ListMarker{
710 is_ordered: true
711 start: start
712 bullet_char: rest[num_end]
713 indent: content_indent
714 }, true
715 }
716 return ListMarker{}, false
717}
718
719// parse_list parses a list (bullet or ordered) and returns a list node.
720fn (mut p BlockParser) parse_list(indent int) &Node {
721 // Determine list type from the first item's marker.
722 if p.pos >= p.lines.len {
723 return new_node(.list)
724 }
725
726 first_raw := expand_tabs(p.lines[p.pos])
727 first_line := trim_indent(first_raw, indent)
728 marker, ok := parse_list_marker(first_line)
729 if !ok {
730 return new_node(.list)
731 }
732
733 mut list := new_node(.list)
734 list.is_ordered = marker.is_ordered
735 list.list_start = if marker.is_ordered { marker.start } else { 1 }
736 list.is_tight = true
737
738 mut had_blank := false
739
740 for p.pos < p.lines.len {
741 raw := expand_tabs(p.lines[p.pos])
742 stripped := trim_indent(raw, indent)
743
744 if is_blank(stripped) {
745 had_blank = true
746 p.pos++
747 continue
748 }
749
750 cur_marker, marker_ok := parse_list_marker(stripped)
751 if !marker_ok {
752 break
753 }
754 // Different list type → stop.
755 if cur_marker.is_ordered != marker.is_ordered {
756 break
757 }
758 if !cur_marker.is_ordered && cur_marker.bullet_char != marker.bullet_char {
759 break
760 }
761
762 if had_blank {
763 list.is_tight = false
764 }
765 had_blank = false
766
767 item := p.parse_list_item(indent)
768 list.append_child(item)
769 }
770
771 return list
772}
773
774// parse_list_item parses a single list item and returns a list_item node.
775fn (mut p BlockParser) parse_list_item(base_indent int) &Node {
776 if p.pos >= p.lines.len {
777 return new_node(.list_item)
778 }
779
780 mut item := new_node(.list_item)
781
782 // Get first line of the item
783 first_raw := expand_tabs(p.lines[p.pos])
784 first_stripped := trim_indent(first_raw, base_indent)
785
786 // Extract content after marker
787 marker_line := first_stripped
788 mut marker_idx := 0
789
790 // Find where marker ends in stripped line
791 if marker_line.len > 0
792 && (marker_line[0] == `-` || marker_line[0] == `*` || marker_line[0] == `+`) {
793 // Bullet marker: skip marker and whitespace
794 marker_idx = 1
795 for marker_idx < marker_line.len
796 && (marker_line[marker_idx] == ` ` || marker_line[marker_idx] == `\t`) {
797 marker_idx++
798 }
799 } else {
800 // Ordered marker: skip number and . or )
801 for marker_idx < marker_line.len && is_digit(marker_line[marker_idx]) {
802 marker_idx++
803 }
804 if marker_idx < marker_line.len
805 && (marker_line[marker_idx] == `.` || marker_line[marker_idx] == `)`) {
806 marker_idx++
807 }
808 // Skip whitespace after marker
809 for marker_idx < marker_line.len
810 && (marker_line[marker_idx] == ` ` || marker_line[marker_idx] == `\t`) {
811 marker_idx++
812 }
813 }
814
815 // Get content after marker
816 first_content := if marker_idx < marker_line.len {
817 marker_line[marker_idx..]
818 } else {
819 ''
820 }
821
822 // Detect task list checkbox: [ ] or [x] or [X] at the start of content.
823 mut task_checked := false
824 mut has_task := false
825 mut task_content_start := 0
826 if p.opts.task_list && first_content.len >= 3 && first_content[0] == `[` {
827 if (first_content[1] == ` ` || first_content[1] == `x` || first_content[1] == `X`)
828 && first_content[2] == `]` {
829 if first_content.len == 3 || first_content[3] == ` ` || first_content[3] == `\t` {
830 has_task = true
831 task_checked = first_content[1] != ` `
832 // Skip the checkbox and one optional following whitespace char.
833 task_content_start = if first_content.len > 3 { 4 } else { 3 }
834 }
835 }
836 }
837 actual_first_content := if has_task {
838 first_content[task_content_start..]
839 } else {
840 first_content
841 }
842
843 // Collect lines belonging to this item
844 mut item_lines := [actual_first_content]
845 p.pos++
846
847 // Content indent is where subsequent lines must be indented to
848 first_leading := leading_spaces(first_stripped)
849 content_indent := first_leading + marker_idx
850
851 for p.pos < p.lines.len {
852 raw := expand_tabs(p.lines[p.pos])
853 stripped_base := trim_indent(raw, base_indent)
854
855 if is_blank(stripped_base) {
856 item_lines << ''
857 p.pos++
858 continue
859 }
860
861 sp := leading_spaces(stripped_base)
862
863 // If line has enough indent, include it
864 if sp >= content_indent {
865 // Remove the content indent
866 item_lines << trim_indent(stripped_base, content_indent)
867 p.pos++
868 continue
869 }
870
871 // If line starts a new list item at base indent level, stop
872 if sp < 2 {
873 _, mk := parse_list_marker(stripped_base)
874 if mk {
875 break
876 }
877 }
878
879 // Check if it might be a sub-block (less indented but meaningful)
880 if is_thematic_break(stripped_base) || stripped_base.starts_with('#') {
881 break
882 }
883
884 item_lines << stripped_base
885 p.pos++
886 }
887
888 // Trim trailing blank lines
889 for item_lines.len > 0 && item_lines[item_lines.len - 1] == '' {
890 item_lines = unsafe { item_lines[..item_lines.len - 1] }
891 }
892
893 // Recursively parse the item's content with fresh parser
894 mut inner := p.nested_block_parser(item_lines)
895 inner.parse_blocks(mut item, 0)
896
897 // Merge back any new link references and footnote definitions.
898 p.merge_nested_state(inner)
899
900 // Prepend task checkbox node if detected (must be first child).
901 if has_task {
902 mut chk := new_node(.task_checkbox)
903 chk.checked = task_checked
904 mut new_children := [chk]
905 for child in item.children {
906 new_children << child
907 }
908 item.children = new_children
909 }
910
911 return item
912}
913
914// try_table attempts to parse a GFM table starting at p.pos.
915// A table requires a header row, an alignment row (|---|), then data rows.
916fn (mut p BlockParser) try_table(indent int) ?&Node {
917 if p.pos + 1 >= p.lines.len {
918 return none
919 }
920 header_raw := expand_tabs(p.lines[p.pos])
921 header_line := trim_indent(header_raw, indent)
922 sep_raw := expand_tabs(p.lines[p.pos + 1])
923 sep_line := trim_indent(sep_raw, indent)
924
925 if !is_table_separator(sep_line) {
926 return none
927 }
928 if !header_line.contains('|') {
929 return none
930 }
931
932 aligns := parse_table_alignments(sep_line)
933 if aligns.len == 0 {
934 return none
935 }
936
937 mut tbl := new_node(.table)
938
939 // Header row.
940 mut head := new_node(.table_head)
941 header_row := parse_table_row(header_line, aligns)
942 head.append_child(header_row)
943 tbl.append_child(head)
944
945 p.pos += 2
946
947 // Body rows.
948 mut body := new_node(.table_body)
949 for p.pos < p.lines.len {
950 raw := expand_tabs(p.lines[p.pos])
951 stripped := trim_indent(raw, indent)
952 if is_blank(stripped) || !stripped.contains('|') {
953 break
954 }
955 body.append_child(parse_table_row(stripped, aligns))
956 p.pos++
957 }
958 if body.children.len > 0 {
959 tbl.append_child(body)
960 }
961 return tbl
962}
963
964// is_table_separator returns true if line is a table alignment separator row.
965fn is_table_separator(line string) bool {
966 trimmed := line.trim('| \t')
967 if trimmed.len == 0 {
968 return false
969 }
970 cells := split_table_cells(line)
971 if cells.len == 0 {
972 return false
973 }
974 for cell in cells {
975 c := cell.trim_space()
976 if c.len == 0 {
977 continue
978 }
979 inner := if c.starts_with(':') && c.ends_with(':') {
980 c[1..c.len - 1]
981 } else if c.starts_with(':') {
982 c[1..]
983 } else if c.ends_with(':') {
984 c[..c.len - 1]
985 } else {
986 c
987 }
988 for ch in inner.runes() {
989 if ch != `-` {
990 return false
991 }
992 }
993 if inner.len == 0 {
994 return false
995 }
996 }
997 return true
998}
999
1000// parse_table_alignments returns the alignment for each column from a separator line.
1001fn parse_table_alignments(line string) []Alignment {
1002 cells := split_table_cells(line)
1003 mut aligns := []Alignment{}
1004 for cell in cells {
1005 c := cell.trim_space()
1006 if c.len == 0 {
1007 continue
1008 }
1009 left := c.starts_with(':')
1010 right := c.ends_with(':')
1011 if left && right {
1012 aligns << .center
1013 } else if left {
1014 aligns << .left
1015 } else if right {
1016 aligns << .right
1017 } else {
1018 aligns << .none_
1019 }
1020 }
1021 return aligns
1022}
1023
1024// parse_table_row parses a table row line into a table_row node.
1025fn parse_table_row(line string, aligns []Alignment) &Node {
1026 cells := split_table_cells(line)
1027 mut row := new_node(.table_row)
1028 for i, cell_text in cells {
1029 mut cell := new_node(.table_cell)
1030 cell.align = if i < aligns.len { aligns[i] } else { .none_ }
1031 cell.literal = cell_text.trim_space()
1032 row.append_child(cell)
1033 }
1034 return row
1035}
1036
1037// split_table_cells splits a table row line by pipe characters.
1038fn split_table_cells(line string) []string {
1039 trimmed := line.trim_space()
1040 // Strip leading/trailing pipe.
1041 inner := if trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.len > 1 {
1042 trimmed[1..trimmed.len - 1]
1043 } else if trimmed.starts_with('|') {
1044 trimmed[1..]
1045 } else if trimmed.ends_with('|') {
1046 trimmed[..trimmed.len - 1]
1047 } else {
1048 trimmed
1049 }
1050 mut cells := []string{}
1051 mut current := strings.new_builder(32)
1052 for i := 0; i < inner.len; i++ {
1053 if inner[i] == `\\` && i + 1 < inner.len && inner[i + 1] == `|` {
1054 current.write_u8(`|`)
1055 i++
1056 } else if inner[i] == `|` {
1057 cells << current.str()
1058 current = strings.new_builder(32)
1059 } else {
1060 current.write_u8(inner[i])
1061 }
1062 }
1063 last := current.str()
1064 cells << last
1065 return cells
1066}
1067
1068// try_definition_list attempts to parse a definition list starting at p.pos.
1069fn (mut p BlockParser) try_definition_list(indent int) ?&Node {
1070 if p.pos + 1 >= p.lines.len {
1071 return none
1072 }
1073 next_raw := expand_tabs(p.lines[p.pos + 1])
1074 next_stripped := trim_indent(next_raw, indent)
1075 if !next_stripped.starts_with(':') {
1076 return none
1077 }
1078 mut dl := new_node(.definition_list)
1079 for p.pos < p.lines.len {
1080 raw := expand_tabs(p.lines[p.pos])
1081 stripped := trim_indent(raw, indent)
1082 if is_blank(stripped) {
1083 p.pos++
1084 break
1085 }
1086 // Collect term line.
1087 mut term := new_node(.definition_term)
1088 term.literal = stripped
1089 p.pos++
1090 // Collect one or more definitions (:).
1091 for p.pos < p.lines.len {
1092 def_raw := expand_tabs(p.lines[p.pos])
1093 def_stripped := trim_indent(def_raw, indent)
1094 if def_stripped.starts_with(':') {
1095 mut desc := new_node(.definition_desc)
1096 desc.literal = def_stripped[1..].trim_left(' \t')
1097 term.append_child(desc)
1098 p.pos++
1099 } else {
1100 break
1101 }
1102 }
1103 dl.append_child(term)
1104 }
1105 if dl.children.len == 0 {
1106 return none
1107 }
1108 return dl
1109}
1110
1111// try_footnote_def attempts to parse a footnote definition starting at p.pos.
1112fn (mut p BlockParser) try_footnote_def(line string, indent int) bool {
1113 if !line.starts_with('[^') {
1114 return false
1115 }
1116 end := line.index_after_(']', 2)
1117 if end < 0 || end + 1 >= line.len || line[end + 1] != `:` {
1118 return false
1119 }
1120 label := line[2..end]
1121 if label.len == 0 {
1122 return false
1123 }
1124 content := line[end + 2..].trim_left(' \t')
1125 mut fn_node := new_node(.footnote_def)
1126 fn_node.fn_label = label
1127 fn_node.literal = content
1128 p.pos++
1129 // Collect continuation lines (indented by at least 4 spaces).
1130 for p.pos < p.lines.len {
1131 raw := expand_tabs(p.lines[p.pos])
1132 stripped := trim_indent(raw, indent)
1133 if is_blank(stripped) || leading_spaces(stripped) >= 4 {
1134 fn_node.literal += '\n' + stripped.trim_left(' ')
1135 p.pos++
1136 } else {
1137 break
1138 }
1139 }
1140 if label !in p.fn_defs {
1141 p.fn_defs[label] = fn_node
1142 }
1143 return true
1144}
1145
1146// parse_paragraph parses a paragraph block, upgrading it to a setext heading
1147// if the immediately following line is a setext underline (=== or ---).
1148fn (mut p BlockParser) parse_paragraph(indent int) &Node {
1149 mut para_lines := []string{}
1150 for p.pos < p.lines.len {
1151 raw := expand_tabs(p.lines[p.pos])
1152 stripped := trim_indent(raw, indent)
1153
1154 if is_blank(stripped) {
1155 p.pos++
1156 break
1157 }
1158 // Check for setext underline on the next line while para_lines is non-empty.
1159 if para_lines.len > 0 {
1160 if is_setext_underline(stripped) {
1161 level := if stripped.trim_left(' \t')[0] == `=` { 1 } else { 2 }
1162 content := para_lines.join('\n').trim_space()
1163 mut node := new_node(.heading)
1164 node.level = level
1165 node.literal = content
1166 if p.opts.parser_opts.auto_heading_id {
1167 node.id = heading_id_from_text(content)
1168 }
1169 p.pos++
1170 return node
1171 }
1172 }
1173 // Other block starters interrupt a paragraph.
1174 if para_lines.len > 0 {
1175 if is_thematic_break(stripped) || stripped.starts_with('#') || stripped.starts_with('>')
1176 || stripped.starts_with('```') || stripped.starts_with('~~~') {
1177 break
1178 }
1179 if is_list_marker(stripped) {
1180 break
1181 }
1182 }
1183 para_lines << stripped
1184 p.pos++
1185 }
1186 mut node := new_node(.paragraph)
1187 node.literal = para_lines.join('\n').trim_space()
1188 return node
1189}
1190
1191// is_setext_underline returns true if line is a setext heading underline
1192// (0-3 leading spaces, then one or more = or - with optional trailing spaces).
1193fn is_setext_underline(line string) bool {
1194 if line.len == 0 {
1195 return false
1196 }
1197 // Allow up to 3 leading spaces (CommonMark spec rule 80).
1198 mut start := 0
1199 for start < 3 && start < line.len && line[start] == ` ` {
1200 start++
1201 }
1202 if start >= line.len {
1203 return false
1204 }
1205 c := line[start]
1206 if c != `=` && c != `-` {
1207 return false
1208 }
1209 for i := start; i < line.len; i++ {
1210 if line[i] != c && line[i] != ` ` && line[i] != `\t` {
1211 return false
1212 }
1213 }
1214 return true
1215}
1216
1217// unescape_string decodes CommonMark backslash escapes in s.
1218fn unescape_string(s string) string {
1219 if !s.contains('\\') {
1220 return s
1221 }
1222 mut out := strings.new_builder(s.len)
1223 mut i := 0
1224 for i < s.len {
1225 if s[i] == `\\` && i + 1 < s.len && is_ascii_punct(s[i + 1]) {
1226 out.write_u8(s[i + 1])
1227 i += 2
1228 } else {
1229 out.write_u8(s[i])
1230 i++
1231 }
1232 }
1233 return out.str()
1234}
1235