| 1 | // Copyright 2026 The V Language. All rights reserved. |
| 2 | // Use of this source code is governed by an MIT license |
| 3 | // that can be found in the LICENSE file. |
| 4 | module markdown |
| 5 | |
| 6 | import strings |
| 7 | |
| 8 | // unicode_space lists Unicode code points considered whitespace |
| 9 | // vfmt off |
| 10 | const unicode_space = [ |
| 11 | ` `, // space |
| 12 | `\t`, // tab |
| 13 | 0x0a, // LF |
| 14 | 0x0b, // Vertical Tab |
| 15 | 0x0c, // FF |
| 16 | 0x0d, // CR |
| 17 | 0x0085, // next line |
| 18 | 0x00A0, // no-break space |
| 19 | 0x1680, // ogham space mark |
| 20 | 0x180E, // mongolian vowel separator |
| 21 | 0x2000, // en quad |
| 22 | 0x2001, // em quad |
| 23 | 0x2002, // en space |
| 24 | 0x2003, // em space |
| 25 | 0x2004, // three-per-em space |
| 26 | 0x2005, // four-per-em space |
| 27 | 0x2006, // six-per-em space |
| 28 | 0x2007, // figure space |
| 29 | 0x2008, // punctuation space |
| 30 | 0x2009, // thin space |
| 31 | 0x200A, // hair space |
| 32 | 0x200B, // zero width space |
| 33 | 0x200C, // zero width non-joiner |
| 34 | 0x200D, // zero width joiner |
| 35 | 0x2028, // line separator |
| 36 | 0x2029, // paragraph separator |
| 37 | 0x202F, // narrow no-break space |
| 38 | 0x205F, // medium mathematical space |
| 39 | 0x2060, // word joiner |
| 40 | 0x3000, // ideographic space |
| 41 | 0xFEFF, // zero width non-breaking space |
| 42 | ]! |
| 43 | |
| 44 | // ascii_punct lists ASCII punctuation characters |
| 45 | const ascii_punct = [ |
| 46 | `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, `*`, `+`, `,`, `-`, `.`, `/`, `:`, |
| 47 | `;`, `<`, `=`, `>`, `?`, `@`, `[`, `\\`, `]`, `^`, `_`, `\``, `{`, `|`, `}`, `~`, |
| 48 | ]! |
| 49 | |
| 50 | // alpha lists ASCII letters a-z and A-Z |
| 51 | const alpha = [ |
| 52 | `a`, `b`, `c`, `d`, `e`, `f`, `g`, `h`, `i`, `j`, `k`, `l`, `m`, |
| 53 | `n`, `o`, `p`, `q`, `r`, `s`, `t`, `u`, `v`, `w`, `x`, `y`, `z`, |
| 54 | `A`, `B`, `C`, `D`, `E`, `F`, `G`, `H`, `I`, `J`, `K`, `L`, `M`, |
| 55 | `N`, `O`, `P`, `Q`, `R`, `S`, `T`, `U`, `V`, `W`, `X`, `Y`, `Z`, |
| 56 | ]! |
| 57 | // vfmt on |
| 58 | |
| 59 | // html_escape replaces HTML special characters in s with their entity equivalents. |
| 60 | fn html_escape(s string) string { |
| 61 | if s.index_any('&<>"') == -1 { |
| 62 | return s |
| 63 | } |
| 64 | mut sb := strings.new_builder(s.len + 8) |
| 65 | for i := 0; i < s.len; i++ { |
| 66 | match s[i] { |
| 67 | `&` { sb.write_string('&') } |
| 68 | `<` { sb.write_string('<') } |
| 69 | `>` { sb.write_string('>') } |
| 70 | `"` { sb.write_string('"') } |
| 71 | else { sb.write_u8(s[i]) } |
| 72 | } |
| 73 | } |
| 74 | return sb.str() |
| 75 | } |
| 76 | |
| 77 | // url_safe_chars contains URL characters that do not need percent-encoding. |
| 78 | const url_safe_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~:/?#[]@!$&()*+,;=%' |
| 79 | |
| 80 | // url_encode percent-encodes characters in a URL that need encoding, |
| 81 | // while leaving valid URL characters (including already-encoded sequences) intact. |
| 82 | fn url_encode(s string) string { |
| 83 | mut sb := strings.new_builder(s.len) |
| 84 | for i := 0; i < s.len; i++ { |
| 85 | c := s[i] |
| 86 | if url_safe_chars.index_u8(c) >= 0 { |
| 87 | sb.write_u8(c) |
| 88 | } else { |
| 89 | sb.write_string('%${c:02X}') |
| 90 | } |
| 91 | } |
| 92 | return sb.str() |
| 93 | } |
| 94 | |
| 95 | // normalize_label normalises a link reference label per CommonMark spec: |
| 96 | // strip leading/trailing Unicode whitespace, collapse internal whitespace runs |
| 97 | // to a single space, and fold to lower case. |
| 98 | fn normalize_label(s string) string { |
| 99 | mut out := strings.new_builder(s.len) |
| 100 | mut in_space := true // start true so we trim leading space |
| 101 | for ch in s.runes() { |
| 102 | if ch in unicode_space { |
| 103 | if !in_space { |
| 104 | out.write_u8(` `) |
| 105 | in_space = true |
| 106 | } |
| 107 | } else { |
| 108 | if ch >= `A` && ch <= `Z` { |
| 109 | out.write_u8(u8(ch + 32)) |
| 110 | } else { |
| 111 | out.write_string(ch.str()) |
| 112 | } |
| 113 | in_space = false |
| 114 | } |
| 115 | } |
| 116 | result := out.str() |
| 117 | // Trim potential trailing space. |
| 118 | if result.ends_with(' ') { |
| 119 | return result[..result.len - 1] |
| 120 | } |
| 121 | return result |
| 122 | } |
| 123 | |
| 124 | // ascii_lower converts an ASCII upper-case letter to lower case. |
| 125 | @[inline] |
| 126 | fn ascii_lower(c u8) u8 { |
| 127 | if c >= `A` && c <= `Z` { |
| 128 | return c + 32 |
| 129 | } |
| 130 | return c |
| 131 | } |
| 132 | |
| 133 | // is_unicode_space returns true for CommonMark Unicode whitespace. |
| 134 | @[inline] |
| 135 | fn is_unicode_space(c rune) bool { |
| 136 | return c in unicode_space |
| 137 | } |
| 138 | |
| 139 | // is_ascii_punct returns true if c is an ASCII punctuation character. |
| 140 | @[inline] |
| 141 | fn is_ascii_punct(c rune) bool { |
| 142 | return c <= 0x7f && u8(c) in ascii_punct |
| 143 | } |
| 144 | |
| 145 | const digits = [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`]! |
| 146 | // is_digit returns true if c is an ASCII decimal digit. |
| 147 | @[inline] |
| 148 | fn is_digit(c u8) bool { |
| 149 | return c in digits |
| 150 | } |
| 151 | |
| 152 | // is_alpha returns true if c is an ASCII letter. |
| 153 | @[inline] |
| 154 | fn is_alpha(c u8) bool { |
| 155 | return c in alpha |
| 156 | } |
| 157 | |
| 158 | // is_alnum returns true if c is an ASCII letter or digit. |
| 159 | @[inline] |
| 160 | fn is_alnum(c u8) bool { |
| 161 | return is_alpha(c) || is_digit(c) |
| 162 | } |
| 163 | |
| 164 | // heading_id_from_text generates a slug-style id attribute from plain text, |
| 165 | // matching goldmark's AutoHeadingID output. |
| 166 | fn heading_id_from_text(text string) string { |
| 167 | mut sb := strings.new_builder(text.len) |
| 168 | mut prev_dash := true // start true so we trim leading dashes |
| 169 | for ch in text.runes() { |
| 170 | if ch <= 0x7f && is_alnum(u8(ch)) { |
| 171 | sb.write_u8(ascii_lower(u8(ch))) |
| 172 | prev_dash = false |
| 173 | } else if ch == `-` || ch == `_` || ch in unicode_space { |
| 174 | if !prev_dash { |
| 175 | sb.write_u8(`-`) |
| 176 | prev_dash = true |
| 177 | } |
| 178 | } |
| 179 | // other characters (punctuation) are dropped |
| 180 | } |
| 181 | s := sb.str() |
| 182 | // Trim trailing dash. |
| 183 | return s.trim_right('-') |
| 184 | } |
| 185 | |
| 186 | // count_leading counts how many consecutive occurrences of c appear at the |
| 187 | // start of s. |
| 188 | @[inline] |
| 189 | fn count_leading(s string, c u8) int { |
| 190 | mut n := 0 |
| 191 | for n < s.len && s[n] == c { |
| 192 | n++ |
| 193 | } |
| 194 | return n |
| 195 | } |
| 196 | |
| 197 | // expand_tabs converts leading tabs in s to spaces (tab stop = 4 columns). |
| 198 | fn expand_tabs(s string) string { |
| 199 | if !s.contains('\t') { |
| 200 | return s |
| 201 | } |
| 202 | mut sb := strings.new_builder(s.len) |
| 203 | mut col := 0 |
| 204 | for i := 0; i < s.len; i++ { |
| 205 | if s[i] == `\t` { |
| 206 | spaces := 4 - (col % 4) |
| 207 | for _ in 0 .. spaces { |
| 208 | sb.write_u8(` `) |
| 209 | } |
| 210 | col += spaces |
| 211 | } else { |
| 212 | sb.write_u8(s[i]) |
| 213 | col++ |
| 214 | } |
| 215 | } |
| 216 | return sb.str() |
| 217 | } |
| 218 | |
| 219 | // trim_indent removes up to n leading spaces from s. |
| 220 | @[inline] |
| 221 | fn trim_indent(s string, n int) string { |
| 222 | mut i := 0 |
| 223 | for i < n && i < s.len && s[i] == ` ` { |
| 224 | i++ |
| 225 | } |
| 226 | return s[i..] |
| 227 | } |
| 228 | |
| 229 | // is_blank returns true if s contains only whitespace. |
| 230 | @[inline] |
| 231 | fn is_blank(s string) bool { |
| 232 | for i := 0; i < s.len; i++ { |
| 233 | if s[i] != ` ` && s[i] != `\t` { |
| 234 | return false |
| 235 | } |
| 236 | } |
| 237 | return true |
| 238 | } |
| 239 | |
| 240 | // leading_spaces returns the number of leading spaces (not tabs) in s. |
| 241 | @[inline] |
| 242 | fn leading_spaces(s string) int { |
| 243 | mut n := 0 |
| 244 | for n < s.len && s[n] == ` ` { |
| 245 | n++ |
| 246 | } |
| 247 | return n |
| 248 | } |
| 249 | |
| 250 | // smart_punctuate applies typographic substitutions to s: |
| 251 | // -- → en dash, --- → em dash, ... → ellipsis, smart quotes. |
| 252 | fn smart_punctuate(s string) string { |
| 253 | mut out := strings.new_builder(s.len) |
| 254 | i := 0 |
| 255 | mut j := i |
| 256 | src := s.bytes() |
| 257 | for j < src.len { |
| 258 | c := src[j] |
| 259 | if c == `-` { |
| 260 | if j + 2 < src.len && src[j + 1] == `-` && src[j + 2] == `-` { |
| 261 | out.write_string('\u2014') // em dash |
| 262 | j += 3 |
| 263 | continue |
| 264 | } else if j + 1 < src.len && src[j + 1] == `-` { |
| 265 | out.write_string('\u2013') // en dash |
| 266 | j += 2 |
| 267 | continue |
| 268 | } |
| 269 | } else if c == `.` { |
| 270 | if j + 2 < src.len && src[j + 1] == `.` && src[j + 2] == `.` { |
| 271 | out.write_string('\u2026') // ellipsis |
| 272 | j += 3 |
| 273 | continue |
| 274 | } |
| 275 | } else if c == `'` { |
| 276 | // Simple heuristic: opening after space/start, closing otherwise. |
| 277 | if j == 0 || is_unicode_space(src[j - 1]) || is_ascii_punct(src[j - 1]) { |
| 278 | out.write_string('\u2018') // left single quote |
| 279 | } else { |
| 280 | out.write_string('\u2019') // right single quote |
| 281 | } |
| 282 | j++ |
| 283 | continue |
| 284 | } else if c == `"` { |
| 285 | if j == 0 || is_unicode_space(src[j - 1]) || is_ascii_punct(src[j - 1]) { |
| 286 | out.write_string('\u201C') // left double quote |
| 287 | } else { |
| 288 | out.write_string('\u201D') // right double quote |
| 289 | } |
| 290 | j++ |
| 291 | continue |
| 292 | } |
| 293 | out.write_u8(c) |
| 294 | j++ |
| 295 | } |
| 296 | return out.str() |
| 297 | } |
| 298 | |