v2 / vlib / x / markdown / util.v
297 lines · 276 sloc · 7.29 KB · 46c3d7f13d605a08603985fe4e6f82f2a8771775
Raw
1// Copyright 2026 The V Language. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module markdown
5
6import strings
7
8// unicode_space lists Unicode code points considered whitespace
9// vfmt off
10const unicode_space = [
11 ` `, // space
12 `\t`, // tab
13 0x0a, // LF
14 0x0b, // Vertical Tab
15 0x0c, // FF
16 0x0d, // CR
17 0x0085, // next line
18 0x00A0, // no-break space
19 0x1680, // ogham space mark
20 0x180E, // mongolian vowel separator
21 0x2000, // en quad
22 0x2001, // em quad
23 0x2002, // en space
24 0x2003, // em space
25 0x2004, // three-per-em space
26 0x2005, // four-per-em space
27 0x2006, // six-per-em space
28 0x2007, // figure space
29 0x2008, // punctuation space
30 0x2009, // thin space
31 0x200A, // hair space
32 0x200B, // zero width space
33 0x200C, // zero width non-joiner
34 0x200D, // zero width joiner
35 0x2028, // line separator
36 0x2029, // paragraph separator
37 0x202F, // narrow no-break space
38 0x205F, // medium mathematical space
39 0x2060, // word joiner
40 0x3000, // ideographic space
41 0xFEFF, // zero width non-breaking space
42]!
43
44// ascii_punct lists ASCII punctuation characters
45const ascii_punct = [
46 `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, `*`, `+`, `,`, `-`, `.`, `/`, `:`,
47 `;`, `<`, `=`, `>`, `?`, `@`, `[`, `\\`, `]`, `^`, `_`, `\``, `{`, `|`, `}`, `~`,
48]!
49
50// alpha lists ASCII letters a-z and A-Z
51const alpha = [
52 `a`, `b`, `c`, `d`, `e`, `f`, `g`, `h`, `i`, `j`, `k`, `l`, `m`,
53 `n`, `o`, `p`, `q`, `r`, `s`, `t`, `u`, `v`, `w`, `x`, `y`, `z`,
54 `A`, `B`, `C`, `D`, `E`, `F`, `G`, `H`, `I`, `J`, `K`, `L`, `M`,
55 `N`, `O`, `P`, `Q`, `R`, `S`, `T`, `U`, `V`, `W`, `X`, `Y`, `Z`,
56]!
57// vfmt on
58
59// html_escape replaces HTML special characters in s with their entity equivalents.
60fn html_escape(s string) string {
61 if s.index_any('&<>"') == -1 {
62 return s
63 }
64 mut sb := strings.new_builder(s.len + 8)
65 for i := 0; i < s.len; i++ {
66 match s[i] {
67 `&` { sb.write_string('&') }
68 `<` { sb.write_string('<') }
69 `>` { sb.write_string('>') }
70 `"` { sb.write_string('"') }
71 else { sb.write_u8(s[i]) }
72 }
73 }
74 return sb.str()
75}
76
77// url_safe_chars contains URL characters that do not need percent-encoding.
78const url_safe_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~:/?#[]@!$&()*+,;=%'
79
80// url_encode percent-encodes characters in a URL that need encoding,
81// while leaving valid URL characters (including already-encoded sequences) intact.
82fn url_encode(s string) string {
83 mut sb := strings.new_builder(s.len)
84 for i := 0; i < s.len; i++ {
85 c := s[i]
86 if url_safe_chars.index_u8(c) >= 0 {
87 sb.write_u8(c)
88 } else {
89 sb.write_string('%${c:02X}')
90 }
91 }
92 return sb.str()
93}
94
95// normalize_label normalises a link reference label per CommonMark spec:
96// strip leading/trailing Unicode whitespace, collapse internal whitespace runs
97// to a single space, and fold to lower case.
98fn normalize_label(s string) string {
99 mut out := strings.new_builder(s.len)
100 mut in_space := true // start true so we trim leading space
101 for ch in s.runes() {
102 if ch in unicode_space {
103 if !in_space {
104 out.write_u8(` `)
105 in_space = true
106 }
107 } else {
108 if ch >= `A` && ch <= `Z` {
109 out.write_u8(u8(ch + 32))
110 } else {
111 out.write_string(ch.str())
112 }
113 in_space = false
114 }
115 }
116 result := out.str()
117 // Trim potential trailing space.
118 if result.ends_with(' ') {
119 return result[..result.len - 1]
120 }
121 return result
122}
123
124// ascii_lower converts an ASCII upper-case letter to lower case.
125@[inline]
126fn ascii_lower(c u8) u8 {
127 if c >= `A` && c <= `Z` {
128 return c + 32
129 }
130 return c
131}
132
133// is_unicode_space returns true for CommonMark Unicode whitespace.
134@[inline]
135fn is_unicode_space(c rune) bool {
136 return c in unicode_space
137}
138
139// is_ascii_punct returns true if c is an ASCII punctuation character.
140@[inline]
141fn is_ascii_punct(c rune) bool {
142 return c <= 0x7f && u8(c) in ascii_punct
143}
144
145const digits = [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`]!
146// is_digit returns true if c is an ASCII decimal digit.
147@[inline]
148fn is_digit(c u8) bool {
149 return c in digits
150}
151
152// is_alpha returns true if c is an ASCII letter.
153@[inline]
154fn is_alpha(c u8) bool {
155 return c in alpha
156}
157
158// is_alnum returns true if c is an ASCII letter or digit.
159@[inline]
160fn is_alnum(c u8) bool {
161 return is_alpha(c) || is_digit(c)
162}
163
164// heading_id_from_text generates a slug-style id attribute from plain text,
165// matching goldmark's AutoHeadingID output.
166fn heading_id_from_text(text string) string {
167 mut sb := strings.new_builder(text.len)
168 mut prev_dash := true // start true so we trim leading dashes
169 for ch in text.runes() {
170 if ch <= 0x7f && is_alnum(u8(ch)) {
171 sb.write_u8(ascii_lower(u8(ch)))
172 prev_dash = false
173 } else if ch == `-` || ch == `_` || ch in unicode_space {
174 if !prev_dash {
175 sb.write_u8(`-`)
176 prev_dash = true
177 }
178 }
179 // other characters (punctuation) are dropped
180 }
181 s := sb.str()
182 // Trim trailing dash.
183 return s.trim_right('-')
184}
185
186// count_leading counts how many consecutive occurrences of c appear at the
187// start of s.
188@[inline]
189fn count_leading(s string, c u8) int {
190 mut n := 0
191 for n < s.len && s[n] == c {
192 n++
193 }
194 return n
195}
196
197// expand_tabs converts leading tabs in s to spaces (tab stop = 4 columns).
198fn expand_tabs(s string) string {
199 if !s.contains('\t') {
200 return s
201 }
202 mut sb := strings.new_builder(s.len)
203 mut col := 0
204 for i := 0; i < s.len; i++ {
205 if s[i] == `\t` {
206 spaces := 4 - (col % 4)
207 for _ in 0 .. spaces {
208 sb.write_u8(` `)
209 }
210 col += spaces
211 } else {
212 sb.write_u8(s[i])
213 col++
214 }
215 }
216 return sb.str()
217}
218
219// trim_indent removes up to n leading spaces from s.
220@[inline]
221fn trim_indent(s string, n int) string {
222 mut i := 0
223 for i < n && i < s.len && s[i] == ` ` {
224 i++
225 }
226 return s[i..]
227}
228
229// is_blank returns true if s contains only whitespace.
230@[inline]
231fn is_blank(s string) bool {
232 for i := 0; i < s.len; i++ {
233 if s[i] != ` ` && s[i] != `\t` {
234 return false
235 }
236 }
237 return true
238}
239
240// leading_spaces returns the number of leading spaces (not tabs) in s.
241@[inline]
242fn leading_spaces(s string) int {
243 mut n := 0
244 for n < s.len && s[n] == ` ` {
245 n++
246 }
247 return n
248}
249
250// smart_punctuate applies typographic substitutions to s:
251// -- → en dash, --- → em dash, ... → ellipsis, smart quotes.
252fn smart_punctuate(s string) string {
253 mut out := strings.new_builder(s.len)
254 i := 0
255 mut j := i
256 src := s.bytes()
257 for j < src.len {
258 c := src[j]
259 if c == `-` {
260 if j + 2 < src.len && src[j + 1] == `-` && src[j + 2] == `-` {
261 out.write_string('\u2014') // em dash
262 j += 3
263 continue
264 } else if j + 1 < src.len && src[j + 1] == `-` {
265 out.write_string('\u2013') // en dash
266 j += 2
267 continue
268 }
269 } else if c == `.` {
270 if j + 2 < src.len && src[j + 1] == `.` && src[j + 2] == `.` {
271 out.write_string('\u2026') // ellipsis
272 j += 3
273 continue
274 }
275 } else if c == `'` {
276 // Simple heuristic: opening after space/start, closing otherwise.
277 if j == 0 || is_unicode_space(src[j - 1]) || is_ascii_punct(src[j - 1]) {
278 out.write_string('\u2018') // left single quote
279 } else {
280 out.write_string('\u2019') // right single quote
281 }
282 j++
283 continue
284 } else if c == `"` {
285 if j == 0 || is_unicode_space(src[j - 1]) || is_ascii_punct(src[j - 1]) {
286 out.write_string('\u201C') // left double quote
287 } else {
288 out.write_string('\u201D') // right double quote
289 }
290 j++
291 continue
292 }
293 out.write_u8(c)
294 j++
295 }
296 return out.str()
297}
298