Gitly


1 // Copyright 2026 The V Language. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module markdown
5 
6 import strings
7 
8 // unicode_space lists Unicode code points considered whitespace
9 // vfmt off
10 const unicode_space = [
11     ` `, // space
12     `\t`, // tab
13     0x0a, // LF
14     0x0b, // Vertical Tab
15     0x0c, // FF
16     0x0d, // CR
17     0x0085, // next line
18     0x00A0, // no-break space
19     0x1680, // ogham space mark
20     0x180E, // mongolian vowel separator
21     0x2000, // en quad
22     0x2001, // em quad
23     0x2002, // en space
24     0x2003, // em space
25     0x2004, // three-per-em space
26     0x2005, // four-per-em space
27     0x2006, // six-per-em space
28     0x2007, // figure space
29     0x2008, // punctuation space
30     0x2009, // thin space
31     0x200A, // hair space
32     0x200B, // zero width space
33     0x200C, // zero width non-joiner
34     0x200D, // zero width joiner
35     0x2028, // line separator
36     0x2029, // paragraph separator
37     0x202F, // narrow no-break space
38     0x205F, // medium mathematical space
39     0x2060, // word joiner
40     0x3000, // ideographic space
41     0xFEFF, // zero width non-breaking space
42 ]!
43 
44 // ascii_punct lists ASCII punctuation characters
45 const ascii_punct = [
46     `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, `*`, `+`, `,`, `-`, `.`, `/`, `:`,
47     `;`, `<`, `=`, `>`, `?`, `@`, `[`, `\\`, `]`, `^`, `_`, `\``, `{`, `|`, `}`, `~`,
48 ]!
49 
50 // alpha lists ASCII letters a-z and A-Z
51 const alpha = [
52     `a`, `b`, `c`, `d`, `e`, `f`, `g`, `h`, `i`, `j`, `k`, `l`, `m`,
53     `n`, `o`, `p`, `q`, `r`, `s`, `t`, `u`, `v`, `w`, `x`, `y`, `z`,
54     `A`, `B`, `C`, `D`, `E`, `F`, `G`, `H`, `I`, `J`, `K`, `L`, `M`,
55     `N`, `O`, `P`, `Q`, `R`, `S`, `T`, `U`, `V`, `W`, `X`, `Y`, `Z`,
56 ]!
57 // vfmt on
58 
59 // html_escape replaces HTML special characters in s with their entity equivalents.
60 fn html_escape(s string) string {
61     if s.index_any('&<>"') == -1 {
62         return s
63     }
64     mut sb := strings.new_builder(s.len + 8)
65     for i := 0; i < s.len; i++ {
66         match s[i] {
67             `&` { sb.write_string('&') }
68             `<` { sb.write_string('<') }
69             `>` { sb.write_string('>') }
70             `"` { sb.write_string('"') }
71             else { sb.write_u8(s[i]) }
72         }
73     }
74     return sb.str()
75 }
76 
77 // url_safe_chars contains URL characters that do not need percent-encoding.
78 const url_safe_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~:/?#[]@!$&()*+,;=%'
79 
80 // url_encode percent-encodes characters in a URL that need encoding,
81 // while leaving valid URL characters (including already-encoded sequences) intact.
82 fn url_encode(s string) string {
83     mut sb := strings.new_builder(s.len)
84     for i := 0; i < s.len; i++ {
85         c := s[i]
86         if url_safe_chars.index_u8(c) >= 0 {
87             sb.write_u8(c)
88         } else {
89             sb.write_string('%${c:02X}')
90         }
91     }
92     return sb.str()
93 }
94 
95 // normalize_label normalises a link reference label per CommonMark spec:
96 // strip leading/trailing Unicode whitespace, collapse internal whitespace runs
97 // to a single space, and fold to lower case.
98 fn normalize_label(s string) string {
99     mut out := strings.new_builder(s.len)
100     mut in_space := true // start true so we trim leading space
101     for ch in s.runes() {
102         if ch in unicode_space {
103             if !in_space {
104                 out.write_u8(` `)
105                 in_space = true
106             }
107         } else {
108             if ch >= `A` && ch <= `Z` {
109                 out.write_u8(u8(ch + 32))
110             } else {
111                 out.write_string(ch.str())
112             }
113             in_space = false
114         }
115     }
116     result := out.str()
117     // Trim potential trailing space.
118     if result.ends_with(' ') {
119         return result[..result.len - 1]
120     }
121     return result
122 }
123 
124 // ascii_lower converts an ASCII upper-case letter to lower case.
125 @[inline]
126 fn ascii_lower(c u8) u8 {
127     if c >= `A` && c <= `Z` {
128         return c + 32
129     }
130     return c
131 }
132 
133 // is_unicode_space returns true for CommonMark Unicode whitespace.
134 @[inline]
135 fn is_unicode_space(c rune) bool {
136     return c in unicode_space
137 }
138 
139 // is_ascii_punct returns true if c is an ASCII punctuation character.
140 @[inline]
141 fn is_ascii_punct(c rune) bool {
142     return c <= 0x7f && u8(c) in ascii_punct
143 }
144 
145 const digits = [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`]!
146 // is_digit returns true if c is an ASCII decimal digit.
147 @[inline]
148 fn is_digit(c u8) bool {
149     return c in digits
150 }
151 
152 // is_alpha returns true if c is an ASCII letter.
153 @[inline]
154 fn is_alpha(c u8) bool {
155     return c in alpha
156 }
157 
158 // is_alnum returns true if c is an ASCII letter or digit.
159 @[inline]
160 fn is_alnum(c u8) bool {
161     return is_alpha(c) || is_digit(c)
162 }
163 
164 // heading_id_from_text generates a slug-style id attribute from plain text,
165 // matching goldmark's AutoHeadingID output.
166 fn heading_id_from_text(text string) string {
167     mut sb := strings.new_builder(text.len)
168     mut prev_dash := true // start true so we trim leading dashes
169     for ch in text.runes() {
170         if ch <= 0x7f && is_alnum(u8(ch)) {
171             sb.write_u8(ascii_lower(u8(ch)))
172             prev_dash = false
173         } else if ch == `-` || ch == `_` || ch in unicode_space {
174             if !prev_dash {
175                 sb.write_u8(`-`)
176                 prev_dash = true
177             }
178         }
179         // other characters (punctuation) are dropped
180     }
181     s := sb.str()
182     // Trim trailing dash.
183     return s.trim_right('-')
184 }
185 
186 // count_leading counts how many consecutive occurrences of c appear at the
187 // start of s.
188 @[inline]
189 fn count_leading(s string, c u8) int {
190     mut n := 0
191     for n < s.len && s[n] == c {
192         n++
193     }
194     return n
195 }
196 
197 // expand_tabs converts leading tabs in s to spaces (tab stop = 4 columns).
198 fn expand_tabs(s string) string {
199     if !s.contains('\t') {
200         return s
201     }
202     mut sb := strings.new_builder(s.len)
203     mut col := 0
204     for i := 0; i < s.len; i++ {
205         if s[i] == `\t` {
206             spaces := 4 - (col % 4)
207             for _ in 0 .. spaces {
208                 sb.write_u8(` `)
209             }
210             col += spaces
211         } else {
212             sb.write_u8(s[i])
213             col++
214         }
215     }
216     return sb.str()
217 }
218 
219 // trim_indent removes up to n leading spaces from s.
220 @[inline]
221 fn trim_indent(s string, n int) string {
222     mut i := 0
223     for i < n && i < s.len && s[i] == ` ` {
224         i++
225     }
226     return s[i..]
227 }
228 
229 // is_blank returns true if s contains only whitespace.
230 @[inline]
231 fn is_blank(s string) bool {
232     for i := 0; i < s.len; i++ {
233         if s[i] != ` ` && s[i] != `\t` {
234             return false
235         }
236     }
237     return true
238 }
239 
240 // leading_spaces returns the number of leading spaces (not tabs) in s.
241 @[inline]
242 fn leading_spaces(s string) int {
243     mut n := 0
244     for n < s.len && s[n] == ` ` {
245         n++
246     }
247     return n
248 }
249 
250 // smart_punctuate applies typographic substitutions to s:
251 //   -- → en dash, --- → em dash, ... → ellipsis, smart quotes.
252 fn smart_punctuate(s string) string {
253     mut out := strings.new_builder(s.len)
254     i := 0
255     mut j := i
256     src := s.bytes()
257     for j < src.len {
258         c := src[j]
259         if c == `-` {
260             if j + 2 < src.len && src[j + 1] == `-` && src[j + 2] == `-` {
261                 out.write_string('\u2014') // em dash
262                 j += 3
263                 continue
264             } else if j + 1 < src.len && src[j + 1] == `-` {
265                 out.write_string('\u2013') // en dash
266                 j += 2
267                 continue
268             }
269         } else if c == `.` {
270             if j + 2 < src.len && src[j + 1] == `.` && src[j + 2] == `.` {
271                 out.write_string('\u2026') // ellipsis
272                 j += 3
273                 continue
274             }
275         } else if c == `'` {
276             // Simple heuristic: opening after space/start, closing otherwise.
277             if j == 0 || is_unicode_space(src[j - 1]) || is_ascii_punct(src[j - 1]) {
278                 out.write_string('\u2018') // left single quote
279             } else {
280                 out.write_string('\u2019') // right single quote
281             }
282             j++
283             continue
284         } else if c == `"` {
285             if j == 0 || is_unicode_space(src[j - 1]) || is_ascii_punct(src[j - 1]) {
286                 out.write_string('\u201C') // left double quote
287             } else {
288                 out.write_string('\u201D') // right double quote
289             }
290             j++
291             continue
292         }
293         out.write_u8(c)
294         j++
295     }
296     return out.str()
297 }
298

1	// Copyright 2026 The V Language. All rights reserved.
2	// Use of this source code is governed by an MIT license
3	// that can be found in the LICENSE file.
4	module markdown
5
6	import strings
7
8	// unicode_space lists Unicode code points considered whitespace
9	// vfmt off
10	const unicode_space = [
11	` `, // space
12	`\t`, // tab
13	0x0a, // LF
14	0x0b, // Vertical Tab
15	0x0c, // FF
16	0x0d, // CR
17	0x0085, // next line
18	0x00A0, // no-break space
19	0x1680, // ogham space mark
20	0x180E, // mongolian vowel separator
21	0x2000, // en quad
22	0x2001, // em quad
23	0x2002, // en space
24	0x2003, // em space
25	0x2004, // three-per-em space
26	0x2005, // four-per-em space
27	0x2006, // six-per-em space
28	0x2007, // figure space
29	0x2008, // punctuation space
30	0x2009, // thin space
31	0x200A, // hair space
32	0x200B, // zero width space
33	0x200C, // zero width non-joiner
34	0x200D, // zero width joiner
35	0x2028, // line separator
36	0x2029, // paragraph separator
37	0x202F, // narrow no-break space
38	0x205F, // medium mathematical space
39	0x2060, // word joiner
40	0x3000, // ideographic space
41	0xFEFF, // zero width non-breaking space
42	]!
43
44	// ascii_punct lists ASCII punctuation characters
45	const ascii_punct = [
46	`!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, `*`, `+`, `,`, `-`, `.`, `/`, `:`,
47	`;`, `<`, `=`, `>`, `?`, `@`, `[`, `\\`, `]`, `^`, `_`, `\``, `{`, `\|`, `}`, `~`,
48	]!
49
50	// alpha lists ASCII letters a-z and A-Z
51	const alpha = [
52	`a`, `b`, `c`, `d`, `e`, `f`, `g`, `h`, `i`, `j`, `k`, `l`, `m`,
53	`n`, `o`, `p`, `q`, `r`, `s`, `t`, `u`, `v`, `w`, `x`, `y`, `z`,
54	`A`, `B`, `C`, `D`, `E`, `F`, `G`, `H`, `I`, `J`, `K`, `L`, `M`,
55	`N`, `O`, `P`, `Q`, `R`, `S`, `T`, `U`, `V`, `W`, `X`, `Y`, `Z`,
56	]!
57	// vfmt on
58
59	// html_escape replaces HTML special characters in s with their entity equivalents.
60	fn html_escape(s string) string {
61	if s.index_any('&<>"') == -1 {
62	return s
63	}
64	mut sb := strings.new_builder(s.len + 8)
65	for i := 0; i < s.len; i++ {
66	match s[i] {
67	`&` { sb.write_string('&') }
68	`<` { sb.write_string('<') }
69	`>` { sb.write_string('>') }
70	`"` { sb.write_string('"') }
71	else { sb.write_u8(s[i]) }
72	}
73	}
74	return sb.str()
75	}
76
77	// url_safe_chars contains URL characters that do not need percent-encoding.
78	const url_safe_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~:/?#[]@!$&()*+,;=%'
79
80	// url_encode percent-encodes characters in a URL that need encoding,
81	// while leaving valid URL characters (including already-encoded sequences) intact.
82	fn url_encode(s string) string {
83	mut sb := strings.new_builder(s.len)
84	for i := 0; i < s.len; i++ {
85	c := s[i]
86	if url_safe_chars.index_u8(c) >= 0 {
87	sb.write_u8(c)
88	} else {
89	sb.write_string('%${c:02X}')
90	}
91	}
92	return sb.str()
93	}
94
95	// normalize_label normalises a link reference label per CommonMark spec:
96	// strip leading/trailing Unicode whitespace, collapse internal whitespace runs
97	// to a single space, and fold to lower case.
98	fn normalize_label(s string) string {
99	mut out := strings.new_builder(s.len)
100	mut in_space := true // start true so we trim leading space
101	for ch in s.runes() {
102	if ch in unicode_space {
103	if !in_space {
104	out.write_u8(` `)
105	in_space = true
106	}
107	} else {
108	if ch >= `A` && ch <= `Z` {
109	out.write_u8(u8(ch + 32))
110	} else {
111	out.write_string(ch.str())
112	}
113	in_space = false
114	}
115	}
116	result := out.str()
117	// Trim potential trailing space.
118	if result.ends_with(' ') {
119	return result[..result.len - 1]
120	}
121	return result
122	}
123
124	// ascii_lower converts an ASCII upper-case letter to lower case.
125	@[inline]
126	fn ascii_lower(c u8) u8 {
127	if c >= `A` && c <= `Z` {
128	return c + 32
129	}
130	return c
131	}
132
133	// is_unicode_space returns true for CommonMark Unicode whitespace.
134	@[inline]
135	fn is_unicode_space(c rune) bool {
136	return c in unicode_space
137	}
138
139	// is_ascii_punct returns true if c is an ASCII punctuation character.
140	@[inline]
141	fn is_ascii_punct(c rune) bool {
142	return c <= 0x7f && u8(c) in ascii_punct
143	}
144
145	const digits = [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`]!
146	// is_digit returns true if c is an ASCII decimal digit.
147	@[inline]
148	fn is_digit(c u8) bool {
149	return c in digits
150	}
151
152	// is_alpha returns true if c is an ASCII letter.
153	@[inline]
154	fn is_alpha(c u8) bool {
155	return c in alpha
156	}
157
158	// is_alnum returns true if c is an ASCII letter or digit.
159	@[inline]
160	fn is_alnum(c u8) bool {
161	return is_alpha(c) \|\| is_digit(c)
162	}
163
164	// heading_id_from_text generates a slug-style id attribute from plain text,
165	// matching goldmark's AutoHeadingID output.
166	fn heading_id_from_text(text string) string {
167	mut sb := strings.new_builder(text.len)
168	mut prev_dash := true // start true so we trim leading dashes
169	for ch in text.runes() {
170	if ch <= 0x7f && is_alnum(u8(ch)) {
171	sb.write_u8(ascii_lower(u8(ch)))
172	prev_dash = false
173	} else if ch == `-` \|\| ch == `_` \|\| ch in unicode_space {
174	if !prev_dash {
175	sb.write_u8(`-`)
176	prev_dash = true
177	}
178	}
179	// other characters (punctuation) are dropped
180	}
181	s := sb.str()
182	// Trim trailing dash.
183	return s.trim_right('-')
184	}
185
186	// count_leading counts how many consecutive occurrences of c appear at the
187	// start of s.
188	@[inline]
189	fn count_leading(s string, c u8) int {
190	mut n := 0
191	for n < s.len && s[n] == c {
192	n++
193	}
194	return n
195	}
196
197	// expand_tabs converts leading tabs in s to spaces (tab stop = 4 columns).
198	fn expand_tabs(s string) string {
199	if !s.contains('\t') {
200	return s
201	}
202	mut sb := strings.new_builder(s.len)
203	mut col := 0
204	for i := 0; i < s.len; i++ {
205	if s[i] == `\t` {
206	spaces := 4 - (col % 4)
207	for _ in 0 .. spaces {
208	sb.write_u8(` `)
209	}
210	col += spaces
211	} else {
212	sb.write_u8(s[i])
213	col++
214	}
215	}
216	return sb.str()
217	}
218
219	// trim_indent removes up to n leading spaces from s.
220	@[inline]
221	fn trim_indent(s string, n int) string {
222	mut i := 0
223	for i < n && i < s.len && s[i] == ` ` {
224	i++
225	}
226	return s[i..]
227	}
228
229	// is_blank returns true if s contains only whitespace.
230	@[inline]
231	fn is_blank(s string) bool {
232	for i := 0; i < s.len; i++ {
233	if s[i] != ` ` && s[i] != `\t` {
234	return false
235	}
236	}
237	return true
238	}
239
240	// leading_spaces returns the number of leading spaces (not tabs) in s.
241	@[inline]
242	fn leading_spaces(s string) int {
243	mut n := 0
244	for n < s.len && s[n] == ` ` {
245	n++
246	}
247	return n
248	}
249
250	// smart_punctuate applies typographic substitutions to s:
251	// -- → en dash, --- → em dash, ... → ellipsis, smart quotes.
252	fn smart_punctuate(s string) string {
253	mut out := strings.new_builder(s.len)
254	i := 0
255	mut j := i
256	src := s.bytes()
257	for j < src.len {
258	c := src[j]
259	if c == `-` {
260	if j + 2 < src.len && src[j + 1] == `-` && src[j + 2] == `-` {
261	out.write_string('\u2014') // em dash
262	j += 3
263	continue
264	} else if j + 1 < src.len && src[j + 1] == `-` {
265	out.write_string('\u2013') // en dash
266	j += 2
267	continue
268	}
269	} else if c == `.` {
270	if j + 2 < src.len && src[j + 1] == `.` && src[j + 2] == `.` {
271	out.write_string('\u2026') // ellipsis
272	j += 3
273	continue
274	}
275	} else if c == `'` {
276	// Simple heuristic: opening after space/start, closing otherwise.
277	if j == 0 \|\| is_unicode_space(src[j - 1]) \|\| is_ascii_punct(src[j - 1]) {
278	out.write_string('\u2018') // left single quote
279	} else {
280	out.write_string('\u2019') // right single quote
281	}
282	j++
283	continue
284	} else if c == `"` {
285	if j == 0 \|\| is_unicode_space(src[j - 1]) \|\| is_ascii_punct(src[j - 1]) {
286	out.write_string('\u201C') // left double quote
287	} else {
288	out.write_string('\u201D') // right double quote
289	}
290	j++
291	continue
292	}
293	out.write_u8(c)
294	j++
295	}
296	return out.str()
297	}
298