ggdgsdbsdbbb / highlight / markdown.v
195 lines · 161 sloc · 4.09 KB · 523f5776deba67f05a71f7c2b987f9f192523bed
Raw
1module highlight
2
3import markdown
4import regex.pcre
5
6const allowed_tags = [
7 'a',
8 'abbr',
9 'b',
10 'blockquote',
11 'body',
12 'br',
13 'center',
14 'code',
15 'dd',
16 'details',
17 'div',
18 'dl',
19 'dt',
20 'em',
21 'font',
22 'h1',
23 'h2',
24 'h3',
25 'h4',
26 'h5',
27 'h6',
28 'hr',
29 'i',
30 'img',
31 'kbd',
32 'label',
33 'li',
34 'ol',
35 'p',
36 'pre',
37 'small',
38 'source',
39 'span',
40 'strong',
41 'sub',
42 'summary',
43 'sup',
44 'table',
45 'tbody',
46 'tr',
47 'td',
48 'th',
49 'thead',
50 'ul',
51 'u',
52 'video',
53]
54const allowed_attributes = [
55 'align',
56 'color',
57 'controls',
58 'height',
59 'href',
60 'id',
61 'src',
62 'style',
63 'target',
64 'title',
65 'type',
66 'width',
67]
68const unallowed_schemas = [
69 'javascript:',
70]
71
72pub fn convert_markdown_to_html(code string) string {
73 markdown_code := sanitize_markdown_code(code)
74
75 return markdown.to_html(markdown_code)
76}
77
78// temporary solution while markdown module doesn't support
79// sanitizing HTML tags with DOM parsing
80pub fn sanitize_markdown_code(code string) string {
81 mut result := code
82
83 result = remove_comments(result)
84 result = sanitize_html_tags(result)
85
86 return result
87}
88
89fn sanitize_html_tags(code string) string {
90 mut result := code
91 // tag name, attributes, tag content(optional)
92 paired_tags_re := r'<[\s]*?([a-zA-Z0-9]+)(.*?)>(.*?)<\/\s*?[a-zA-Z0-9]+\s*?>'
93 unpaired_tags_re := r'<(\w*)\s+(.*?)()>'
94
95 result = sanitize_html_tags_with_re(paired_tags_re, result)
96 result = sanitize_html_tags_with_re(unpaired_tags_re, result)
97
98 return result
99}
100
101fn sanitize_html_tags_with_re(re string, code string) string {
102 mut result := code
103
104 tags_re := pcre.new_regex(re, 0) or {
105 println(err)
106 return result
107 }
108
109 mut last_found_index := 0
110
111 for {
112 matched := tags_re.match_str(result, last_found_index, 0) or { break }
113 tag := matched.get(0) or { continue }
114
115 matched_start_index := result.index_after_(tag, last_found_index)
116 last_found_index = matched_start_index + tag.len
117
118 tag_parts := matched.get_all()
119 tag_name := tag_parts[1].trim_space().to_lower()
120 tag_attributes := tag_parts[2].trim_space()
121 tag_content := if tag_parts.len > 3 { tag_parts[3].trim_space() } else { '' }
122 is_allowed_tag := allowed_tags.contains(tag_name)
123
124 if !is_allowed_tag {
125 result = result.replace(tag, '')
126 last_found_index = matched_start_index
127 }
128
129 sanitized_attributes := sanitize_html_attributes(tag_attributes)
130 is_attributes_length_equal := tag_attributes.len == sanitized_attributes.len
131
132 if !is_attributes_length_equal {
133 result = result.replace(tag, tag.replace(tag_attributes, sanitized_attributes))
134 difference := tag_attributes.len - sanitized_attributes.len
135 last_found_index -= difference
136 }
137
138 sanitized_content := sanitize_html_tags(tag_content)
139 is_content_length_equal := tag_content.len == sanitized_content.len
140
141 if !is_content_length_equal {
142 result = result.replace(tag, tag.replace(tag_content, sanitized_content))
143 difference := tag_content.len - sanitized_content.len
144 last_found_index -= difference
145 }
146 }
147
148 return result
149}
150
151fn sanitize_html_attributes(attributes string) string {
152 mut result := attributes
153
154 attributes_query := r'(\w+).*?=["](.*?)["]'
155 attributes_re := pcre.new_regex(attributes_query, 0) or {
156 println(err)
157 return result
158 }
159
160 mut last_found_index := 0
161
162 for {
163 matched := attributes_re.match_str(result, last_found_index, 0) or { break }
164 attribute := matched.get(0) or { break }
165
166 matched_start_index := result.index_after_(attribute, last_found_index)
167 last_found_index += matched_start_index + attribute.len
168
169 attribute_parts := matched.get_all()
170 attribute_name := attribute_parts[1].trim_space().to_lower()
171 attribute_value := attribute_parts[2].trim_space()
172
173 is_allowed_attribute := allowed_attributes.contains(attribute_name)
174 is_unallowed_schemas := unallowed_schemas.any(attribute_value.starts_with(it))
175
176 if !is_allowed_attribute || is_unallowed_schemas {
177 result = result.replace(attribute, '')
178 last_found_index -= attribute.len
179 }
180 }
181
182 return result
183}
184
185fn remove_comments(code string) string {
186 mut result := code
187
188 for {
189 start := result.index('<!--') or { break }
190 end := result.index_after('-->', start + 4) or { return result[..start] }
191 result = result[..start] + result[end + 3..]
192 }
193
194 return result
195}
196