| 1 | module highlight |
| 2 | |
| 3 | import markdown |
| 4 | import regex.pcre |
| 5 | |
| 6 | const allowed_tags = [ |
| 7 | 'a', |
| 8 | 'abbr', |
| 9 | 'b', |
| 10 | 'blockquote', |
| 11 | 'body', |
| 12 | 'br', |
| 13 | 'center', |
| 14 | 'code', |
| 15 | 'dd', |
| 16 | 'details', |
| 17 | 'div', |
| 18 | 'dl', |
| 19 | 'dt', |
| 20 | 'em', |
| 21 | 'font', |
| 22 | 'h1', |
| 23 | 'h2', |
| 24 | 'h3', |
| 25 | 'h4', |
| 26 | 'h5', |
| 27 | 'h6', |
| 28 | 'hr', |
| 29 | 'i', |
| 30 | 'img', |
| 31 | 'kbd', |
| 32 | 'label', |
| 33 | 'li', |
| 34 | 'ol', |
| 35 | 'p', |
| 36 | 'pre', |
| 37 | 'small', |
| 38 | 'source', |
| 39 | 'span', |
| 40 | 'strong', |
| 41 | 'sub', |
| 42 | 'summary', |
| 43 | 'sup', |
| 44 | 'table', |
| 45 | 'tbody', |
| 46 | 'tr', |
| 47 | 'td', |
| 48 | 'th', |
| 49 | 'thead', |
| 50 | 'ul', |
| 51 | 'u', |
| 52 | 'video', |
| 53 | ] |
| 54 | const allowed_attributes = [ |
| 55 | 'align', |
| 56 | 'color', |
| 57 | 'controls', |
| 58 | 'height', |
| 59 | 'href', |
| 60 | 'id', |
| 61 | 'src', |
| 62 | 'style', |
| 63 | 'target', |
| 64 | 'title', |
| 65 | 'type', |
| 66 | 'width', |
| 67 | ] |
| 68 | const unallowed_schemas = [ |
| 69 | 'javascript:', |
| 70 | ] |
| 71 | |
| 72 | pub fn convert_markdown_to_html(code string) string { |
| 73 | markdown_code := sanitize_markdown_code(code) |
| 74 | |
| 75 | return markdown.to_html(markdown_code) |
| 76 | } |
| 77 | |
| 78 | // temporary solution while markdown module doesn't support |
| 79 | // sanitizing HTML tags with DOM parsing |
| 80 | pub fn sanitize_markdown_code(code string) string { |
| 81 | mut result := code |
| 82 | |
| 83 | result = remove_comments(result) |
| 84 | result = sanitize_html_tags(result) |
| 85 | |
| 86 | return result |
| 87 | } |
| 88 | |
| 89 | fn sanitize_html_tags(code string) string { |
| 90 | mut result := code |
| 91 | // tag name, attributes, tag content(optional) |
| 92 | paired_tags_re := r'<[\s]*?([a-zA-Z0-9]+)(.*?)>(.*?)<\/\s*?[a-zA-Z0-9]+\s*?>' |
| 93 | unpaired_tags_re := r'<(\w*)\s+(.*?)()>' |
| 94 | |
| 95 | result = sanitize_html_tags_with_re(paired_tags_re, result) |
| 96 | result = sanitize_html_tags_with_re(unpaired_tags_re, result) |
| 97 | |
| 98 | return result |
| 99 | } |
| 100 | |
| 101 | fn sanitize_html_tags_with_re(re string, code string) string { |
| 102 | mut result := code |
| 103 | |
| 104 | tags_re := pcre.new_regex(re, 0) or { |
| 105 | println(err) |
| 106 | return result |
| 107 | } |
| 108 | |
| 109 | mut last_found_index := 0 |
| 110 | |
| 111 | for { |
| 112 | matched := tags_re.match_str(result, last_found_index, 0) or { break } |
| 113 | tag := matched.get(0) or { continue } |
| 114 | |
| 115 | matched_start_index := result.index_after_(tag, last_found_index) |
| 116 | last_found_index = matched_start_index + tag.len |
| 117 | |
| 118 | tag_parts := matched.get_all() |
| 119 | tag_name := tag_parts[1].trim_space().to_lower() |
| 120 | tag_attributes := tag_parts[2].trim_space() |
| 121 | tag_content := if tag_parts.len > 3 { tag_parts[3].trim_space() } else { '' } |
| 122 | is_allowed_tag := allowed_tags.contains(tag_name) |
| 123 | |
| 124 | if !is_allowed_tag { |
| 125 | result = result.replace(tag, '') |
| 126 | last_found_index = matched_start_index |
| 127 | } |
| 128 | |
| 129 | sanitized_attributes := sanitize_html_attributes(tag_attributes) |
| 130 | is_attributes_length_equal := tag_attributes.len == sanitized_attributes.len |
| 131 | |
| 132 | if !is_attributes_length_equal { |
| 133 | result = result.replace(tag, tag.replace(tag_attributes, sanitized_attributes)) |
| 134 | difference := tag_attributes.len - sanitized_attributes.len |
| 135 | last_found_index -= difference |
| 136 | } |
| 137 | |
| 138 | sanitized_content := sanitize_html_tags(tag_content) |
| 139 | is_content_length_equal := tag_content.len == sanitized_content.len |
| 140 | |
| 141 | if !is_content_length_equal { |
| 142 | result = result.replace(tag, tag.replace(tag_content, sanitized_content)) |
| 143 | difference := tag_content.len - sanitized_content.len |
| 144 | last_found_index -= difference |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | return result |
| 149 | } |
| 150 | |
| 151 | fn sanitize_html_attributes(attributes string) string { |
| 152 | mut result := attributes |
| 153 | |
| 154 | attributes_query := r'(\w+).*?=["](.*?)["]' |
| 155 | attributes_re := pcre.new_regex(attributes_query, 0) or { |
| 156 | println(err) |
| 157 | return result |
| 158 | } |
| 159 | |
| 160 | mut last_found_index := 0 |
| 161 | |
| 162 | for { |
| 163 | matched := attributes_re.match_str(result, last_found_index, 0) or { break } |
| 164 | attribute := matched.get(0) or { break } |
| 165 | |
| 166 | matched_start_index := result.index_after_(attribute, last_found_index) |
| 167 | last_found_index += matched_start_index + attribute.len |
| 168 | |
| 169 | attribute_parts := matched.get_all() |
| 170 | attribute_name := attribute_parts[1].trim_space().to_lower() |
| 171 | attribute_value := attribute_parts[2].trim_space() |
| 172 | |
| 173 | is_allowed_attribute := allowed_attributes.contains(attribute_name) |
| 174 | is_unallowed_schemas := unallowed_schemas.any(attribute_value.starts_with(it)) |
| 175 | |
| 176 | if !is_allowed_attribute || is_unallowed_schemas { |
| 177 | result = result.replace(attribute, '') |
| 178 | last_found_index -= attribute.len |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | return result |
| 183 | } |
| 184 | |
| 185 | fn remove_comments(code string) string { |
| 186 | mut result := code |
| 187 | |
| 188 | for { |
| 189 | start := result.index('<!--') or { break } |
| 190 | end := result.index_after('-->', start + 4) or { return result[..start] } |
| 191 | result = result[..start] + result[end + 3..] |
| 192 | } |
| 193 | |
| 194 | return result |
| 195 | } |
| 196 | |