module highlight import x.markdown import regex.pcre const allowed_tags = [ 'a', 'abbr', 'b', 'blockquote', 'body', 'br', 'center', 'code', 'dd', 'details', 'div', 'dl', 'dt', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'kbd', 'label', 'li', 'ol', 'p', 'pre', 'small', 'source', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'tr', 'td', 'th', 'thead', 'ul', 'u', 'video', ] const allowed_attributes = [ 'align', 'color', 'controls', 'height', 'href', 'id', 'src', 'style', 'target', 'title', 'type', 'width', ] const unallowed_schemas = [ 'javascript:', ] pub fn convert_markdown_to_html(code string) string { markdown_code := sanitize_markdown_code(code) return markdown.to_html(markdown_code).trim_right('\n') } // temporary solution while markdown module doesn't support // sanitizing HTML tags with DOM parsing pub fn sanitize_markdown_code(code string) string { mut result := code result = remove_comments(result) result = sanitize_html_tags(result) return result } fn sanitize_html_tags(code string) string { mut result := code // tag name, attributes, tag content(optional) paired_tags_re := r'<[\s]*?([a-zA-Z0-9]+)(.*?)>(.*?)<\/\s*?[a-zA-Z0-9]+\s*?>' unpaired_tags_re := r'<(\w*)\s+(.*?)()>' result = sanitize_html_tags_with_re(paired_tags_re, result) result = sanitize_html_tags_with_re(unpaired_tags_re, result) return result } fn sanitize_html_tags_with_re(re string, code string) string { mut result := code tags_re := pcre.new_regex(re, 0) or { println(err) return result } mut last_found_index := 0 for { matched := tags_re.match_str(result, last_found_index, 0) or { break } tag := matched.get(0) or { continue } matched_start_index := result.index_after_(tag, last_found_index) last_found_index = matched_start_index + tag.len tag_parts := matched.get_all() tag_name := tag_parts[1].trim_space().to_lower() tag_attributes := tag_parts[2].trim_space() tag_content := if tag_parts.len > 3 { tag_parts[3].trim_space() } else { '' } is_allowed_tag := allowed_tags.contains(tag_name) if !is_allowed_tag { result = result.replace(tag, '') last_found_index = matched_start_index } sanitized_attributes := sanitize_html_attributes(tag_attributes) is_attributes_length_equal := tag_attributes.len == sanitized_attributes.len if !is_attributes_length_equal { result = result.replace(tag, tag.replace(tag_attributes, sanitized_attributes)) difference := tag_attributes.len - sanitized_attributes.len last_found_index -= difference } sanitized_content := sanitize_html_tags(tag_content) is_content_length_equal := tag_content.len == sanitized_content.len if !is_content_length_equal { result = result.replace(tag, tag.replace(tag_content, sanitized_content)) difference := tag_content.len - sanitized_content.len last_found_index -= difference } } return result } fn sanitize_html_attributes(attributes string) string { mut result := attributes attributes_query := r'(\w+).*?=["](.*?)["]' attributes_re := pcre.new_regex(attributes_query, 0) or { println(err) return result } mut last_found_index := 0 for { matched := attributes_re.match_str(result, last_found_index, 0) or { break } attribute := matched.get(0) or { break } matched_start_index := result.index_after_(attribute, last_found_index) last_found_index += matched_start_index + attribute.len attribute_parts := matched.get_all() attribute_name := attribute_parts[1].trim_space().to_lower() attribute_value := attribute_parts[2].trim_space() is_allowed_attribute := allowed_attributes.contains(attribute_name) is_unallowed_schemas := unallowed_schemas.any(attribute_value.starts_with(it)) if !is_allowed_attribute || is_unallowed_schemas { result = result.replace(attribute, '') last_found_index -= attribute.len } } return result } fn remove_comments(code string) string { mut result := code for { start := result.index('', start + 4) or { return result[..start] } result = result[..start] + result[end + 3..] } return result }