| 1 | module html |
| 2 | |
| 3 | import os |
| 4 | import strings |
| 5 | |
| 6 | struct LexicalAttributes { |
| 7 | mut: |
| 8 | current_tag &Tag = unsafe { nil } |
| 9 | open_tag bool |
| 10 | open_code bool |
| 11 | open_string int |
| 12 | open_comment bool |
| 13 | is_attribute bool |
| 14 | opened_code_type string |
| 15 | line_count int |
| 16 | outside_tag bool |
| 17 | text_after_tag bool |
| 18 | lexeme_builder strings.Builder = strings.new_builder(100) |
| 19 | code_tags map[string]bool = { |
| 20 | 'script': true |
| 21 | 'style': true |
| 22 | } |
| 23 | } |
| 24 | |
| 25 | // Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`. |
| 26 | pub struct Parser { |
| 27 | mut: |
| 28 | dom DocumentObjectModel |
| 29 | lexical_attributes LexicalAttributes = LexicalAttributes{ |
| 30 | current_tag: &Tag{} |
| 31 | } |
| 32 | filename string = 'direct-parse' |
| 33 | initialized bool |
| 34 | tags []&Tag |
| 35 | debug_file os.File |
| 36 | } |
| 37 | |
| 38 | // This function is used to add a tag for the parser ignore it's content. |
| 39 | // For example, if you have an html or XML with a custom tag, like `<script>`, using this function, |
| 40 | // like `add_code_tag('script')` will make all `script` tags content be jumped, |
| 41 | // so you still have its content, but will not confuse the parser with it's `>` or `<`. |
| 42 | pub fn (mut parser Parser) add_code_tag(name string) { |
| 43 | if name.len <= 0 { |
| 44 | return |
| 45 | } |
| 46 | parser.lexical_attributes.code_tags[name] = true |
| 47 | } |
| 48 | |
| 49 | @[inline] |
| 50 | fn (parser Parser) builder_str() string { |
| 51 | return parser.lexical_attributes.lexeme_builder.after(0) |
| 52 | } |
| 53 | |
| 54 | @[if debug_html ?] |
| 55 | fn (mut parser Parser) print_debug(data string) { |
| 56 | if data.len > 0 { |
| 57 | parser.debug_file.writeln(data) or { panic(err) } |
| 58 | } |
| 59 | } |
| 60 | |
| 61 | fn (mut parser Parser) verify_end_comment(remove bool) bool { |
| 62 | lexeme := parser.builder_str() |
| 63 | last := lexeme[lexeme.len - 1] |
| 64 | penultimate := lexeme[lexeme.len - 2] |
| 65 | is_end_comment := last == `-` && penultimate == `-` |
| 66 | if is_end_comment && remove { |
| 67 | parser.lexical_attributes.lexeme_builder.go_back(2) |
| 68 | } |
| 69 | return is_end_comment |
| 70 | } |
| 71 | |
| 72 | fn blank_string(data string) bool { |
| 73 | mut count := 0 |
| 74 | for chr in data { |
| 75 | if chr == 10 || chr == 9 || chr == 32 { |
| 76 | count++ |
| 77 | } |
| 78 | } |
| 79 | return count == data.len |
| 80 | } |
| 81 | |
| 82 | // init initializes the parser. |
| 83 | fn (mut parser Parser) init() { |
| 84 | if parser.initialized { |
| 85 | return |
| 86 | } |
| 87 | parser.dom = DocumentObjectModel{ |
| 88 | debug_file: parser.debug_file |
| 89 | root: &Tag{} |
| 90 | } |
| 91 | parser.add_code_tag('') |
| 92 | parser.tags = []&Tag{} |
| 93 | parser.dom.close_tags['/!document'] = true |
| 94 | parser.lexical_attributes.current_tag = &Tag{} |
| 95 | parser.lexical_attributes.outside_tag = true |
| 96 | parser.initialized = true |
| 97 | } |
| 98 | |
| 99 | fn (mut parser Parser) generate_tag() { |
| 100 | if parser.lexical_attributes.open_tag { |
| 101 | return |
| 102 | } |
| 103 | if parser.lexical_attributes.current_tag.name != '' |
| 104 | || parser.lexical_attributes.current_tag.content.len > 0 { |
| 105 | parser.tags << parser.lexical_attributes.current_tag |
| 106 | } |
| 107 | parser.lexical_attributes.current_tag = &Tag{} |
| 108 | } |
| 109 | |
| 110 | // split_parse parses the HTML fragment |
| 111 | pub fn (mut parser Parser) split_parse(data string) { |
| 112 | parser.init() |
| 113 | for chr in data { |
| 114 | is_quote := chr == `"` || chr == `'` |
| 115 | string_code := match chr { |
| 116 | `"` { 1 } |
| 117 | `'` { 2 } |
| 118 | else { 0 } |
| 119 | } |
| 120 | |
| 121 | if parser.lexical_attributes.open_code { // verify if open_code is complete and handle string code |
| 122 | parser.lexical_attributes.lexeme_builder.write_u8(chr) |
| 123 | if parser.lexical_attributes.open_string > 0 |
| 124 | && parser.lexical_attributes.open_string == string_code { |
| 125 | parser.lexical_attributes.open_string = 0 |
| 126 | } else if chr == `>` { // code tag is finished |
| 127 | name_close_tag := '</${parser.lexical_attributes.opened_code_type}>' |
| 128 | if parser.builder_str().to_lower().ends_with(name_close_tag) { |
| 129 | parser.lexical_attributes.open_code = false |
| 130 | // modify lexeme_builder to include script text as content in the next loop |
| 131 | parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len) |
| 132 | parser.lexical_attributes.current_tag.closed = true |
| 133 | parser.lexical_attributes.current_tag.close_type = .new_tag |
| 134 | } |
| 135 | } |
| 136 | } else if parser.lexical_attributes.open_comment { |
| 137 | if chr == `>` && parser.verify_end_comment(false) { |
| 138 | // parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str()) |
| 139 | parser.lexical_attributes.lexeme_builder.go_back_to(0) |
| 140 | parser.lexical_attributes.open_comment = false |
| 141 | parser.lexical_attributes.open_tag = false |
| 142 | } else { |
| 143 | parser.lexical_attributes.lexeme_builder.write_u8(chr) |
| 144 | } |
| 145 | } else if parser.lexical_attributes.open_string > 0 { |
| 146 | if parser.lexical_attributes.open_string == string_code { |
| 147 | parser.lexical_attributes.open_string = 0 |
| 148 | parser.lexical_attributes.lexeme_builder.write_u8(chr) |
| 149 | temp_lexeme := parser.builder_str() |
| 150 | if parser.lexical_attributes.current_tag.last_attribute != '' { |
| 151 | lattr := parser.lexical_attributes.current_tag.last_attribute |
| 152 | nval := temp_lexeme.substr(1, temp_lexeme.len - 1) |
| 153 | // parser.print_debug(lattr + " = " + temp_lexeme) |
| 154 | parser.lexical_attributes.current_tag.attributes[lattr] = nval |
| 155 | if lattr == 'class' { |
| 156 | for class_name in nval.split_any('\t\r\n \x0D') { |
| 157 | if class_name != '' { |
| 158 | parser.lexical_attributes.current_tag.class_set.add(class_name) |
| 159 | } |
| 160 | } |
| 161 | } |
| 162 | parser.lexical_attributes.current_tag.last_attribute = '' |
| 163 | } else { |
| 164 | parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' |
| 165 | // parser.print_debug(temp_lexeme) |
| 166 | } |
| 167 | parser.lexical_attributes.lexeme_builder.go_back_to(0) |
| 168 | } else { |
| 169 | parser.lexical_attributes.lexeme_builder.write_u8(chr) |
| 170 | } |
| 171 | } else if parser.lexical_attributes.open_tag { |
| 172 | if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote { |
| 173 | parser.lexical_attributes.open_string = string_code |
| 174 | parser.lexical_attributes.lexeme_builder.write_u8(chr) |
| 175 | } else if chr == `>` { |
| 176 | complete_lexeme := parser.builder_str().to_lower() |
| 177 | parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0 |
| 178 | && complete_lexeme[complete_lexeme.len - 1] == `/`) |
| 179 | if complete_lexeme.len > 0 && complete_lexeme[0] == `/` { |
| 180 | parser.dom.close_tags[complete_lexeme] = true |
| 181 | } |
| 182 | /* |
| 183 | else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>" |
| 184 | parser.lexical_attributes.current_tag.closed = true |
| 185 | } |
| 186 | */ |
| 187 | if parser.lexical_attributes.current_tag.name == '' { |
| 188 | parser.lexical_attributes.current_tag.name = complete_lexeme |
| 189 | } else if complete_lexeme != '/' { |
| 190 | parser.lexical_attributes.current_tag.attributes[complete_lexeme] = '' |
| 191 | } |
| 192 | parser.lexical_attributes.open_tag = false |
| 193 | parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code |
| 194 | if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags { |
| 195 | parser.lexical_attributes.open_code = true |
| 196 | parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name |
| 197 | } |
| 198 | // parser.print_debug(parser.lexical_attributes.current_tag.name) |
| 199 | } else if chr !in [u8(9), ` `, `=`, `\n`] { // Tab, space, = and \n |
| 200 | parser.lexical_attributes.lexeme_builder.write_u8(chr) |
| 201 | } else if chr != 10 { |
| 202 | complete_lexeme := parser.builder_str().to_lower() |
| 203 | if parser.lexical_attributes.current_tag.name == '' { |
| 204 | parser.lexical_attributes.current_tag.name = complete_lexeme |
| 205 | } else { |
| 206 | parser.lexical_attributes.current_tag.attributes[complete_lexeme] = '' |
| 207 | parser.lexical_attributes.current_tag.last_attribute = '' |
| 208 | if chr == `=` { |
| 209 | parser.lexical_attributes.current_tag.last_attribute = complete_lexeme |
| 210 | } |
| 211 | } |
| 212 | parser.lexical_attributes.lexeme_builder.go_back_to(0) |
| 213 | } |
| 214 | if parser.builder_str() == '!--' { |
| 215 | parser.lexical_attributes.open_comment = true |
| 216 | } |
| 217 | } else if chr == `<` { |
| 218 | temp_string := parser.builder_str() |
| 219 | if parser.lexical_attributes.lexeme_builder.len >= 1 { |
| 220 | if parser.lexical_attributes.current_tag.name.len > 1 |
| 221 | && parser.lexical_attributes.current_tag.name[0] == 47 |
| 222 | && !blank_string(temp_string) { |
| 223 | parser.lexical_attributes.text_after_tag = true |
| 224 | } else { |
| 225 | parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content |
| 226 | } |
| 227 | } |
| 228 | // parser.print_debug(parser.lexical_attributes.current_tag.str()) |
| 229 | parser.lexical_attributes.lexeme_builder.go_back_to(0) |
| 230 | parser.generate_tag() |
| 231 | parser.lexical_attributes.open_tag = true |
| 232 | parser.lexical_attributes.outside_tag = false |
| 233 | |
| 234 | if parser.lexical_attributes.text_after_tag == true { |
| 235 | parser.tags << &Tag{ |
| 236 | name: 'text' |
| 237 | content: temp_string |
| 238 | } |
| 239 | parser.lexical_attributes.text_after_tag = false |
| 240 | } |
| 241 | } else { |
| 242 | parser.lexical_attributes.lexeme_builder.write_u8(chr) |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | // If `data` has not tags but has only text. |
| 247 | if parser.lexical_attributes.outside_tag { |
| 248 | temp_string := parser.lexical_attributes.lexeme_builder.str() |
| 249 | |
| 250 | if parser.tags.len == 0 { |
| 251 | parser.tags << &Tag{ |
| 252 | name: 'text' |
| 253 | content: temp_string |
| 254 | } |
| 255 | } else if parser.tags.len == 1 { |
| 256 | mut tag := parser.tags.first() |
| 257 | |
| 258 | if tag.name == 'text' { |
| 259 | tag.content += temp_string |
| 260 | } |
| 261 | } |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | // parse_html parses the given HTML string |
| 266 | pub fn (mut parser Parser) parse_html(data string) { |
| 267 | parser.init() |
| 268 | mut lines := data.split_into_lines() |
| 269 | for index, line in lines { |
| 270 | parser.lexical_attributes.line_count++ |
| 271 | // Parser shouldn't replace `\n`, because it may break JS code or text which sticks together. |
| 272 | // After `split_into_lines()` we need to add `\n` again. |
| 273 | parser.split_parse(if index < lines.len - 1 { '${line}\n' } else { line }) |
| 274 | } |
| 275 | parser.generate_tag() |
| 276 | parser.dom.debug_file = parser.debug_file |
| 277 | parser.dom.construct(parser.tags) |
| 278 | } |
| 279 | |
| 280 | // finalize finishes the parsing stage . |
| 281 | @[inline] |
| 282 | pub fn (mut parser Parser) finalize() { |
| 283 | parser.generate_tag() |
| 284 | } |
| 285 | |
| 286 | // get_dom returns the parser's current DOM representation. |
| 287 | pub fn (mut parser Parser) get_dom() DocumentObjectModel { |
| 288 | if !parser.dom.constructed { |
| 289 | parser.generate_tag() |
| 290 | parser.dom.construct(parser.tags) |
| 291 | } |
| 292 | return parser.dom |
| 293 | } |
| 294 | |