v / vlib / net / html / parser.v
293 lines · 274 sloc · 9.69 KB · 8e35f4d9848f7ad35d857a187dddbfd2eca5e19d
Raw
1module html
2
3import os
4import strings
5
6struct LexicalAttributes {
7mut:
8 current_tag &Tag = unsafe { nil }
9 open_tag bool
10 open_code bool
11 open_string int
12 open_comment bool
13 is_attribute bool
14 opened_code_type string
15 line_count int
16 outside_tag bool
17 text_after_tag bool
18 lexeme_builder strings.Builder = strings.new_builder(100)
19 code_tags map[string]bool = {
20 'script': true
21 'style': true
22 }
23}
24
25// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`.
26pub struct Parser {
27mut:
28 dom DocumentObjectModel
29 lexical_attributes LexicalAttributes = LexicalAttributes{
30 current_tag: &Tag{}
31 }
32 filename string = 'direct-parse'
33 initialized bool
34 tags []&Tag
35 debug_file os.File
36}
37
38// This function is used to add a tag for the parser ignore it's content.
39// For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
40// like `add_code_tag('script')` will make all `script` tags content be jumped,
41// so you still have its content, but will not confuse the parser with it's `>` or `<`.
42pub fn (mut parser Parser) add_code_tag(name string) {
43 if name.len <= 0 {
44 return
45 }
46 parser.lexical_attributes.code_tags[name] = true
47}
48
49@[inline]
50fn (parser Parser) builder_str() string {
51 return parser.lexical_attributes.lexeme_builder.after(0)
52}
53
54@[if debug_html ?]
55fn (mut parser Parser) print_debug(data string) {
56 if data.len > 0 {
57 parser.debug_file.writeln(data) or { panic(err) }
58 }
59}
60
61fn (mut parser Parser) verify_end_comment(remove bool) bool {
62 lexeme := parser.builder_str()
63 last := lexeme[lexeme.len - 1]
64 penultimate := lexeme[lexeme.len - 2]
65 is_end_comment := last == `-` && penultimate == `-`
66 if is_end_comment && remove {
67 parser.lexical_attributes.lexeme_builder.go_back(2)
68 }
69 return is_end_comment
70}
71
72fn blank_string(data string) bool {
73 mut count := 0
74 for chr in data {
75 if chr == 10 || chr == 9 || chr == 32 {
76 count++
77 }
78 }
79 return count == data.len
80}
81
82// init initializes the parser.
83fn (mut parser Parser) init() {
84 if parser.initialized {
85 return
86 }
87 parser.dom = DocumentObjectModel{
88 debug_file: parser.debug_file
89 root: &Tag{}
90 }
91 parser.add_code_tag('')
92 parser.tags = []&Tag{}
93 parser.dom.close_tags['/!document'] = true
94 parser.lexical_attributes.current_tag = &Tag{}
95 parser.lexical_attributes.outside_tag = true
96 parser.initialized = true
97}
98
99fn (mut parser Parser) generate_tag() {
100 if parser.lexical_attributes.open_tag {
101 return
102 }
103 if parser.lexical_attributes.current_tag.name != ''
104 || parser.lexical_attributes.current_tag.content.len > 0 {
105 parser.tags << parser.lexical_attributes.current_tag
106 }
107 parser.lexical_attributes.current_tag = &Tag{}
108}
109
110// split_parse parses the HTML fragment
111pub fn (mut parser Parser) split_parse(data string) {
112 parser.init()
113 for chr in data {
114 is_quote := chr == `"` || chr == `'`
115 string_code := match chr {
116 `"` { 1 }
117 `'` { 2 }
118 else { 0 }
119 }
120
121 if parser.lexical_attributes.open_code { // verify if open_code is complete and handle string code
122 parser.lexical_attributes.lexeme_builder.write_u8(chr)
123 if parser.lexical_attributes.open_string > 0
124 && parser.lexical_attributes.open_string == string_code {
125 parser.lexical_attributes.open_string = 0
126 } else if chr == `>` { // code tag is finished
127 name_close_tag := '</${parser.lexical_attributes.opened_code_type}>'
128 if parser.builder_str().to_lower().ends_with(name_close_tag) {
129 parser.lexical_attributes.open_code = false
130 // modify lexeme_builder to include script text as content in the next loop
131 parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len)
132 parser.lexical_attributes.current_tag.closed = true
133 parser.lexical_attributes.current_tag.close_type = .new_tag
134 }
135 }
136 } else if parser.lexical_attributes.open_comment {
137 if chr == `>` && parser.verify_end_comment(false) {
138 // parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str())
139 parser.lexical_attributes.lexeme_builder.go_back_to(0)
140 parser.lexical_attributes.open_comment = false
141 parser.lexical_attributes.open_tag = false
142 } else {
143 parser.lexical_attributes.lexeme_builder.write_u8(chr)
144 }
145 } else if parser.lexical_attributes.open_string > 0 {
146 if parser.lexical_attributes.open_string == string_code {
147 parser.lexical_attributes.open_string = 0
148 parser.lexical_attributes.lexeme_builder.write_u8(chr)
149 temp_lexeme := parser.builder_str()
150 if parser.lexical_attributes.current_tag.last_attribute != '' {
151 lattr := parser.lexical_attributes.current_tag.last_attribute
152 nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
153 // parser.print_debug(lattr + " = " + temp_lexeme)
154 parser.lexical_attributes.current_tag.attributes[lattr] = nval
155 if lattr == 'class' {
156 for class_name in nval.split_any('\t\r\n \x0D') {
157 if class_name != '' {
158 parser.lexical_attributes.current_tag.class_set.add(class_name)
159 }
160 }
161 }
162 parser.lexical_attributes.current_tag.last_attribute = ''
163 } else {
164 parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = ''
165 // parser.print_debug(temp_lexeme)
166 }
167 parser.lexical_attributes.lexeme_builder.go_back_to(0)
168 } else {
169 parser.lexical_attributes.lexeme_builder.write_u8(chr)
170 }
171 } else if parser.lexical_attributes.open_tag {
172 if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote {
173 parser.lexical_attributes.open_string = string_code
174 parser.lexical_attributes.lexeme_builder.write_u8(chr)
175 } else if chr == `>` {
176 complete_lexeme := parser.builder_str().to_lower()
177 parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0
178 && complete_lexeme[complete_lexeme.len - 1] == `/`)
179 if complete_lexeme.len > 0 && complete_lexeme[0] == `/` {
180 parser.dom.close_tags[complete_lexeme] = true
181 }
182 /*
183 else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
184 parser.lexical_attributes.current_tag.closed = true
185 }
186 */
187 if parser.lexical_attributes.current_tag.name == '' {
188 parser.lexical_attributes.current_tag.name = complete_lexeme
189 } else if complete_lexeme != '/' {
190 parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
191 }
192 parser.lexical_attributes.open_tag = false
193 parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code
194 if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags {
195 parser.lexical_attributes.open_code = true
196 parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name
197 }
198 // parser.print_debug(parser.lexical_attributes.current_tag.name)
199 } else if chr !in [u8(9), ` `, `=`, `\n`] { // Tab, space, = and \n
200 parser.lexical_attributes.lexeme_builder.write_u8(chr)
201 } else if chr != 10 {
202 complete_lexeme := parser.builder_str().to_lower()
203 if parser.lexical_attributes.current_tag.name == '' {
204 parser.lexical_attributes.current_tag.name = complete_lexeme
205 } else {
206 parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
207 parser.lexical_attributes.current_tag.last_attribute = ''
208 if chr == `=` {
209 parser.lexical_attributes.current_tag.last_attribute = complete_lexeme
210 }
211 }
212 parser.lexical_attributes.lexeme_builder.go_back_to(0)
213 }
214 if parser.builder_str() == '!--' {
215 parser.lexical_attributes.open_comment = true
216 }
217 } else if chr == `<` {
218 temp_string := parser.builder_str()
219 if parser.lexical_attributes.lexeme_builder.len >= 1 {
220 if parser.lexical_attributes.current_tag.name.len > 1
221 && parser.lexical_attributes.current_tag.name[0] == 47
222 && !blank_string(temp_string) {
223 parser.lexical_attributes.text_after_tag = true
224 } else {
225 parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content
226 }
227 }
228 // parser.print_debug(parser.lexical_attributes.current_tag.str())
229 parser.lexical_attributes.lexeme_builder.go_back_to(0)
230 parser.generate_tag()
231 parser.lexical_attributes.open_tag = true
232 parser.lexical_attributes.outside_tag = false
233
234 if parser.lexical_attributes.text_after_tag == true {
235 parser.tags << &Tag{
236 name: 'text'
237 content: temp_string
238 }
239 parser.lexical_attributes.text_after_tag = false
240 }
241 } else {
242 parser.lexical_attributes.lexeme_builder.write_u8(chr)
243 }
244 }
245
246 // If `data` has not tags but has only text.
247 if parser.lexical_attributes.outside_tag {
248 temp_string := parser.lexical_attributes.lexeme_builder.str()
249
250 if parser.tags.len == 0 {
251 parser.tags << &Tag{
252 name: 'text'
253 content: temp_string
254 }
255 } else if parser.tags.len == 1 {
256 mut tag := parser.tags.first()
257
258 if tag.name == 'text' {
259 tag.content += temp_string
260 }
261 }
262 }
263}
264
265// parse_html parses the given HTML string
266pub fn (mut parser Parser) parse_html(data string) {
267 parser.init()
268 mut lines := data.split_into_lines()
269 for index, line in lines {
270 parser.lexical_attributes.line_count++
271 // Parser shouldn't replace `\n`, because it may break JS code or text which sticks together.
272 // After `split_into_lines()` we need to add `\n` again.
273 parser.split_parse(if index < lines.len - 1 { '${line}\n' } else { line })
274 }
275 parser.generate_tag()
276 parser.dom.debug_file = parser.debug_file
277 parser.dom.construct(parser.tags)
278}
279
280// finalize finishes the parsing stage .
281@[inline]
282pub fn (mut parser Parser) finalize() {
283 parser.generate_tag()
284}
285
286// get_dom returns the parser's current DOM representation.
287pub fn (mut parser Parser) get_dom() DocumentObjectModel {
288 if !parser.dom.constructed {
289 parser.generate_tag()
290 parser.dom.construct(parser.tags)
291 }
292 return parser.dom
293}
294