Gitly


1 module html
2 
3 import os
4 import strings
5 
6 struct LexicalAttributes {
7 mut:
8     current_tag      &Tag = unsafe { nil }
9     open_tag         bool
10     open_code        bool
11     open_string      int
12     open_comment     bool
13     is_attribute     bool
14     opened_code_type string
15     line_count       int
16     outside_tag      bool
17     text_after_tag   bool
18     lexeme_builder   strings.Builder = strings.new_builder(100)
19     code_tags        map[string]bool = {
20         'script': true
21         'style':  true
22     }
23 }
24 
25 // Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`.
26 pub struct Parser {
27 mut:
28     dom                DocumentObjectModel
29     lexical_attributes LexicalAttributes = LexicalAttributes{
30         current_tag: &Tag{}
31     }
32     filename           string = 'direct-parse'
33     initialized        bool
34     tags               []&Tag
35     debug_file         os.File
36 }
37 
38 // This function is used to add a tag for the parser ignore it's content.
39 // For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
40 // like `add_code_tag('script')` will make all `script` tags content be jumped,
41 // so you still have its content, but will not confuse the parser with it's `>` or `<`.
42 pub fn (mut parser Parser) add_code_tag(name string) {
43     if name.len <= 0 {
44         return
45     }
46     parser.lexical_attributes.code_tags[name] = true
47 }
48 
49 @[inline]
50 fn (parser Parser) builder_str() string {
51     return parser.lexical_attributes.lexeme_builder.after(0)
52 }
53 
54 @[if debug_html ?]
55 fn (mut parser Parser) print_debug(data string) {
56     if data.len > 0 {
57         parser.debug_file.writeln(data) or { panic(err) }
58     }
59 }
60 
61 fn (mut parser Parser) verify_end_comment(remove bool) bool {
62     lexeme := parser.builder_str()
63     last := lexeme[lexeme.len - 1]
64     penultimate := lexeme[lexeme.len - 2]
65     is_end_comment := last == `-` && penultimate == `-`
66     if is_end_comment && remove {
67         parser.lexical_attributes.lexeme_builder.go_back(2)
68     }
69     return is_end_comment
70 }
71 
72 fn blank_string(data string) bool {
73     mut count := 0
74     for chr in data {
75         if chr == 10 || chr == 9 || chr == 32 {
76             count++
77         }
78     }
79     return count == data.len
80 }
81 
82 // init initializes the parser.
83 fn (mut parser Parser) init() {
84     if parser.initialized {
85         return
86     }
87     parser.dom = DocumentObjectModel{
88         debug_file: parser.debug_file
89         root:       &Tag{}
90     }
91     parser.add_code_tag('')
92     parser.tags = []&Tag{}
93     parser.dom.close_tags['/!document'] = true
94     parser.lexical_attributes.current_tag = &Tag{}
95     parser.lexical_attributes.outside_tag = true
96     parser.initialized = true
97 }
98 
99 fn (mut parser Parser) generate_tag() {
100     if parser.lexical_attributes.open_tag {
101         return
102     }
103     if parser.lexical_attributes.current_tag.name != ''
104         || parser.lexical_attributes.current_tag.content.len > 0 {
105         parser.tags << parser.lexical_attributes.current_tag
106     }
107     parser.lexical_attributes.current_tag = &Tag{}
108 }
109 
110 // split_parse parses the HTML fragment
111 pub fn (mut parser Parser) split_parse(data string) {
112     parser.init()
113     for chr in data {
114         is_quote := chr == `"` || chr == `'`
115         string_code := match chr {
116             `"` { 1 }
117             `'` { 2 }
118             else { 0 }
119         }
120 
121         if parser.lexical_attributes.open_code { // verify if open_code is complete and handle string code
122             parser.lexical_attributes.lexeme_builder.write_u8(chr)
123             if parser.lexical_attributes.open_string > 0
124                 && parser.lexical_attributes.open_string == string_code {
125                 parser.lexical_attributes.open_string = 0
126             } else if chr == `>` { // code tag is finished
127                 name_close_tag := '</${parser.lexical_attributes.opened_code_type}>'
128                 if parser.builder_str().to_lower().ends_with(name_close_tag) {
129                     parser.lexical_attributes.open_code = false
130                     // modify lexeme_builder to include script text as content in the next loop
131                     parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len)
132                     parser.lexical_attributes.current_tag.closed = true
133                     parser.lexical_attributes.current_tag.close_type = .new_tag
134                 }
135             }
136         } else if parser.lexical_attributes.open_comment {
137             if chr == `>` && parser.verify_end_comment(false) {
138                 // parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str())
139                 parser.lexical_attributes.lexeme_builder.go_back_to(0)
140                 parser.lexical_attributes.open_comment = false
141                 parser.lexical_attributes.open_tag = false
142             } else {
143                 parser.lexical_attributes.lexeme_builder.write_u8(chr)
144             }
145         } else if parser.lexical_attributes.open_string > 0 {
146             if parser.lexical_attributes.open_string == string_code {
147                 parser.lexical_attributes.open_string = 0
148                 parser.lexical_attributes.lexeme_builder.write_u8(chr)
149                 temp_lexeme := parser.builder_str()
150                 if parser.lexical_attributes.current_tag.last_attribute != '' {
151                     lattr := parser.lexical_attributes.current_tag.last_attribute
152                     nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
153                     // parser.print_debug(lattr + " = " + temp_lexeme)
154                     parser.lexical_attributes.current_tag.attributes[lattr] = nval
155                     if lattr == 'class' {
156                         for class_name in nval.split_any('\t\r\n \x0D') {
157                             if class_name != '' {
158                                 parser.lexical_attributes.current_tag.class_set.add(class_name)
159                             }
160                         }
161                     }
162                     parser.lexical_attributes.current_tag.last_attribute = ''
163                 } else {
164                     parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = ''
165                     // parser.print_debug(temp_lexeme)
166                 }
167                 parser.lexical_attributes.lexeme_builder.go_back_to(0)
168             } else {
169                 parser.lexical_attributes.lexeme_builder.write_u8(chr)
170             }
171         } else if parser.lexical_attributes.open_tag {
172             if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote {
173                 parser.lexical_attributes.open_string = string_code
174                 parser.lexical_attributes.lexeme_builder.write_u8(chr)
175             } else if chr == `>` {
176                 complete_lexeme := parser.builder_str().to_lower()
177                 parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0
178                     && complete_lexeme[complete_lexeme.len - 1] == `/`)
179                 if complete_lexeme.len > 0 && complete_lexeme[0] == `/` {
180                     parser.dom.close_tags[complete_lexeme] = true
181                 }
182                 /*
183                 else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
184                     parser.lexical_attributes.current_tag.closed = true
185                 }
186                 */
187                 if parser.lexical_attributes.current_tag.name == '' {
188                     parser.lexical_attributes.current_tag.name = complete_lexeme
189                 } else if complete_lexeme != '/' {
190                     parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
191                 }
192                 parser.lexical_attributes.open_tag = false
193                 parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code
194                 if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags {
195                     parser.lexical_attributes.open_code = true
196                     parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name
197                 }
198                 // parser.print_debug(parser.lexical_attributes.current_tag.name)
199             } else if chr !in [u8(9), ` `, `=`, `\n`] { // Tab, space, = and \n
200                 parser.lexical_attributes.lexeme_builder.write_u8(chr)
201             } else if chr != 10 {
202                 complete_lexeme := parser.builder_str().to_lower()
203                 if parser.lexical_attributes.current_tag.name == '' {
204                     parser.lexical_attributes.current_tag.name = complete_lexeme
205                 } else {
206                     parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
207                     parser.lexical_attributes.current_tag.last_attribute = ''
208                     if chr == `=` {
209                         parser.lexical_attributes.current_tag.last_attribute = complete_lexeme
210                     }
211                 }
212                 parser.lexical_attributes.lexeme_builder.go_back_to(0)
213             }
214             if parser.builder_str() == '!--' {
215                 parser.lexical_attributes.open_comment = true
216             }
217         } else if chr == `<` {
218             temp_string := parser.builder_str()
219             if parser.lexical_attributes.lexeme_builder.len >= 1 {
220                 if parser.lexical_attributes.current_tag.name.len > 1
221                     && parser.lexical_attributes.current_tag.name[0] == 47
222                     && !blank_string(temp_string) {
223                     parser.lexical_attributes.text_after_tag = true
224                 } else {
225                     parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content
226                 }
227             }
228             // parser.print_debug(parser.lexical_attributes.current_tag.str())
229             parser.lexical_attributes.lexeme_builder.go_back_to(0)
230             parser.generate_tag()
231             parser.lexical_attributes.open_tag = true
232             parser.lexical_attributes.outside_tag = false
233 
234             if parser.lexical_attributes.text_after_tag == true {
235                 parser.tags << &Tag{
236                     name:    'text'
237                     content: temp_string
238                 }
239                 parser.lexical_attributes.text_after_tag = false
240             }
241         } else {
242             parser.lexical_attributes.lexeme_builder.write_u8(chr)
243         }
244     }
245 
246     // If `data` has not tags but has only text.
247     if parser.lexical_attributes.outside_tag {
248         temp_string := parser.lexical_attributes.lexeme_builder.str()
249 
250         if parser.tags.len == 0 {
251             parser.tags << &Tag{
252                 name:    'text'
253                 content: temp_string
254             }
255         } else if parser.tags.len == 1 {
256             mut tag := parser.tags.first()
257 
258             if tag.name == 'text' {
259                 tag.content += temp_string
260             }
261         }
262     }
263 }
264 
265 // parse_html parses the given HTML string
266 pub fn (mut parser Parser) parse_html(data string) {
267     parser.init()
268     mut lines := data.split_into_lines()
269     for index, line in lines {
270         parser.lexical_attributes.line_count++
271         // Parser shouldn't replace `\n`, because it may break JS code or text which sticks together.
272         // After `split_into_lines()` we need to add `\n` again.
273         parser.split_parse(if index < lines.len - 1 { '${line}\n' } else { line })
274     }
275     parser.generate_tag()
276     parser.dom.debug_file = parser.debug_file
277     parser.dom.construct(parser.tags)
278 }
279 
280 // finalize finishes the parsing stage .
281 @[inline]
282 pub fn (mut parser Parser) finalize() {
283     parser.generate_tag()
284 }
285 
286 // get_dom returns the parser's current DOM representation.
287 pub fn (mut parser Parser) get_dom() DocumentObjectModel {
288     if !parser.dom.constructed {
289         parser.generate_tag()
290         parser.dom.construct(parser.tags)
291     }
292     return parser.dom
293 }
294

1	module html
2
3	import os
4	import strings
5
6	struct LexicalAttributes {
7	mut:
8	current_tag &Tag = unsafe { nil }
9	open_tag bool
10	open_code bool
11	open_string int
12	open_comment bool
13	is_attribute bool
14	opened_code_type string
15	line_count int
16	outside_tag bool
17	text_after_tag bool
18	lexeme_builder strings.Builder = strings.new_builder(100)
19	code_tags map[string]bool = {
20	'script': true
21	'style': true
22	}
23	}
24
25	// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`.
26	pub struct Parser {
27	mut:
28	dom DocumentObjectModel
29	lexical_attributes LexicalAttributes = LexicalAttributes{
30	current_tag: &Tag{}
31	}
32	filename string = 'direct-parse'
33	initialized bool
34	tags []&Tag
35	debug_file os.File
36	}
37
38	// This function is used to add a tag for the parser ignore it's content.
39	// For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
40	// like `add_code_tag('script')` will make all `script` tags content be jumped,
41	// so you still have its content, but will not confuse the parser with it's `>` or `<`.
42	pub fn (mut parser Parser) add_code_tag(name string) {
43	if name.len <= 0 {
44	return
45	}
46	parser.lexical_attributes.code_tags[name] = true
47	}
48
49	@[inline]
50	fn (parser Parser) builder_str() string {
51	return parser.lexical_attributes.lexeme_builder.after(0)
52	}
53
54	@[if debug_html ?]
55	fn (mut parser Parser) print_debug(data string) {
56	if data.len > 0 {
57	parser.debug_file.writeln(data) or { panic(err) }
58	}
59	}
60
61	fn (mut parser Parser) verify_end_comment(remove bool) bool {
62	lexeme := parser.builder_str()
63	last := lexeme[lexeme.len - 1]
64	penultimate := lexeme[lexeme.len - 2]
65	is_end_comment := last == `-` && penultimate == `-`
66	if is_end_comment && remove {
67	parser.lexical_attributes.lexeme_builder.go_back(2)
68	}
69	return is_end_comment
70	}
71
72	fn blank_string(data string) bool {
73	mut count := 0
74	for chr in data {
75	if chr == 10 \|\| chr == 9 \|\| chr == 32 {
76	count++
77	}
78	}
79	return count == data.len
80	}
81
82	// init initializes the parser.
83	fn (mut parser Parser) init() {
84	if parser.initialized {
85	return
86	}
87	parser.dom = DocumentObjectModel{
88	debug_file: parser.debug_file
89	root: &Tag{}
90	}
91	parser.add_code_tag('')
92	parser.tags = []&Tag{}
93	parser.dom.close_tags['/!document'] = true
94	parser.lexical_attributes.current_tag = &Tag{}
95	parser.lexical_attributes.outside_tag = true
96	parser.initialized = true
97	}
98
99	fn (mut parser Parser) generate_tag() {
100	if parser.lexical_attributes.open_tag {
101	return
102	}
103	if parser.lexical_attributes.current_tag.name != ''
104	\|\| parser.lexical_attributes.current_tag.content.len > 0 {
105	parser.tags << parser.lexical_attributes.current_tag
106	}
107	parser.lexical_attributes.current_tag = &Tag{}
108	}
109
110	// split_parse parses the HTML fragment
111	pub fn (mut parser Parser) split_parse(data string) {
112	parser.init()
113	for chr in data {
114	is_quote := chr == `"` \|\| chr == `'`
115	string_code := match chr {
116	`"` { 1 }
117	`'` { 2 }
118	else { 0 }
119	}
120
121	if parser.lexical_attributes.open_code { // verify if open_code is complete and handle string code
122	parser.lexical_attributes.lexeme_builder.write_u8(chr)
123	if parser.lexical_attributes.open_string > 0
124	&& parser.lexical_attributes.open_string == string_code {
125	parser.lexical_attributes.open_string = 0
126	} else if chr == `>` { // code tag is finished
127	name_close_tag := '</${parser.lexical_attributes.opened_code_type}>'
128	if parser.builder_str().to_lower().ends_with(name_close_tag) {
129	parser.lexical_attributes.open_code = false
130	// modify lexeme_builder to include script text as content in the next loop
131	parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len)
132	parser.lexical_attributes.current_tag.closed = true
133	parser.lexical_attributes.current_tag.close_type = .new_tag
134	}
135	}
136	} else if parser.lexical_attributes.open_comment {
137	if chr == `>` && parser.verify_end_comment(false) {
138	// parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str())
139	parser.lexical_attributes.lexeme_builder.go_back_to(0)
140	parser.lexical_attributes.open_comment = false
141	parser.lexical_attributes.open_tag = false
142	} else {
143	parser.lexical_attributes.lexeme_builder.write_u8(chr)
144	}
145	} else if parser.lexical_attributes.open_string > 0 {
146	if parser.lexical_attributes.open_string == string_code {
147	parser.lexical_attributes.open_string = 0
148	parser.lexical_attributes.lexeme_builder.write_u8(chr)
149	temp_lexeme := parser.builder_str()
150	if parser.lexical_attributes.current_tag.last_attribute != '' {
151	lattr := parser.lexical_attributes.current_tag.last_attribute
152	nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
153	// parser.print_debug(lattr + " = " + temp_lexeme)
154	parser.lexical_attributes.current_tag.attributes[lattr] = nval
155	if lattr == 'class' {
156	for class_name in nval.split_any('\t\r\n \x0D') {
157	if class_name != '' {
158	parser.lexical_attributes.current_tag.class_set.add(class_name)
159	}
160	}
161	}
162	parser.lexical_attributes.current_tag.last_attribute = ''
163	} else {
164	parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = ''
165	// parser.print_debug(temp_lexeme)
166	}
167	parser.lexical_attributes.lexeme_builder.go_back_to(0)
168	} else {
169	parser.lexical_attributes.lexeme_builder.write_u8(chr)
170	}
171	} else if parser.lexical_attributes.open_tag {
172	if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote {
173	parser.lexical_attributes.open_string = string_code
174	parser.lexical_attributes.lexeme_builder.write_u8(chr)
175	} else if chr == `>` {
176	complete_lexeme := parser.builder_str().to_lower()
177	parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0
178	&& complete_lexeme[complete_lexeme.len - 1] == `/`)
179	if complete_lexeme.len > 0 && complete_lexeme[0] == `/` {
180	parser.dom.close_tags[complete_lexeme] = true
181	}
182	/*
183	else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
184	parser.lexical_attributes.current_tag.closed = true
185	}
186	*/
187	if parser.lexical_attributes.current_tag.name == '' {
188	parser.lexical_attributes.current_tag.name = complete_lexeme
189	} else if complete_lexeme != '/' {
190	parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
191	}
192	parser.lexical_attributes.open_tag = false
193	parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code
194	if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags {
195	parser.lexical_attributes.open_code = true
196	parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name
197	}
198	// parser.print_debug(parser.lexical_attributes.current_tag.name)
199	} else if chr !in [u8(9), ` `, `=`, `\n`] { // Tab, space, = and \n
200	parser.lexical_attributes.lexeme_builder.write_u8(chr)
201	} else if chr != 10 {
202	complete_lexeme := parser.builder_str().to_lower()
203	if parser.lexical_attributes.current_tag.name == '' {
204	parser.lexical_attributes.current_tag.name = complete_lexeme
205	} else {
206	parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
207	parser.lexical_attributes.current_tag.last_attribute = ''
208	if chr == `=` {
209	parser.lexical_attributes.current_tag.last_attribute = complete_lexeme
210	}
211	}
212	parser.lexical_attributes.lexeme_builder.go_back_to(0)
213	}
214	if parser.builder_str() == '!--' {
215	parser.lexical_attributes.open_comment = true
216	}
217	} else if chr == `<` {
218	temp_string := parser.builder_str()
219	if parser.lexical_attributes.lexeme_builder.len >= 1 {
220	if parser.lexical_attributes.current_tag.name.len > 1
221	&& parser.lexical_attributes.current_tag.name[0] == 47
222	&& !blank_string(temp_string) {
223	parser.lexical_attributes.text_after_tag = true
224	} else {
225	parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content
226	}
227	}
228	// parser.print_debug(parser.lexical_attributes.current_tag.str())
229	parser.lexical_attributes.lexeme_builder.go_back_to(0)
230	parser.generate_tag()
231	parser.lexical_attributes.open_tag = true
232	parser.lexical_attributes.outside_tag = false
233
234	if parser.lexical_attributes.text_after_tag == true {
235	parser.tags << &Tag{
236	name: 'text'
237	content: temp_string
238	}
239	parser.lexical_attributes.text_after_tag = false
240	}
241	} else {
242	parser.lexical_attributes.lexeme_builder.write_u8(chr)
243	}
244	}
245
246	// If `data` has not tags but has only text.
247	if parser.lexical_attributes.outside_tag {
248	temp_string := parser.lexical_attributes.lexeme_builder.str()
249
250	if parser.tags.len == 0 {
251	parser.tags << &Tag{
252	name: 'text'
253	content: temp_string
254	}
255	} else if parser.tags.len == 1 {
256	mut tag := parser.tags.first()
257
258	if tag.name == 'text' {
259	tag.content += temp_string
260	}
261	}
262	}
263	}
264
265	// parse_html parses the given HTML string
266	pub fn (mut parser Parser) parse_html(data string) {
267	parser.init()
268	mut lines := data.split_into_lines()
269	for index, line in lines {
270	parser.lexical_attributes.line_count++
271	// Parser shouldn't replace `\n`, because it may break JS code or text which sticks together.
272	// After `split_into_lines()` we need to add `\n` again.
273	parser.split_parse(if index < lines.len - 1 { '${line}\n' } else { line })
274	}
275	parser.generate_tag()
276	parser.dom.debug_file = parser.debug_file
277	parser.dom.construct(parser.tags)
278	}
279
280	// finalize finishes the parsing stage .
281	@[inline]
282	pub fn (mut parser Parser) finalize() {
283	parser.generate_tag()
284	}
285
286	// get_dom returns the parser's current DOM representation.
287	pub fn (mut parser Parser) get_dom() DocumentObjectModel {
288	if !parser.dom.constructed {
289	parser.generate_tag()
290	parser.dom.construct(parser.tags)
291	}
292	return parser.dom
293	}
294