Gitly


1 // Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module decoder
5 
6 import toml.ast
7 import toml.ast.walker
8 import toml.token
9 import toml.scanner
10 import strconv
11 
12 // utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
13 const utf8_max = 0x10FFFF
14 
15 // Decoder decode special sequences in a tree of TOML `ast.Value`'s.
16 pub struct Decoder {
17 pub:
18     scanner &scanner.Scanner = unsafe { nil }
19 }
20 
21 // decode decodes certain `ast.Value`'s and all it's children.
22 pub fn (d Decoder) decode(mut n ast.Value) ! {
23     walker.walk_and_modify(d, mut n)!
24 }
25 
26 fn (d Decoder) modify(mut value ast.Value) ! {
27     match value {
28         ast.Quoted {
29             mut v := &(value as ast.Quoted)
30             d.decode_quoted(mut v)!
31         }
32         ast.Number {
33             mut v := &(value as ast.Number)
34             d.decode_number(mut v)!
35         }
36         ast.DateTime {
37             mut v := &(value as ast.DateTime)
38             d.decode_date_time(mut v)!
39         }
40         else {}
41     }
42 }
43 
44 // excerpt returns a string of the token's surroundings
45 fn (d Decoder) excerpt(tp token.Pos) string {
46     return d.scanner.excerpt(tp.pos, 10)
47 }
48 
49 // decode_quoted returns an error if `q` is not a valid quoted TOML string.
50 fn (d Decoder) decode_quoted(mut q ast.Quoted) ! {
51     decode_quoted_escapes(mut q)!
52 }
53 
54 // decode_number decodes the `n ast.Number` into valid TOML.
55 fn (d Decoder) decode_number(mut n ast.Number) ! {
56     if n.text == '-nan' || n.text == '+nan' {
57         n.text = 'nan'
58     }
59 }
60 
61 // decode_quoted_escapes returns an error for any disallowed escape sequences.
62 // Delimiters in TOML has significant meaning:
63 // '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
64 // "/""" delimits *basic* strings
65 // Allowed escapes in *basic* strings are:
66 // \b         - backspace       (U+0008)
67 // \t         - tab             (U+0009)
68 // \n         - linefeed        (U+000A)
69 // \f         - form feed       (U+000C)
70 // \r         - carriage return (U+000D)
71 // \"         - quote           (U+0022)
72 // \\         - backslash       (U+005C)
73 // \uXXXX     - Unicode         (U+XXXX)
74 // \UXXXXXXXX - Unicode         (U+XXXXXXXX)
75 pub fn decode_quoted_escapes(mut q ast.Quoted) ! {
76     // Setup a scanner in stack memory for easier navigation.
77     mut eat_whitespace := false
78     // TODO: use string builder
79     mut decoded_s := ''
80     // See https://toml.io/en/v1.0.0#string for more info on string types.
81     is_basic := q.quote == `\"`
82     if !is_basic {
83         return
84     }
85 
86     mut s := scanner.new_simple_text(q.text)!
87     q.text = q.text.replace('\\"', '"')
88 
89     for ch := s.next(); ch != scanner.end_of_text; ch = s.next() {
90         ch_byte := u8(ch)
91         if eat_whitespace && ch_byte.is_space() {
92             continue
93         }
94         eat_whitespace = false
95 
96         if ch == `\\` {
97             ch_next := s.at()
98             ch_next_byte := u8(ch_next)
99 
100             if q.is_multiline {
101                 if ch_next_byte.is_space() {
102                     eat_whitespace = true
103                     continue
104                 }
105             }
106             match rune(ch_next) {
107                 `\\`, `"` {
108                     decoded_s += ch_next_byte.ascii_str()
109                     s.next()
110                     continue
111                 }
112                 `n` {
113                     decoded_s += '\n'
114                     s.next()
115                     continue
116                 }
117                 `t` {
118                     decoded_s += '\t'
119                     s.next()
120                     continue
121                 }
122                 `b` {
123                     decoded_s += '\b'
124                     s.next()
125                     continue
126                 }
127                 `r` {
128                     decoded_s += '\r'
129                     s.next()
130                     continue
131                 }
132                 `f` {
133                     decoded_s += '\f'
134                     s.next()
135                     continue
136                 }
137                 else {}
138             }
139 
140             escape := ch_byte.ascii_str() + ch_next_byte.ascii_str()
141             // Decode unicode escapes
142             if escape.to_lower() == '\\u' {
143                 is_valid_short := u8(s.peek(1)).is_hex_digit() && u8(s.peek(2)).is_hex_digit()
144                     && u8(s.peek(3)).is_hex_digit() && u8(s.peek(4)).is_hex_digit()
145 
146                 if is_valid_short {
147                     is_valid_long := u8(s.peek(5)).is_hex_digit() && u8(s.peek(6)).is_hex_digit()
148                         && u8(s.peek(7)).is_hex_digit() && u8(s.peek(8)).is_hex_digit()
149                     // If it's a long type Unicode (\UXXXXXXXX) with a maximum of 10 chars: '\' + 'U' + 8 hex characters
150                     // we pass in 10 characters from the `u`/`U` which is the longest possible sequence
151                     // of 9 chars plus one extra.
152                     // Else it's a short sequence (\uXXXX) with a maximum of 6 chars: '\' + 'U' + 4 hex characters.
153                     mut decoded := ''
154                     mut sequence_length := 0
155                     mut unicode_val := 0
156                     mut slen := if is_valid_long { 10 } else { 6 }
157                     if slen <= s.remaining() {
158                         pos := s.state().pos
159                         sequence := s.text#[pos..pos + slen + 1]
160                         decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
161                             decoded_s += escape
162                             continue
163                         }
164                         if unicode_val > utf8_max || unicode_val < 0 {
165                             decoded_s += escape
166                             continue
167                         }
168                         // Check if the Unicode value is actually in the valid Unicode scalar value ranges.
169                         if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF)
170                             || (unicode_val >= 0xE000 && unicode_val <= utf8_max)) {
171                             decoded_s += escape
172                             continue
173                         }
174                         decoded_s += decoded
175                         replacement := s.text[pos..pos + sequence_length + 1]
176                         s.skip_n(replacement.len)
177                         continue
178                     } else {
179                         pos := s.state().pos
180                         sequence := s.text[pos..]
181                         decoded, _, _ = decode_unicode_escape(sequence) or {
182                             decoded_s += escape
183                             continue
184                         }
185                         decoded_s += decoded
186                         s.skip_n(s.text[pos..].len)
187                         continue
188                     }
189                 }
190             }
191         }
192         decoded_s += ch_byte.ascii_str()
193     }
194     q.text = decoded_s
195 }
196 
197 // decode_unicode_escape decodes the Unicode escape sequence `esc_unicode`.
198 // The sequence is expected to be prefixed with either `u` or `U`.
199 // decode_unicode_escape returns the decoded rune as
200 // a string, it's integer value and it's length.
201 fn decode_unicode_escape(esc_unicode string) !(string, int, int) {
202     is_long_esc_type := esc_unicode.starts_with('U')
203     mut sequence := esc_unicode[1..]
204     hex_digits_len := if is_long_esc_type { 8 } else { 4 }
205     mut sequence_len := hex_digits_len
206 
207     sequence = sequence[..hex_digits_len]
208 
209     mut unicode_point := sequence
210     if unicode_point.len < 8 {
211         unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
212     }
213     i64_val := strconv.parse_int(unicode_point, 16, 0)!
214     rn := rune(i64_val)
215     return '${rn}', int(i64_val), sequence_len
216 }
217 
218 // decode_date_time decodes the `dt ast.DateTime`.
219 fn (d Decoder) decode_date_time(mut dt ast.DateTime) ! {
220     // Expand milliseconds that are only 1 char
221     if dt.text.contains('.') {
222         yymmddhhmmss := dt.text.all_before('.')
223         rest := dt.text.all_after('.')
224         z := if rest.contains('Z') { 'Z' } else { '' }
225         mut ms := rest
226         mut offset := ''
227         if rest.contains('+') {
228             offset = '+' + rest.all_after('+')
229             ms = rest.all_before('+')
230         } else if rest.contains('-') {
231             offset = '-' + rest.all_after('-')
232             ms = rest.all_before('-')
233         }
234         if z != '' {
235             ms = ms.replace('Z', '')
236         }
237         if ms.len > 1 {
238             return
239         }
240         ms = ms + '0'.repeat(4 - ms.len) + z
241         dt.text = yymmddhhmmss + '.' + ms + offset
242     }
243 }
244

1	// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
2	// Use of this source code is governed by an MIT license
3	// that can be found in the LICENSE file.
4	module decoder
5
6	import toml.ast
7	import toml.ast.walker
8	import toml.token
9	import toml.scanner
10	import strconv
11
12	// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
13	const utf8_max = 0x10FFFF
14
15	// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
16	pub struct Decoder {
17	pub:
18	scanner &scanner.Scanner = unsafe { nil }
19	}
20
21	// decode decodes certain `ast.Value`'s and all it's children.
22	pub fn (d Decoder) decode(mut n ast.Value) ! {
23	walker.walk_and_modify(d, mut n)!
24	}
25
26	fn (d Decoder) modify(mut value ast.Value) ! {
27	match value {
28	ast.Quoted {
29	mut v := &(value as ast.Quoted)
30	d.decode_quoted(mut v)!
31	}
32	ast.Number {
33	mut v := &(value as ast.Number)
34	d.decode_number(mut v)!
35	}
36	ast.DateTime {
37	mut v := &(value as ast.DateTime)
38	d.decode_date_time(mut v)!
39	}
40	else {}
41	}
42	}
43
44	// excerpt returns a string of the token's surroundings
45	fn (d Decoder) excerpt(tp token.Pos) string {
46	return d.scanner.excerpt(tp.pos, 10)
47	}
48
49	// decode_quoted returns an error if `q` is not a valid quoted TOML string.
50	fn (d Decoder) decode_quoted(mut q ast.Quoted) ! {
51	decode_quoted_escapes(mut q)!
52	}
53
54	// decode_number decodes the `n ast.Number` into valid TOML.
55	fn (d Decoder) decode_number(mut n ast.Number) ! {
56	if n.text == '-nan' \|\| n.text == '+nan' {
57	n.text = 'nan'
58	}
59	}
60
61	// decode_quoted_escapes returns an error for any disallowed escape sequences.
62	// Delimiters in TOML has significant meaning:
63	// '/''' delimits literal* strings (WYSIWYG / What-you-see-is-what-you-get)*
64	// "/""" delimits basic* strings*
65	// Allowed escapes in basic* strings are:*
66	// \b - backspace (U+0008)
67	// \t - tab (U+0009)
68	// \n - linefeed (U+000A)
69	// \f - form feed (U+000C)
70	// \r - carriage return (U+000D)
71	// \" - quote (U+0022)
72	// \\ - backslash (U+005C)
73	// \uXXXX - Unicode (U+XXXX)
74	// \UXXXXXXXX - Unicode (U+XXXXXXXX)
75	pub fn decode_quoted_escapes(mut q ast.Quoted) ! {
76	// Setup a scanner in stack memory for easier navigation.
77	mut eat_whitespace := false
78	// TODO: use string builder
79	mut decoded_s := ''
80	// See https://toml.io/en/v1.0.0#string for more info on string types.
81	is_basic := q.quote == `\"`
82	if !is_basic {
83	return
84	}
85
86	mut s := scanner.new_simple_text(q.text)!
87	q.text = q.text.replace('\\"', '"')
88
89	for ch := s.next(); ch != scanner.end_of_text; ch = s.next() {
90	ch_byte := u8(ch)
91	if eat_whitespace && ch_byte.is_space() {
92	continue
93	}
94	eat_whitespace = false
95
96	if ch == `\\` {
97	ch_next := s.at()
98	ch_next_byte := u8(ch_next)
99
100	if q.is_multiline {
101	if ch_next_byte.is_space() {
102	eat_whitespace = true
103	continue
104	}
105	}
106	match rune(ch_next) {
107	`\\`, `"` {
108	decoded_s += ch_next_byte.ascii_str()
109	s.next()
110	continue
111	}
112	`n` {
113	decoded_s += '\n'
114	s.next()
115	continue
116	}
117	`t` {
118	decoded_s += '\t'
119	s.next()
120	continue
121	}
122	`b` {
123	decoded_s += '\b'
124	s.next()
125	continue
126	}
127	`r` {
128	decoded_s += '\r'
129	s.next()
130	continue
131	}
132	`f` {
133	decoded_s += '\f'
134	s.next()
135	continue
136	}
137	else {}
138	}
139
140	escape := ch_byte.ascii_str() + ch_next_byte.ascii_str()
141	// Decode unicode escapes
142	if escape.to_lower() == '\\u' {
143	is_valid_short := u8(s.peek(1)).is_hex_digit() && u8(s.peek(2)).is_hex_digit()
144	&& u8(s.peek(3)).is_hex_digit() && u8(s.peek(4)).is_hex_digit()
145
146	if is_valid_short {
147	is_valid_long := u8(s.peek(5)).is_hex_digit() && u8(s.peek(6)).is_hex_digit()
148	&& u8(s.peek(7)).is_hex_digit() && u8(s.peek(8)).is_hex_digit()
149	// If it's a long type Unicode (\UXXXXXXXX) with a maximum of 10 chars: '\' + 'U' + 8 hex characters
150	// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
151	// of 9 chars plus one extra.
152	// Else it's a short sequence (\uXXXX) with a maximum of 6 chars: '\' + 'U' + 4 hex characters.
153	mut decoded := ''
154	mut sequence_length := 0
155	mut unicode_val := 0
156	mut slen := if is_valid_long { 10 } else { 6 }
157	if slen <= s.remaining() {
158	pos := s.state().pos
159	sequence := s.text#[pos..pos + slen + 1]
160	decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
161	decoded_s += escape
162	continue
163	}
164	if unicode_val > utf8_max \|\| unicode_val < 0 {
165	decoded_s += escape
166	continue
167	}
168	// Check if the Unicode value is actually in the valid Unicode scalar value ranges.
169	if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF)
170	\|\| (unicode_val >= 0xE000 && unicode_val <= utf8_max)) {
171	decoded_s += escape
172	continue
173	}
174	decoded_s += decoded
175	replacement := s.text[pos..pos + sequence_length + 1]
176	s.skip_n(replacement.len)
177	continue
178	} else {
179	pos := s.state().pos
180	sequence := s.text[pos..]
181	decoded, _, _ = decode_unicode_escape(sequence) or {
182	decoded_s += escape
183	continue
184	}
185	decoded_s += decoded
186	s.skip_n(s.text[pos..].len)
187	continue
188	}
189	}
190	}
191	}
192	decoded_s += ch_byte.ascii_str()
193	}
194	q.text = decoded_s
195	}
196
197	// decode_unicode_escape decodes the Unicode escape sequence `esc_unicode`.
198	// The sequence is expected to be prefixed with either `u` or `U`.
199	// decode_unicode_escape returns the decoded rune as
200	// a string, it's integer value and it's length.
201	fn decode_unicode_escape(esc_unicode string) !(string, int, int) {
202	is_long_esc_type := esc_unicode.starts_with('U')
203	mut sequence := esc_unicode[1..]
204	hex_digits_len := if is_long_esc_type { 8 } else { 4 }
205	mut sequence_len := hex_digits_len
206
207	sequence = sequence[..hex_digits_len]
208
209	mut unicode_point := sequence
210	if unicode_point.len < 8 {
211	unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
212	}
213	i64_val := strconv.parse_int(unicode_point, 16, 0)!
214	rn := rune(i64_val)
215	return '${rn}', int(i64_val), sequence_len
216	}
217
218	// decode_date_time decodes the `dt ast.DateTime`.
219	fn (d Decoder) decode_date_time(mut dt ast.DateTime) ! {
220	// Expand milliseconds that are only 1 char
221	if dt.text.contains('.') {
222	yymmddhhmmss := dt.text.all_before('.')
223	rest := dt.text.all_after('.')
224	z := if rest.contains('Z') { 'Z' } else { '' }
225	mut ms := rest
226	mut offset := ''
227	if rest.contains('+') {
228	offset = '+' + rest.all_after('+')
229	ms = rest.all_before('+')
230	} else if rest.contains('-') {
231	offset = '-' + rest.all_after('-')
232	ms = rest.all_before('-')
233	}
234	if z != '' {
235	ms = ms.replace('Z', '')
236	}
237	if ms.len > 1 {
238	return
239	}
240	ms = ms + '0'.repeat(4 - ms.len) + z
241	dt.text = yymmddhhmmss + '.' + ms + offset
242	}
243	}
244