v2 / vlib / toml / decoder / decoder.v
243 lines · 224 sloc · 6.67 KB · 8672f831461dec8ec646927d6a63a727afcacea8
Raw
1// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module decoder
5
6import toml.ast
7import toml.ast.walker
8import toml.token
9import toml.scanner
10import strconv
11
12// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
13const utf8_max = 0x10FFFF
14
15// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
16pub struct Decoder {
17pub:
18 scanner &scanner.Scanner = unsafe { nil }
19}
20
21// decode decodes certain `ast.Value`'s and all it's children.
22pub fn (d Decoder) decode(mut n ast.Value) ! {
23 walker.walk_and_modify(d, mut n)!
24}
25
26fn (d Decoder) modify(mut value ast.Value) ! {
27 match value {
28 ast.Quoted {
29 mut v := &(value as ast.Quoted)
30 d.decode_quoted(mut v)!
31 }
32 ast.Number {
33 mut v := &(value as ast.Number)
34 d.decode_number(mut v)!
35 }
36 ast.DateTime {
37 mut v := &(value as ast.DateTime)
38 d.decode_date_time(mut v)!
39 }
40 else {}
41 }
42}
43
44// excerpt returns a string of the token's surroundings
45fn (d Decoder) excerpt(tp token.Pos) string {
46 return d.scanner.excerpt(tp.pos, 10)
47}
48
49// decode_quoted returns an error if `q` is not a valid quoted TOML string.
50fn (d Decoder) decode_quoted(mut q ast.Quoted) ! {
51 decode_quoted_escapes(mut q)!
52}
53
54// decode_number decodes the `n ast.Number` into valid TOML.
55fn (d Decoder) decode_number(mut n ast.Number) ! {
56 if n.text == '-nan' || n.text == '+nan' {
57 n.text = 'nan'
58 }
59}
60
61// decode_quoted_escapes returns an error for any disallowed escape sequences.
62// Delimiters in TOML has significant meaning:
63// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
64// "/""" delimits *basic* strings
65// Allowed escapes in *basic* strings are:
66// \b - backspace (U+0008)
67// \t - tab (U+0009)
68// \n - linefeed (U+000A)
69// \f - form feed (U+000C)
70// \r - carriage return (U+000D)
71// \" - quote (U+0022)
72// \\ - backslash (U+005C)
73// \uXXXX - Unicode (U+XXXX)
74// \UXXXXXXXX - Unicode (U+XXXXXXXX)
75pub fn decode_quoted_escapes(mut q ast.Quoted) ! {
76 // Setup a scanner in stack memory for easier navigation.
77 mut eat_whitespace := false
78 // TODO: use string builder
79 mut decoded_s := ''
80 // See https://toml.io/en/v1.0.0#string for more info on string types.
81 is_basic := q.quote == `\"`
82 if !is_basic {
83 return
84 }
85
86 mut s := scanner.new_simple_text(q.text)!
87 q.text = q.text.replace('\\"', '"')
88
89 for ch := s.next(); ch != scanner.end_of_text; ch = s.next() {
90 ch_byte := u8(ch)
91 if eat_whitespace && ch_byte.is_space() {
92 continue
93 }
94 eat_whitespace = false
95
96 if ch == `\\` {
97 ch_next := s.at()
98 ch_next_byte := u8(ch_next)
99
100 if q.is_multiline {
101 if ch_next_byte.is_space() {
102 eat_whitespace = true
103 continue
104 }
105 }
106 match rune(ch_next) {
107 `\\`, `"` {
108 decoded_s += ch_next_byte.ascii_str()
109 s.next()
110 continue
111 }
112 `n` {
113 decoded_s += '\n'
114 s.next()
115 continue
116 }
117 `t` {
118 decoded_s += '\t'
119 s.next()
120 continue
121 }
122 `b` {
123 decoded_s += '\b'
124 s.next()
125 continue
126 }
127 `r` {
128 decoded_s += '\r'
129 s.next()
130 continue
131 }
132 `f` {
133 decoded_s += '\f'
134 s.next()
135 continue
136 }
137 else {}
138 }
139
140 escape := ch_byte.ascii_str() + ch_next_byte.ascii_str()
141 // Decode unicode escapes
142 if escape.to_lower() == '\\u' {
143 is_valid_short := u8(s.peek(1)).is_hex_digit() && u8(s.peek(2)).is_hex_digit()
144 && u8(s.peek(3)).is_hex_digit() && u8(s.peek(4)).is_hex_digit()
145
146 if is_valid_short {
147 is_valid_long := u8(s.peek(5)).is_hex_digit() && u8(s.peek(6)).is_hex_digit()
148 && u8(s.peek(7)).is_hex_digit() && u8(s.peek(8)).is_hex_digit()
149 // If it's a long type Unicode (\UXXXXXXXX) with a maximum of 10 chars: '\' + 'U' + 8 hex characters
150 // we pass in 10 characters from the `u`/`U` which is the longest possible sequence
151 // of 9 chars plus one extra.
152 // Else it's a short sequence (\uXXXX) with a maximum of 6 chars: '\' + 'U' + 4 hex characters.
153 mut decoded := ''
154 mut sequence_length := 0
155 mut unicode_val := 0
156 mut slen := if is_valid_long { 10 } else { 6 }
157 if slen <= s.remaining() {
158 pos := s.state().pos
159 sequence := s.text#[pos..pos + slen + 1]
160 decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
161 decoded_s += escape
162 continue
163 }
164 if unicode_val > utf8_max || unicode_val < 0 {
165 decoded_s += escape
166 continue
167 }
168 // Check if the Unicode value is actually in the valid Unicode scalar value ranges.
169 if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF)
170 || (unicode_val >= 0xE000 && unicode_val <= utf8_max)) {
171 decoded_s += escape
172 continue
173 }
174 decoded_s += decoded
175 replacement := s.text[pos..pos + sequence_length + 1]
176 s.skip_n(replacement.len)
177 continue
178 } else {
179 pos := s.state().pos
180 sequence := s.text[pos..]
181 decoded, _, _ = decode_unicode_escape(sequence) or {
182 decoded_s += escape
183 continue
184 }
185 decoded_s += decoded
186 s.skip_n(s.text[pos..].len)
187 continue
188 }
189 }
190 }
191 }
192 decoded_s += ch_byte.ascii_str()
193 }
194 q.text = decoded_s
195}
196
197// decode_unicode_escape decodes the Unicode escape sequence `esc_unicode`.
198// The sequence is expected to be prefixed with either `u` or `U`.
199// decode_unicode_escape returns the decoded rune as
200// a string, it's integer value and it's length.
201fn decode_unicode_escape(esc_unicode string) !(string, int, int) {
202 is_long_esc_type := esc_unicode.starts_with('U')
203 mut sequence := esc_unicode[1..]
204 hex_digits_len := if is_long_esc_type { 8 } else { 4 }
205 mut sequence_len := hex_digits_len
206
207 sequence = sequence[..hex_digits_len]
208
209 mut unicode_point := sequence
210 if unicode_point.len < 8 {
211 unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
212 }
213 i64_val := strconv.parse_int(unicode_point, 16, 0)!
214 rn := rune(i64_val)
215 return '${rn}', int(i64_val), sequence_len
216}
217
218// decode_date_time decodes the `dt ast.DateTime`.
219fn (d Decoder) decode_date_time(mut dt ast.DateTime) ! {
220 // Expand milliseconds that are only 1 char
221 if dt.text.contains('.') {
222 yymmddhhmmss := dt.text.all_before('.')
223 rest := dt.text.all_after('.')
224 z := if rest.contains('Z') { 'Z' } else { '' }
225 mut ms := rest
226 mut offset := ''
227 if rest.contains('+') {
228 offset = '+' + rest.all_after('+')
229 ms = rest.all_before('+')
230 } else if rest.contains('-') {
231 offset = '-' + rest.all_after('-')
232 ms = rest.all_before('-')
233 }
234 if z != '' {
235 ms = ms.replace('Z', '')
236 }
237 if ms.len > 1 {
238 return
239 }
240 ms = ms + '0'.repeat(4 - ms.len) + z
241 dt.text = yymmddhhmmss + '.' + ms + offset
242 }
243}
244