| 1 | // Copyright (c) 2021 Lars Pontoppidan. All rights reserved. |
| 2 | // Use of this source code is governed by an MIT license |
| 3 | // that can be found in the LICENSE file. |
| 4 | module decoder |
| 5 | |
| 6 | import toml.ast |
| 7 | import toml.ast.walker |
| 8 | import toml.token |
| 9 | import toml.scanner |
| 10 | import strconv |
| 11 | |
| 12 | // utf8_max is the largest inclusive value of the Unicodes scalar value ranges. |
| 13 | const utf8_max = 0x10FFFF |
| 14 | |
| 15 | // Decoder decode special sequences in a tree of TOML `ast.Value`'s. |
| 16 | pub struct Decoder { |
| 17 | pub: |
| 18 | scanner &scanner.Scanner = unsafe { nil } |
| 19 | } |
| 20 | |
| 21 | // decode decodes certain `ast.Value`'s and all it's children. |
| 22 | pub fn (d Decoder) decode(mut n ast.Value) ! { |
| 23 | walker.walk_and_modify(d, mut n)! |
| 24 | } |
| 25 | |
| 26 | fn (d Decoder) modify(mut value ast.Value) ! { |
| 27 | match value { |
| 28 | ast.Quoted { |
| 29 | mut v := &(value as ast.Quoted) |
| 30 | d.decode_quoted(mut v)! |
| 31 | } |
| 32 | ast.Number { |
| 33 | mut v := &(value as ast.Number) |
| 34 | d.decode_number(mut v)! |
| 35 | } |
| 36 | ast.DateTime { |
| 37 | mut v := &(value as ast.DateTime) |
| 38 | d.decode_date_time(mut v)! |
| 39 | } |
| 40 | else {} |
| 41 | } |
| 42 | } |
| 43 | |
| 44 | // excerpt returns a string of the token's surroundings |
| 45 | fn (d Decoder) excerpt(tp token.Pos) string { |
| 46 | return d.scanner.excerpt(tp.pos, 10) |
| 47 | } |
| 48 | |
| 49 | // decode_quoted returns an error if `q` is not a valid quoted TOML string. |
| 50 | fn (d Decoder) decode_quoted(mut q ast.Quoted) ! { |
| 51 | decode_quoted_escapes(mut q)! |
| 52 | } |
| 53 | |
| 54 | // decode_number decodes the `n ast.Number` into valid TOML. |
| 55 | fn (d Decoder) decode_number(mut n ast.Number) ! { |
| 56 | if n.text == '-nan' || n.text == '+nan' { |
| 57 | n.text = 'nan' |
| 58 | } |
| 59 | } |
| 60 | |
| 61 | // decode_quoted_escapes returns an error for any disallowed escape sequences. |
| 62 | // Delimiters in TOML has significant meaning: |
| 63 | // '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get) |
| 64 | // "/""" delimits *basic* strings |
| 65 | // Allowed escapes in *basic* strings are: |
| 66 | // \b - backspace (U+0008) |
| 67 | // \t - tab (U+0009) |
| 68 | // \n - linefeed (U+000A) |
| 69 | // \f - form feed (U+000C) |
| 70 | // \r - carriage return (U+000D) |
| 71 | // \" - quote (U+0022) |
| 72 | // \\ - backslash (U+005C) |
| 73 | // \uXXXX - Unicode (U+XXXX) |
| 74 | // \UXXXXXXXX - Unicode (U+XXXXXXXX) |
| 75 | pub fn decode_quoted_escapes(mut q ast.Quoted) ! { |
| 76 | // Setup a scanner in stack memory for easier navigation. |
| 77 | mut eat_whitespace := false |
| 78 | // TODO: use string builder |
| 79 | mut decoded_s := '' |
| 80 | // See https://toml.io/en/v1.0.0#string for more info on string types. |
| 81 | is_basic := q.quote == `\"` |
| 82 | if !is_basic { |
| 83 | return |
| 84 | } |
| 85 | |
| 86 | mut s := scanner.new_simple_text(q.text)! |
| 87 | q.text = q.text.replace('\\"', '"') |
| 88 | |
| 89 | for ch := s.next(); ch != scanner.end_of_text; ch = s.next() { |
| 90 | ch_byte := u8(ch) |
| 91 | if eat_whitespace && ch_byte.is_space() { |
| 92 | continue |
| 93 | } |
| 94 | eat_whitespace = false |
| 95 | |
| 96 | if ch == `\\` { |
| 97 | ch_next := s.at() |
| 98 | ch_next_byte := u8(ch_next) |
| 99 | |
| 100 | if q.is_multiline { |
| 101 | if ch_next_byte.is_space() { |
| 102 | eat_whitespace = true |
| 103 | continue |
| 104 | } |
| 105 | } |
| 106 | match rune(ch_next) { |
| 107 | `\\`, `"` { |
| 108 | decoded_s += ch_next_byte.ascii_str() |
| 109 | s.next() |
| 110 | continue |
| 111 | } |
| 112 | `n` { |
| 113 | decoded_s += '\n' |
| 114 | s.next() |
| 115 | continue |
| 116 | } |
| 117 | `t` { |
| 118 | decoded_s += '\t' |
| 119 | s.next() |
| 120 | continue |
| 121 | } |
| 122 | `b` { |
| 123 | decoded_s += '\b' |
| 124 | s.next() |
| 125 | continue |
| 126 | } |
| 127 | `r` { |
| 128 | decoded_s += '\r' |
| 129 | s.next() |
| 130 | continue |
| 131 | } |
| 132 | `f` { |
| 133 | decoded_s += '\f' |
| 134 | s.next() |
| 135 | continue |
| 136 | } |
| 137 | else {} |
| 138 | } |
| 139 | |
| 140 | escape := ch_byte.ascii_str() + ch_next_byte.ascii_str() |
| 141 | // Decode unicode escapes |
| 142 | if escape.to_lower() == '\\u' { |
| 143 | is_valid_short := u8(s.peek(1)).is_hex_digit() && u8(s.peek(2)).is_hex_digit() |
| 144 | && u8(s.peek(3)).is_hex_digit() && u8(s.peek(4)).is_hex_digit() |
| 145 | |
| 146 | if is_valid_short { |
| 147 | is_valid_long := u8(s.peek(5)).is_hex_digit() && u8(s.peek(6)).is_hex_digit() |
| 148 | && u8(s.peek(7)).is_hex_digit() && u8(s.peek(8)).is_hex_digit() |
| 149 | // If it's a long type Unicode (\UXXXXXXXX) with a maximum of 10 chars: '\' + 'U' + 8 hex characters |
| 150 | // we pass in 10 characters from the `u`/`U` which is the longest possible sequence |
| 151 | // of 9 chars plus one extra. |
| 152 | // Else it's a short sequence (\uXXXX) with a maximum of 6 chars: '\' + 'U' + 4 hex characters. |
| 153 | mut decoded := '' |
| 154 | mut sequence_length := 0 |
| 155 | mut unicode_val := 0 |
| 156 | mut slen := if is_valid_long { 10 } else { 6 } |
| 157 | if slen <= s.remaining() { |
| 158 | pos := s.state().pos |
| 159 | sequence := s.text#[pos..pos + slen + 1] |
| 160 | decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or { |
| 161 | decoded_s += escape |
| 162 | continue |
| 163 | } |
| 164 | if unicode_val > utf8_max || unicode_val < 0 { |
| 165 | decoded_s += escape |
| 166 | continue |
| 167 | } |
| 168 | // Check if the Unicode value is actually in the valid Unicode scalar value ranges. |
| 169 | if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF) |
| 170 | || (unicode_val >= 0xE000 && unicode_val <= utf8_max)) { |
| 171 | decoded_s += escape |
| 172 | continue |
| 173 | } |
| 174 | decoded_s += decoded |
| 175 | replacement := s.text[pos..pos + sequence_length + 1] |
| 176 | s.skip_n(replacement.len) |
| 177 | continue |
| 178 | } else { |
| 179 | pos := s.state().pos |
| 180 | sequence := s.text[pos..] |
| 181 | decoded, _, _ = decode_unicode_escape(sequence) or { |
| 182 | decoded_s += escape |
| 183 | continue |
| 184 | } |
| 185 | decoded_s += decoded |
| 186 | s.skip_n(s.text[pos..].len) |
| 187 | continue |
| 188 | } |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | decoded_s += ch_byte.ascii_str() |
| 193 | } |
| 194 | q.text = decoded_s |
| 195 | } |
| 196 | |
| 197 | // decode_unicode_escape decodes the Unicode escape sequence `esc_unicode`. |
| 198 | // The sequence is expected to be prefixed with either `u` or `U`. |
| 199 | // decode_unicode_escape returns the decoded rune as |
| 200 | // a string, it's integer value and it's length. |
| 201 | fn decode_unicode_escape(esc_unicode string) !(string, int, int) { |
| 202 | is_long_esc_type := esc_unicode.starts_with('U') |
| 203 | mut sequence := esc_unicode[1..] |
| 204 | hex_digits_len := if is_long_esc_type { 8 } else { 4 } |
| 205 | mut sequence_len := hex_digits_len |
| 206 | |
| 207 | sequence = sequence[..hex_digits_len] |
| 208 | |
| 209 | mut unicode_point := sequence |
| 210 | if unicode_point.len < 8 { |
| 211 | unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point |
| 212 | } |
| 213 | i64_val := strconv.parse_int(unicode_point, 16, 0)! |
| 214 | rn := rune(i64_val) |
| 215 | return '${rn}', int(i64_val), sequence_len |
| 216 | } |
| 217 | |
| 218 | // decode_date_time decodes the `dt ast.DateTime`. |
| 219 | fn (d Decoder) decode_date_time(mut dt ast.DateTime) ! { |
| 220 | // Expand milliseconds that are only 1 char |
| 221 | if dt.text.contains('.') { |
| 222 | yymmddhhmmss := dt.text.all_before('.') |
| 223 | rest := dt.text.all_after('.') |
| 224 | z := if rest.contains('Z') { 'Z' } else { '' } |
| 225 | mut ms := rest |
| 226 | mut offset := '' |
| 227 | if rest.contains('+') { |
| 228 | offset = '+' + rest.all_after('+') |
| 229 | ms = rest.all_before('+') |
| 230 | } else if rest.contains('-') { |
| 231 | offset = '-' + rest.all_after('-') |
| 232 | ms = rest.all_before('-') |
| 233 | } |
| 234 | if z != '' { |
| 235 | ms = ms.replace('Z', '') |
| 236 | } |
| 237 | if ms.len > 1 { |
| 238 | return |
| 239 | } |
| 240 | ms = ms + '0'.repeat(4 - ms.len) + z |
| 241 | dt.text = yymmddhhmmss + '.' + ms + offset |
| 242 | } |
| 243 | } |
| 244 | |