| 1 | // Copyright (c) 2021 Lars Pontoppidan. All rights reserved. |
| 2 | // Use of this source code is governed by an MIT license |
| 3 | // that can be found in the LICENSE file. |
| 4 | module checker |
| 5 | |
| 6 | import toml.ast |
| 7 | import toml.ast.walker |
| 8 | import toml.util |
| 9 | import toml.token |
| 10 | import toml.scanner |
| 11 | import encoding.utf8 |
| 12 | import time |
| 13 | import strconv |
| 14 | |
| 15 | pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`] |
| 16 | |
| 17 | // utf8_max is the largest inclusive value of the Unicodes scalar value ranges. |
| 18 | const utf8_max = 0x10FFFF |
| 19 | |
| 20 | fn allowed_basic_escape_char_list() []u8 { |
| 21 | return [u8(`u`), `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`] |
| 22 | } |
| 23 | |
| 24 | fn toml_parse_time(s string) !time.Time { |
| 25 | if s.len > 3 && s[2] == `:` { |
| 26 | // complete the partial time, with an arbitrary date: |
| 27 | return time.parse_rfc3339('0001-01-01T' + s) |
| 28 | } |
| 29 | if s.len == 10 { |
| 30 | // complete the partial date, with zero time and zero timezone |
| 31 | return time.parse_rfc3339(s + 'T00:00:00Z') |
| 32 | } |
| 33 | return time.parse_rfc3339(s)! |
| 34 | } |
| 35 | |
| 36 | // Checker checks a tree of TOML `ast.Value`'s for common errors. |
| 37 | pub struct Checker { |
| 38 | pub: |
| 39 | scanner &scanner.Scanner = unsafe { nil } |
| 40 | } |
| 41 | |
| 42 | // check checks the `ast.Value` and all it's children |
| 43 | // for common errors. |
| 44 | pub fn (c &Checker) check(n &ast.Value) ! { |
| 45 | walker.walk(c, n)! |
| 46 | } |
| 47 | |
| 48 | fn (c &Checker) visit(value &ast.Value) ! { |
| 49 | match value { |
| 50 | ast.Bool { |
| 51 | c.check_boolean(value)! |
| 52 | } |
| 53 | ast.Number { |
| 54 | c.check_number(value)! |
| 55 | } |
| 56 | ast.Quoted { |
| 57 | c.check_quoted(value)! |
| 58 | } |
| 59 | ast.DateTime { |
| 60 | c.check_date_time(value)! |
| 61 | } |
| 62 | ast.Date { |
| 63 | c.check_date(value)! |
| 64 | } |
| 65 | ast.Time { |
| 66 | c.check_time(value)! |
| 67 | } |
| 68 | else {} |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | // excerpt returns a string of the token's surroundings |
| 73 | fn (c &Checker) excerpt(tp token.Pos) string { |
| 74 | return c.scanner.excerpt(tp.pos, 10) |
| 75 | } |
| 76 | |
| 77 | // is_hex_bin_oct_prefixed returns true if `hbo` has either |
| 78 | // of: `0x`, `0o` or `0b` - as a prefix. |
| 79 | // Example: assert is_hex_bin_oct_prefixed('0xAF') == true |
| 80 | // Example: assert is_hex_bin_oct_prefixed('xAF') == false |
| 81 | fn is_hex_bin_oct_prefixed(hbo string) bool { |
| 82 | return hbo.len > 2 && (hbo.starts_with('0x') || hbo.starts_with('0o') || hbo.starts_with('0b')) |
| 83 | } |
| 84 | |
| 85 | // has_repeating returns true if `str` has one or more repeating |
| 86 | // `rune` characters provided in `repeats`. |
| 87 | // Example: assert has_repeating('hello__v.', [`.`,`_`]) == true |
| 88 | // Example: assert has_repeating('hello_v.', [`.`,`_`]) == false |
| 89 | fn has_repeating(str string, repeats []rune) bool { |
| 90 | for i, r in str { |
| 91 | if r in repeats && i + 1 < str.len { |
| 92 | if r == str[i + 1] { |
| 93 | return true |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | return false |
| 98 | } |
| 99 | |
| 100 | // check_number returns an error if `num` is not a valid TOML number. |
| 101 | fn (c &Checker) check_number(num ast.Number) ! { |
| 102 | lit := num.text |
| 103 | lit_lower_case := lit.to_lower() |
| 104 | if lit in ['0', '0.0', '+0', '-0', '+0.0', '-0.0', '0e0', '+0e0', '-0e0', '0e00'] { |
| 105 | return |
| 106 | } |
| 107 | |
| 108 | if lit.contains('_') { |
| 109 | if lit.starts_with('_') || lit.ends_with('_') { |
| 110 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 111 | ' numbers like "${lit}" can not start or end with `_` in ...${c.excerpt(num.pos)}...') |
| 112 | } |
| 113 | if lit.contains('__') { |
| 114 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 115 | ' numbers like "${lit}" can not have more than one underscore (`_`) in ...${c.excerpt(num.pos)}...') |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | mut hex_bin_oct := is_hex_bin_oct_prefixed(lit) |
| 120 | mut is_bin, mut is_oct, mut is_hex := false, false, false |
| 121 | is_float := lit_lower_case.all_before('e').contains('.') |
| 122 | has_exponent_notation := lit_lower_case.contains('e') |
| 123 | float_decimal_index := lit.index_('.') |
| 124 | // mut is_first_digit := u8(lit[0]).is_digit() |
| 125 | mut ascii := u8(lit[0]).ascii_str() |
| 126 | is_sign_prefixed := lit[0] in [`+`, `-`] |
| 127 | mut lit_sans_sign := lit |
| 128 | if is_sign_prefixed { // +/- ... |
| 129 | lit_sans_sign = lit[1..] |
| 130 | hex_bin_oct = is_hex_bin_oct_prefixed(lit_sans_sign) |
| 131 | if hex_bin_oct { |
| 132 | ascii = u8(lit[0]).ascii_str() |
| 133 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 134 | ' numbers like "${lit}" (hex, octal and binary) can not start with `${ascii}` in ...${c.excerpt(num.pos)}...') |
| 135 | } |
| 136 | if lit.len > 1 && lit_sans_sign.starts_with('0') && !lit_sans_sign.starts_with('0.') { |
| 137 | ascii = u8(lit_sans_sign[0]).ascii_str() |
| 138 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 139 | ' numbers like "${lit}" can not start with `${ascii}` in ...${c.excerpt(num.pos)}...') |
| 140 | } |
| 141 | } else { |
| 142 | if !hex_bin_oct { |
| 143 | if !is_float && lit[0] == `0` { |
| 144 | if lit[1] in [`B`, `O`, `X`] { |
| 145 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 146 | ' numbers like "${lit}" only lowercase notation in ...${c.excerpt(num.pos)}...') |
| 147 | } |
| 148 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 149 | ' numbers like "${lit}" can not start with a zero in ...${c.excerpt(num.pos)}...') |
| 150 | } |
| 151 | |
| 152 | if is_float && lit[0] == `0` && float_decimal_index > 1 { |
| 153 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 154 | ' numbers like "${lit}" can not start with a zero in ...${c.excerpt(num.pos)}...') |
| 155 | } |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | if has_repeating(lit, [`_`, `.`, `b`, `o`, `x`]) { |
| 160 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 161 | ' numbers like "${lit}" can not have ${scanner.digit_extras} as repeating characters in ...${c.excerpt(num.pos)}...') |
| 162 | } |
| 163 | |
| 164 | if hex_bin_oct { |
| 165 | is_bin = lit_sans_sign.starts_with('0b') |
| 166 | is_oct = lit_sans_sign.starts_with('0o') |
| 167 | is_hex = lit_sans_sign.starts_with('0x') |
| 168 | |
| 169 | lit_sans_sign_and_type_prefix := lit_sans_sign[2..] |
| 170 | |
| 171 | if lit_sans_sign_and_type_prefix.starts_with('_') |
| 172 | || lit_sans_sign_and_type_prefix.ends_with('_') { |
| 173 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 174 | ' numbers like "${lit}" can not start or end with `_` in ...${c.excerpt(num.pos)}...') |
| 175 | } |
| 176 | |
| 177 | if is_bin { |
| 178 | if !c.is_valid_binary_literal(lit_sans_sign_and_type_prefix) { |
| 179 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 180 | ' "${lit}" is not a valid binary number in ...${c.excerpt(num.pos)}...') |
| 181 | } |
| 182 | } else if is_oct { |
| 183 | if !c.is_valid_octal_literal(lit_sans_sign_and_type_prefix) { |
| 184 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 185 | ' "${lit}" is not a valid octal number in ...${c.excerpt(num.pos)}...') |
| 186 | } |
| 187 | } else { |
| 188 | if !c.is_valid_hex_literal(lit_sans_sign_and_type_prefix) { |
| 189 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 190 | ' "${lit}" is not a valid hexadecimal number in ...${c.excerpt(num.pos)}...') |
| 191 | } |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | if has_exponent_notation { |
| 196 | if lit_lower_case.all_after('e').starts_with('_') |
| 197 | || lit_lower_case.all_before('e').ends_with('_') { |
| 198 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 199 | ' the exponent in "${lit}" can not start nor end with an underscore in ...${c.excerpt(num.pos)}...') |
| 200 | } |
| 201 | if lit_lower_case.all_after('e').contains('.') { |
| 202 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 203 | ' numbers like "${lit}" (with exponent) can not have a decimal point in ...${c.excerpt(num.pos)}...') |
| 204 | } |
| 205 | if !is_hex && lit_lower_case.count('e') > 1 { |
| 206 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 207 | ' numbers like "${lit}" (with exponent) can only have one exponent in ...${c.excerpt(num.pos)}...') |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | if is_float { |
| 212 | if lit.count('.') > 1 { |
| 213 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 214 | ' numbers like "${lit}" (float) can only have one decimal point in ...${c.excerpt(num.pos)}...') |
| 215 | } |
| 216 | last := lit[lit.len - 1] |
| 217 | if last in scanner.digit_extras { |
| 218 | ascii = u8(last).ascii_str() |
| 219 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 220 | ' numbers like "${lit}" (float) can not end with `${ascii}` in ...${c.excerpt(num.pos)}...') |
| 221 | } |
| 222 | if lit.contains('_.') || lit.contains('._') { |
| 223 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 224 | ' numbers like "${lit}" (float) can not have underscores before or after the decimal point in ...${c.excerpt(num.pos)}...') |
| 225 | } |
| 226 | if lit_lower_case.contains('e.') || lit.contains('.e') { |
| 227 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 228 | ' numbers like "${lit}" (float) can not have decimal points on either side of the exponent notation in ...${c.excerpt(num.pos)}...') |
| 229 | } |
| 230 | // Check if it contains other chars than the allowed |
| 231 | for r in lit { |
| 232 | if r !in [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `.`, `e`, `E`, `-`, `+`, |
| 233 | `_`] { |
| 234 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 235 | ' numbers like "${lit}" (float) can not contain `${u8(r).ascii_str()}` in ...${c.excerpt(num.pos)}...') |
| 236 | } |
| 237 | } |
| 238 | } else { |
| 239 | if lit.len > 1 && lit.starts_with('0') && lit[1] !in [`b`, `o`, `x`] { |
| 240 | ascii = u8(lit[0]).ascii_str() |
| 241 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 242 | ' numbers like "${lit}" can not start with `${ascii}` in ...${c.excerpt(num.pos)}...') |
| 243 | } |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | // is_valid_binary_literal returns true if `num` is valid TOML binary literal. |
| 248 | fn (c &Checker) is_valid_binary_literal(num string) bool { |
| 249 | for ch in num { |
| 250 | if ch == `_` { |
| 251 | continue |
| 252 | } |
| 253 | if !(ch >= `0` && ch <= `1`) { |
| 254 | return false |
| 255 | } |
| 256 | } |
| 257 | return true |
| 258 | } |
| 259 | |
| 260 | // is_valid_octal_literal returns true if `num` is valid TOML octal literal. |
| 261 | fn (c &Checker) is_valid_octal_literal(num string) bool { |
| 262 | for ch in num { |
| 263 | if ch == `_` { |
| 264 | continue |
| 265 | } |
| 266 | if !(ch >= `0` && ch <= `7`) { |
| 267 | return false |
| 268 | } |
| 269 | } |
| 270 | return true |
| 271 | } |
| 272 | |
| 273 | // is_valid_hex_literal returns true if `num` is valid TOML hexadecimal literal. |
| 274 | fn (c &Checker) is_valid_hex_literal(num string) bool { |
| 275 | for ch in num { |
| 276 | if ch == `_` { |
| 277 | continue |
| 278 | } |
| 279 | if !ch.is_hex_digit() { |
| 280 | return false |
| 281 | } |
| 282 | } |
| 283 | return true |
| 284 | } |
| 285 | |
| 286 | // check_boolean returns an error if `b` is not a valid TOML boolean. |
| 287 | fn (c &Checker) check_boolean(b ast.Bool) ! { |
| 288 | lit := b.text |
| 289 | if lit in ['true', 'false'] { |
| 290 | return |
| 291 | } |
| 292 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 293 | ' boolean values like "${lit}" can only be `true` or `false` literals, not `${lit}` in ...${c.excerpt(b.pos)}...') |
| 294 | } |
| 295 | |
| 296 | // check_date_time returns an error if `dt` is not a valid TOML date-time string (RFC 3339). |
| 297 | // See also https://ijmacd.github.io/rfc3339-iso8601 for a more |
| 298 | // visual representation of the RFC 3339 format. |
| 299 | fn (c &Checker) check_date_time(dt ast.DateTime) ! { |
| 300 | lit := dt.text |
| 301 | mut split := []string{} |
| 302 | // RFC 3339 Date-Times can be split via 4 separators (` `, `_`, `T` and `t`). |
| 303 | if lit.to_lower().contains_any(' _t') { |
| 304 | if lit.contains(' ') { |
| 305 | split = lit.split(' ') |
| 306 | } else if lit.contains('_') { |
| 307 | split = lit.split('_') |
| 308 | } else if lit.contains('T') { |
| 309 | split = lit.split('T') |
| 310 | } else if lit.contains('t') { |
| 311 | split = lit.split('t') |
| 312 | } |
| 313 | // Validate the split into date and time parts. |
| 314 | if split.len != 2 { |
| 315 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 316 | ' "${lit}" contains too many date/time separators in ...${c.excerpt(dt.pos)}...') |
| 317 | } |
| 318 | // Re-use date and time validation code for detailed testing of each part |
| 319 | c.check_date(ast.Date{ |
| 320 | text: split[0] |
| 321 | pos: token.Pos{ |
| 322 | len: split[0].len |
| 323 | line_nr: dt.pos.line_nr |
| 324 | pos: dt.pos.pos |
| 325 | col: dt.pos.col |
| 326 | } |
| 327 | })! |
| 328 | c.check_time(ast.Time{ |
| 329 | text: split[1] |
| 330 | pos: token.Pos{ |
| 331 | len: split[1].len |
| 332 | line_nr: dt.pos.line_nr |
| 333 | pos: dt.pos.pos + split[0].len |
| 334 | col: dt.pos.col + split[0].len |
| 335 | } |
| 336 | })! |
| 337 | // Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't. |
| 338 | mut has_time_offset := false |
| 339 | for ch in lit#[19..] { |
| 340 | if ch in [u8(`-`), `+`, `Z`] { |
| 341 | has_time_offset = true |
| 342 | break |
| 343 | } |
| 344 | } |
| 345 | |
| 346 | mut lit_with_offset := lit |
| 347 | if !has_time_offset { |
| 348 | lit_with_offset += 'Z' |
| 349 | } |
| 350 | |
| 351 | toml_parse_time(lit_with_offset) or { |
| 352 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 353 | ' "${lit}" is not a valid RFC 3339 Date-Time format string "${err}". In ...${c.excerpt(dt.pos)}...') |
| 354 | } |
| 355 | } else { |
| 356 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 357 | ' "${lit}" is not a valid RFC 3339 Date-Time format string in ...${c.excerpt(dt.pos)}...') |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | // check_time returns an error if `date` is not a valid TOML date string (RFC 3339). |
| 362 | fn (c &Checker) check_date(date ast.Date) ! { |
| 363 | lit := date.text |
| 364 | parts := lit.split('-') |
| 365 | if parts.len != 3 { |
| 366 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 367 | ' "${lit}" is not a valid RFC 3339 Date format string in ...${c.excerpt(date.pos)}...') |
| 368 | } |
| 369 | yyyy := parts[0] |
| 370 | if yyyy.len != 4 { |
| 371 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 372 | ' "${lit}" does not have a valid RFC 3339 year indication in ...${c.excerpt(date.pos)}...') |
| 373 | } |
| 374 | mm := parts[1] |
| 375 | if mm.len != 2 { |
| 376 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 377 | ' "${lit}" does not have a valid RFC 3339 month indication in ...${c.excerpt(date.pos)}...') |
| 378 | } |
| 379 | dd := parts[2] |
| 380 | if dd.len != 2 { |
| 381 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 382 | ' "${lit}" does not have a valid RFC 3339 day indication in ...${c.excerpt(date.pos)}...') |
| 383 | } |
| 384 | if mm.int() == 2 { |
| 385 | ddi := dd.int() |
| 386 | if ddi > 28 { |
| 387 | if ddi == 29 { |
| 388 | yyyyi := yyyy.int() |
| 389 | if !(yyyyi % 4 == 0 && (yyyyi % 100 != 0 || yyyyi % 400 == 0)) { |
| 390 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 391 | ' "${lit}" is not a valid RFC 3339 date: ${yyyy} is not a leap year so February can not have 29 days in it ...${c.excerpt(date.pos)}...') |
| 392 | } |
| 393 | } else { |
| 394 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 395 | ' "${lit}" is not a valid RFC 3339 date: February can not have more that 28 or 29 days in it ...${c.excerpt(date.pos)}...') |
| 396 | } |
| 397 | } |
| 398 | } |
| 399 | toml_parse_time(lit) or { |
| 400 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 401 | ' "${lit}" is not a valid RFC 3339 Date format string "${err}". In ...${c.excerpt(date.pos)}...') |
| 402 | } |
| 403 | } |
| 404 | |
| 405 | // check_time returns an error if `t` is not a valid TOML time string (RFC 3339). |
| 406 | fn (c &Checker) check_time(t ast.Time) ! { |
| 407 | lit := t.text |
| 408 | // Split any offsets from the time |
| 409 | mut offset_splitter := if lit.contains('+') { '+' } else { '-' } |
| 410 | parts := lit.split(offset_splitter) |
| 411 | mut hhmmss := parts[0].all_before('.') |
| 412 | // Check for 2 digits in all fields |
| 413 | mut check_length := 8 |
| 414 | if hhmmss.to_upper().ends_with('Z') { |
| 415 | check_length++ |
| 416 | } |
| 417 | if hhmmss.len != check_length { |
| 418 | starts_with_zero := hhmmss.starts_with('0') |
| 419 | if !starts_with_zero { |
| 420 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 421 | ' "${lit}" must be zero prefixed in ...${c.excerpt(t.pos)}...') |
| 422 | } |
| 423 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 424 | ' "${lit}" is not a valid RFC 3339 Time format string in ...${c.excerpt(t.pos)}...') |
| 425 | } |
| 426 | |
| 427 | if parts.len > 1 { |
| 428 | // Offset |
| 429 | offset_parts := parts[1].split(':') |
| 430 | if offset_parts.len != 2 { |
| 431 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 432 | ' "${parts[1]}" is not a valid RFC 3339 time offset specifier in ...${c.excerpt(t.pos)}...') |
| 433 | } |
| 434 | hh := offset_parts[0].int() |
| 435 | if hh < 0 || hh > 24 { |
| 436 | pos := token.Pos{ |
| 437 | ...t.pos |
| 438 | pos: t.pos.pos + check_length |
| 439 | } |
| 440 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 441 | ' "${hh}" hour specifier in "${parts[1]}" should be between 00 and 24 in ...${c.excerpt(pos)}...') |
| 442 | } |
| 443 | mm := offset_parts[1].int() |
| 444 | if mm < 0 || mm > 59 { |
| 445 | pos := token.Pos{ |
| 446 | ...t.pos |
| 447 | pos: t.pos.pos + check_length |
| 448 | } |
| 449 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 450 | ' "${mm}" second specifier in "${parts[1]}" should be between 00 and 59 in ...${c.excerpt(pos)}...') |
| 451 | } |
| 452 | } |
| 453 | // Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't. |
| 454 | mut has_time_offset := false |
| 455 | for ch in parts[0]#[8..] { |
| 456 | if ch in [u8(`-`), `+`, `Z`] { |
| 457 | has_time_offset = true |
| 458 | break |
| 459 | } |
| 460 | } |
| 461 | |
| 462 | mut part_with_offset := parts[0] |
| 463 | if !has_time_offset { |
| 464 | part_with_offset += 'Z' |
| 465 | } |
| 466 | |
| 467 | toml_parse_time(part_with_offset) or { |
| 468 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 469 | ' "${lit}" is not a valid RFC 3339 Time format string "${err}". In ...${c.excerpt(t.pos)}...') |
| 470 | } |
| 471 | } |
| 472 | |
| 473 | // check_quoted returns an error if `q` is not a valid quoted TOML string. |
| 474 | pub fn (c &Checker) check_quoted(q ast.Quoted) ! { |
| 475 | lit := q.text |
| 476 | quote := q.quote.ascii_str() |
| 477 | triple_quote := quote + quote + quote |
| 478 | if q.is_multiline && lit.ends_with(triple_quote) && !lit.ends_with('\\' + triple_quote) { |
| 479 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 480 | ' string values like "${lit}" has unbalanced quote literals `${quote}` in ...${c.excerpt(q.pos)}...') |
| 481 | } |
| 482 | c.check_quoted_escapes(q)! |
| 483 | c.check_utf8_validity(q)! |
| 484 | } |
| 485 | |
| 486 | // check_quoted_escapes returns an error for any disallowed escape sequences. |
| 487 | // Delimiters in TOML has significant meaning: |
| 488 | // '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get) |
| 489 | // "/""" delimits *basic* strings |
| 490 | // Allowed escapes in *basic* strings are: |
| 491 | // \b - backspace (U+0008) |
| 492 | // \t - tab (U+0009) |
| 493 | // \n - linefeed (U+000A) |
| 494 | // \f - form feed (U+000C) |
| 495 | // \r - carriage return (U+000D) |
| 496 | // \" - quote (U+0022) |
| 497 | // \\ - backslash (U+005C) |
| 498 | // \uXXXX - Unicode (U+XXXX) |
| 499 | // \UXXXXXXXX - Unicode (U+XXXXXXXX) |
| 500 | fn (c &Checker) check_quoted_escapes(q ast.Quoted) ! { |
| 501 | // Setup a scanner in stack memory for easier navigation. |
| 502 | mut s := scanner.new_simple_text(q.text)! |
| 503 | |
| 504 | // See https://toml.io/en/v1.0.0#string for more info on string types. |
| 505 | is_basic := q.quote == `\"` |
| 506 | contains_newlines := q.text.contains('\n') |
| 507 | for { |
| 508 | ch := s.next() |
| 509 | if ch == scanner.end_of_text { |
| 510 | break |
| 511 | } |
| 512 | ch_byte := u8(ch) |
| 513 | if ch == `\\` { |
| 514 | next_ch := u8(s.at()) |
| 515 | |
| 516 | if next_ch == `\\` { |
| 517 | s.next() |
| 518 | continue |
| 519 | } |
| 520 | |
| 521 | escape := ch_byte.ascii_str() + next_ch.ascii_str() |
| 522 | if is_basic { |
| 523 | if q.is_multiline { |
| 524 | if next_ch == ` ` { |
| 525 | if !contains_newlines { |
| 526 | st := s.state() |
| 527 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 528 | ' can not escape whitespaces in multi-line strings (`\\ `) at `${escape}` (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...') |
| 529 | } |
| 530 | // Rest of line must only be space chars from this point on |
| 531 | for { |
| 532 | ch_ := s.next() |
| 533 | if ch_ == `\n` { |
| 534 | break |
| 535 | } |
| 536 | if !(ch_ == ` ` || ch_ == `\t`) { |
| 537 | st := s.state() |
| 538 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 539 | ' invalid character `${u8(ch_).ascii_str()}` after `${escape}` at (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...') |
| 540 | } |
| 541 | } |
| 542 | } |
| 543 | if next_ch in [`\t`, `\r`, `\n`, ` `] { |
| 544 | s.next() |
| 545 | continue |
| 546 | } |
| 547 | } |
| 548 | if next_ch !in allowed_basic_escape_char_list() { |
| 549 | st := s.state() |
| 550 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 551 | ' unknown basic string escape character `${next_ch.ascii_str()}` in `${escape}` (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...') |
| 552 | } |
| 553 | } |
| 554 | // Check Unicode escapes |
| 555 | if is_basic && escape.to_lower() == '\\u' { |
| 556 | // Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters |
| 557 | // we pass in 10 characters from the `u`/`U` which is the longest possible sequence |
| 558 | // of 9 chars plus one extra. |
| 559 | if s.remaining() >= 10 { |
| 560 | pos := s.state().pos |
| 561 | c.check_unicode_escape(s.text[pos..pos + 11]) or { |
| 562 | st := s.state() |
| 563 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 564 | ' escaped Unicode is invalid. ${err.msg().capitalize()} (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...') |
| 565 | } |
| 566 | } else { |
| 567 | pos := s.state().pos |
| 568 | c.check_unicode_escape(s.text[pos..]) or { |
| 569 | st := s.state() |
| 570 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 571 | ' escaped Unicode is invalid. ${err.msg().capitalize()} (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...') |
| 572 | } |
| 573 | } |
| 574 | } |
| 575 | } |
| 576 | } |
| 577 | } |
| 578 | |
| 579 | // check_utf8_string returns an error if `str` is not valid UTF-8. |
| 580 | fn (c &Checker) check_utf8_validity(q ast.Quoted) ! { |
| 581 | lit := q.text |
| 582 | if !utf8.validate_str(lit) { |
| 583 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 584 | ' the string value "${lit}" is not valid UTF-8 in ...${c.excerpt(q.pos)}...') |
| 585 | } |
| 586 | } |
| 587 | |
| 588 | // validate_utf8_codepoint_string returns an error if `str` is not a valid Unicode code point. |
| 589 | // `str` is expected to be a `string` containing *only* hex values. |
| 590 | // Any preludes or prefixes like `0x` could pontentially yield wrong results. |
| 591 | fn validate_utf8_codepoint_string(str string) ! { |
| 592 | int_val := strconv.parse_int(str, 16, 64) or { i64(-1) } |
| 593 | if int_val > utf8_max || int_val < 0 { |
| 594 | return error('Unicode code point `${str}` is outside the valid Unicode scalar value ranges.') |
| 595 | } |
| 596 | // Check if the Unicode value is actually in the valid Unicode scalar value ranges. |
| 597 | // TODO: should probably be transferred / implemented in `utf8.validate(...)` also? |
| 598 | if !((int_val >= 0x0000 && int_val <= 0xD7FF) || (int_val >= 0xE000 && int_val <= 0x10FFFF)) { |
| 599 | return error('Unicode code point `${str}` is not a valid Unicode scalar value.') |
| 600 | } |
| 601 | bytes := str.bytes() |
| 602 | if !utf8.validate(bytes.data, bytes.len) { |
| 603 | return error('Unicode code point `${str}` is not a valid UTF-8 code point.') |
| 604 | } |
| 605 | } |
| 606 | |
| 607 | // check_unicode_escape returns an error if `esc_unicode` is not |
| 608 | // a valid Unicode escape sequence. `esc_unicode` is expected to be |
| 609 | // prefixed with either `u` or `U`. |
| 610 | fn (c &Checker) check_unicode_escape(esc_unicode string) ! { |
| 611 | if esc_unicode.len < 5 || !esc_unicode.to_lower().starts_with('u') { |
| 612 | // Makes sure the input to this function is actually valid. |
| 613 | return error('`${esc_unicode}` is not a valid escaped Unicode sequence.') |
| 614 | } |
| 615 | is_long_esc_type := esc_unicode.starts_with('U') |
| 616 | mut sequence := esc_unicode[1..] |
| 617 | hex_digits_len := if is_long_esc_type { 8 } else { 4 } |
| 618 | if sequence.len < hex_digits_len { |
| 619 | return error('Unicode escape sequence `${esc_unicode}` should be at least ${hex_digits_len} in length.') |
| 620 | } |
| 621 | sequence = sequence[..hex_digits_len] |
| 622 | // TODO: not enforced in BurnSushi testsuite?? |
| 623 | // if !sequence.is_upper() { |
| 624 | // return error('Unicode escape sequence `${esc_unicode}` is not in all uppercase.') |
| 625 | //} |
| 626 | validate_utf8_codepoint_string(sequence.to_upper())! |
| 627 | if is_long_esc_type { |
| 628 | // Long escape type checks |
| 629 | } else { |
| 630 | // Short escape type checks |
| 631 | } |
| 632 | } |
| 633 | |
| 634 | // check_comment returns an error if the contents of `comment` isn't |
| 635 | // a valid TOML comment. |
| 636 | pub fn (c &Checker) check_comment(comment ast.Comment) ! { |
| 637 | lit := comment.text |
| 638 | // Setup a scanner in stack memory for easier navigation. |
| 639 | mut s := scanner.new_simple_text(lit)! |
| 640 | for { |
| 641 | ch := s.next() |
| 642 | if ch == scanner.end_of_text { |
| 643 | break |
| 644 | } |
| 645 | ch_byte := u8(ch) |
| 646 | // Check for carriage return |
| 647 | if ch_byte == 0x0D { |
| 648 | st := s.state() |
| 649 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 650 | ' carriage return character `${ch_byte.hex()}` is not allowed in comments (${st.line_nr},${st.col}).') |
| 651 | } |
| 652 | // Check for control characters (allow TAB) |
| 653 | if util.is_illegal_ascii_control_character(ch_byte) { |
| 654 | st := s.state() |
| 655 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 656 | ' control character `${ch_byte.hex()}` is not allowed (${st.line_nr},${st.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(st.pos, 10)}...') |
| 657 | } |
| 658 | } |
| 659 | |
| 660 | // Check for bad UTF-8 encoding |
| 661 | if !utf8.validate_str(lit) { |
| 662 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 663 | ' comment "${lit}" is not valid UTF-8 in ...${c.excerpt(comment.pos)}...') |
| 664 | } |
| 665 | } |
| 666 | |