Gitly


1 // Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module checker
5 
6 import toml.ast
7 import toml.ast.walker
8 import toml.util
9 import toml.token
10 import toml.scanner
11 import encoding.utf8
12 import time
13 import strconv
14 
15 pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`]
16 
17 // utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
18 const utf8_max = 0x10FFFF
19 
20 fn allowed_basic_escape_char_list() []u8 {
21     return [u8(`u`), `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`]
22 }
23 
24 fn toml_parse_time(s string) !time.Time {
25     if s.len > 3 && s[2] == `:` {
26         // complete the partial time, with an arbitrary date:
27         return time.parse_rfc3339('0001-01-01T' + s)
28     }
29     if s.len == 10 {
30         // complete the partial date, with zero time and zero timezone
31         return time.parse_rfc3339(s + 'T00:00:00Z')
32     }
33     return time.parse_rfc3339(s)!
34 }
35 
36 // Checker checks a tree of TOML `ast.Value`'s for common errors.
37 pub struct Checker {
38 pub:
39     scanner &scanner.Scanner = unsafe { nil }
40 }
41 
42 // check checks the `ast.Value` and all it's children
43 // for common errors.
44 pub fn (c &Checker) check(n &ast.Value) ! {
45     walker.walk(c, n)!
46 }
47 
48 fn (c &Checker) visit(value &ast.Value) ! {
49     match value {
50         ast.Bool {
51             c.check_boolean(value)!
52         }
53         ast.Number {
54             c.check_number(value)!
55         }
56         ast.Quoted {
57             c.check_quoted(value)!
58         }
59         ast.DateTime {
60             c.check_date_time(value)!
61         }
62         ast.Date {
63             c.check_date(value)!
64         }
65         ast.Time {
66             c.check_time(value)!
67         }
68         else {}
69     }
70 }
71 
72 // excerpt returns a string of the token's surroundings
73 fn (c &Checker) excerpt(tp token.Pos) string {
74     return c.scanner.excerpt(tp.pos, 10)
75 }
76 
77 // is_hex_bin_oct_prefixed returns true if `hbo` has either
78 // of: `0x`, `0o` or `0b` - as a prefix.
79 // Example: assert is_hex_bin_oct_prefixed('0xAF') == true
80 // Example: assert is_hex_bin_oct_prefixed('xAF') == false
81 fn is_hex_bin_oct_prefixed(hbo string) bool {
82     return hbo.len > 2 && (hbo.starts_with('0x') || hbo.starts_with('0o') || hbo.starts_with('0b'))
83 }
84 
85 // has_repeating returns true if `str` has one or more repeating
86 // `rune` characters provided in `repeats`.
87 // Example: assert has_repeating('hello__v.', [`.`,`_`]) == true
88 // Example: assert has_repeating('hello_v.', [`.`,`_`]) == false
89 fn has_repeating(str string, repeats []rune) bool {
90     for i, r in str {
91         if r in repeats && i + 1 < str.len {
92             if r == str[i + 1] {
93                 return true
94             }
95         }
96     }
97     return false
98 }
99 
100 // check_number returns an error if `num` is not a valid TOML number.
101 fn (c &Checker) check_number(num ast.Number) ! {
102     lit := num.text
103     lit_lower_case := lit.to_lower()
104     if lit in ['0', '0.0', '+0', '-0', '+0.0', '-0.0', '0e0', '+0e0', '-0e0', '0e00'] {
105         return
106     }
107 
108     if lit.contains('_') {
109         if lit.starts_with('_') || lit.ends_with('_') {
110             return error(@MOD + '.' + @STRUCT + '.' + @FN +
111                 ' numbers like "${lit}" can not start or end with `_` in ...${c.excerpt(num.pos)}...')
112         }
113         if lit.contains('__') {
114             return error(@MOD + '.' + @STRUCT + '.' + @FN +
115                 ' numbers like "${lit}" can not have more than one underscore (`_`) in ...${c.excerpt(num.pos)}...')
116         }
117     }
118 
119     mut hex_bin_oct := is_hex_bin_oct_prefixed(lit)
120     mut is_bin, mut is_oct, mut is_hex := false, false, false
121     is_float := lit_lower_case.all_before('e').contains('.')
122     has_exponent_notation := lit_lower_case.contains('e')
123     float_decimal_index := lit.index_('.')
124     // mut is_first_digit := u8(lit[0]).is_digit()
125     mut ascii := u8(lit[0]).ascii_str()
126     is_sign_prefixed := lit[0] in [`+`, `-`]
127     mut lit_sans_sign := lit
128     if is_sign_prefixed { // +/- ...
129         lit_sans_sign = lit[1..]
130         hex_bin_oct = is_hex_bin_oct_prefixed(lit_sans_sign)
131         if hex_bin_oct {
132             ascii = u8(lit[0]).ascii_str()
133             return error(@MOD + '.' + @STRUCT + '.' + @FN +
134                 ' numbers like "${lit}" (hex, octal and binary) can not start with `${ascii}` in ...${c.excerpt(num.pos)}...')
135         }
136         if lit.len > 1 && lit_sans_sign.starts_with('0') && !lit_sans_sign.starts_with('0.') {
137             ascii = u8(lit_sans_sign[0]).ascii_str()
138             return error(@MOD + '.' + @STRUCT + '.' + @FN +
139                 ' numbers like "${lit}" can not start with `${ascii}` in ...${c.excerpt(num.pos)}...')
140         }
141     } else {
142         if !hex_bin_oct {
143             if !is_float && lit[0] == `0` {
144                 if lit[1] in [`B`, `O`, `X`] {
145                     return error(@MOD + '.' + @STRUCT + '.' + @FN +
146                         ' numbers like "${lit}" only lowercase notation in ...${c.excerpt(num.pos)}...')
147                 }
148                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
149                     ' numbers like "${lit}" can not start with a zero in ...${c.excerpt(num.pos)}...')
150             }
151 
152             if is_float && lit[0] == `0` && float_decimal_index > 1 {
153                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
154                     ' numbers like "${lit}" can not start with a zero in ...${c.excerpt(num.pos)}...')
155             }
156         }
157     }
158 
159     if has_repeating(lit, [`_`, `.`, `b`, `o`, `x`]) {
160         return error(@MOD + '.' + @STRUCT + '.' + @FN +
161             ' numbers like "${lit}" can not have ${scanner.digit_extras} as repeating characters in ...${c.excerpt(num.pos)}...')
162     }
163 
164     if hex_bin_oct {
165         is_bin = lit_sans_sign.starts_with('0b')
166         is_oct = lit_sans_sign.starts_with('0o')
167         is_hex = lit_sans_sign.starts_with('0x')
168 
169         lit_sans_sign_and_type_prefix := lit_sans_sign[2..]
170 
171         if lit_sans_sign_and_type_prefix.starts_with('_')
172             || lit_sans_sign_and_type_prefix.ends_with('_') {
173             return error(@MOD + '.' + @STRUCT + '.' + @FN +
174                 ' numbers like "${lit}" can not start or end with `_` in ...${c.excerpt(num.pos)}...')
175         }
176 
177         if is_bin {
178             if !c.is_valid_binary_literal(lit_sans_sign_and_type_prefix) {
179                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
180                     ' "${lit}" is not a valid binary number in ...${c.excerpt(num.pos)}...')
181             }
182         } else if is_oct {
183             if !c.is_valid_octal_literal(lit_sans_sign_and_type_prefix) {
184                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
185                     ' "${lit}" is not a valid octal number in ...${c.excerpt(num.pos)}...')
186             }
187         } else {
188             if !c.is_valid_hex_literal(lit_sans_sign_and_type_prefix) {
189                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
190                     ' "${lit}" is not a valid hexadecimal number in ...${c.excerpt(num.pos)}...')
191             }
192         }
193     }
194 
195     if has_exponent_notation {
196         if lit_lower_case.all_after('e').starts_with('_')
197             || lit_lower_case.all_before('e').ends_with('_') {
198             return error(@MOD + '.' + @STRUCT + '.' + @FN +
199                 ' the exponent in "${lit}" can not start nor end with an underscore in ...${c.excerpt(num.pos)}...')
200         }
201         if lit_lower_case.all_after('e').contains('.') {
202             return error(@MOD + '.' + @STRUCT + '.' + @FN +
203                 ' numbers like "${lit}" (with exponent) can not have a decimal point in ...${c.excerpt(num.pos)}...')
204         }
205         if !is_hex && lit_lower_case.count('e') > 1 {
206             return error(@MOD + '.' + @STRUCT + '.' + @FN +
207                 ' numbers like "${lit}" (with exponent) can only have one exponent in ...${c.excerpt(num.pos)}...')
208         }
209     }
210 
211     if is_float {
212         if lit.count('.') > 1 {
213             return error(@MOD + '.' + @STRUCT + '.' + @FN +
214                 ' numbers like "${lit}" (float) can only have one decimal point in ...${c.excerpt(num.pos)}...')
215         }
216         last := lit[lit.len - 1]
217         if last in scanner.digit_extras {
218             ascii = u8(last).ascii_str()
219             return error(@MOD + '.' + @STRUCT + '.' + @FN +
220                 ' numbers like "${lit}" (float) can not end with `${ascii}` in ...${c.excerpt(num.pos)}...')
221         }
222         if lit.contains('_.') || lit.contains('._') {
223             return error(@MOD + '.' + @STRUCT + '.' + @FN +
224                 ' numbers like "${lit}" (float) can not have underscores before or after the decimal point in ...${c.excerpt(num.pos)}...')
225         }
226         if lit_lower_case.contains('e.') || lit.contains('.e') {
227             return error(@MOD + '.' + @STRUCT + '.' + @FN +
228                 ' numbers like "${lit}" (float) can not have decimal points on either side of the exponent notation in ...${c.excerpt(num.pos)}...')
229         }
230         // Check if it contains other chars than the allowed
231         for r in lit {
232             if r !in [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `.`, `e`, `E`, `-`, `+`,
233                 `_`] {
234                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
235                     ' numbers like "${lit}" (float) can not contain `${u8(r).ascii_str()}` in ...${c.excerpt(num.pos)}...')
236             }
237         }
238     } else {
239         if lit.len > 1 && lit.starts_with('0') && lit[1] !in [`b`, `o`, `x`] {
240             ascii = u8(lit[0]).ascii_str()
241             return error(@MOD + '.' + @STRUCT + '.' + @FN +
242                 ' numbers like "${lit}" can not start with `${ascii}` in ...${c.excerpt(num.pos)}...')
243         }
244     }
245 }
246 
247 // is_valid_binary_literal returns true if `num` is valid TOML binary literal.
248 fn (c &Checker) is_valid_binary_literal(num string) bool {
249     for ch in num {
250         if ch == `_` {
251             continue
252         }
253         if !(ch >= `0` && ch <= `1`) {
254             return false
255         }
256     }
257     return true
258 }
259 
260 // is_valid_octal_literal returns true if `num` is valid TOML octal literal.
261 fn (c &Checker) is_valid_octal_literal(num string) bool {
262     for ch in num {
263         if ch == `_` {
264             continue
265         }
266         if !(ch >= `0` && ch <= `7`) {
267             return false
268         }
269     }
270     return true
271 }
272 
273 // is_valid_hex_literal returns true if `num` is valid TOML hexadecimal literal.
274 fn (c &Checker) is_valid_hex_literal(num string) bool {
275     for ch in num {
276         if ch == `_` {
277             continue
278         }
279         if !ch.is_hex_digit() {
280             return false
281         }
282     }
283     return true
284 }
285 
286 // check_boolean returns an error if `b` is not a valid TOML boolean.
287 fn (c &Checker) check_boolean(b ast.Bool) ! {
288     lit := b.text
289     if lit in ['true', 'false'] {
290         return
291     }
292     return error(@MOD + '.' + @STRUCT + '.' + @FN +
293         ' boolean values like "${lit}" can only be `true` or `false` literals, not `${lit}` in ...${c.excerpt(b.pos)}...')
294 }
295 
296 // check_date_time returns an error if `dt` is not a valid TOML date-time string (RFC 3339).
297 // See also https://ijmacd.github.io/rfc3339-iso8601 for a more
298 // visual representation of the RFC 3339 format.
299 fn (c &Checker) check_date_time(dt ast.DateTime) ! {
300     lit := dt.text
301     mut split := []string{}
302     // RFC 3339 Date-Times can be split via 4 separators (` `, `_`, `T` and `t`).
303     if lit.to_lower().contains_any(' _t') {
304         if lit.contains(' ') {
305             split = lit.split(' ')
306         } else if lit.contains('_') {
307             split = lit.split('_')
308         } else if lit.contains('T') {
309             split = lit.split('T')
310         } else if lit.contains('t') {
311             split = lit.split('t')
312         }
313         // Validate the split into date and time parts.
314         if split.len != 2 {
315             return error(@MOD + '.' + @STRUCT + '.' + @FN +
316                 ' "${lit}" contains too many date/time separators in ...${c.excerpt(dt.pos)}...')
317         }
318         // Re-use date and time validation code for detailed testing of each part
319         c.check_date(ast.Date{
320             text: split[0]
321             pos:  token.Pos{
322                 len:     split[0].len
323                 line_nr: dt.pos.line_nr
324                 pos:     dt.pos.pos
325                 col:     dt.pos.col
326             }
327         })!
328         c.check_time(ast.Time{
329             text: split[1]
330             pos:  token.Pos{
331                 len:     split[1].len
332                 line_nr: dt.pos.line_nr
333                 pos:     dt.pos.pos + split[0].len
334                 col:     dt.pos.col + split[0].len
335             }
336         })!
337         // Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't.
338         mut has_time_offset := false
339         for ch in lit#[19..] {
340             if ch in [u8(`-`), `+`, `Z`] {
341                 has_time_offset = true
342                 break
343             }
344         }
345 
346         mut lit_with_offset := lit
347         if !has_time_offset {
348             lit_with_offset += 'Z'
349         }
350 
351         toml_parse_time(lit_with_offset) or {
352             return error(@MOD + '.' + @STRUCT + '.' + @FN +
353                 ' "${lit}" is not a valid RFC 3339 Date-Time format string "${err}". In ...${c.excerpt(dt.pos)}...')
354         }
355     } else {
356         return error(@MOD + '.' + @STRUCT + '.' + @FN +
357             ' "${lit}" is not a valid RFC 3339 Date-Time format string in ...${c.excerpt(dt.pos)}...')
358     }
359 }
360 
361 // check_time returns an error if `date` is not a valid TOML date string (RFC 3339).
362 fn (c &Checker) check_date(date ast.Date) ! {
363     lit := date.text
364     parts := lit.split('-')
365     if parts.len != 3 {
366         return error(@MOD + '.' + @STRUCT + '.' + @FN +
367             ' "${lit}" is not a valid RFC 3339 Date format string in ...${c.excerpt(date.pos)}...')
368     }
369     yyyy := parts[0]
370     if yyyy.len != 4 {
371         return error(@MOD + '.' + @STRUCT + '.' + @FN +
372             ' "${lit}" does not have a valid RFC 3339 year indication in ...${c.excerpt(date.pos)}...')
373     }
374     mm := parts[1]
375     if mm.len != 2 {
376         return error(@MOD + '.' + @STRUCT + '.' + @FN +
377             ' "${lit}" does not have a valid RFC 3339 month indication in ...${c.excerpt(date.pos)}...')
378     }
379     dd := parts[2]
380     if dd.len != 2 {
381         return error(@MOD + '.' + @STRUCT + '.' + @FN +
382             ' "${lit}" does not have a valid RFC 3339 day indication in ...${c.excerpt(date.pos)}...')
383     }
384     if mm.int() == 2 {
385         ddi := dd.int()
386         if ddi > 28 {
387             if ddi == 29 {
388                 yyyyi := yyyy.int()
389                 if !(yyyyi % 4 == 0 && (yyyyi % 100 != 0 || yyyyi % 400 == 0)) {
390                     return error(@MOD + '.' + @STRUCT + '.' + @FN +
391                         ' "${lit}" is not a valid RFC 3339 date: ${yyyy} is not a leap year so February can not have 29 days in it ...${c.excerpt(date.pos)}...')
392                 }
393             } else {
394                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
395                     ' "${lit}" is not a valid RFC 3339 date: February can not have more that 28 or 29 days in it ...${c.excerpt(date.pos)}...')
396             }
397         }
398     }
399     toml_parse_time(lit) or {
400         return error(@MOD + '.' + @STRUCT + '.' + @FN +
401             ' "${lit}" is not a valid RFC 3339 Date format string "${err}". In ...${c.excerpt(date.pos)}...')
402     }
403 }
404 
405 // check_time returns an error if `t` is not a valid TOML time string (RFC 3339).
406 fn (c &Checker) check_time(t ast.Time) ! {
407     lit := t.text
408     // Split any offsets from the time
409     mut offset_splitter := if lit.contains('+') { '+' } else { '-' }
410     parts := lit.split(offset_splitter)
411     mut hhmmss := parts[0].all_before('.')
412     // Check for 2 digits in all fields
413     mut check_length := 8
414     if hhmmss.to_upper().ends_with('Z') {
415         check_length++
416     }
417     if hhmmss.len != check_length {
418         starts_with_zero := hhmmss.starts_with('0')
419         if !starts_with_zero {
420             return error(@MOD + '.' + @STRUCT + '.' + @FN +
421                 ' "${lit}" must be zero prefixed in ...${c.excerpt(t.pos)}...')
422         }
423         return error(@MOD + '.' + @STRUCT + '.' + @FN +
424             ' "${lit}" is not a valid RFC 3339 Time format string in ...${c.excerpt(t.pos)}...')
425     }
426 
427     if parts.len > 1 {
428         // Offset
429         offset_parts := parts[1].split(':')
430         if offset_parts.len != 2 {
431             return error(@MOD + '.' + @STRUCT + '.' + @FN +
432                 ' "${parts[1]}" is not a valid RFC 3339 time offset specifier in ...${c.excerpt(t.pos)}...')
433         }
434         hh := offset_parts[0].int()
435         if hh < 0 || hh > 24 {
436             pos := token.Pos{
437                 ...t.pos
438                 pos: t.pos.pos + check_length
439             }
440             return error(@MOD + '.' + @STRUCT + '.' + @FN +
441                 ' "${hh}" hour specifier in "${parts[1]}" should be between 00 and 24 in ...${c.excerpt(pos)}...')
442         }
443         mm := offset_parts[1].int()
444         if mm < 0 || mm > 59 {
445             pos := token.Pos{
446                 ...t.pos
447                 pos: t.pos.pos + check_length
448             }
449             return error(@MOD + '.' + @STRUCT + '.' + @FN +
450                 ' "${mm}" second specifier in "${parts[1]}" should be between 00 and 59 in ...${c.excerpt(pos)}...')
451         }
452     }
453     // Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't.
454     mut has_time_offset := false
455     for ch in parts[0]#[8..] {
456         if ch in [u8(`-`), `+`, `Z`] {
457             has_time_offset = true
458             break
459         }
460     }
461 
462     mut part_with_offset := parts[0]
463     if !has_time_offset {
464         part_with_offset += 'Z'
465     }
466 
467     toml_parse_time(part_with_offset) or {
468         return error(@MOD + '.' + @STRUCT + '.' + @FN +
469             ' "${lit}" is not a valid RFC 3339 Time format string "${err}". In ...${c.excerpt(t.pos)}...')
470     }
471 }
472 
473 // check_quoted returns an error if `q` is not a valid quoted TOML string.
474 pub fn (c &Checker) check_quoted(q ast.Quoted) ! {
475     lit := q.text
476     quote := q.quote.ascii_str()
477     triple_quote := quote + quote + quote
478     if q.is_multiline && lit.ends_with(triple_quote) && !lit.ends_with('\\' + triple_quote) {
479         return error(@MOD + '.' + @STRUCT + '.' + @FN +
480             ' string values like "${lit}" has unbalanced quote literals `${quote}` in ...${c.excerpt(q.pos)}...')
481     }
482     c.check_quoted_escapes(q)!
483     c.check_utf8_validity(q)!
484 }
485 
486 // check_quoted_escapes returns an error for any disallowed escape sequences.
487 // Delimiters in TOML has significant meaning:
488 // '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
489 // "/""" delimits *basic* strings
490 // Allowed escapes in *basic* strings are:
491 // \b         - backspace       (U+0008)
492 // \t         - tab             (U+0009)
493 // \n         - linefeed        (U+000A)
494 // \f         - form feed       (U+000C)
495 // \r         - carriage return (U+000D)
496 // \"         - quote           (U+0022)
497 // \\         - backslash       (U+005C)
498 // \uXXXX     - Unicode         (U+XXXX)
499 // \UXXXXXXXX - Unicode         (U+XXXXXXXX)
500 fn (c &Checker) check_quoted_escapes(q ast.Quoted) ! {
501     // Setup a scanner in stack memory for easier navigation.
502     mut s := scanner.new_simple_text(q.text)!
503 
504     // See https://toml.io/en/v1.0.0#string for more info on string types.
505     is_basic := q.quote == `\"`
506     contains_newlines := q.text.contains('\n')
507     for {
508         ch := s.next()
509         if ch == scanner.end_of_text {
510             break
511         }
512         ch_byte := u8(ch)
513         if ch == `\\` {
514             next_ch := u8(s.at())
515 
516             if next_ch == `\\` {
517                 s.next()
518                 continue
519             }
520 
521             escape := ch_byte.ascii_str() + next_ch.ascii_str()
522             if is_basic {
523                 if q.is_multiline {
524                     if next_ch == ` ` {
525                         if !contains_newlines {
526                             st := s.state()
527                             return error(@MOD + '.' + @STRUCT + '.' + @FN +
528                                 ' can not escape whitespaces in multi-line strings (`\\ `) at `${escape}` (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
529                         }
530                         // Rest of line must only be space chars from this point on
531                         for {
532                             ch_ := s.next()
533                             if ch_ == `\n` {
534                                 break
535                             }
536                             if !(ch_ == ` ` || ch_ == `\t`) {
537                                 st := s.state()
538                                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
539                                     ' invalid character `${u8(ch_).ascii_str()}` after `${escape}` at (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
540                             }
541                         }
542                     }
543                     if next_ch in [`\t`, `\r`, `\n`, ` `] {
544                         s.next()
545                         continue
546                     }
547                 }
548                 if next_ch !in allowed_basic_escape_char_list() {
549                     st := s.state()
550                     return error(@MOD + '.' + @STRUCT + '.' + @FN +
551                         ' unknown basic string escape character `${next_ch.ascii_str()}` in `${escape}` (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
552                 }
553             }
554             // Check Unicode escapes
555             if is_basic && escape.to_lower() == '\\u' {
556                 // Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
557                 // we pass in 10 characters from the `u`/`U` which is the longest possible sequence
558                 // of 9 chars plus one extra.
559                 if s.remaining() >= 10 {
560                     pos := s.state().pos
561                     c.check_unicode_escape(s.text[pos..pos + 11]) or {
562                         st := s.state()
563                         return error(@MOD + '.' + @STRUCT + '.' + @FN +
564                             ' escaped Unicode is invalid. ${err.msg().capitalize()} (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
565                     }
566                 } else {
567                     pos := s.state().pos
568                     c.check_unicode_escape(s.text[pos..]) or {
569                         st := s.state()
570                         return error(@MOD + '.' + @STRUCT + '.' + @FN +
571                             ' escaped Unicode is invalid. ${err.msg().capitalize()} (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
572                     }
573                 }
574             }
575         }
576     }
577 }
578 
579 // check_utf8_string returns an error if `str` is not valid UTF-8.
580 fn (c &Checker) check_utf8_validity(q ast.Quoted) ! {
581     lit := q.text
582     if !utf8.validate_str(lit) {
583         return error(@MOD + '.' + @STRUCT + '.' + @FN +
584             ' the string value "${lit}" is not valid UTF-8 in ...${c.excerpt(q.pos)}...')
585     }
586 }
587 
588 // validate_utf8_codepoint_string returns an error if `str` is not a valid Unicode code point.
589 // `str` is expected to be a `string` containing *only* hex values.
590 // Any preludes or prefixes like `0x` could pontentially yield wrong results.
591 fn validate_utf8_codepoint_string(str string) ! {
592     int_val := strconv.parse_int(str, 16, 64) or { i64(-1) }
593     if int_val > utf8_max || int_val < 0 {
594         return error('Unicode code point `${str}` is outside the valid Unicode scalar value ranges.')
595     }
596     // Check if the Unicode value is actually in the valid Unicode scalar value ranges.
597     // TODO: should probably be transferred / implemented in `utf8.validate(...)` also?
598     if !((int_val >= 0x0000 && int_val <= 0xD7FF) || (int_val >= 0xE000 && int_val <= 0x10FFFF)) {
599         return error('Unicode code point `${str}` is not a valid Unicode scalar value.')
600     }
601     bytes := str.bytes()
602     if !utf8.validate(bytes.data, bytes.len) {
603         return error('Unicode code point `${str}` is not a valid UTF-8 code point.')
604     }
605 }
606 
607 // check_unicode_escape returns an error if `esc_unicode` is not
608 // a valid Unicode escape sequence. `esc_unicode` is expected to be
609 // prefixed with either `u` or `U`.
610 fn (c &Checker) check_unicode_escape(esc_unicode string) ! {
611     if esc_unicode.len < 5 || !esc_unicode.to_lower().starts_with('u') {
612         // Makes sure the input to this function is actually valid.
613         return error('`${esc_unicode}` is not a valid escaped Unicode sequence.')
614     }
615     is_long_esc_type := esc_unicode.starts_with('U')
616     mut sequence := esc_unicode[1..]
617     hex_digits_len := if is_long_esc_type { 8 } else { 4 }
618     if sequence.len < hex_digits_len {
619         return error('Unicode escape sequence `${esc_unicode}` should be at least ${hex_digits_len} in length.')
620     }
621     sequence = sequence[..hex_digits_len]
622     // TODO: not enforced in BurnSushi testsuite??
623     // if !sequence.is_upper() {
624     //    return error('Unicode escape sequence `${esc_unicode}` is not in all uppercase.')
625     //}
626     validate_utf8_codepoint_string(sequence.to_upper())!
627     if is_long_esc_type {
628         // Long escape type checks
629     } else {
630         // Short escape type checks
631     }
632 }
633 
634 // check_comment returns an error if the contents of `comment` isn't
635 // a valid TOML comment.
636 pub fn (c &Checker) check_comment(comment ast.Comment) ! {
637     lit := comment.text
638     // Setup a scanner in stack memory for easier navigation.
639     mut s := scanner.new_simple_text(lit)!
640     for {
641         ch := s.next()
642         if ch == scanner.end_of_text {
643             break
644         }
645         ch_byte := u8(ch)
646         // Check for carriage return
647         if ch_byte == 0x0D {
648             st := s.state()
649             return error(@MOD + '.' + @STRUCT + '.' + @FN +
650                 ' carriage return character `${ch_byte.hex()}` is not allowed in comments (${st.line_nr},${st.col}).')
651         }
652         // Check for control characters (allow TAB)
653         if util.is_illegal_ascii_control_character(ch_byte) {
654             st := s.state()
655             return error(@MOD + '.' + @STRUCT + '.' + @FN +
656                 ' control character `${ch_byte.hex()}` is not allowed (${st.line_nr},${st.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(st.pos, 10)}...')
657         }
658     }
659 
660     // Check for bad UTF-8 encoding
661     if !utf8.validate_str(lit) {
662         return error(@MOD + '.' + @STRUCT + '.' + @FN +
663             ' comment "${lit}" is not valid UTF-8 in ...${c.excerpt(comment.pos)}...')
664     }
665 }
666