v2 / vlib / toml / checker / checker.v
665 lines · 621 sloc · 22.09 KB · 58a3b6f56a43444feffe35dd64d72d104fbfb07c
Raw
1// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module checker
5
6import toml.ast
7import toml.ast.walker
8import toml.util
9import toml.token
10import toml.scanner
11import encoding.utf8
12import time
13import strconv
14
15pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`]
16
17// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
18const utf8_max = 0x10FFFF
19
20fn allowed_basic_escape_char_list() []u8 {
21 return [u8(`u`), `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`]
22}
23
24fn toml_parse_time(s string) !time.Time {
25 if s.len > 3 && s[2] == `:` {
26 // complete the partial time, with an arbitrary date:
27 return time.parse_rfc3339('0001-01-01T' + s)
28 }
29 if s.len == 10 {
30 // complete the partial date, with zero time and zero timezone
31 return time.parse_rfc3339(s + 'T00:00:00Z')
32 }
33 return time.parse_rfc3339(s)!
34}
35
36// Checker checks a tree of TOML `ast.Value`'s for common errors.
37pub struct Checker {
38pub:
39 scanner &scanner.Scanner = unsafe { nil }
40}
41
42// check checks the `ast.Value` and all it's children
43// for common errors.
44pub fn (c &Checker) check(n &ast.Value) ! {
45 walker.walk(c, n)!
46}
47
48fn (c &Checker) visit(value &ast.Value) ! {
49 match value {
50 ast.Bool {
51 c.check_boolean(value)!
52 }
53 ast.Number {
54 c.check_number(value)!
55 }
56 ast.Quoted {
57 c.check_quoted(value)!
58 }
59 ast.DateTime {
60 c.check_date_time(value)!
61 }
62 ast.Date {
63 c.check_date(value)!
64 }
65 ast.Time {
66 c.check_time(value)!
67 }
68 else {}
69 }
70}
71
72// excerpt returns a string of the token's surroundings
73fn (c &Checker) excerpt(tp token.Pos) string {
74 return c.scanner.excerpt(tp.pos, 10)
75}
76
77// is_hex_bin_oct_prefixed returns true if `hbo` has either
78// of: `0x`, `0o` or `0b` - as a prefix.
79// Example: assert is_hex_bin_oct_prefixed('0xAF') == true
80// Example: assert is_hex_bin_oct_prefixed('xAF') == false
81fn is_hex_bin_oct_prefixed(hbo string) bool {
82 return hbo.len > 2 && (hbo.starts_with('0x') || hbo.starts_with('0o') || hbo.starts_with('0b'))
83}
84
85// has_repeating returns true if `str` has one or more repeating
86// `rune` characters provided in `repeats`.
87// Example: assert has_repeating('hello__v.', [`.`,`_`]) == true
88// Example: assert has_repeating('hello_v.', [`.`,`_`]) == false
89fn has_repeating(str string, repeats []rune) bool {
90 for i, r in str {
91 if r in repeats && i + 1 < str.len {
92 if r == str[i + 1] {
93 return true
94 }
95 }
96 }
97 return false
98}
99
100// check_number returns an error if `num` is not a valid TOML number.
101fn (c &Checker) check_number(num ast.Number) ! {
102 lit := num.text
103 lit_lower_case := lit.to_lower()
104 if lit in ['0', '0.0', '+0', '-0', '+0.0', '-0.0', '0e0', '+0e0', '-0e0', '0e00'] {
105 return
106 }
107
108 if lit.contains('_') {
109 if lit.starts_with('_') || lit.ends_with('_') {
110 return error(@MOD + '.' + @STRUCT + '.' + @FN +
111 ' numbers like "${lit}" can not start or end with `_` in ...${c.excerpt(num.pos)}...')
112 }
113 if lit.contains('__') {
114 return error(@MOD + '.' + @STRUCT + '.' + @FN +
115 ' numbers like "${lit}" can not have more than one underscore (`_`) in ...${c.excerpt(num.pos)}...')
116 }
117 }
118
119 mut hex_bin_oct := is_hex_bin_oct_prefixed(lit)
120 mut is_bin, mut is_oct, mut is_hex := false, false, false
121 is_float := lit_lower_case.all_before('e').contains('.')
122 has_exponent_notation := lit_lower_case.contains('e')
123 float_decimal_index := lit.index_('.')
124 // mut is_first_digit := u8(lit[0]).is_digit()
125 mut ascii := u8(lit[0]).ascii_str()
126 is_sign_prefixed := lit[0] in [`+`, `-`]
127 mut lit_sans_sign := lit
128 if is_sign_prefixed { // +/- ...
129 lit_sans_sign = lit[1..]
130 hex_bin_oct = is_hex_bin_oct_prefixed(lit_sans_sign)
131 if hex_bin_oct {
132 ascii = u8(lit[0]).ascii_str()
133 return error(@MOD + '.' + @STRUCT + '.' + @FN +
134 ' numbers like "${lit}" (hex, octal and binary) can not start with `${ascii}` in ...${c.excerpt(num.pos)}...')
135 }
136 if lit.len > 1 && lit_sans_sign.starts_with('0') && !lit_sans_sign.starts_with('0.') {
137 ascii = u8(lit_sans_sign[0]).ascii_str()
138 return error(@MOD + '.' + @STRUCT + '.' + @FN +
139 ' numbers like "${lit}" can not start with `${ascii}` in ...${c.excerpt(num.pos)}...')
140 }
141 } else {
142 if !hex_bin_oct {
143 if !is_float && lit[0] == `0` {
144 if lit[1] in [`B`, `O`, `X`] {
145 return error(@MOD + '.' + @STRUCT + '.' + @FN +
146 ' numbers like "${lit}" only lowercase notation in ...${c.excerpt(num.pos)}...')
147 }
148 return error(@MOD + '.' + @STRUCT + '.' + @FN +
149 ' numbers like "${lit}" can not start with a zero in ...${c.excerpt(num.pos)}...')
150 }
151
152 if is_float && lit[0] == `0` && float_decimal_index > 1 {
153 return error(@MOD + '.' + @STRUCT + '.' + @FN +
154 ' numbers like "${lit}" can not start with a zero in ...${c.excerpt(num.pos)}...')
155 }
156 }
157 }
158
159 if has_repeating(lit, [`_`, `.`, `b`, `o`, `x`]) {
160 return error(@MOD + '.' + @STRUCT + '.' + @FN +
161 ' numbers like "${lit}" can not have ${scanner.digit_extras} as repeating characters in ...${c.excerpt(num.pos)}...')
162 }
163
164 if hex_bin_oct {
165 is_bin = lit_sans_sign.starts_with('0b')
166 is_oct = lit_sans_sign.starts_with('0o')
167 is_hex = lit_sans_sign.starts_with('0x')
168
169 lit_sans_sign_and_type_prefix := lit_sans_sign[2..]
170
171 if lit_sans_sign_and_type_prefix.starts_with('_')
172 || lit_sans_sign_and_type_prefix.ends_with('_') {
173 return error(@MOD + '.' + @STRUCT + '.' + @FN +
174 ' numbers like "${lit}" can not start or end with `_` in ...${c.excerpt(num.pos)}...')
175 }
176
177 if is_bin {
178 if !c.is_valid_binary_literal(lit_sans_sign_and_type_prefix) {
179 return error(@MOD + '.' + @STRUCT + '.' + @FN +
180 ' "${lit}" is not a valid binary number in ...${c.excerpt(num.pos)}...')
181 }
182 } else if is_oct {
183 if !c.is_valid_octal_literal(lit_sans_sign_and_type_prefix) {
184 return error(@MOD + '.' + @STRUCT + '.' + @FN +
185 ' "${lit}" is not a valid octal number in ...${c.excerpt(num.pos)}...')
186 }
187 } else {
188 if !c.is_valid_hex_literal(lit_sans_sign_and_type_prefix) {
189 return error(@MOD + '.' + @STRUCT + '.' + @FN +
190 ' "${lit}" is not a valid hexadecimal number in ...${c.excerpt(num.pos)}...')
191 }
192 }
193 }
194
195 if has_exponent_notation {
196 if lit_lower_case.all_after('e').starts_with('_')
197 || lit_lower_case.all_before('e').ends_with('_') {
198 return error(@MOD + '.' + @STRUCT + '.' + @FN +
199 ' the exponent in "${lit}" can not start nor end with an underscore in ...${c.excerpt(num.pos)}...')
200 }
201 if lit_lower_case.all_after('e').contains('.') {
202 return error(@MOD + '.' + @STRUCT + '.' + @FN +
203 ' numbers like "${lit}" (with exponent) can not have a decimal point in ...${c.excerpt(num.pos)}...')
204 }
205 if !is_hex && lit_lower_case.count('e') > 1 {
206 return error(@MOD + '.' + @STRUCT + '.' + @FN +
207 ' numbers like "${lit}" (with exponent) can only have one exponent in ...${c.excerpt(num.pos)}...')
208 }
209 }
210
211 if is_float {
212 if lit.count('.') > 1 {
213 return error(@MOD + '.' + @STRUCT + '.' + @FN +
214 ' numbers like "${lit}" (float) can only have one decimal point in ...${c.excerpt(num.pos)}...')
215 }
216 last := lit[lit.len - 1]
217 if last in scanner.digit_extras {
218 ascii = u8(last).ascii_str()
219 return error(@MOD + '.' + @STRUCT + '.' + @FN +
220 ' numbers like "${lit}" (float) can not end with `${ascii}` in ...${c.excerpt(num.pos)}...')
221 }
222 if lit.contains('_.') || lit.contains('._') {
223 return error(@MOD + '.' + @STRUCT + '.' + @FN +
224 ' numbers like "${lit}" (float) can not have underscores before or after the decimal point in ...${c.excerpt(num.pos)}...')
225 }
226 if lit_lower_case.contains('e.') || lit.contains('.e') {
227 return error(@MOD + '.' + @STRUCT + '.' + @FN +
228 ' numbers like "${lit}" (float) can not have decimal points on either side of the exponent notation in ...${c.excerpt(num.pos)}...')
229 }
230 // Check if it contains other chars than the allowed
231 for r in lit {
232 if r !in [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `.`, `e`, `E`, `-`, `+`,
233 `_`] {
234 return error(@MOD + '.' + @STRUCT + '.' + @FN +
235 ' numbers like "${lit}" (float) can not contain `${u8(r).ascii_str()}` in ...${c.excerpt(num.pos)}...')
236 }
237 }
238 } else {
239 if lit.len > 1 && lit.starts_with('0') && lit[1] !in [`b`, `o`, `x`] {
240 ascii = u8(lit[0]).ascii_str()
241 return error(@MOD + '.' + @STRUCT + '.' + @FN +
242 ' numbers like "${lit}" can not start with `${ascii}` in ...${c.excerpt(num.pos)}...')
243 }
244 }
245}
246
247// is_valid_binary_literal returns true if `num` is valid TOML binary literal.
248fn (c &Checker) is_valid_binary_literal(num string) bool {
249 for ch in num {
250 if ch == `_` {
251 continue
252 }
253 if !(ch >= `0` && ch <= `1`) {
254 return false
255 }
256 }
257 return true
258}
259
260// is_valid_octal_literal returns true if `num` is valid TOML octal literal.
261fn (c &Checker) is_valid_octal_literal(num string) bool {
262 for ch in num {
263 if ch == `_` {
264 continue
265 }
266 if !(ch >= `0` && ch <= `7`) {
267 return false
268 }
269 }
270 return true
271}
272
273// is_valid_hex_literal returns true if `num` is valid TOML hexadecimal literal.
274fn (c &Checker) is_valid_hex_literal(num string) bool {
275 for ch in num {
276 if ch == `_` {
277 continue
278 }
279 if !ch.is_hex_digit() {
280 return false
281 }
282 }
283 return true
284}
285
286// check_boolean returns an error if `b` is not a valid TOML boolean.
287fn (c &Checker) check_boolean(b ast.Bool) ! {
288 lit := b.text
289 if lit in ['true', 'false'] {
290 return
291 }
292 return error(@MOD + '.' + @STRUCT + '.' + @FN +
293 ' boolean values like "${lit}" can only be `true` or `false` literals, not `${lit}` in ...${c.excerpt(b.pos)}...')
294}
295
296// check_date_time returns an error if `dt` is not a valid TOML date-time string (RFC 3339).
297// See also https://ijmacd.github.io/rfc3339-iso8601 for a more
298// visual representation of the RFC 3339 format.
299fn (c &Checker) check_date_time(dt ast.DateTime) ! {
300 lit := dt.text
301 mut split := []string{}
302 // RFC 3339 Date-Times can be split via 4 separators (` `, `_`, `T` and `t`).
303 if lit.to_lower().contains_any(' _t') {
304 if lit.contains(' ') {
305 split = lit.split(' ')
306 } else if lit.contains('_') {
307 split = lit.split('_')
308 } else if lit.contains('T') {
309 split = lit.split('T')
310 } else if lit.contains('t') {
311 split = lit.split('t')
312 }
313 // Validate the split into date and time parts.
314 if split.len != 2 {
315 return error(@MOD + '.' + @STRUCT + '.' + @FN +
316 ' "${lit}" contains too many date/time separators in ...${c.excerpt(dt.pos)}...')
317 }
318 // Re-use date and time validation code for detailed testing of each part
319 c.check_date(ast.Date{
320 text: split[0]
321 pos: token.Pos{
322 len: split[0].len
323 line_nr: dt.pos.line_nr
324 pos: dt.pos.pos
325 col: dt.pos.col
326 }
327 })!
328 c.check_time(ast.Time{
329 text: split[1]
330 pos: token.Pos{
331 len: split[1].len
332 line_nr: dt.pos.line_nr
333 pos: dt.pos.pos + split[0].len
334 col: dt.pos.col + split[0].len
335 }
336 })!
337 // Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't.
338 mut has_time_offset := false
339 for ch in lit#[19..] {
340 if ch in [u8(`-`), `+`, `Z`] {
341 has_time_offset = true
342 break
343 }
344 }
345
346 mut lit_with_offset := lit
347 if !has_time_offset {
348 lit_with_offset += 'Z'
349 }
350
351 toml_parse_time(lit_with_offset) or {
352 return error(@MOD + '.' + @STRUCT + '.' + @FN +
353 ' "${lit}" is not a valid RFC 3339 Date-Time format string "${err}". In ...${c.excerpt(dt.pos)}...')
354 }
355 } else {
356 return error(@MOD + '.' + @STRUCT + '.' + @FN +
357 ' "${lit}" is not a valid RFC 3339 Date-Time format string in ...${c.excerpt(dt.pos)}...')
358 }
359}
360
361// check_time returns an error if `date` is not a valid TOML date string (RFC 3339).
362fn (c &Checker) check_date(date ast.Date) ! {
363 lit := date.text
364 parts := lit.split('-')
365 if parts.len != 3 {
366 return error(@MOD + '.' + @STRUCT + '.' + @FN +
367 ' "${lit}" is not a valid RFC 3339 Date format string in ...${c.excerpt(date.pos)}...')
368 }
369 yyyy := parts[0]
370 if yyyy.len != 4 {
371 return error(@MOD + '.' + @STRUCT + '.' + @FN +
372 ' "${lit}" does not have a valid RFC 3339 year indication in ...${c.excerpt(date.pos)}...')
373 }
374 mm := parts[1]
375 if mm.len != 2 {
376 return error(@MOD + '.' + @STRUCT + '.' + @FN +
377 ' "${lit}" does not have a valid RFC 3339 month indication in ...${c.excerpt(date.pos)}...')
378 }
379 dd := parts[2]
380 if dd.len != 2 {
381 return error(@MOD + '.' + @STRUCT + '.' + @FN +
382 ' "${lit}" does not have a valid RFC 3339 day indication in ...${c.excerpt(date.pos)}...')
383 }
384 if mm.int() == 2 {
385 ddi := dd.int()
386 if ddi > 28 {
387 if ddi == 29 {
388 yyyyi := yyyy.int()
389 if !(yyyyi % 4 == 0 && (yyyyi % 100 != 0 || yyyyi % 400 == 0)) {
390 return error(@MOD + '.' + @STRUCT + '.' + @FN +
391 ' "${lit}" is not a valid RFC 3339 date: ${yyyy} is not a leap year so February can not have 29 days in it ...${c.excerpt(date.pos)}...')
392 }
393 } else {
394 return error(@MOD + '.' + @STRUCT + '.' + @FN +
395 ' "${lit}" is not a valid RFC 3339 date: February can not have more that 28 or 29 days in it ...${c.excerpt(date.pos)}...')
396 }
397 }
398 }
399 toml_parse_time(lit) or {
400 return error(@MOD + '.' + @STRUCT + '.' + @FN +
401 ' "${lit}" is not a valid RFC 3339 Date format string "${err}". In ...${c.excerpt(date.pos)}...')
402 }
403}
404
405// check_time returns an error if `t` is not a valid TOML time string (RFC 3339).
406fn (c &Checker) check_time(t ast.Time) ! {
407 lit := t.text
408 // Split any offsets from the time
409 mut offset_splitter := if lit.contains('+') { '+' } else { '-' }
410 parts := lit.split(offset_splitter)
411 mut hhmmss := parts[0].all_before('.')
412 // Check for 2 digits in all fields
413 mut check_length := 8
414 if hhmmss.to_upper().ends_with('Z') {
415 check_length++
416 }
417 if hhmmss.len != check_length {
418 starts_with_zero := hhmmss.starts_with('0')
419 if !starts_with_zero {
420 return error(@MOD + '.' + @STRUCT + '.' + @FN +
421 ' "${lit}" must be zero prefixed in ...${c.excerpt(t.pos)}...')
422 }
423 return error(@MOD + '.' + @STRUCT + '.' + @FN +
424 ' "${lit}" is not a valid RFC 3339 Time format string in ...${c.excerpt(t.pos)}...')
425 }
426
427 if parts.len > 1 {
428 // Offset
429 offset_parts := parts[1].split(':')
430 if offset_parts.len != 2 {
431 return error(@MOD + '.' + @STRUCT + '.' + @FN +
432 ' "${parts[1]}" is not a valid RFC 3339 time offset specifier in ...${c.excerpt(t.pos)}...')
433 }
434 hh := offset_parts[0].int()
435 if hh < 0 || hh > 24 {
436 pos := token.Pos{
437 ...t.pos
438 pos: t.pos.pos + check_length
439 }
440 return error(@MOD + '.' + @STRUCT + '.' + @FN +
441 ' "${hh}" hour specifier in "${parts[1]}" should be between 00 and 24 in ...${c.excerpt(pos)}...')
442 }
443 mm := offset_parts[1].int()
444 if mm < 0 || mm > 59 {
445 pos := token.Pos{
446 ...t.pos
447 pos: t.pos.pos + check_length
448 }
449 return error(@MOD + '.' + @STRUCT + '.' + @FN +
450 ' "${mm}" second specifier in "${parts[1]}" should be between 00 and 59 in ...${c.excerpt(pos)}...')
451 }
452 }
453 // Simulate a time offset if it's missing then it can be checked. Already toml supports local time and rfc3339 don't.
454 mut has_time_offset := false
455 for ch in parts[0]#[8..] {
456 if ch in [u8(`-`), `+`, `Z`] {
457 has_time_offset = true
458 break
459 }
460 }
461
462 mut part_with_offset := parts[0]
463 if !has_time_offset {
464 part_with_offset += 'Z'
465 }
466
467 toml_parse_time(part_with_offset) or {
468 return error(@MOD + '.' + @STRUCT + '.' + @FN +
469 ' "${lit}" is not a valid RFC 3339 Time format string "${err}". In ...${c.excerpt(t.pos)}...')
470 }
471}
472
473// check_quoted returns an error if `q` is not a valid quoted TOML string.
474pub fn (c &Checker) check_quoted(q ast.Quoted) ! {
475 lit := q.text
476 quote := q.quote.ascii_str()
477 triple_quote := quote + quote + quote
478 if q.is_multiline && lit.ends_with(triple_quote) && !lit.ends_with('\\' + triple_quote) {
479 return error(@MOD + '.' + @STRUCT + '.' + @FN +
480 ' string values like "${lit}" has unbalanced quote literals `${quote}` in ...${c.excerpt(q.pos)}...')
481 }
482 c.check_quoted_escapes(q)!
483 c.check_utf8_validity(q)!
484}
485
486// check_quoted_escapes returns an error for any disallowed escape sequences.
487// Delimiters in TOML has significant meaning:
488// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
489// "/""" delimits *basic* strings
490// Allowed escapes in *basic* strings are:
491// \b - backspace (U+0008)
492// \t - tab (U+0009)
493// \n - linefeed (U+000A)
494// \f - form feed (U+000C)
495// \r - carriage return (U+000D)
496// \" - quote (U+0022)
497// \\ - backslash (U+005C)
498// \uXXXX - Unicode (U+XXXX)
499// \UXXXXXXXX - Unicode (U+XXXXXXXX)
500fn (c &Checker) check_quoted_escapes(q ast.Quoted) ! {
501 // Setup a scanner in stack memory for easier navigation.
502 mut s := scanner.new_simple_text(q.text)!
503
504 // See https://toml.io/en/v1.0.0#string for more info on string types.
505 is_basic := q.quote == `\"`
506 contains_newlines := q.text.contains('\n')
507 for {
508 ch := s.next()
509 if ch == scanner.end_of_text {
510 break
511 }
512 ch_byte := u8(ch)
513 if ch == `\\` {
514 next_ch := u8(s.at())
515
516 if next_ch == `\\` {
517 s.next()
518 continue
519 }
520
521 escape := ch_byte.ascii_str() + next_ch.ascii_str()
522 if is_basic {
523 if q.is_multiline {
524 if next_ch == ` ` {
525 if !contains_newlines {
526 st := s.state()
527 return error(@MOD + '.' + @STRUCT + '.' + @FN +
528 ' can not escape whitespaces in multi-line strings (`\\ `) at `${escape}` (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
529 }
530 // Rest of line must only be space chars from this point on
531 for {
532 ch_ := s.next()
533 if ch_ == `\n` {
534 break
535 }
536 if !(ch_ == ` ` || ch_ == `\t`) {
537 st := s.state()
538 return error(@MOD + '.' + @STRUCT + '.' + @FN +
539 ' invalid character `${u8(ch_).ascii_str()}` after `${escape}` at (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
540 }
541 }
542 }
543 if next_ch in [`\t`, `\r`, `\n`, ` `] {
544 s.next()
545 continue
546 }
547 }
548 if next_ch !in allowed_basic_escape_char_list() {
549 st := s.state()
550 return error(@MOD + '.' + @STRUCT + '.' + @FN +
551 ' unknown basic string escape character `${next_ch.ascii_str()}` in `${escape}` (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
552 }
553 }
554 // Check Unicode escapes
555 if is_basic && escape.to_lower() == '\\u' {
556 // Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
557 // we pass in 10 characters from the `u`/`U` which is the longest possible sequence
558 // of 9 chars plus one extra.
559 if s.remaining() >= 10 {
560 pos := s.state().pos
561 c.check_unicode_escape(s.text[pos..pos + 11]) or {
562 st := s.state()
563 return error(@MOD + '.' + @STRUCT + '.' + @FN +
564 ' escaped Unicode is invalid. ${err.msg().capitalize()} (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
565 }
566 } else {
567 pos := s.state().pos
568 c.check_unicode_escape(s.text[pos..]) or {
569 st := s.state()
570 return error(@MOD + '.' + @STRUCT + '.' + @FN +
571 ' escaped Unicode is invalid. ${err.msg().capitalize()} (${st.line_nr},${st.col}) in ...${c.excerpt(q.pos)}...')
572 }
573 }
574 }
575 }
576 }
577}
578
579// check_utf8_string returns an error if `str` is not valid UTF-8.
580fn (c &Checker) check_utf8_validity(q ast.Quoted) ! {
581 lit := q.text
582 if !utf8.validate_str(lit) {
583 return error(@MOD + '.' + @STRUCT + '.' + @FN +
584 ' the string value "${lit}" is not valid UTF-8 in ...${c.excerpt(q.pos)}...')
585 }
586}
587
588// validate_utf8_codepoint_string returns an error if `str` is not a valid Unicode code point.
589// `str` is expected to be a `string` containing *only* hex values.
590// Any preludes or prefixes like `0x` could pontentially yield wrong results.
591fn validate_utf8_codepoint_string(str string) ! {
592 int_val := strconv.parse_int(str, 16, 64) or { i64(-1) }
593 if int_val > utf8_max || int_val < 0 {
594 return error('Unicode code point `${str}` is outside the valid Unicode scalar value ranges.')
595 }
596 // Check if the Unicode value is actually in the valid Unicode scalar value ranges.
597 // TODO: should probably be transferred / implemented in `utf8.validate(...)` also?
598 if !((int_val >= 0x0000 && int_val <= 0xD7FF) || (int_val >= 0xE000 && int_val <= 0x10FFFF)) {
599 return error('Unicode code point `${str}` is not a valid Unicode scalar value.')
600 }
601 bytes := str.bytes()
602 if !utf8.validate(bytes.data, bytes.len) {
603 return error('Unicode code point `${str}` is not a valid UTF-8 code point.')
604 }
605}
606
607// check_unicode_escape returns an error if `esc_unicode` is not
608// a valid Unicode escape sequence. `esc_unicode` is expected to be
609// prefixed with either `u` or `U`.
610fn (c &Checker) check_unicode_escape(esc_unicode string) ! {
611 if esc_unicode.len < 5 || !esc_unicode.to_lower().starts_with('u') {
612 // Makes sure the input to this function is actually valid.
613 return error('`${esc_unicode}` is not a valid escaped Unicode sequence.')
614 }
615 is_long_esc_type := esc_unicode.starts_with('U')
616 mut sequence := esc_unicode[1..]
617 hex_digits_len := if is_long_esc_type { 8 } else { 4 }
618 if sequence.len < hex_digits_len {
619 return error('Unicode escape sequence `${esc_unicode}` should be at least ${hex_digits_len} in length.')
620 }
621 sequence = sequence[..hex_digits_len]
622 // TODO: not enforced in BurnSushi testsuite??
623 // if !sequence.is_upper() {
624 // return error('Unicode escape sequence `${esc_unicode}` is not in all uppercase.')
625 //}
626 validate_utf8_codepoint_string(sequence.to_upper())!
627 if is_long_esc_type {
628 // Long escape type checks
629 } else {
630 // Short escape type checks
631 }
632}
633
634// check_comment returns an error if the contents of `comment` isn't
635// a valid TOML comment.
636pub fn (c &Checker) check_comment(comment ast.Comment) ! {
637 lit := comment.text
638 // Setup a scanner in stack memory for easier navigation.
639 mut s := scanner.new_simple_text(lit)!
640 for {
641 ch := s.next()
642 if ch == scanner.end_of_text {
643 break
644 }
645 ch_byte := u8(ch)
646 // Check for carriage return
647 if ch_byte == 0x0D {
648 st := s.state()
649 return error(@MOD + '.' + @STRUCT + '.' + @FN +
650 ' carriage return character `${ch_byte.hex()}` is not allowed in comments (${st.line_nr},${st.col}).')
651 }
652 // Check for control characters (allow TAB)
653 if util.is_illegal_ascii_control_character(ch_byte) {
654 st := s.state()
655 return error(@MOD + '.' + @STRUCT + '.' + @FN +
656 ' control character `${ch_byte.hex()}` is not allowed (${st.line_nr},${st.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(st.pos, 10)}...')
657 }
658 }
659
660 // Check for bad UTF-8 encoding
661 if !utf8.validate_str(lit) {
662 return error(@MOD + '.' + @STRUCT + '.' + @FN +
663 ' comment "${lit}" is not valid UTF-8 in ...${c.excerpt(comment.pos)}...')
664 }
665}
666