Gitly


1 // Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module scanner
5 
6 import toml.input
7 import toml.token
8 import toml.util
9 
10 pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
11 pub const end_of_text = u32(~0)
12 
13 // Scanner contains the necessary fields for the state of the scan process.
14 // the task the scanner does is also referred to as "lexing" or "tokenizing".
15 // The Scanner methods are based on much of the work in `vlib/strings/textscanner`.
16 pub struct Scanner {
17 pub:
18     config Config
19     text   string // the input TOML text
20 mut:
21     col        int // current column number (x coordinate)
22     line_nr    int = 1 // current line number (y coordinate)
23     pos        int // current flat/index position in the `text` field
24     header_len int // Length, how many bytes of header was found
25     // Quirks
26     is_left_of_assign bool = true // indicates if the scanner is on the *left* side of an assignment
27 }
28 
29 // State is a read-only copy of the scanner's internal state.
30 // See also `Scanner.state()`.
31 pub struct State {
32 pub:
33     col     int // current column number (x coordinate)
34     line_nr int = 1 // current line number (y coordinate)
35     pos     int // current flat/index position in the `text` field
36 }
37 
38 // Config is used to configure a Scanner instance.
39 // Only one of the fields `text` and `file_path` is allowed to be set at time of configuration.
40 pub struct Config {
41 pub:
42     input               input.Config
43     tokenize_formatting bool = true // if true, generate tokens for `\n`, ` `, `\t`, `\r` etc.
44 }
45 
46 // new_scanner returns a new *heap* allocated `Scanner` instance, based on the file in config.input.file_path,
47 // or based on the text in config.input.text .
48 pub fn new_scanner(config Config) !&Scanner {
49     mut s := &Scanner{
50         config: config
51         text:   config.input.read_input()!
52     }
53     return s
54 }
55 
56 // new_simple returns a new *stack* allocated `Scanner` instance.
57 pub fn new_simple(config Config) !Scanner {
58     return Scanner{
59         config: config
60         text:   config.input.read_input()!
61     }
62 }
63 
64 // new_simple_text returns a new *stack* allocated `Scanner` instance
65 // ready for parsing TOML in `text`.
66 pub fn new_simple_text(text string) !Scanner {
67     in_config := input.Config{
68         text: text
69     }
70     config := Config{
71         input: in_config
72     }
73     return Scanner{
74         config: config
75         text:   config.input.read_input()!
76     }
77 }
78 
79 // new_simple_file returns a new *stack* allocated `Scanner` instance
80 // ready for parsing TOML in file read from `path`.
81 pub fn new_simple_file(path string) !Scanner {
82     in_config := input.Config{
83         file_path: path
84     }
85     config := Config{
86         input: in_config
87     }
88     return Scanner{
89         config: config
90         text:   config.input.read_input()!
91     }
92 }
93 
94 // scan returns the next token from the input.
95 @[direct_array_access]
96 pub fn (mut s Scanner) scan() !token.Token {
97     s.validate_and_skip_headers()!
98 
99     for {
100         c := s.next()
101         byte_c := u8(c)
102         if c == end_of_text {
103             s.inc_line_number()
104             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'reached EOF')
105             return s.new_token(.eof, '', 1)
106         }
107 
108         ascii := byte_c.ascii_str()
109         util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'current char "${ascii}"')
110 
111         if byte_c == u8(0x0) {
112             s.reset()
113             return error(@MOD + '.' + @STRUCT + '.' + @FN +
114                 ' NULL control character `${c.hex()}` is not allowed at (${s.line_nr},${s.col}) "${ascii}" near ...${s.excerpt(s.pos, 5)}...')
115         }
116 
117         is_sign := c == `+` || c == `-`
118 
119         // (+/-)nan & (+/-)inf
120         peek_1 := s.peek(1)
121         peek_2 := s.peek(2)
122         is_nan := c == `n` && s.at() == `a` && peek_1 == `n`
123         is_inf := !is_nan && c == `i` && s.at() == `n` && peek_1 == `f`
124         is_signed_nan := is_sign && s.at() == `n` && peek_1 == `a` && peek_2 == `n`
125         is_signed_inf := !is_signed_nan && is_sign && s.at() == `i` && peek_1 == `n`
126             && peek_2 == `f`
127         if !s.is_left_of_assign && (is_nan || is_inf || is_signed_nan || is_signed_inf) {
128             num := s.extract_nan_or_inf_number()!
129             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
130                 'identified a special number "${num}" (${num.len})')
131             return s.new_token(.number, num, num.len)
132         }
133 
134         is_signed_number := is_sign && u8(s.at()).is_digit() && !u8(s.peek(-1)).is_digit()
135         is_digit := byte_c.is_digit()
136         if is_digit || is_signed_number {
137             num := s.extract_number()!
138             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
139                 'identified a number "${num}" (${num.len})')
140             return s.new_token(.number, num, num.len)
141         }
142 
143         if util.is_key_char(byte_c) {
144             key := s.extract_key()
145             if u8(s.peek(1)) != `=` && (key == 'true' || key == 'false') {
146                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
147                     'identified a boolean "${key}" (${key.len})')
148                 return s.new_token(.boolean, key, key.len)
149             }
150             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
151                 'identified a bare key "${key}" (${key.len})')
152             return s.new_token(.bare, key, key.len)
153         }
154 
155         match rune(c) {
156             ` `, `\t`, `\n`, `\r` {
157                 if c == `\n` {
158                     s.inc_line_number()
159                     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
160                         'incremented line nr to ${s.line_nr}')
161                 } else if c == `\r` {
162                     // CR should always be followed by a `\n`
163                     if s.at() != `\n` {
164                         return error(@MOD + '.' + @STRUCT + '.' + @FN +
165                             ' missing newline/linefeed character after "\\c" carriage return at (${s.line_nr},${s.col}) "${ascii}" near ...${s.excerpt(s.pos, 5)}...')
166                     }
167                 }
168                 // Date-Time in RFC 3339 is allowed to have a space between the date and time in supplement to the 'T'
169                 // so we allow space characters to slip through to the parser if the space is between two digits...
170                 // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, '"'+u8(s.peek(-1)).ascii_str()+'" < "${ascii}" > "'+u8(s.at()).ascii_str()+'"')
171                 if c == ` ` && u8(s.peek(-1)).is_digit() && u8(s.at()).is_digit() {
172                     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
173                         'identified, what could be, a space between a RFC 3339 date and time ("${ascii}") (${ascii.len})')
174                     return s.new_token(token.Kind.whitespace, ascii, ascii.len)
175                 }
176                 if s.config.tokenize_formatting {
177                     mut kind := token.Kind.whitespace
178                     if c == `\t` {
179                         kind = token.Kind.tab
180                     } else if c == `\r` {
181                         kind = token.Kind.cr
182                     } else if c == `\n` {
183                         kind = token.Kind.nl
184                     }
185                     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
186                         'identified formatting character ("${ascii}") (${ascii.len})')
187                     return s.new_token(kind, ascii, ascii.len)
188                 } else {
189                     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
190                         'skipping " ", "\\t" or "\\n" ("${ascii}") (${ascii.len})')
191                 }
192                 continue
193             }
194             `-` {
195                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
196                     'identified minus "${ascii}" (${ascii.len})')
197                 return s.new_token(.minus, ascii, ascii.len)
198             }
199             `_` {
200                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
201                     'identified underscore "${ascii}" (${ascii.len})')
202                 return s.new_token(.underscore, ascii, ascii.len)
203             }
204             `+` {
205                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
206                     'identified plus "${ascii}" (${ascii.len})')
207                 return s.new_token(.plus, ascii, ascii.len)
208             }
209             `=` {
210                 s.is_left_of_assign = false
211                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
212                     'identified assignment "${ascii}" (${ascii.len})')
213                 return s.new_token(.assign, ascii, ascii.len)
214             }
215             `"`, `'` { // ... some string "/'
216                 ident_string := s.extract_string()!
217                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
218                     'identified quoted string `${ident_string}`')
219                 return s.new_token(.quoted, ident_string, ident_string.len)
220             }
221             `#` {
222                 hash := s.ignore_line()!
223                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
224                     'identified comment hash "${hash}" (${hash.len})')
225                 return s.new_token(.hash, hash, hash.len + 1)
226             }
227             `{` {
228                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
229                     'identified left curly bracket "${ascii}" (${ascii.len})')
230                 return s.new_token(.lcbr, ascii, ascii.len)
231             }
232             `}` {
233                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
234                     'identified right curly bracket "${ascii}" (${ascii.len})')
235                 return s.new_token(.rcbr, ascii, ascii.len)
236             }
237             `[` {
238                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
239                     'identified left square bracket "${ascii}" (${ascii.len})')
240                 return s.new_token(.lsbr, ascii, ascii.len)
241             }
242             `]` {
243                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
244                     'identified right square bracket "${ascii}" (${ascii.len})')
245                 return s.new_token(.rsbr, ascii, ascii.len)
246             }
247             `:` {
248                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
249                     'identified colon "${ascii}" (${ascii.len})')
250                 return s.new_token(.colon, ascii, ascii.len)
251             }
252             `,` {
253                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
254                     'identified comma "${ascii}" (${ascii.len})')
255                 return s.new_token(.comma, ascii, ascii.len)
256             }
257             `.` {
258                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
259                     'identified period "${ascii}" (${ascii.len})')
260                 return s.new_token(.period, ascii, ascii.len)
261             }
262             else {
263                 return error(@MOD + '.' + @STRUCT + '.' + @FN +
264                     ' could not scan character `${ascii}` / ${c} at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
265             }
266         }
267     }
268     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'unknown character code at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos,
269         5)}...')
270     return s.new_token(.unknown, '', 0)
271 }
272 
273 // free frees all allocated resources.
274 @[unsafe]
275 pub fn (mut s Scanner) free() {
276     unsafe {
277         s.text.free()
278     }
279 }
280 
281 // remaining returns how many characters remain in the text input.
282 @[inline]
283 pub fn (s &Scanner) remaining() int {
284     return s.text.len - s.pos
285 }
286 
287 // next returns the next character code from the input text.
288 // next returns `end_of_text` if it can't reach the next character.
289 @[direct_array_access; inline]
290 pub fn (mut s Scanner) next() u32 {
291     if s.pos < s.text.len {
292         opos := s.pos
293         s.pos++
294         s.col++
295         c := s.text[opos]
296         return c
297     }
298     return end_of_text
299 }
300 
301 // skip skips one character ahead.
302 @[inline]
303 pub fn (mut s Scanner) skip() {
304     if s.pos + 1 < s.text.len {
305         s.pos++
306         s.col++
307     }
308 }
309 
310 // skip_n skips ahead `n` characters.
311 // If the skip goes out of bounds from the length of `Scanner.text`,
312 // the scanner position will be sat to the last character possible.
313 @[inline]
314 pub fn (mut s Scanner) skip_n(n int) {
315     s.pos += n
316     if s.pos > s.text.len {
317         s.pos = s.text.len
318     }
319     s.col = s.pos
320 }
321 
322 // at returns the *current* character code from the input text.
323 // at returns `end_of_text` if it can't get the current character.
324 // unlike `next()`, `at()` does not change the state of the scanner.
325 @[direct_array_access; inline]
326 pub fn (s &Scanner) at() u32 {
327     if s.pos < s.text.len {
328         return s.text[s.pos]
329     }
330     return end_of_text
331 }
332 
333 // at_crlf returns `true` if the scanner is at a `\r` character
334 // and the next character is a `\n`.
335 fn (s &Scanner) at_crlf() bool {
336     return s.at() == `\r` && s.peek(1) == `\n`
337 }
338 
339 // peek returns the character code from the input text at position + `n`.
340 // peek returns `end_of_text` if it can't peek `n` characters ahead.
341 @[direct_array_access; inline]
342 pub fn (s &Scanner) peek(n int) u32 {
343     if s.pos + n < s.text.len {
344         // Allow peeking back - needed for spaces between date and time in RFC 3339 format :/
345         if n - 1 < 0 && s.pos + n - 1 >= 0 {
346             // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'LOOKING BAAAA-AACK - OOOVER MY SHOOOOULDEEEER "${s.text[s.pos + n-1]}"')
347             return s.text[s.pos + n - 1]
348         }
349         return s.text[s.pos + n]
350     }
351     return end_of_text
352 }
353 
354 // reset resets the internal state of the scanner.
355 pub fn (mut s Scanner) reset() {
356     s.pos = 0
357     s.col = 0
358     s.line_nr = 1
359     s.header_len = 0
360 }
361 
362 // new_token returns a new `token.Token`.
363 @[inline]
364 fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
365     // println('new_token(${lit})')
366     mut col := s.col - len + 1
367     if s.line_nr == 1 {
368         col -= s.header_len
369     }
370     return token.Token{
371         kind:    kind
372         lit:     lit
373         col:     if col < 1 { 1 } else { col }
374         line_nr: s.line_nr + 1
375         pos:     s.pos - s.header_len - len + 1
376         len:     len
377     }
378 }
379 
380 // ignore_line forwards the scanner to the end of the current line.
381 @[direct_array_access; inline]
382 fn (mut s Scanner) ignore_line() !string {
383     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL...')
384     start := s.pos
385     for c := s.at(); c != end_of_text && c != `\n`; c = s.at() {
386         util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${u8(c).ascii_str()} / ${c}"')
387         if s.at_crlf() {
388             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'letting `\\r\\n` slip through')
389             break
390         }
391         s.next()
392     }
393     return s.text[start..s.pos]
394 }
395 
396 // inc_line_number increases the internal line number.
397 @[inline]
398 fn (mut s Scanner) inc_line_number() {
399     s.col = 0
400     s.line_nr++
401     s.is_left_of_assign = true
402 }
403 
404 // extract_key parses and returns a TOML key as a string.
405 @[direct_array_access; inline]
406 fn (mut s Scanner) extract_key() string {
407     s.pos--
408     s.col--
409     start := s.pos
410     for s.pos < s.text.len {
411         c := u8(s.at())
412         if !(util.is_key_char(c) || c.is_digit() || c in [`_`, `-`]) {
413             break
414         }
415         s.pos++
416         s.col++
417     }
418     key := s.text[start..s.pos]
419     return key
420 }
421 
422 // extract_string collects and returns a string containing
423 // any bytes recognized as a TOML string.
424 // TOML strings are everything found between two double or single quotation marks (`"`/`'`).
425 @[direct_array_access; inline]
426 fn (mut s Scanner) extract_string() !string {
427     // extract_string is called when the scanner has already reached
428     // a byte that is the start of a string so we rewind it to start at the correct
429     s.pos--
430     s.col--
431     quote := u8(s.at())
432     start := s.pos
433     mut lit := quote.ascii_str()
434 
435     is_multiline := s.text[s.pos + 1] == quote && s.text[s.pos + 2] == quote
436     // Check for escaped multiline quote
437     if is_multiline {
438         mls := s.extract_multiline_string()!
439         return mls
440     }
441 
442     for {
443         s.pos++
444         s.col++
445 
446         if s.pos >= s.text.len {
447             return error(@MOD + '.' + @STRUCT + '.' + @FN +
448                 ' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
449         }
450 
451         c := u8(s.at())
452         util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
453             'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})')
454 
455         // Check for escaped chars
456         if c == u8(92) {
457             esc, skip := s.handle_escapes(quote, is_multiline)
458             lit += esc
459             if skip > 0 {
460                 s.pos += skip
461                 s.col += skip
462                 continue
463             }
464         }
465         // Check for control characters (allow TAB)
466         if util.is_illegal_ascii_control_character(c) {
467             return error(@MOD + '.' + @STRUCT + '.' + @FN +
468                 ' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
469         }
470 
471         if c == quote {
472             s.pos++
473             s.col++
474             return lit + quote.ascii_str()
475         }
476 
477         lit += c.ascii_str()
478 
479         // Don't eat multiple lines in single-line mode
480         if lit.contains('\n') {
481             return error(@MOD + '.' + @STRUCT + '.' + @FN +
482                 ' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
483         }
484     }
485     return lit
486 }
487 
488 // extract_multiline_string collects and returns a string containing
489 // any bytes recognized as a TOML string.
490 // TOML strings are everything found between two double or single quotation marks (`"`/`'`).
491 @[direct_array_access; inline]
492 fn (mut s Scanner) extract_multiline_string() !string {
493     // extract_multiline_string is called from extract_string so we know the 3 first
494     // characters is the quotes
495     quote := u8(s.at())
496     start := s.pos
497     mut lit := quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
498 
499     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'multi-line `${quote.ascii_str()}${s.text[
500         s.pos + 1].ascii_str()}${s.text[s.pos + 2].ascii_str()}` string started at pos ${start} (${s.line_nr},${s.col}) (quote type: ${quote.ascii_str()} / ${quote})')
501 
502     s.pos += 2
503     s.col += 2
504 
505     for {
506         s.pos++
507         s.col++
508 
509         if s.pos >= s.text.len {
510             return error(@MOD + '.' + @STRUCT + '.' + @FN +
511                 ' unfinished multi-line string literal (${quote.ascii_str()}${quote.ascii_str()}${quote.ascii_str()}) started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
512         }
513 
514         c := u8(s.at())
515         util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
516             'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})')
517 
518         if c == `\r` && s.peek(1) == `\n` {
519             continue
520         }
521         if c == `\n` {
522             s.inc_line_number()
523             lit += c.ascii_str()
524             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `\\n` / ${c}')
525             continue
526         }
527         // Check for escaped chars
528         if c == u8(92) {
529             esc, skip := s.handle_escapes(quote, true)
530             lit += esc
531             if skip > 0 {
532                 s.pos += skip
533                 s.col += skip
534                 continue
535             }
536         }
537         // Check for control characters (allow TAB)
538         if util.is_illegal_ascii_control_character(c) {
539             return error(@MOD + '.' + @STRUCT + '.' + @FN +
540                 ' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
541         }
542 
543         if c == quote {
544             if s.peek(1) == quote && s.peek(2) == quote {
545                 if s.peek(3) == end_of_text {
546                     s.pos += 3
547                     s.col += 3
548                     lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
549                     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
550                         'returning at ${c.ascii_str()} `${lit}`')
551                     return lit
552                 } else if s.peek(3) != quote {
553                     // lit += c.ascii_str()
554                     // lit += quote.ascii_str()
555                     s.pos += 3
556                     s.col += 3
557                     lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
558                     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
559                         'returning at ${c.ascii_str()} `${lit}`')
560                     return lit
561                 }
562             }
563         }
564         lit += c.ascii_str()
565     }
566     return lit
567 }
568 
569 // handle_escapes returns any escape character sequence.
570 // For escape sequence validation see `Checker.check_quoted_escapes`.
571 fn (mut s Scanner) handle_escapes(quote u8, is_multiline bool) (string, int) {
572     c := u8(s.at())
573     mut lit := c.ascii_str()
574     is_literal_string := quote == `'`
575     if !is_literal_string {
576         if s.peek(1) == `u` && u8(s.peek(2)).is_hex_digit() && u8(s.peek(3)).is_hex_digit()
577             && u8(s.peek(4)).is_hex_digit() && u8(s.peek(5)).is_hex_digit() {
578             lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str()
579             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped unicode `${lit}`')
580             return lit, 5
581         } else if s.peek(1) == quote {
582             if (!is_multiline && s.peek(2) == `\n`)
583                 || (is_multiline && s.peek(2) == quote && s.peek(3) == quote && s.peek(4) == `\n`) {
584                 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
585                     'ignore special case escaped `${lit}` at end of string')
586                 return '', 0
587             }
588             lit += quote.ascii_str()
589             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`')
590             return lit, 1
591         }
592     }
593     if is_literal_string {
594         if s.peek(1) == quote {
595             util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
596                 'ignore escape `${lit}${u8(s.peek(1)).ascii_str()}` in literal string')
597             return '', 0
598         }
599     }
600 
601     lit += u8(s.peek(1)).ascii_str()
602     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`')
603     return lit, 1
604 }
605 
606 // extract_number collects and returns a string containing
607 // any bytes recognized as a TOML number except for "(+/-)nan" and "(+/-)inf".
608 // TOML numbers can include digits 0-9 and `_`.
609 @[direct_array_access; inline]
610 fn (mut s Scanner) extract_number() !string {
611     // extract_number is called when the scanner has already reached
612     // a byte that is a number or +/- - so we rewind it to start at the correct
613     // position to get the complete number. Even if it's only one digit
614     s.pos--
615     s.col--
616     start := s.pos
617 
618     mut c := s.at()
619     is_digit := u8(c).is_digit()
620     if !(is_digit || c in [`+`, `-`]) {
621         return error(@MOD + '.' + @STRUCT + '.' + @FN +
622             ' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}')
623     }
624     s.pos++
625     s.col++
626     for s.pos < s.text.len {
627         c = s.at()
628         // Adjust scanner position to floating point numbers
629         mut float_precision := 0
630         if c == `.` {
631             mut i := 1
632             for c_ := u8(s.peek(i)); c_ != end_of_text && c_ != `\n`; c_ = u8(s.peek(i)) {
633                 if !c_.is_digit() && c_ != `,` {
634                     float_precision = 0
635                     break
636                 }
637                 float_precision++
638                 i++
639             }
640         }
641         s.pos += float_precision
642         s.col += float_precision
643         // Handle signed exponent notation. I.e.: 3e2, 3E2, 3e-2, 3E+2, 3e0, 3.1e2, 3.1E2, -1E-1
644         if c in [`e`, `E`] && s.peek(1) in [`+`, `-`] && u8(s.peek(2)).is_digit() {
645             s.pos += 2
646             s.col += 2
647         }
648         c = s.at()
649         if !(u8(c).is_hex_digit() || c in digit_extras) || (c == `.` && s.is_left_of_assign) {
650             break
651         }
652         s.pos++
653         s.col++
654     }
655     key := s.text[start..s.pos]
656     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
657         'identified number "${key}" in range [${start} .. ${s.pos}]')
658     return key
659 }
660 
661 // extract_nan_or_inf_number collects and returns a string containing
662 // any bytes recognized as infinity or not-a-number TOML numbers.
663 @[direct_array_access; inline]
664 fn (mut s Scanner) extract_nan_or_inf_number() !string {
665     // extract_nan_or_inf_number is called when the scanner has already identified that
666     // +/- or 'nan'/'inf' bytes is up but we rewind it to start at the correct position
667     s.pos--
668     s.col--
669     start := s.pos
670 
671     mut c := s.at()
672     if c !in [`+`, `-`, `n`, `i`] {
673         return error(@MOD + '.' + @STRUCT + '.' + @FN +
674             ' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}')
675     }
676     s.pos++
677     s.col++
678     for s.pos < s.text.len {
679         c = s.at()
680         if c !in [`n`, `a`, `i`, `f`] {
681             break
682         }
683         s.pos++
684         s.col++
685     }
686     key := s.text[start..s.pos]
687     util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
688         'identified special number "${key}" in range [${start} .. ${s.pos}]')
689     return key
690 }
691 
692 // excerpt returns a string excerpt of the input text centered
693 // at `pos`. The `margin` argument defines how many chacters
694 // on each side of `pos` is returned
695 pub fn (s &Scanner) excerpt(pos int, margin int) string {
696     start := if pos > 0 && pos >= margin { pos - margin } else { 0 }
697     end := if pos + margin < s.text.len { pos + margin } else { s.text.len }
698     return s.text[start..end].replace('\n', r'\n')
699 }
700 
701 // state returns a read-only view of the scanner's internal state.
702 pub fn (s &Scanner) state() State {
703     return State{
704         col:     s.col
705         line_nr: s.line_nr
706         pos:     s.pos
707     }
708 }
709 
710 fn (mut s Scanner) validate_and_skip_headers() ! {
711     // UTF-16 / UTF-32 headers (BE/LE)
712     s.check_utf16_or_32_bom()!
713 
714     // NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only.
715 
716     // Skip optional UTF-8 header, if any.
717     if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF {
718         util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)')
719         s.header_len = 3
720         s.skip_n(s.header_len)
721     }
722 
723     // Check after we've skipped UTF-8 BOM
724     s.check_utf16_or_32_bom()!
725 }
726 
727 fn (mut s Scanner) check_utf16_or_32_bom() ! {
728     if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00)
729         || (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) {
730         s.header_len = 4
731         s.skip_n(s.header_len)
732         return error(@MOD + '.' + @STRUCT + '.' + @FN +
733             ' UTF-32 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
734     }
735     if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) {
736         s.header_len = 2
737         s.skip_n(s.header_len)
738         return error(@MOD + '.' + @STRUCT + '.' + @FN +
739             ' UTF-16 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
740     }
741 }
742