v2 / vlib / toml / scanner / scanner.v
741 lines · 680 sloc · 22.94 KB · e2e5cf8db56f3562c7baa735061690be936bdf3e
Raw
1// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module scanner
5
6import toml.input
7import toml.token
8import toml.util
9
10pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
11pub const end_of_text = u32(~0)
12
13// Scanner contains the necessary fields for the state of the scan process.
14// the task the scanner does is also referred to as "lexing" or "tokenizing".
15// The Scanner methods are based on much of the work in `vlib/strings/textscanner`.
16pub struct Scanner {
17pub:
18 config Config
19 text string // the input TOML text
20mut:
21 col int // current column number (x coordinate)
22 line_nr int = 1 // current line number (y coordinate)
23 pos int // current flat/index position in the `text` field
24 header_len int // Length, how many bytes of header was found
25 // Quirks
26 is_left_of_assign bool = true // indicates if the scanner is on the *left* side of an assignment
27}
28
29// State is a read-only copy of the scanner's internal state.
30// See also `Scanner.state()`.
31pub struct State {
32pub:
33 col int // current column number (x coordinate)
34 line_nr int = 1 // current line number (y coordinate)
35 pos int // current flat/index position in the `text` field
36}
37
38// Config is used to configure a Scanner instance.
39// Only one of the fields `text` and `file_path` is allowed to be set at time of configuration.
40pub struct Config {
41pub:
42 input input.Config
43 tokenize_formatting bool = true // if true, generate tokens for `\n`, ` `, `\t`, `\r` etc.
44}
45
46// new_scanner returns a new *heap* allocated `Scanner` instance, based on the file in config.input.file_path,
47// or based on the text in config.input.text .
48pub fn new_scanner(config Config) !&Scanner {
49 mut s := &Scanner{
50 config: config
51 text: config.input.read_input()!
52 }
53 return s
54}
55
56// new_simple returns a new *stack* allocated `Scanner` instance.
57pub fn new_simple(config Config) !Scanner {
58 return Scanner{
59 config: config
60 text: config.input.read_input()!
61 }
62}
63
64// new_simple_text returns a new *stack* allocated `Scanner` instance
65// ready for parsing TOML in `text`.
66pub fn new_simple_text(text string) !Scanner {
67 in_config := input.Config{
68 text: text
69 }
70 config := Config{
71 input: in_config
72 }
73 return Scanner{
74 config: config
75 text: config.input.read_input()!
76 }
77}
78
79// new_simple_file returns a new *stack* allocated `Scanner` instance
80// ready for parsing TOML in file read from `path`.
81pub fn new_simple_file(path string) !Scanner {
82 in_config := input.Config{
83 file_path: path
84 }
85 config := Config{
86 input: in_config
87 }
88 return Scanner{
89 config: config
90 text: config.input.read_input()!
91 }
92}
93
94// scan returns the next token from the input.
95@[direct_array_access]
96pub fn (mut s Scanner) scan() !token.Token {
97 s.validate_and_skip_headers()!
98
99 for {
100 c := s.next()
101 byte_c := u8(c)
102 if c == end_of_text {
103 s.inc_line_number()
104 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'reached EOF')
105 return s.new_token(.eof, '', 1)
106 }
107
108 ascii := byte_c.ascii_str()
109 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'current char "${ascii}"')
110
111 if byte_c == u8(0x0) {
112 s.reset()
113 return error(@MOD + '.' + @STRUCT + '.' + @FN +
114 ' NULL control character `${c.hex()}` is not allowed at (${s.line_nr},${s.col}) "${ascii}" near ...${s.excerpt(s.pos, 5)}...')
115 }
116
117 is_sign := c == `+` || c == `-`
118
119 // (+/-)nan & (+/-)inf
120 peek_1 := s.peek(1)
121 peek_2 := s.peek(2)
122 is_nan := c == `n` && s.at() == `a` && peek_1 == `n`
123 is_inf := !is_nan && c == `i` && s.at() == `n` && peek_1 == `f`
124 is_signed_nan := is_sign && s.at() == `n` && peek_1 == `a` && peek_2 == `n`
125 is_signed_inf := !is_signed_nan && is_sign && s.at() == `i` && peek_1 == `n`
126 && peek_2 == `f`
127 if !s.is_left_of_assign && (is_nan || is_inf || is_signed_nan || is_signed_inf) {
128 num := s.extract_nan_or_inf_number()!
129 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
130 'identified a special number "${num}" (${num.len})')
131 return s.new_token(.number, num, num.len)
132 }
133
134 is_signed_number := is_sign && u8(s.at()).is_digit() && !u8(s.peek(-1)).is_digit()
135 is_digit := byte_c.is_digit()
136 if is_digit || is_signed_number {
137 num := s.extract_number()!
138 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
139 'identified a number "${num}" (${num.len})')
140 return s.new_token(.number, num, num.len)
141 }
142
143 if util.is_key_char(byte_c) {
144 key := s.extract_key()
145 if u8(s.peek(1)) != `=` && (key == 'true' || key == 'false') {
146 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
147 'identified a boolean "${key}" (${key.len})')
148 return s.new_token(.boolean, key, key.len)
149 }
150 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
151 'identified a bare key "${key}" (${key.len})')
152 return s.new_token(.bare, key, key.len)
153 }
154
155 match rune(c) {
156 ` `, `\t`, `\n`, `\r` {
157 if c == `\n` {
158 s.inc_line_number()
159 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
160 'incremented line nr to ${s.line_nr}')
161 } else if c == `\r` {
162 // CR should always be followed by a `\n`
163 if s.at() != `\n` {
164 return error(@MOD + '.' + @STRUCT + '.' + @FN +
165 ' missing newline/linefeed character after "\\c" carriage return at (${s.line_nr},${s.col}) "${ascii}" near ...${s.excerpt(s.pos, 5)}...')
166 }
167 }
168 // Date-Time in RFC 3339 is allowed to have a space between the date and time in supplement to the 'T'
169 // so we allow space characters to slip through to the parser if the space is between two digits...
170 // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, '"'+u8(s.peek(-1)).ascii_str()+'" < "${ascii}" > "'+u8(s.at()).ascii_str()+'"')
171 if c == ` ` && u8(s.peek(-1)).is_digit() && u8(s.at()).is_digit() {
172 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
173 'identified, what could be, a space between a RFC 3339 date and time ("${ascii}") (${ascii.len})')
174 return s.new_token(token.Kind.whitespace, ascii, ascii.len)
175 }
176 if s.config.tokenize_formatting {
177 mut kind := token.Kind.whitespace
178 if c == `\t` {
179 kind = token.Kind.tab
180 } else if c == `\r` {
181 kind = token.Kind.cr
182 } else if c == `\n` {
183 kind = token.Kind.nl
184 }
185 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
186 'identified formatting character ("${ascii}") (${ascii.len})')
187 return s.new_token(kind, ascii, ascii.len)
188 } else {
189 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
190 'skipping " ", "\\t" or "\\n" ("${ascii}") (${ascii.len})')
191 }
192 continue
193 }
194 `-` {
195 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
196 'identified minus "${ascii}" (${ascii.len})')
197 return s.new_token(.minus, ascii, ascii.len)
198 }
199 `_` {
200 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
201 'identified underscore "${ascii}" (${ascii.len})')
202 return s.new_token(.underscore, ascii, ascii.len)
203 }
204 `+` {
205 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
206 'identified plus "${ascii}" (${ascii.len})')
207 return s.new_token(.plus, ascii, ascii.len)
208 }
209 `=` {
210 s.is_left_of_assign = false
211 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
212 'identified assignment "${ascii}" (${ascii.len})')
213 return s.new_token(.assign, ascii, ascii.len)
214 }
215 `"`, `'` { // ... some string "/'
216 ident_string := s.extract_string()!
217 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
218 'identified quoted string `${ident_string}`')
219 return s.new_token(.quoted, ident_string, ident_string.len)
220 }
221 `#` {
222 hash := s.ignore_line()!
223 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
224 'identified comment hash "${hash}" (${hash.len})')
225 return s.new_token(.hash, hash, hash.len + 1)
226 }
227 `{` {
228 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
229 'identified left curly bracket "${ascii}" (${ascii.len})')
230 return s.new_token(.lcbr, ascii, ascii.len)
231 }
232 `}` {
233 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
234 'identified right curly bracket "${ascii}" (${ascii.len})')
235 return s.new_token(.rcbr, ascii, ascii.len)
236 }
237 `[` {
238 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
239 'identified left square bracket "${ascii}" (${ascii.len})')
240 return s.new_token(.lsbr, ascii, ascii.len)
241 }
242 `]` {
243 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
244 'identified right square bracket "${ascii}" (${ascii.len})')
245 return s.new_token(.rsbr, ascii, ascii.len)
246 }
247 `:` {
248 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
249 'identified colon "${ascii}" (${ascii.len})')
250 return s.new_token(.colon, ascii, ascii.len)
251 }
252 `,` {
253 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
254 'identified comma "${ascii}" (${ascii.len})')
255 return s.new_token(.comma, ascii, ascii.len)
256 }
257 `.` {
258 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
259 'identified period "${ascii}" (${ascii.len})')
260 return s.new_token(.period, ascii, ascii.len)
261 }
262 else {
263 return error(@MOD + '.' + @STRUCT + '.' + @FN +
264 ' could not scan character `${ascii}` / ${c} at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
265 }
266 }
267 }
268 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'unknown character code at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos,
269 5)}...')
270 return s.new_token(.unknown, '', 0)
271}
272
273// free frees all allocated resources.
274@[unsafe]
275pub fn (mut s Scanner) free() {
276 unsafe {
277 s.text.free()
278 }
279}
280
281// remaining returns how many characters remain in the text input.
282@[inline]
283pub fn (s &Scanner) remaining() int {
284 return s.text.len - s.pos
285}
286
287// next returns the next character code from the input text.
288// next returns `end_of_text` if it can't reach the next character.
289@[direct_array_access; inline]
290pub fn (mut s Scanner) next() u32 {
291 if s.pos < s.text.len {
292 opos := s.pos
293 s.pos++
294 s.col++
295 c := s.text[opos]
296 return c
297 }
298 return end_of_text
299}
300
301// skip skips one character ahead.
302@[inline]
303pub fn (mut s Scanner) skip() {
304 if s.pos + 1 < s.text.len {
305 s.pos++
306 s.col++
307 }
308}
309
310// skip_n skips ahead `n` characters.
311// If the skip goes out of bounds from the length of `Scanner.text`,
312// the scanner position will be sat to the last character possible.
313@[inline]
314pub fn (mut s Scanner) skip_n(n int) {
315 s.pos += n
316 if s.pos > s.text.len {
317 s.pos = s.text.len
318 }
319 s.col = s.pos
320}
321
322// at returns the *current* character code from the input text.
323// at returns `end_of_text` if it can't get the current character.
324// unlike `next()`, `at()` does not change the state of the scanner.
325@[direct_array_access; inline]
326pub fn (s &Scanner) at() u32 {
327 if s.pos < s.text.len {
328 return s.text[s.pos]
329 }
330 return end_of_text
331}
332
333// at_crlf returns `true` if the scanner is at a `\r` character
334// and the next character is a `\n`.
335fn (s &Scanner) at_crlf() bool {
336 return s.at() == `\r` && s.peek(1) == `\n`
337}
338
339// peek returns the character code from the input text at position + `n`.
340// peek returns `end_of_text` if it can't peek `n` characters ahead.
341@[direct_array_access; inline]
342pub fn (s &Scanner) peek(n int) u32 {
343 if s.pos + n < s.text.len {
344 // Allow peeking back - needed for spaces between date and time in RFC 3339 format :/
345 if n - 1 < 0 && s.pos + n - 1 >= 0 {
346 // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'LOOKING BAAAA-AACK - OOOVER MY SHOOOOULDEEEER "${s.text[s.pos + n-1]}"')
347 return s.text[s.pos + n - 1]
348 }
349 return s.text[s.pos + n]
350 }
351 return end_of_text
352}
353
354// reset resets the internal state of the scanner.
355pub fn (mut s Scanner) reset() {
356 s.pos = 0
357 s.col = 0
358 s.line_nr = 1
359 s.header_len = 0
360}
361
362// new_token returns a new `token.Token`.
363@[inline]
364fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
365 // println('new_token(${lit})')
366 mut col := s.col - len + 1
367 if s.line_nr == 1 {
368 col -= s.header_len
369 }
370 return token.Token{
371 kind: kind
372 lit: lit
373 col: if col < 1 { 1 } else { col }
374 line_nr: s.line_nr + 1
375 pos: s.pos - s.header_len - len + 1
376 len: len
377 }
378}
379
380// ignore_line forwards the scanner to the end of the current line.
381@[direct_array_access; inline]
382fn (mut s Scanner) ignore_line() !string {
383 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL...')
384 start := s.pos
385 for c := s.at(); c != end_of_text && c != `\n`; c = s.at() {
386 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${u8(c).ascii_str()} / ${c}"')
387 if s.at_crlf() {
388 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'letting `\\r\\n` slip through')
389 break
390 }
391 s.next()
392 }
393 return s.text[start..s.pos]
394}
395
396// inc_line_number increases the internal line number.
397@[inline]
398fn (mut s Scanner) inc_line_number() {
399 s.col = 0
400 s.line_nr++
401 s.is_left_of_assign = true
402}
403
404// extract_key parses and returns a TOML key as a string.
405@[direct_array_access; inline]
406fn (mut s Scanner) extract_key() string {
407 s.pos--
408 s.col--
409 start := s.pos
410 for s.pos < s.text.len {
411 c := u8(s.at())
412 if !(util.is_key_char(c) || c.is_digit() || c in [`_`, `-`]) {
413 break
414 }
415 s.pos++
416 s.col++
417 }
418 key := s.text[start..s.pos]
419 return key
420}
421
422// extract_string collects and returns a string containing
423// any bytes recognized as a TOML string.
424// TOML strings are everything found between two double or single quotation marks (`"`/`'`).
425@[direct_array_access; inline]
426fn (mut s Scanner) extract_string() !string {
427 // extract_string is called when the scanner has already reached
428 // a byte that is the start of a string so we rewind it to start at the correct
429 s.pos--
430 s.col--
431 quote := u8(s.at())
432 start := s.pos
433 mut lit := quote.ascii_str()
434
435 is_multiline := s.text[s.pos + 1] == quote && s.text[s.pos + 2] == quote
436 // Check for escaped multiline quote
437 if is_multiline {
438 mls := s.extract_multiline_string()!
439 return mls
440 }
441
442 for {
443 s.pos++
444 s.col++
445
446 if s.pos >= s.text.len {
447 return error(@MOD + '.' + @STRUCT + '.' + @FN +
448 ' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
449 }
450
451 c := u8(s.at())
452 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
453 'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})')
454
455 // Check for escaped chars
456 if c == u8(92) {
457 esc, skip := s.handle_escapes(quote, is_multiline)
458 lit += esc
459 if skip > 0 {
460 s.pos += skip
461 s.col += skip
462 continue
463 }
464 }
465 // Check for control characters (allow TAB)
466 if util.is_illegal_ascii_control_character(c) {
467 return error(@MOD + '.' + @STRUCT + '.' + @FN +
468 ' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
469 }
470
471 if c == quote {
472 s.pos++
473 s.col++
474 return lit + quote.ascii_str()
475 }
476
477 lit += c.ascii_str()
478
479 // Don't eat multiple lines in single-line mode
480 if lit.contains('\n') {
481 return error(@MOD + '.' + @STRUCT + '.' + @FN +
482 ' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
483 }
484 }
485 return lit
486}
487
488// extract_multiline_string collects and returns a string containing
489// any bytes recognized as a TOML string.
490// TOML strings are everything found between two double or single quotation marks (`"`/`'`).
491@[direct_array_access; inline]
492fn (mut s Scanner) extract_multiline_string() !string {
493 // extract_multiline_string is called from extract_string so we know the 3 first
494 // characters is the quotes
495 quote := u8(s.at())
496 start := s.pos
497 mut lit := quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
498
499 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'multi-line `${quote.ascii_str()}${s.text[
500 s.pos + 1].ascii_str()}${s.text[s.pos + 2].ascii_str()}` string started at pos ${start} (${s.line_nr},${s.col}) (quote type: ${quote.ascii_str()} / ${quote})')
501
502 s.pos += 2
503 s.col += 2
504
505 for {
506 s.pos++
507 s.col++
508
509 if s.pos >= s.text.len {
510 return error(@MOD + '.' + @STRUCT + '.' + @FN +
511 ' unfinished multi-line string literal (${quote.ascii_str()}${quote.ascii_str()}${quote.ascii_str()}) started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
512 }
513
514 c := u8(s.at())
515 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
516 'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})')
517
518 if c == `\r` && s.peek(1) == `\n` {
519 continue
520 }
521 if c == `\n` {
522 s.inc_line_number()
523 lit += c.ascii_str()
524 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `\\n` / ${c}')
525 continue
526 }
527 // Check for escaped chars
528 if c == u8(92) {
529 esc, skip := s.handle_escapes(quote, true)
530 lit += esc
531 if skip > 0 {
532 s.pos += skip
533 s.col += skip
534 continue
535 }
536 }
537 // Check for control characters (allow TAB)
538 if util.is_illegal_ascii_control_character(c) {
539 return error(@MOD + '.' + @STRUCT + '.' + @FN +
540 ' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
541 }
542
543 if c == quote {
544 if s.peek(1) == quote && s.peek(2) == quote {
545 if s.peek(3) == end_of_text {
546 s.pos += 3
547 s.col += 3
548 lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
549 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
550 'returning at ${c.ascii_str()} `${lit}`')
551 return lit
552 } else if s.peek(3) != quote {
553 // lit += c.ascii_str()
554 // lit += quote.ascii_str()
555 s.pos += 3
556 s.col += 3
557 lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
558 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
559 'returning at ${c.ascii_str()} `${lit}`')
560 return lit
561 }
562 }
563 }
564 lit += c.ascii_str()
565 }
566 return lit
567}
568
569// handle_escapes returns any escape character sequence.
570// For escape sequence validation see `Checker.check_quoted_escapes`.
571fn (mut s Scanner) handle_escapes(quote u8, is_multiline bool) (string, int) {
572 c := u8(s.at())
573 mut lit := c.ascii_str()
574 is_literal_string := quote == `'`
575 if !is_literal_string {
576 if s.peek(1) == `u` && u8(s.peek(2)).is_hex_digit() && u8(s.peek(3)).is_hex_digit()
577 && u8(s.peek(4)).is_hex_digit() && u8(s.peek(5)).is_hex_digit() {
578 lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str()
579 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped unicode `${lit}`')
580 return lit, 5
581 } else if s.peek(1) == quote {
582 if (!is_multiline && s.peek(2) == `\n`)
583 || (is_multiline && s.peek(2) == quote && s.peek(3) == quote && s.peek(4) == `\n`) {
584 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
585 'ignore special case escaped `${lit}` at end of string')
586 return '', 0
587 }
588 lit += quote.ascii_str()
589 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`')
590 return lit, 1
591 }
592 }
593 if is_literal_string {
594 if s.peek(1) == quote {
595 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
596 'ignore escape `${lit}${u8(s.peek(1)).ascii_str()}` in literal string')
597 return '', 0
598 }
599 }
600
601 lit += u8(s.peek(1)).ascii_str()
602 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`')
603 return lit, 1
604}
605
606// extract_number collects and returns a string containing
607// any bytes recognized as a TOML number except for "(+/-)nan" and "(+/-)inf".
608// TOML numbers can include digits 0-9 and `_`.
609@[direct_array_access; inline]
610fn (mut s Scanner) extract_number() !string {
611 // extract_number is called when the scanner has already reached
612 // a byte that is a number or +/- - so we rewind it to start at the correct
613 // position to get the complete number. Even if it's only one digit
614 s.pos--
615 s.col--
616 start := s.pos
617
618 mut c := s.at()
619 is_digit := u8(c).is_digit()
620 if !(is_digit || c in [`+`, `-`]) {
621 return error(@MOD + '.' + @STRUCT + '.' + @FN +
622 ' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}')
623 }
624 s.pos++
625 s.col++
626 for s.pos < s.text.len {
627 c = s.at()
628 // Adjust scanner position to floating point numbers
629 mut float_precision := 0
630 if c == `.` {
631 mut i := 1
632 for c_ := u8(s.peek(i)); c_ != end_of_text && c_ != `\n`; c_ = u8(s.peek(i)) {
633 if !c_.is_digit() && c_ != `,` {
634 float_precision = 0
635 break
636 }
637 float_precision++
638 i++
639 }
640 }
641 s.pos += float_precision
642 s.col += float_precision
643 // Handle signed exponent notation. I.e.: 3e2, 3E2, 3e-2, 3E+2, 3e0, 3.1e2, 3.1E2, -1E-1
644 if c in [`e`, `E`] && s.peek(1) in [`+`, `-`] && u8(s.peek(2)).is_digit() {
645 s.pos += 2
646 s.col += 2
647 }
648 c = s.at()
649 if !(u8(c).is_hex_digit() || c in digit_extras) || (c == `.` && s.is_left_of_assign) {
650 break
651 }
652 s.pos++
653 s.col++
654 }
655 key := s.text[start..s.pos]
656 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
657 'identified number "${key}" in range [${start} .. ${s.pos}]')
658 return key
659}
660
661// extract_nan_or_inf_number collects and returns a string containing
662// any bytes recognized as infinity or not-a-number TOML numbers.
663@[direct_array_access; inline]
664fn (mut s Scanner) extract_nan_or_inf_number() !string {
665 // extract_nan_or_inf_number is called when the scanner has already identified that
666 // +/- or 'nan'/'inf' bytes is up but we rewind it to start at the correct position
667 s.pos--
668 s.col--
669 start := s.pos
670
671 mut c := s.at()
672 if c !in [`+`, `-`, `n`, `i`] {
673 return error(@MOD + '.' + @STRUCT + '.' + @FN +
674 ' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}')
675 }
676 s.pos++
677 s.col++
678 for s.pos < s.text.len {
679 c = s.at()
680 if c !in [`n`, `a`, `i`, `f`] {
681 break
682 }
683 s.pos++
684 s.col++
685 }
686 key := s.text[start..s.pos]
687 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN,
688 'identified special number "${key}" in range [${start} .. ${s.pos}]')
689 return key
690}
691
692// excerpt returns a string excerpt of the input text centered
693// at `pos`. The `margin` argument defines how many chacters
694// on each side of `pos` is returned
695pub fn (s &Scanner) excerpt(pos int, margin int) string {
696 start := if pos > 0 && pos >= margin { pos - margin } else { 0 }
697 end := if pos + margin < s.text.len { pos + margin } else { s.text.len }
698 return s.text[start..end].replace('\n', r'\n')
699}
700
701// state returns a read-only view of the scanner's internal state.
702pub fn (s &Scanner) state() State {
703 return State{
704 col: s.col
705 line_nr: s.line_nr
706 pos: s.pos
707 }
708}
709
710fn (mut s Scanner) validate_and_skip_headers() ! {
711 // UTF-16 / UTF-32 headers (BE/LE)
712 s.check_utf16_or_32_bom()!
713
714 // NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only.
715
716 // Skip optional UTF-8 header, if any.
717 if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF {
718 util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)')
719 s.header_len = 3
720 s.skip_n(s.header_len)
721 }
722
723 // Check after we've skipped UTF-8 BOM
724 s.check_utf16_or_32_bom()!
725}
726
727fn (mut s Scanner) check_utf16_or_32_bom() ! {
728 if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00)
729 || (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) {
730 s.header_len = 4
731 s.skip_n(s.header_len)
732 return error(@MOD + '.' + @STRUCT + '.' + @FN +
733 ' UTF-32 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
734 }
735 if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) {
736 s.header_len = 2
737 s.skip_n(s.header_len)
738 return error(@MOD + '.' + @STRUCT + '.' + @FN +
739 ' UTF-16 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
740 }
741}
742